summaryrefslogtreecommitdiffstats
path: root/contrib/python/pandas/py2
diff options
context:
space:
mode:
authormaxim-yurchuk <[email protected]>2024-10-09 12:29:46 +0300
committermaxim-yurchuk <[email protected]>2024-10-09 13:14:22 +0300
commit9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80 (patch)
treea8fb3181d5947c0d78cf402aa56e686130179049 /contrib/python/pandas/py2
parenta44b779cd359f06c3ebbef4ec98c6b38609d9d85 (diff)
publishFullContrib: true for ydb
<HIDDEN_URL> commit_hash:c82a80ac4594723cebf2c7387dec9c60217f603e
Diffstat (limited to 'contrib/python/pandas/py2')
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi.in138
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi.in368
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi.in298
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi.in824
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi.in1109
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi.in352
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi.in74
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi.in413
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi.in308
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/headers/ms_inttypes.h305
-rw-r--r--contrib/python/pandas/py2/pandas/conftest.py677
-rw-r--r--contrib/python/pandas/py2/pandas/tests/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/api/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/api/test_api.py165
-rw-r--r--contrib/python/pandas/py2/pandas/tests/api/test_types.py42
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/conftest.py192
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/test_datetime64.py2334
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/test_numeric.py1076
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/test_object.py314
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/test_period.py1213
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arithmetic/test_timedelta64.py1977
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/common.py10
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/conftest.py13
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_algos.py142
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_analytics.py303
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_api.py508
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_constructors.py586
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_dtypes.py177
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_indexing.py264
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_missing.py87
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_operators.py331
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_repr.py529
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_sorting.py124
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_subclass.py25
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_warnings.py31
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/interval/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/interval/test_interval.py68
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/interval/test_ops.py82
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/sparse/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_arithmetics.py538
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_array.py1203
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_dtype.py161
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_libsparse.py605
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_array.py256
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_datetimelike.py657
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_datetimes.py292
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_integer.py713
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_numpy.py206
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_period.py317
-rw-r--r--contrib/python/pandas/py2/pandas/tests/arrays/test_timedeltas.py161
-rw-r--r--contrib/python/pandas/py2/pandas/tests/computation/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/computation/test_compat.py47
-rw-r--r--contrib/python/pandas/py2/pandas/tests/computation/test_eval.py1924
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_from_scalar.py22
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_ndarray.py20
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_object_arr.py22
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_convert_objects.py15
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_downcast.py82
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_find_common_type.py108
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_datetimelike.py22
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_dtype.py160
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_common.py653
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_concat.py53
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_dtypes.py890
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_generic.py95
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_inference.py1333
-rw-r--r--contrib/python/pandas/py2/pandas/tests/dtypes/test_missing.py498
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/arrow/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/arrow/bool.py144
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/arrow/test_bool.py68
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/__init__.py56
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/base.py10
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/casting.py23
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/constructors.py77
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/dtype.py91
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/getitem.py248
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/groupby.py83
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/interface.py68
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/io.py23
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/methods.py341
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/missing.py132
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/ops.py166
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/printing.py44
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/reduce.py61
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/reshaping.py271
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/base/setitem.py189
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/conftest.py110
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/decimal/__init__.py4
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/decimal/array.py166
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/decimal/test_decimal.py401
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/json/__init__.py3
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/json/array.py199
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/json/test_json.py304
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/numpy_/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/numpy_/conftest.py38
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy.py182
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy_nested.py286
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_categorical.py243
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_common.py86
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_datetime.py237
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_external_block.py76
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_integer.py224
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_interval.py162
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_period.py166
-rw-r--r--contrib/python/pandas/py2/pandas/tests/extension/test_sparse.py370
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/common.py141
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/conftest.py221
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_alter_axes.py1444
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_analytics.py2393
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_api.py534
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_apply.py1154
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_arithmetic.py636
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_asof.py126
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_axis_select_reindex.py1159
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_block_internals.py587
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_combine_concat.py863
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_constructors.py2316
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_convert_to.py504
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_dtypes.py989
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_duplicates.py466
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_indexing.py3684
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_join.py185
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_missing.py863
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_mutate_columns.py280
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_nonunique_indexes.py477
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_operators.py802
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_period.py147
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_quantile.py384
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_query_eval.py1040
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_rank.py318
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_replace.py1111
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_repr_info.py523
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_reshape.py968
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_sort_values_level_as_str.py96
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_sorting.py670
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_subclass.py573
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_timeseries.py899
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_timezones.py198
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_to_csv.py1234
-rw-r--r--contrib/python/pandas/py2/pandas/tests/frame/test_validate.py32
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/test_frame.py271
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/test_generic.py1029
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/test_label_or_level_utils.py406
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/test_panel.py59
-rw-r--r--contrib/python/pandas/py2/pandas/tests/generic/test_series.py247
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/aggregate/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_aggregate.py289
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_cython.py218
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_other.py514
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/conftest.py78
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_apply.py542
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_bin_groupby.py157
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_categorical.py936
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_counting.py224
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_filters.py588
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_function.py1143
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_groupby.py1746
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_grouping.py838
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_index_as_string.py68
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_nth.py416
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_rank.py306
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_timegrouper.py652
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_transform.py847
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_value_counts.py76
-rw-r--r--contrib/python/pandas/py2/pandas/tests/groupby/test_whitelist.py297
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/common.py928
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/conftest.py49
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimelike.py101
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_arithmetic.py109
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_astype.py343
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_construction.py794
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_date_range.py842
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetime.py436
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetimelike.py31
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_formats.py221
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_indexing.py612
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_misc.py312
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_missing.py52
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_ops.py498
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_partial_slicing.py388
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_scalar_compat.py280
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_setops.py500
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_timezones.py1161
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_tools.py1841
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_astype.py206
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_construction.py389
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval.py1246
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_new.py271
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_range.py316
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_tree.py184
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/conftest.py56
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_analytics.py328
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_astype.py32
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_compat.py131
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_constructor.py577
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_contains.py97
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_conversion.py224
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_copy.py93
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_drop.py128
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_duplicates.py278
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_equivalence.py221
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_format.py132
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_get_set.py454
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_indexing.py375
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_integrity.py293
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_join.py96
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_missing.py129
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_monotonic.py213
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_names.py124
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_partial_indexing.py98
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reindex.py108
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reshape.py126
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_set_ops.py372
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/multi/test_sorting.py266
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_arithmetic.py108
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_asfreq.py152
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_astype.py126
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_construction.py519
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_formats.py220
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_indexing.py637
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_ops.py329
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_partial_slicing.py132
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_period.py578
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_period_range.py95
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_scalar_compat.py18
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_setops.py281
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/period/test_tools.py345
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_base.py2587
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_category.py1161
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_common.py343
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_frozen.py109
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_numeric.py1091
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/test_range.py887
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_arithmetic.py265
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_astype.py110
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_construction.py199
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_formats.py96
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_indexing.py338
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_ops.py281
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_partial_slicing.py85
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_scalar_compat.py64
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_setops.py75
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta.py335
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta_range.py79
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_tools.py182
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/common.py307
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/conftest.py20
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/interval/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval.py267
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval_new.py246
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/conftest.py31
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_chaining_and_caching.py65
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_datetime.py22
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_getitem.py237
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_iloc.py151
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_indexing_slow.py89
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_ix.py56
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_loc.py378
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_multiindex.py86
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_panel.py103
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_partial.py183
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_set_ops.py42
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_setitem.py439
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_slice.py576
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_sorted.py92
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_xs.py237
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_callable.py268
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_categorical.py717
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_chaining_and_caching.py402
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_coercion.py939
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_datetime.py315
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_floats.py898
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_iloc.py677
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_indexing.py1015
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_engines.py169
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_slow.py17
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_ix.py314
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_loc.py767
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_panel.py214
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_partial.py620
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_scalar.py207
-rw-r--r--contrib/python/pandas/py2/pandas/tests/indexing/test_timedelta.py97
-rw-r--r--contrib/python/pandas/py2/pandas/tests/internals/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/internals/test_internals.py1296
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/conftest.py90
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/data/feather-0_3_1.featherbin0 -> 672 bytes
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_console.py92
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_css.py187
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_eng_formatting.py196
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_format.py2794
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_printing.py204
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_style.py1315
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_to_csv.py563
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_to_excel.py278
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_to_html.py602
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/formats/test_to_latex.py737
-rwxr-xr-xcontrib/python/pandas/py2/pandas/tests/io/generate_legacy_storage_files.py369
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_compression.py120
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_json_table_schema.py580
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_normalize.py462
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_pandas.py1274
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_readlines.py172
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/json/test_ujson.py1129
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/common.py9
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/data/frame.mpbin0 -> 309 bytes
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_buffer.py21
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_case.py115
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_except.py39
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_extension.py63
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_format.py91
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_limits.py109
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_obj.py74
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_pack.py162
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_read_size.py71
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_seq.py47
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_sequnpack.py104
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_subtype.py26
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack.py67
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack_raw.py30
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/conftest.py85
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/data/items.jsonl2
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/data/tar_csv.tarbin0 -> 10240 bytes
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_c_parser_only.py591
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_comment.py136
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_common.py1946
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_compression.py154
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_converters.py158
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_dialect.py135
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_dtypes.py514
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_header.py428
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_index_col.py152
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_mangle_dupes.py119
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_multi_thread.py145
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_na_values.py441
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_network.py204
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_parse_dates.py849
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_python_parser_only.py301
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_quoting.py158
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_read_fwf.py580
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_skiprows.py222
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_textreader.py353
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_unsupported.py140
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/test_usecols.py534
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/sas/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/sas/test_sas.py25
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/sas/test_sas7bdat.py227
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/sas/test_xport.py146
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_clipboard.py227
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_common.py357
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_compression.py116
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_date_converters.py43
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_excel.py2566
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_feather.py158
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_gbq.py153
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_gcs.py72
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_html.py1161
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_packers.py954
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_parquet.py541
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_pickle.py481
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_pytables.py5691
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_s3.py29
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_sql.py2708
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/test_stata.py1613
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/common.py544
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_boxplot_method.py385
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_converter.py346
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_datetimelike.py1563
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_frame.py3003
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_groupby.py75
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_hist_method.py439
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_misc.py356
-rw-r--r--contrib/python/pandas/py2/pandas/tests/plotting/test_series.py903
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reductions/__init__.py4
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reductions/test_reductions.py1159
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reductions/test_stat_reductions.py202
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/conftest.py142
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_base.py228
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_datetime_index.py1530
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_period_index.py759
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_resample_api.py544
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_resampler_grouper.py260
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_time_grouper.py287
-rw-r--r--contrib/python/pandas/py2/pandas/tests/resample/test_timedelta.py128
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_join.py880
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge.py1609
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_asof.py1038
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_index_as_string.py177
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_ordered.py103
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/merge/test_multi.py668
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_concat.py2600
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_cut.py458
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_melt.py718
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_pivot.py1798
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_qcut.py199
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_reshape.py621
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_union_categoricals.py346
-rw-r--r--contrib/python/pandas/py2/pandas/tests/reshape/test_util.py53
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/interval/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/interval/test_interval.py225
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/interval/test_ops.py60
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/period/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/period/test_asfreq.py747
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/period/test_period.py1495
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/test_nat.py341
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timedelta/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_arithmetic.py691
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_construction.py210
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_formats.py28
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_timedelta.py715
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_arithmetic.py117
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_comparisons.py168
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_rendering.py96
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timestamp.py964
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timezones.py389
-rw-r--r--contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_unary_ops.py364
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/common.py31
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/conftest.py42
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/conftest.py8
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_alter_index.py564
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_boolean.py634
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_callable.py33
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_datetime.py714
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_iloc.py37
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_indexing.py840
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_loc.py168
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/indexing/test_numeric.py259
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_alter_axes.py347
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_analytics.py1499
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_api.py719
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_apply.py667
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_arithmetic.py172
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_asof.py174
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_block_internals.py43
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_combine_concat.py373
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_constructors.py1266
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_datetime_values.py556
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_dtypes.py518
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_duplicates.py148
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_internals.py343
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_io.py267
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_missing.py1374
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_operators.py756
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_period.py166
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_quantile.py195
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_rank.py506
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_replace.py296
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_repr.py484
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_sorting.py266
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_subclass.py108
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_timeseries.py1099
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_timezones.py366
-rw-r--r--contrib/python/pandas/py2/pandas/tests/series/test_validate.py19
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/common.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/conftest.py115
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_analytics.py39
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_apply.py105
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_frame.py1369
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_indexing.py109
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_csv.py21
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_from_scipy.py185
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/series/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/series/test_indexing.py111
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/series/test_series.py1523
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_combine_concat.py462
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_format.py135
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_groupby.py70
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_indexing.py1058
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_pivot.py52
-rw-r--r--contrib/python/pandas/py2/pandas/tests/sparse/test_reshape.py42
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_algos.py1881
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_base.py1351
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_common.py118
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_compat.py98
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_config.py433
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_downstream.py135
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_errors.py74
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_expressions.py463
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_join.py236
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_lib.py207
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_multilevel.py2063
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_nanops.py1059
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_panel.py2621
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_register_accessor.py89
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_sorting.py435
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_strings.py3426
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_take.py468
-rw-r--r--contrib/python/pandas/py2/pandas/tests/test_window.py4073
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tools/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tools/test_numeric.py440
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/__init__.py1
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/common.py25
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/conftest.py21
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_fiscal.py657
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets.py3143
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets_properties.py108
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_ticks.py320
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_yqm_offsets.py1027
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/test_frequencies.py793
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tseries/test_holiday.py382
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_api.py40
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_array_to_datetime.py156
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_ccalendar.py25
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_conversion.py68
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_libfrequencies.py100
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_liboffsets.py174
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_normalize_date.py18
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_parse_iso8601.py62
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_parsing.py186
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_period_asfreq.py87
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_timedeltas.py29
-rw-r--r--contrib/python/pandas/py2/pandas/tests/tslibs/test_timezones.py101
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/conftest.py26
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_almost_equal.py350
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_categorical_equal.py92
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_extension_array_equal.py102
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_frame_equal.py209
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_index_equal.py179
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_interval_array_equal.py80
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_numpy_array_equal.py177
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_assert_series_equal.py185
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_deprecate.py63
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_deprecate_kwarg.py93
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_hashing.py328
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_locale.py94
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_move.py79
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_safe_import.py45
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_util.py127
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_validate_args.py76
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_validate_args_and_kwargs.py105
-rw-r--r--contrib/python/pandas/py2/pandas/tests/util/test_validate_kwargs.py72
565 files changed, 232945 insertions, 0 deletions
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi.in
new file mode 100644
index 00000000000..91599fa223b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi.in
@@ -0,0 +1,138 @@
+"""
+Template for each `dtype` helper function using 1-d template
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+{{py:
+
+# name, c_type, dest_type
+dtypes = [('float64', 'float64_t', 'float64_t'),
+ ('float32', 'float32_t', 'float32_t'),
+ ('int8', 'int8_t', 'float32_t'),
+ ('int16', 'int16_t', 'float32_t'),
+ ('int32', 'int32_t', 'float64_t'),
+ ('int64', 'int64_t', 'float64_t')]
+
+def get_dispatch(dtypes):
+
+ for name, c_type, dest_type, in dtypes:
+ dest_name = dest_type[:-2] # i.e. strip "_t"
+ yield name, c_type, dest_type, dest_name
+
+}}
+
+{{for name, c_type, dest_type, dest_name
+ in get_dispatch(dtypes)}}
+
+
+def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
+ ndarray[{{dest_type}}, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+{{endfor}}
+
+# ----------------------------------------------------------------------
+# ensure_dtype
+# ----------------------------------------------------------------------
+
+cdef int PLATFORM_INT = (<ndarray>np.arange(0, dtype=np.intp)).descr.type_num
+
+
+def ensure_platform_int(object arr):
+ # GH3033, GH1392
+ # platform int is the size of the int pointer, e.g. np.intp
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == PLATFORM_INT:
+ return arr
+ else:
+ return arr.astype(np.intp)
+ else:
+ return np.array(arr, dtype=np.intp)
+
+
+def ensure_object(object arr):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_OBJECT:
+ return arr
+ else:
+ return arr.astype(np.object_)
+ else:
+ return np.array(arr, dtype=np.object_)
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'FLOAT64', 'float64'),
+ ('float32', 'FLOAT32', 'float32'),
+ ('int8', 'INT8', 'int8'),
+ ('int16', 'INT16', 'int16'),
+ ('int32', 'INT32', 'int32'),
+ ('int64', 'INT64', 'int64'),
+ ('uint8', 'UINT8', 'uint8'),
+ ('uint16', 'UINT16', 'uint16'),
+ ('uint32', 'UINT32', 'uint32'),
+ ('uint64', 'UINT64', 'uint64'),
+ # ('platform_int', 'INT', 'int_'),
+ # ('object', 'OBJECT', 'object_'),
+]
+
+def get_dispatch(dtypes):
+
+ for name, c_type, dtype in dtypes:
+ yield name, c_type, dtype
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+
+def ensure_{{name}}(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
+ return arr
+ else:
+ return arr.astype(np.{{dtype}}, copy=copy)
+ else:
+ return np.array(arr, dtype=np.{{dtype}})
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi.in
new file mode 100644
index 00000000000..5dac94394c7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi.in
@@ -0,0 +1,368 @@
+"""
+Template for each `dtype` helper function for rank
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# rank_1d, rank_2d
+# ----------------------------------------------------------------------
+
+{{py:
+
+# dtype ctype pos_nan_value neg_nan_value
+dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
+ ('float64', 'float64_t', 'np.inf', '-np.inf'),
+ ('uint64', 'uint64_t', '', ''),
+ ('int64', 'int64_t', 'np.iinfo(np.int64).max',
+ 'np.iinfo(np.int64).min')]
+
+}}
+
+{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}}
+
+
+def rank_1d_{{dtype}}(object in_arr, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
+
+ {{if dtype == 'object'}}
+ ndarray sorted_data, values
+ {{else}}
+ ndarray[{{ctype}}] sorted_data, values
+ {{endif}}
+
+ ndarray[float64_t] ranks
+ ndarray[int64_t] argsorted
+ ndarray[uint8_t, cast=True] sorted_mask
+
+ {{if dtype == 'uint64'}}
+ {{ctype}} val
+ {{else}}
+ {{ctype}} val, nan_value
+ {{endif}}
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ bint isnan
+ float64_t count = 0.0
+ tiebreak = tiebreakers[ties_method]
+
+ {{if dtype == 'float64'}}
+ values = np.asarray(in_arr).copy()
+ {{elif dtype == 'object'}}
+ values = np.array(in_arr, copy=True)
+
+ if values.dtype != np.object_:
+ values = values.astype('O')
+ {{else}}
+ values = np.asarray(in_arr)
+ {{endif}}
+
+ keep_na = na_option == 'keep'
+
+ {{if dtype == 'object'}}
+ mask = missing.isnaobj(values)
+ {{elif dtype == 'float64'}}
+ mask = np.isnan(values)
+ {{elif dtype == 'int64'}}
+ mask = values == NPY_NAT
+
+ # create copy in case of NPY_NAT
+ # values are mutated inplace
+ if mask.any():
+ values = values.copy()
+ {{endif}}
+
+ # double sort first by mask and then by values to ensure nan values are
+ # either at the beginning or the end. mask/(~mask) controls padding at
+ # tail or the head
+ {{if dtype != 'uint64'}}
+ if ascending ^ (na_option == 'top'):
+ nan_value = {{pos_nan_value}}
+ order = (values, mask)
+ else:
+ nan_value = {{neg_nan_value}}
+ order = (values, ~mask)
+ np.putmask(values, mask, nan_value)
+ {{else}}
+ mask = np.zeros(shape=len(values), dtype=bool)
+ order = (values, mask)
+ {{endif}}
+
+ n = len(values)
+ ranks = np.empty(n, dtype='f8')
+
+ {{if dtype == 'object'}}
+ _as = np.lexsort(keys=order)
+ {{else}}
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = np.lexsort(keys=order)
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = np.lexsort(keys=order)
+ {{endif}}
+
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_data = values.take(_as)
+ sorted_mask = mask.take(_as)
+ _indices = np.diff(sorted_mask.astype(int)).nonzero()[0]
+ non_na_idx = _indices[0] if len(_indices) > 0 else -1
+ argsorted = _as.astype('i8')
+
+ {{if dtype == 'object'}}
+ if True:
+ {{else}}
+ with nogil:
+ {{endif}}
+ # TODO: why does the 2d version not have a nogil block?
+ for i in range(n):
+ sum_ranks += i + 1
+ dups += 1
+
+ {{if dtype == 'object'}}
+ val = util.get_value_at(sorted_data, i)
+ {{else}}
+ val = sorted_data[i]
+ {{endif}}
+
+ {{if dtype != 'uint64'}}
+ isnan = sorted_mask[i]
+ if isnan and keep_na:
+ ranks[argsorted[i]] = NaN
+ continue
+ {{endif}}
+
+ count += 1.0
+
+ {{if dtype == 'object'}}
+ if (i == n - 1 or
+ are_diff(util.get_value_at(sorted_data, i + 1), val) or
+ i == non_na_idx):
+ {{else}}
+ if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+ {{endif}}
+
+ if tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ {{if dtype == 'object'}}
+ raise ValueError('first not supported for '
+ 'non-numeric data')
+ {{else}}
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = j + 1
+ {{endif}}
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = 2 * i - j - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ return ranks / total_tie_count
+ else:
+ return ranks / count
+ else:
+ return ranks
+
+
+def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+
+ {{if dtype == 'object'}}
+ Py_ssize_t infs
+ {{endif}}
+
+ ndarray[float64_t, ndim=2] ranks
+ {{if dtype == 'int64' or dtype == 'uint64'}}
+ ndarray[{{ctype}}, ndim=2, cast=True] values
+ {{else}}
+ ndarray[{{ctype}}, ndim=2] values
+ {{endif}}
+
+ ndarray[int64_t, ndim=2] argsorted
+
+ {{if dtype == 'uint64'}}
+ {{ctype}} val
+ {{else}}
+ {{ctype}} val, nan_value
+ {{endif}}
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ float64_t count = 0.0
+
+ tiebreak = tiebreakers[ties_method]
+
+ keep_na = na_option == 'keep'
+
+ in_arr = np.asarray(in_arr)
+
+ if axis == 0:
+ values = in_arr.T.copy()
+ else:
+ values = in_arr.copy()
+
+ {{if dtype == 'object'}}
+ if values.dtype != np.object_:
+ values = values.astype('O')
+ {{endif}}
+
+ {{if dtype != 'uint64'}}
+ if ascending ^ (na_option == 'top'):
+ nan_value = {{pos_nan_value}}
+ else:
+ nan_value = {{neg_nan_value}}
+
+ {{if dtype == 'object'}}
+ mask = missing.isnaobj2d(values)
+ {{elif dtype == 'float64'}}
+ mask = np.isnan(values)
+ {{elif dtype == 'int64'}}
+ mask = values == NPY_NAT
+ {{endif}}
+
+ np.putmask(values, mask, nan_value)
+ {{endif}}
+
+ n, k = (<object>values).shape
+ ranks = np.empty((n, k), dtype='f8')
+
+ {{if dtype == 'object'}}
+ try:
+ _as = values.argsort(1)
+ except TypeError:
+ values = in_arr
+ for i in range(len(values)):
+ ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
+ ascending=ascending, pct=pct)
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+ {{else}}
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = values.argsort(axis=1, kind='mergesort')
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = values.argsort(1)
+ {{endif}}
+
+ if not ascending:
+ _as = _as[:, ::-1]
+
+ values = _take_2d_{{dtype}}(values, _as)
+ argsorted = _as.astype('i8')
+
+ for i in range(n):
+ {{if dtype == 'object'}}
+ dups = sum_ranks = infs = 0
+ {{else}}
+ dups = sum_ranks = 0
+ {{endif}}
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+ {{if dtype != 'object'}}
+ sum_ranks += j + 1
+ dups += 1
+ {{endif}}
+
+ val = values[i, j]
+
+ {{if dtype != 'uint64'}}
+ {{if dtype == 'object'}}
+ if (val is nan_value) and keep_na:
+ {{else}}
+ if (val == nan_value) and keep_na:
+ {{endif}}
+ ranks[i, argsorted[i, j]] = NaN
+
+ {{if dtype == 'object'}}
+ infs += 1
+ {{endif}}
+
+ continue
+ {{endif}}
+
+ count += 1.0
+
+ {{if dtype == 'object'}}
+ sum_ranks += (j - infs) + 1
+ dups += 1
+ {{endif}}
+
+ {{if dtype == 'object'}}
+ if j == k - 1 or are_diff(values[i, j + 1], val):
+ {{else}}
+ if j == k - 1 or values[i, j + 1] != val:
+ {{endif}}
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ {{if dtype == 'object'}}
+ raise ValueError('first not supported '
+ 'for non-numeric data')
+ {{else}}
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = z + 1
+ {{endif}}
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi.in
new file mode 100644
index 00000000000..2fea8b17fd9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi.in
@@ -0,0 +1,298 @@
+"""
+Template for each `dtype` helper function for take
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# take_1d, take_2d
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil
+dtypes = [
+ ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True),
+ ('bool', 'object', 'uint8_t', 'object',
+ 'True if ', ' > 0 else False', False, False),
+ ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False),
+ ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True),
+ ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True),
+ ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True),
+ ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True),
+ ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True),
+ ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True),
+ ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True),
+ ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True),
+ ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True),
+ ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True),
+ ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True),
+ ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True),
+ ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True),
+ ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True),
+ ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True),
+ ('object', 'object', 'object', 'object', '', '', False, False)]
+
+
+def get_dispatch(dtypes):
+
+ inner_take_1d_template = """
+ cdef:
+ Py_ssize_t i, n, idx
+ %(c_type_out)s fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ %(nogil_str)s
+ %(tab)sfor i in range(n):
+ %(tab)s idx = indexer[i]
+ %(tab)s if idx == -1:
+ %(tab)s out[i] = fv
+ %(tab)s else:
+ %(tab)s out[i] = %(preval)svalues[idx]%(postval)s
+"""
+
+ inner_take_2d_axis0_template = """\
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ %(c_type_out)s fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF %(can_copy)s:
+ cdef:
+ %(c_type_out)s *v
+ %(c_type_out)s *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(%(c_type_out)s) and
+ sizeof(%(c_type_out)s) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(%(c_type_out)s) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = %(preval)svalues[idx, j]%(postval)s
+"""
+
+ inner_take_2d_axis1_template = """\
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ %(c_type_out)s fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = %(preval)svalues[i, idx]%(postval)s
+"""
+
+ for (name, dest, c_type_in, c_type_out, preval, postval,
+ can_copy, nogil) in dtypes:
+ if nogil:
+ nogil_str = "with nogil:"
+ tab = ' '
+ else:
+ nogil_str = ''
+ tab = ''
+
+ args = dict(name=name, dest=dest, c_type_in=c_type_in,
+ c_type_out=c_type_out, preval=preval, postval=postval,
+ can_copy=can_copy, nogil_str=nogil_str, tab=tab)
+
+ inner_take_1d = inner_take_1d_template % args
+ inner_take_2d_axis0 = inner_take_2d_axis0_template % args
+ inner_take_2d_axis1 = inner_take_2d_axis1_template % args
+
+ yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy,
+ inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1)
+
+}}
+
+
+{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy,
+ inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1
+ in get_dispatch(dtypes)}}
+
+
+cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values,
+ int64_t[:] indexer,
+ {{c_type_out}}[:] out,
+ fill_value=np.nan):
+
+
+{{inner_take_1d}}
+
+
+def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
+ int64_t[:] indexer,
+ {{c_type_out}}[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_{{name}}_{{dest}}_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+{{inner_take_1d}}
+
+
+cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
+ int64_t[:] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+{{inner_take_2d_axis0}}
+
+
+def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+ ndarray[int64_t] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+{{inner_take_2d_axis0}}
+
+
+cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
+ int64_t[:] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+{{inner_take_2d_axis1}}
+
+
+def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+ ndarray[int64_t] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+{{inner_take_2d_axis1}}
+
+
+def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+ indexer,
+ ndarray[{{c_type_out}}, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ {{c_type_out}} fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}}
+
+{{endfor}}
+
+# ----------------------------------------------------------------------
+# take_2d internal function
+# ----------------------------------------------------------------------
+
+ctypedef fused take_t:
+ float64_t
+ uint64_t
+ int64_t
+ object
+
+
+cdef _take_2d(ndarray[take_t, ndim=2] values, object idx):
+ cdef:
+ Py_ssize_t i, j, N, K
+ ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx
+ ndarray[take_t, ndim=2] result
+ object val
+
+ N, K = (<object>values).shape
+
+ if take_t is object:
+ # evaluated at compile-time
+ result = values.copy()
+ else:
+ result = np.empty_like(values)
+
+ for i in range(N):
+ for j in range(K):
+ result[i, j] = values[i, indexer[i, j]]
+ return result
+
+
+_take_2d_object = _take_2d[object]
+_take_2d_float64 = _take_2d[float64_t]
+_take_2d_int64 = _take_2d[int64_t]
+_take_2d_uint64 = _take_2d[uint64_t]
diff --git a/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi.in
new file mode 100644
index 00000000000..abac9f14784
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi.in
@@ -0,0 +1,824 @@
+"""
+Template for each `dtype` helper function using groupby
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+cdef extern from "numpy/npy_math.h":
+ float64_t NAN "NPY_NAN"
+_int64_max = np.iinfo(np.int64).max
+
+# ----------------------------------------------------------------------
+# group_add, group_prod, group_var, group_mean, group_ohlc
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type
+dtypes = [('float64', 'float64_t'),
+ ('float32', 'float32_t')]
+
+def get_dispatch(dtypes):
+
+ for name, c_type in dtypes:
+ yield name, c_type
+}}
+
+{{for name, c_type in get_dispatch(dtypes)}}
+
+
+def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val, count
+ ndarray[{{c_type}}, ndim=2] sumx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j]
+
+
+def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val, count
+ ndarray[{{c_type}}, ndim=2] prodx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ prodx = np.ones_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ prodx[lab, j] *= val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = prodx[i, j]
+
+
+def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val, ct, oldmean
+ ndarray[{{c_type}}, ndim=2] nobs, mean
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ mean = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ out[:, :] = 0.0
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ oldmean = mean[lab, j]
+ mean[lab, j] += (val - oldmean) / nobs[lab, j]
+ out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+ for i in range(ncounts):
+ for j in range(K):
+ ct = nobs[i, j]
+ if ct < 2:
+ out[i, j] = NAN
+ else:
+ out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val, count
+ ndarray[{{c_type}}, ndim=2] sumx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ count = nobs[i, j]
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j] / count
+
+
+def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab
+ {{c_type}} val, count
+ Py_ssize_t ngroups = len(counts)
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if len(labels) == 0:
+ return
+
+ N, K = (<object>values).shape
+
+ if out.shape[1] != 4:
+ raise ValueError('Output array must have 4 columns')
+
+ if K > 1:
+ raise NotImplementedError("Argument 'values' must have only "
+ "one dimension")
+ out[:] = np.nan
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab == -1:
+ continue
+
+ counts[lab] += 1
+ val = values[i, 0]
+ if val != val:
+ continue
+
+ if out[lab, 0] != out[lab, 0]:
+ out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+ else:
+ out[lab, 1] = max(out[lab, 1], val)
+ out[lab, 2] = min(out[lab, 2], val)
+ out[lab, 3] = val
+
+{{endfor}}
+
+# ----------------------------------------------------------------------
+# group_nth, group_last, group_rank
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, nan_val
+dtypes = [('float64', 'float64_t', 'NAN'),
+ ('float32', 'float32_t', 'NAN'),
+ ('int64', 'int64_t', 'NPY_NAT'),
+ ('object', 'object', 'NAN')]
+
+def get_dispatch(dtypes):
+
+ for name, c_type, nan_val in dtypes:
+
+ yield name, c_type, nan_val
+}}
+
+
+{{for name, c_type, nan_val in get_dispatch(dtypes)}}
+
+
+def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val
+ ndarray[{{c_type}}, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ {{if name == 'object'}}
+ resx = np.empty((<object>out).shape, dtype=object)
+ {{else}}
+ resx = np.empty_like(out)
+ {{endif}}
+
+ N, K = (<object>values).shape
+
+ {{if name == "object"}}
+ if True: # make templating happy
+ {{else}}
+ with nogil:
+ {{endif}}
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != {{nan_val}}:
+ nobs[lab, j] += 1
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = {{nan_val}}
+ else:
+ out[i, j] = resx[i, j]
+
+
+def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels, int64_t rank,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ {{c_type}} val
+ ndarray[{{c_type}}, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ {{if name=='object'}}
+ resx = np.empty((<object>out).shape, dtype=object)
+ {{else}}
+ resx = np.empty_like(out)
+ {{endif}}
+
+ N, K = (<object>values).shape
+
+ {{if name == "object"}}
+ if True: # make templating happy
+ {{else}}
+ with nogil:
+ {{endif}}
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != {{nan_val}}:
+ nobs[lab, j] += 1
+ if nobs[lab, j] == rank:
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = {{nan_val}}
+ else:
+ out[i, j] = resx[i, j]
+
+
+{{if name != 'object'}}
+
+
+def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike, object ties_method,
+ bint ascending, bint pct, object na_option):
+ """
+ Provides the rank of values within each group.
+
+ Parameters
+ ----------
+ out : array of float64_t values which this method will write its results to
+ values : array of {{c_type}} values to be ranked
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ is_datetimelike : bool, default False
+ unused in this method but provided for call compatibility with other
+ Cython transformations
+ ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+ 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ TiebreakEnumType tiebreak
+ Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
+ Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+ ndarray[int64_t] _as
+ ndarray[float64_t, ndim=2] grp_sizes
+ ndarray[{{c_type}}] masked_vals
+ ndarray[uint8_t] mask
+ bint keep_na
+ {{c_type}} nan_fill_val
+
+ tiebreak = tiebreakers[ties_method]
+ keep_na = na_option == 'keep'
+ N, K = (<object>values).shape
+ grp_sizes = np.ones_like(out)
+
+ # Copy values into new array in order to fill missing data
+ # with mask, without obfuscating location of missing data
+ # in values array
+ masked_vals = np.array(values[:, 0], copy=True)
+ {{if name == 'int64'}}
+ mask = (masked_vals == {{nan_val}}).astype(np.uint8)
+ {{else}}
+ mask = np.isnan(masked_vals).astype(np.uint8)
+ {{endif}}
+
+ if ascending ^ (na_option == 'top'):
+ {{if name == 'int64'}}
+ nan_fill_val = np.iinfo(np.int64).max
+ {{else}}
+ nan_fill_val = np.inf
+ {{endif}}
+ order = (masked_vals, mask, labels)
+ else:
+ {{if name == 'int64'}}
+ nan_fill_val = np.iinfo(np.int64).min
+ {{else}}
+ nan_fill_val = -np.inf
+ {{endif}}
+ order = (masked_vals, ~mask, labels)
+ np.putmask(masked_vals, mask, nan_fill_val)
+
+ # lexsort using labels, then mask, then actual values
+ # each label corresponds to a different group value,
+ # the mask helps you differentiate missing values before
+ # performing sort on the actual values
+ _as = np.lexsort(order).astype(np.int64, copy=False)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ with nogil:
+ # Loop over the length of the value array
+ # each incremental i value can be looked up in the _as array
+ # that we sorted previously, which gives us the location of
+ # that sorted value for retrieval back from the original
+ # values / masked_vals arrays
+ for i in range(N):
+ # dups and sum_ranks will be incremented each loop where
+ # the value / group remains the same, and should be reset
+ # when either of those change
+ # Used to calculate tiebreakers
+ dups += 1
+ sum_ranks += i - grp_start + 1
+
+ # Update out only when there is a transition of values or labels.
+ # When a new value or group is encountered, go back #dups steps(
+ # the number of occurrence of current value) and assign the ranks
+ # based on the the starting index of the current group (grp_start)
+ # and the current index
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]]) or
+ (labels[_as[i]] != labels[_as[i+1]])):
+ # if keep_na, check for missing values and assign back
+ # to the result where appropriate
+ if keep_na and mask[_as[i]]:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = NaN
+ grp_na_count = dups
+ elif tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = sum_ranks / <float64_t>dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ if ascending:
+ out[_as[j], 0] = j + 1 - grp_start
+ else:
+ out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
+ elif tiebreak == TIEBREAK_DENSE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = grp_vals_seen
+
+ # look forward to the next value (using the sorting in _as)
+ # if the value does not equal the current value then we need to
+ # reset the dups and sum_ranks, knowing that a new value is
+ # coming up. the conditional also needs to handle nan equality
+ # and the end of iteration
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]])):
+ dups = sum_ranks = 0
+ grp_vals_seen += 1
+ grp_tie_count += 1
+
+ # Similar to the previous conditional, check now if we are
+ # moving to a new group. If so, keep track of the index where
+ # the new group occurs, so the tiebreaker calculations can
+ # decrement that from their position. fill in the size of each
+ # group encountered (used by pct calculations later). also be
+ # sure to reset any of the items helping to calculate dups
+ if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+ if tiebreak != TIEBREAK_DENSE:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+ grp_na_count)
+ else:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (grp_tie_count -
+ (grp_na_count > 0))
+ dups = sum_ranks = 0
+ grp_na_count = 0
+ grp_tie_count = 0
+ grp_start = i + 1
+ grp_vals_seen = 1
+
+ if pct:
+ for i in range(N):
+ # We don't include NaN values in percentage
+ # rankings, so we assign them percentages of NaN.
+ if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
+ out[i, 0] = NAN
+ elif grp_sizes[i, 0] != 0:
+ out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+{{endif}}
+{{endfor}}
+
+
+# ----------------------------------------------------------------------
+# group_min, group_max
+# ----------------------------------------------------------------------
+
+# TODO: consider implementing for more dtypes
+ctypedef fused groupby_t:
+ float64_t
+ float32_t
+ int64_t
+
+
+def group_max(ndarray[groupby_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ groupby_t val, count, nan_val
+ ndarray[groupby_t, ndim=2] maxx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+
+ maxx = np.empty_like(out)
+ if groupby_t is int64_t:
+ # Note: evaluated at compile-time
+ maxx[:] = -_int64_max
+ nan_val = NPY_NAT
+ else:
+ maxx[:] = -np.inf
+ nan_val = NAN
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if groupby_t is int64_t:
+ if val != nan_val:
+ nobs[lab, j] += 1
+ if val > maxx[lab, j]:
+ maxx[lab, j] = val
+ else:
+ if val == val and val != nan_val:
+ nobs[lab, j] += 1
+ if val > maxx[lab, j]:
+ maxx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = nan_val
+ else:
+ out[i, j] = maxx[i, j]
+
+
+def group_min(ndarray[groupby_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ groupby_t val, count, nan_val
+ ndarray[groupby_t, ndim=2] minx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+
+ minx = np.empty_like(out)
+ if groupby_t is int64_t:
+ minx[:] = _int64_max
+ nan_val = NPY_NAT
+ else:
+ minx[:] = np.inf
+ nan_val = NAN
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if groupby_t is int64_t:
+ if val != nan_val:
+ nobs[lab, j] += 1
+ if val < minx[lab, j]:
+ minx[lab, j] = val
+ else:
+ if val == val and val != nan_val:
+ nobs[lab, j] += 1
+ if val < minx[lab, j]:
+ minx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = nan_val
+ else:
+ out[i, j] = minx[i, j]
+
+
+def group_cummin(ndarray[groupby_t, ndim=2] out,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ groupby_t val, mval
+ ndarray[groupby_t, ndim=2] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.empty_like(values)
+ if groupby_t is int64_t:
+ accum[:] = _int64_max
+ else:
+ accum[:] = np.inf
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+
+ # val = nan
+ if groupby_t is int64_t:
+ if is_datetimelike and val == NPY_NAT:
+ out[i, j] = NPY_NAT
+ else:
+ mval = accum[lab, j]
+ if val < mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+ else:
+ if val == val:
+ mval = accum[lab, j]
+ if val < mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+
+
+def group_cummax(ndarray[groupby_t, ndim=2] out,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ groupby_t val, mval
+ ndarray[groupby_t, ndim=2] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.empty_like(values)
+ if groupby_t is int64_t:
+ accum[:] = -_int64_max
+ else:
+ accum[:] = -np.inf
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+
+ if groupby_t is int64_t:
+ if is_datetimelike and val == NPY_NAT:
+ out[i, j] = NPY_NAT
+ else:
+ mval = accum[lab, j]
+ if val > mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+ else:
+ if val == val:
+ mval = accum[lab, j]
+ if val > mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi.in
new file mode 100644
index 00000000000..eac35588b6f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi.in
@@ -0,0 +1,1109 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+
+# ----------------------------------------------------------------------
+# VectorData
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, dtype, arg
+# the generated StringVector is not actually used
+# but is included for completeness (rather ObjectVector is used
+# for uniques in hashtables)
+
+dtypes = [('Float64', 'float64', 'float64_t'),
+ ('Int64', 'int64', 'int64_t'),
+ ('String', 'string', 'char *'),
+ ('UInt64', 'uint64', 'uint64_t')]
+}}
+
+{{for name, dtype, arg in dtypes}}
+
+
+{{if dtype != 'int64'}}
+
+ctypedef struct {{name}}VectorData:
+ {{arg}} *data
+ Py_ssize_t n, m
+
+{{endif}}
+
+
+cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
+ {{arg}} x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+{{endfor}}
+
+ctypedef fused vector_data:
+ Int64VectorData
+ UInt64VectorData
+ Float64VectorData
+ StringVectorData
+
+cdef inline bint needs_resize(vector_data *data) nogil:
+ return data.n == data.m
+
+# ----------------------------------------------------------------------
+# Vector
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, dtype, arg, idtype
+dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
+ ('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
+ ('Int64', 'int64', 'int64_t', 'np.int64')]
+
+}}
+
+{{for name, dtype, arg, idtype in dtypes}}
+
+cdef class {{name}}Vector:
+
+ {{if dtype != 'int64'}}
+ cdef:
+ bint external_view_exists
+ {{name}}VectorData *data
+ ndarray ao
+ {{endif}}
+
+ def __cinit__(self):
+ self.data = <{{name}}VectorData *>PyMem_Malloc(
+ sizeof({{name}}VectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.ao = np.empty(self.data.m, dtype={{idtype}})
+ self.data.data = <{{arg}}*>self.ao.data
+
+ cdef resize(self):
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+ self.ao.resize(self.data.m, refcheck=False)
+ self.data.data = <{{arg}}*>self.ao.data
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ cpdef to_array(self):
+ if self.data.m != self.data.n:
+ if self.external_view_exists:
+ # should never happen
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.data.n, refcheck=False)
+ self.data.m = self.data.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef inline void append(self, {{arg}} x):
+
+ if needs_resize(self.data):
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.resize()
+
+ append_data_{{dtype}}(self.data, x)
+
+ cdef extend(self, const {{arg}}[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+{{endfor}}
+
+cdef class StringVector:
+
+ cdef:
+ StringVectorData *data
+ bint external_view_exists
+
+ def __cinit__(self):
+ self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+
+ cdef resize(self):
+ cdef:
+ char **orig_data
+ Py_ssize_t i, m
+
+ m = self.data.m
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+
+ orig_data = self.data.data
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+ for i in range(m):
+ self.data.data[i] = orig_data[i]
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ if self.data.data is not NULL:
+ free(self.data.data)
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ def to_array(self):
+ cdef:
+ ndarray ao
+ Py_ssize_t n
+ object val
+
+ ao = np.empty(self.data.n, dtype=np.object)
+ for i in range(self.data.n):
+ val = self.data.data[i]
+ ao[i] = val
+ self.external_view_exists = True
+ self.data.m = self.data.n
+ return ao
+
+ cdef inline void append(self, char *x):
+
+ if needs_resize(self.data):
+ self.resize()
+
+ append_data_string(self.data, x)
+
+ cdef extend(self, ndarray[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+
+cdef class ObjectVector:
+
+ cdef:
+ PyObject **data
+ Py_ssize_t n, m
+ ndarray ao
+ bint external_view_exists
+
+ def __cinit__(self):
+ self.external_view_exists = False
+ self.n = 0
+ self.m = _INIT_VEC_CAP
+ self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
+ self.data = <PyObject**>self.ao.data
+
+ def __len__(self):
+ return self.n
+
+ cdef inline append(self, object obj):
+ if self.n == self.m:
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.m = max(self.m * 2, _INIT_VEC_CAP)
+ self.ao.resize(self.m, refcheck=False)
+ self.data = <PyObject**>self.ao.data
+
+ Py_INCREF(obj)
+ self.data[self.n] = <PyObject*>obj
+ self.n += 1
+
+ def to_array(self):
+ if self.m != self.n:
+ if self.external_view_exists:
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.n, refcheck=False)
+ self.m = self.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef extend(self, ndarray[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+# ----------------------------------------------------------------------
+# HashTable
+# ----------------------------------------------------------------------
+
+
+cdef class HashTable:
+
+ pass
+
+{{py:
+
+# name, dtype, float_group, default_na_value
+dtypes = [('Float64', 'float64', True, 'np.nan'),
+ ('UInt64', 'uint64', False, 0),
+ ('Int64', 'int64', False, 'NPY_NAT')]
+
+}}
+
+
+{{for name, dtype, float_group, default_na_value in dtypes}}
+
+cdef class {{name}}HashTable(HashTable):
+
+ def __cinit__(self, int64_t size_hint=1):
+ self.table = kh_init_{{dtype}}()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_{{dtype}}(self.table, size_hint)
+
+ def __len__(self):
+ return self.table.size
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_{{dtype}}(self.table)
+ self.table = NULL
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ k = kh_get_{{dtype}}(self.table, key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, {{dtype}}_t val):
+ cdef khiter_t k
+ k = kh_get_{{dtype}}(self.table, val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+
+ k = kh_put_{{dtype}}(self.table, key, &ret)
+ self.table.keys[k] = key
+ if kh_exist_{{dtype}}(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{dtype}}_t key
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ key = keys[i]
+ k = kh_put_{{dtype}}(self.table, key, &ret)
+ self.table.vals[k] = <Py_ssize_t>values[i]
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{dtype}}_t val
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+ self.table.vals[k] = i
+
+ @cython.boundscheck(False)
+ def lookup(self, const {{dtype}}_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{dtype}}_t val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_get_{{dtype}}(self.table, val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ uniques : {{name}}Vector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ {{dtype}}_t val, na_value2
+ khiter_t k
+ {{name}}VectorData *ud
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+ use_na_value = na_value is not None
+
+ if use_na_value:
+ # We need this na_value2 because we want to allow users
+ # to *optionally* specify an NA sentinel *of the correct* type.
+ # We use None, to make it optional, which requires `object` type
+ # for the parameter. To please the compiler, we use na_value2,
+ # which is only used if it's *specified*.
+ na_value2 = <{{dtype}}_t>na_value
+ else:
+ na_value2 = {{default_na_value}}
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ if ignore_na and (val != val
+ or (use_na_value and val == na_value2)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them,
+ # and replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_{{dtype}}(self.table, val)
+
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ append_data_{{dtype}}(ud, val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = {{name}}Vector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = {{name}}Vector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+ @cython.boundscheck(False)
+ def get_labels_groupby(self, const {{dtype}}_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int64_t[:] labels
+ Py_ssize_t idx, count = 0
+ int ret = 0
+ {{dtype}}_t val
+ khiter_t k
+ {{name}}Vector uniques = {{name}}Vector()
+ {{name}}VectorData *ud
+
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ # specific for groupby
+ {{if dtype != 'uint64'}}
+ if val < 0:
+ labels[i] = -1
+ continue
+ {{endif}}
+
+ k = kh_get_{{dtype}}(self.table, val)
+ if k != self.table.n_buckets:
+ idx = self.table.vals[k]
+ labels[i] = idx
+ else:
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+ self.table.vals[k] = count
+
+ if needs_resize(ud):
+ with gil:
+ uniques.resize()
+ append_data_{{dtype}}(ud, val)
+ labels[i] = count
+ count += 1
+
+ arr_uniques = uniques.to_array()
+
+ return np.asarray(labels), arr_uniques
+
+{{endfor}}
+
+
+cdef class StringHashTable(HashTable):
+ # these by-definition *must* be strings
+ # or a sentinel np.nan / None missing value
+ na_string_sentinel = '__nan__'
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_str()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_str(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_str(self.table)
+ self.table = NULL
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(char *) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, object val):
+ cdef:
+ khiter_t k
+ const char *v
+ v = util.get_c_string(val)
+
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, object key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ const char *v
+
+ v = util.get_c_string(val)
+
+ k = kh_put_str(self.table, v, &ret)
+ self.table.keys[k] = key
+ if kh_exist_str(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def get_indexer(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
+ int64_t *resbuf = <int64_t*>labels.data
+ khiter_t k
+ kh_str_t *table = self.table
+ const char *v
+ const char **vecs
+
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+ v = util.get_c_string(val)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ k = kh_get_str(table, vecs[i])
+ if k != table.n_buckets:
+ resbuf[i] = table.vals[k]
+ else:
+ resbuf[i] = -1
+
+ free(vecs)
+ return labels
+
+ @cython.boundscheck(False)
+ def lookup(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ # these by-definition *must* be strings
+ vecs = <char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, (str, unicode)):
+ v = util.get_c_string(val)
+ else:
+ v = util.get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ free(vecs)
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+
+ # these by-definition *must* be strings
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, (str, unicode)):
+ v = util.get_c_string(val)
+ else:
+ v = util.get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_put_str(self.table, v, &ret)
+ self.table.vals[k] = i
+ free(vecs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int64_t[:] uindexer
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.zeros(n, dtype=np.int64)
+ uindexer = np.empty(n, dtype=np.int64)
+ use_na_value = na_value is not None
+
+ # assign pointers and pre-filter out missing (if ignore_na)
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if (ignore_na
+ and (not isinstance(val, (str, unicode))
+ or (use_na_value and val == na_value))):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), we can skip the actual value, and
+ # replace the label with na_sentinel directly
+ labels[i] = na_sentinel
+ else:
+ # if ignore_na is False, we also stringify NaN/None/etc.
+ v = util.get_c_string(val)
+ vecs[i] = v
+
+ # compute
+ with nogil:
+ for i in range(n):
+ if ignore_na and labels[i] == na_sentinel:
+ # skip entries for ignored missing values (see above)
+ continue
+
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_str(self.table, v, &ret)
+ uindexer[count] = i
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = <int64_t>count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = <int64_t>idx
+
+ free(vecs)
+
+ # uniques
+ for i in range(count):
+ uniques.append(values[uindexer[i]])
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+
+cdef class PyObjectHashTable(HashTable):
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_pymap()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_pymap(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_pymap(self.table)
+ self.table = NULL
+
+ def __len__(self):
+ return self.table.size
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ hash(key)
+
+ k = kh_get_pymap(self.table, <PyObject*>key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(PyObject *) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, object val):
+ cdef khiter_t k
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, object key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ char* buf
+
+ hash(key)
+
+ k = kh_put_pymap(self.table, <PyObject*>key, &ret)
+ # self.table.keys[k] = key
+ if kh_exist_pymap(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ def map_locations(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ self.table.vals[k] = i
+
+ def lookup(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ object val
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ use_na_value = na_value is not None
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ if ignore_na and ((val != val or val is None)
+ or (use_na_value and val == na_value)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them, and
+ # replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ uniques.append(val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi.in
new file mode 100644
index 00000000000..80d864c65d0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi.in
@@ -0,0 +1,352 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# VectorData
+# ----------------------------------------------------------------------
+
+{{py:
+
+# dtype, ttype
+dtypes = [('float64', 'float64', 'float64_t'),
+ ('uint64', 'uint64', 'uint64_t'),
+ ('object', 'pymap', 'object'),
+ ('int64', 'int64', 'int64_t')]
+
+}}
+
+{{for dtype, ttype, scalar in dtypes}}
+
+
+{{if dtype == 'object'}}
+cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values,
+ kh_{{ttype}}_t *table, bint dropna):
+{{else}}
+cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
+ kh_{{ttype}}_t *table, bint dropna):
+{{endif}}
+ cdef:
+ khiter_t k
+ Py_ssize_t i, n = len(values)
+
+ {{scalar}} val
+
+ int ret = 0
+
+ {{if dtype == 'object'}}
+ kh_resize_{{ttype}}(table, n // 10)
+
+ for i in range(n):
+ val = values[i]
+
+ if not checknull(val) or not dropna:
+ k = kh_get_{{ttype}}(table, <PyObject*>val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
+ table.vals[k] = 1
+ {{else}}
+ with nogil:
+ kh_resize_{{ttype}}(table, n)
+
+ for i in range(n):
+ val = values[i]
+
+ {{if dtype == 'float64'}}
+ if val == val or not dropna:
+ {{else}}
+ if True:
+ {{endif}}
+ k = kh_get_{{ttype}}(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_{{ttype}}(table, val, &ret)
+ table.vals[k] = 1
+ {{endif}}
+
+
+{{if dtype == 'object'}}
+cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
+{{else}}
+cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna):
+{{endif}}
+ cdef:
+ Py_ssize_t i = 0
+ kh_{{ttype}}_t *table
+
+ {{if dtype != 'object'}}
+ {{dtype}}_t[:] result_keys
+ int64_t[:] result_counts
+ {{endif}}
+
+ Py_ssize_t k
+
+ table = kh_init_{{ttype}}()
+ {{if dtype == 'object'}}
+ build_count_table_{{dtype}}(values, table, 1)
+ {{else}}
+ build_count_table_{{dtype}}(values, table, dropna)
+ {{endif}}
+
+ result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}})
+ result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+
+ {{if dtype == 'object'}}
+ for k in range(table.n_buckets):
+ if kh_exist_{{ttype}}(table, k):
+ result_keys[i] = <{{dtype}}>table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+ {{else}}
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_{{ttype}}(table, k):
+ result_keys[i] = table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+ {{endif}}
+
+ kh_destroy_{{ttype}}(table)
+
+ {{if dtype == 'object'}}
+ return result_keys, result_counts
+ {{else}}
+ return np.asarray(result_keys), np.asarray(result_counts)
+ {{endif}}
+
+
+{{if dtype == 'object'}}
+
+
+def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
+{{else}}
+
+
+def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
+{{endif}}
+ cdef:
+ int ret = 0
+ {{if dtype != 'object'}}
+ {{dtype}}_t value
+ {{endif}}
+ Py_ssize_t k, i, n = len(values)
+ kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+ kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ if keep == 'last':
+ {{if dtype == 'object'}}
+ for i from n > i >= 0:
+ kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
+ out[i] = ret == 0
+ {{else}}
+ with nogil:
+ for i from n > i >= 0:
+ kh_put_{{ttype}}(table, values[i], &ret)
+ out[i] = ret == 0
+ {{endif}}
+ elif keep == 'first':
+ {{if dtype == 'object'}}
+ for i in range(n):
+ kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
+ out[i] = ret == 0
+ {{else}}
+ with nogil:
+ for i in range(n):
+ kh_put_{{ttype}}(table, values[i], &ret)
+ out[i] = ret == 0
+ {{endif}}
+ else:
+ {{if dtype == 'object'}}
+ for i in range(n):
+ value = values[i]
+ k = kh_get_{{ttype}}(table, <PyObject*>value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_{{ttype}}(table, <PyObject*>value, &ret)
+ table.keys[k] = <PyObject*>value
+ table.vals[k] = i
+ out[i] = 0
+ {{else}}
+ with nogil:
+ for i in range(n):
+ value = values[i]
+ k = kh_get_{{ttype}}(table, value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_{{ttype}}(table, value, &ret)
+ table.keys[k] = value
+ table.vals[k] = i
+ out[i] = 0
+ {{endif}}
+ kh_destroy_{{ttype}}(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+{{if dtype == 'object'}}
+
+
+def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
+{{else}}
+
+
+def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
+{{endif}}
+
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : {{dtype}} ndarray
+ values : {{dtype}} ndarray
+
+ Returns
+ -------
+ boolean ndarry len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n, k
+ int ret = 0
+ ndarray[uint8_t] result
+ {{scalar}} val
+ kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+
+ # construct the table
+ n = len(values)
+ kh_resize_{{ttype}}(table, min(n, len(values)))
+
+ {{if dtype == 'object'}}
+ for i in range(n):
+ kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
+ {{else}}
+ with nogil:
+ for i in range(n):
+ kh_put_{{ttype}}(table, values[i], &ret)
+ {{endif}}
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ {{if dtype == 'object'}}
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_{{ttype}}(table, <PyObject*>val)
+ result[i] = (k != table.n_buckets)
+ {{else}}
+ with nogil:
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_{{ttype}}(table, val)
+ result[i] = (k != table.n_buckets)
+ {{endif}}
+
+ kh_destroy_{{ttype}}(table)
+ return result.view(np.bool_)
+
+{{endfor}}
+
+
+# ----------------------------------------------------------------------
+# Mode Computations
+# ----------------------------------------------------------------------
+
+{{py:
+
+# dtype, ctype, table_type, npy_dtype
+dtypes = [('float64', 'float64_t', 'float64', 'float64'),
+ ('int64', 'int64_t', 'int64', 'int64'),
+ ('uint64', 'uint64_t', 'uint64', 'uint64'),
+ ('object', 'object', 'pymap', 'object_')]
+}}
+
+{{for dtype, ctype, table_type, npy_dtype in dtypes}}
+
+
+
+{{if dtype == 'object'}}
+
+
+def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
+{{else}}
+
+
+def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
+{{endif}}
+ cdef:
+ int count, max_count = 1
+ int j = -1 # so you can do +=
+ Py_ssize_t k
+ kh_{{table_type}}_t *table
+ ndarray[{{ctype}}] modes
+
+ table = kh_init_{{table_type}}()
+ build_count_table_{{dtype}}(values, table, dropna)
+
+ modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
+
+ {{if dtype != 'object'}}
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_{{table_type}}(table, k):
+ count = table.vals[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = table.keys[k]
+ {{else}}
+ for k in range(table.n_buckets):
+ if kh_exist_{{table_type}}(table, k):
+ count = table.vals[k]
+
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = <object>table.keys[k]
+ {{endif}}
+
+ kh_destroy_{{table_type}}(table)
+
+ return modes[:j + 1]
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi.in
new file mode 100644
index 00000000000..3c9a096e7ec
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi.in
@@ -0,0 +1,74 @@
+"""
+Template for functions of IndexEngine subclasses.
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# IndexEngine Subclass Methods
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, dtype, ctype, hashtable_name, hashtable_dtype
+dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'),
+ ('Float32', 'float32', 'float32_t', 'Float64', 'float64'),
+ ('Int64', 'int64', 'int64_t', 'Int64', 'int64'),
+ ('Int32', 'int32', 'int32_t', 'Int64', 'int64'),
+ ('Int16', 'int16', 'int16_t', 'Int64', 'int64'),
+ ('Int8', 'int8', 'int8_t', 'Int64', 'int64'),
+ ('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'),
+ ('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'),
+ ('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'),
+ ('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'),
+ ]
+}}
+
+{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}}
+
+
+cdef class {{name}}Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.{{hashtable_name}}HashTable(n)
+
+ {{if name not in {'Float64', 'Float32'} }}
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+ {{endif}}
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type {{hashtable_name}}HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_{{dtype}}(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[{{ctype}}] values
+ int count = 0
+
+ {{if name not in {'Float64', 'Float32'} }}
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+ {{endif}}
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('{{dtype}}')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi.in b/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi.in
new file mode 100644
index 00000000000..196841f35ed
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi.in
@@ -0,0 +1,413 @@
+"""
+Template for intervaltree
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+ctypedef fused scalar_t:
+ float64_t
+ float32_t
+ int64_t
+ int32_t
+ uint64_t
+
+# ----------------------------------------------------------------------
+# IntervalTree
+# ----------------------------------------------------------------------
+
+cdef class IntervalTree(IntervalMixin):
+ """A centered interval tree
+
+ Based off the algorithm described on Wikipedia:
+ http://en.wikipedia.org/wiki/Interval_tree
+
+ we are emulating the IndexEngine interface
+ """
+ cdef:
+ readonly object left, right, root, dtype
+ readonly str closed
+ object _is_overlapping, _left_sorter, _right_sorter
+
+ def __init__(self, left, right, closed='right', leaf_size=100):
+ """
+ Parameters
+ ----------
+ left, right : np.ndarray[ndim=1]
+ Left and right bounds for each interval. Assumed to contain no
+ NaNs.
+ closed : {'left', 'right', 'both', 'neither'}, optional
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither. Defaults to 'right'.
+ leaf_size : int, optional
+ Parameter that controls when the tree switches from creating nodes
+ to brute-force search. Tune this parameter to optimize query
+ performance.
+ """
+ if closed not in ['left', 'right', 'both', 'neither']:
+ raise ValueError("invalid option for 'closed': %s" % closed)
+
+ left = np.asarray(left)
+ right = np.asarray(right)
+ self.dtype = np.result_type(left, right)
+ self.left = np.asarray(left, dtype=self.dtype)
+ self.right = np.asarray(right, dtype=self.dtype)
+
+ indices = np.arange(len(left), dtype='int64')
+
+ self.closed = closed
+
+ # GH 23352: ensure no nan in nodes
+ mask = ~np.isnan(self.left)
+ self.left = self.left[mask]
+ self.right = self.right[mask]
+ indices = indices[mask]
+
+ node_cls = NODE_CLASSES[str(self.dtype), closed]
+ self.root = node_cls(self.left, self.right, indices, leaf_size)
+
+ @property
+ def left_sorter(self):
+ """How to sort the left labels; this is used for binary search
+ """
+ if self._left_sorter is None:
+ self._left_sorter = np.argsort(self.left)
+ return self._left_sorter
+
+ @property
+ def right_sorter(self):
+ """How to sort the right labels
+ """
+ if self._right_sorter is None:
+ self._right_sorter = np.argsort(self.right)
+ return self._right_sorter
+
+ @property
+ def is_overlapping(self):
+ """
+ Determine if the IntervalTree contains overlapping intervals.
+ Cached as self._is_overlapping.
+ """
+ if self._is_overlapping is not None:
+ return self._is_overlapping
+
+ # <= when both sides closed since endpoints can overlap
+ op = le if self.closed == 'both' else lt
+
+ # overlap if start of current interval < end of previous interval
+ # (current and previous in terms of sorted order by left/start side)
+ current = self.left[self.left_sorter[1:]]
+ previous = self.right[self.left_sorter[:-1]]
+ self._is_overlapping = bool(op(current, previous).any())
+
+ return self._is_overlapping
+
+ def get_loc(self, scalar_t key):
+ """Return all positions corresponding to intervals that overlap with
+ the given scalar key
+ """
+ result = Int64Vector()
+ self.root.query(result, key)
+ if not result.data.n:
+ raise KeyError(key)
+ return result.to_array().astype('intp')
+
+ def _get_partial_overlap(self, key_left, key_right, side):
+ """Return all positions corresponding to intervals with the given side
+ falling between the left and right bounds of an interval query
+ """
+ if side == 'left':
+ values = self.left
+ sorter = self.left_sorter
+ else:
+ values = self.right
+ sorter = self.right_sorter
+ key = [key_left, key_right]
+ i, j = values.searchsorted(key, sorter=sorter)
+ return sorter[i:j]
+
+ def get_loc_interval(self, key_left, key_right):
+ """Lookup the intervals enclosed in the given interval bounds
+
+ The given interval is presumed to have closed bounds.
+ """
+ import pandas as pd
+ left_overlap = self._get_partial_overlap(key_left, key_right, 'left')
+ right_overlap = self._get_partial_overlap(key_left, key_right, 'right')
+ enclosing = self.get_loc(0.5 * (key_left + key_right))
+ combined = np.concatenate([left_overlap, right_overlap, enclosing])
+ uniques = pd.unique(combined)
+ return uniques.astype('intp')
+
+ def get_indexer(self, scalar_t[:] target):
+ """Return the positions corresponding to unique intervals that overlap
+ with the given array of scalar targets.
+ """
+
+ # TODO: write get_indexer_intervals
+ cdef:
+ size_t old_len
+ Py_ssize_t i
+ Int64Vector result
+
+ result = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ self.root.query(result, target[i])
+ if result.data.n == old_len:
+ result.append(-1)
+ elif result.data.n > old_len + 1:
+ raise KeyError(
+ 'indexer does not intersect a unique set of intervals')
+ old_len = result.data.n
+ return result.to_array().astype('intp')
+
+ def get_indexer_non_unique(self, scalar_t[:] target):
+ """Return the positions corresponding to intervals that overlap with
+ the given array of scalar targets. Non-unique positions are repeated.
+ """
+ cdef:
+ size_t old_len
+ Py_ssize_t i
+ Int64Vector result, missing
+
+ result = Int64Vector()
+ missing = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ self.root.query(result, target[i])
+ if result.data.n == old_len:
+ result.append(-1)
+ missing.append(i)
+ old_len = result.data.n
+ return (result.to_array().astype('intp'),
+ missing.to_array().astype('intp'))
+
+ def __repr__(self):
+ return ('<IntervalTree[{dtype},{closed}]: '
+ '{n_elements} elements>'.format(
+ dtype=self.dtype, closed=self.closed,
+ n_elements=self.root.n_elements))
+
+ # compat with IndexEngine interface
+ def clear_mapping(self):
+ pass
+
+
+cdef take(ndarray source, ndarray indices):
+ """Take the given positions from a 1D ndarray
+ """
+ return PyArray_Take(source, indices, 0)
+
+
+cdef sort_values_and_indices(all_values, all_indices, subset):
+ indices = take(all_indices, subset)
+ values = take(all_values, subset)
+ sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
+ sorted_values = take(values, sorter)
+ sorted_indices = take(indices, sorter)
+ return sorted_values, sorted_indices
+
+
+# ----------------------------------------------------------------------
+# Nodes
+# ----------------------------------------------------------------------
+
+# we need specialized nodes and leaves to optimize for different dtype and
+# closed values
+
+{{py:
+
+nodes = []
+for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']:
+ for closed, cmp_left, cmp_right in [
+ ('left', '<=', '<'),
+ ('right', '<', '<='),
+ ('both', '<=', '<='),
+ ('neither', '<', '<')]:
+ cmp_left_converse = '<' if cmp_left == '<=' else '<='
+ cmp_right_converse = '<' if cmp_right == '<=' else '<='
+ nodes.append((dtype, dtype.title(),
+ closed, closed.title(),
+ cmp_left,
+ cmp_right,
+ cmp_left_converse,
+ cmp_right_converse))
+
+}}
+
+NODE_CLASSES = {}
+
+{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right,
+ cmp_left_converse, cmp_right_converse in nodes}}
+
+cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node
+ {{dtype}}_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ {{dtype}}_t min_left, max_right
+ readonly {{dtype}}_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[{{dtype}}_t, ndim=1] left,
+ ndarray[{{dtype}}_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] {{cmp_right_converse}} self.pivot:
+ left_ind.append(i)
+ elif self.pivot {{cmp_left_converse}} left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[{{dtype}}_t, ndim=1] left,
+ ndarray[{{dtype}}_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return {{dtype_title}}Closed{{closed_title}}IntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ {{dtype}}_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] {{cmp_left}} point:
+ break
+ result.append(indices[i])
+ if point {{cmp_right}} self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point {{cmp_right}} values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left {{cmp_left}} point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['{{dtype}}',
+ '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi.in b/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi.in
new file mode 100644
index 00000000000..18476d2e285
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi.in
@@ -0,0 +1,308 @@
+"""
+Template for each `dtype` helper function for sparse ops
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# Sparse op
+# ----------------------------------------------------------------------
+
+ctypedef fused sparse_t:
+ float64_t
+ int64_t
+
+
+cdef inline float64_t __div__(sparse_t a, sparse_t b):
+ if b == 0:
+ if a > 0:
+ return INF
+ elif a < 0:
+ return -INF
+ else:
+ return NaN
+ else:
+ return float(a) / b
+
+
+cdef inline float64_t __truediv__(sparse_t a, sparse_t b):
+ return __div__(a, b)
+
+
+cdef inline sparse_t __mod__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ return NaN
+ else:
+ return 0
+ else:
+ return a % b
+
+
+cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ return NaN
+ else:
+ return 0
+ else:
+ return a // b
+
+
+# ----------------------------------------------------------------------
+# sparse array op
+# ----------------------------------------------------------------------
+
+{{py:
+
+# dtype, arith_comp_group, logical_group
+dtypes = [('float64', True, False),
+ ('int64', True, True),
+ ('uint8', False, True)]
+# do not generate arithmetic / comparison template for uint8,
+# it should be done in fused types
+
+def get_op(tup):
+ assert isinstance(tup, tuple)
+ assert len(tup) == 4
+
+ opname, lval, rval, dtype = tup
+
+ ops_dict = {'add': '{0} + {1}',
+ 'sub': '{0} - {1}',
+ 'mul': '{0} * {1}',
+ 'div': '__div__({0}, {1})',
+ 'mod': '__mod__({0}, {1})',
+ 'truediv': '__truediv__({0}, {1})',
+ 'floordiv': '__floordiv__({0}, {1})',
+ 'pow': '{0} ** {1}',
+ 'eq': '{0} == {1}',
+ 'ne': '{0} != {1}',
+ 'lt': '{0} < {1}',
+ 'gt': '{0} > {1}',
+ 'le': '{0} <= {1}',
+ 'ge': '{0} >= {1}',
+
+ 'and': '{0} & {1}', # logical op
+ 'or': '{0} | {1}'}
+
+ return ops_dict[opname].format(lval, rval, dtype)
+
+
+def get_dispatch(dtypes):
+
+ ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
+ 'floordiv', 'pow',
+ 'eq', 'ne', 'lt', 'gt', 'le', 'ge',
+ 'and', 'or']
+
+ for opname in ops_list:
+ for dtype, arith_comp_group, logical_group in dtypes:
+
+ if opname in ('div', 'truediv'):
+ rdtype = 'float64'
+ elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
+ # comparison op
+ rdtype = 'uint8'
+ elif opname in ('and', 'or'):
+ # logical op
+ rdtype = 'uint8'
+ else:
+ rdtype = dtype
+
+ if opname in ('and', 'or'):
+ if logical_group:
+ yield opname, dtype, rdtype
+ else:
+ if arith_comp_group:
+ yield opname, dtype, rdtype
+
+}}
+
+
+{{for opname, dtype, rdtype in get_dispatch(dtypes)}}
+
+{{if opname == "pow"}}
[email protected](True) # Cython 3 matches Python pow, which isn't what we want here
+{{endif}}
+cdef inline tuple block_op_{{opname}}_{{dtype}}(ndarray x_,
+ BlockIndex xindex,
+ {{dtype}}_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ {{dtype}}_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[{{dtype}}_t, ndim=1] x, y
+ ndarray[{{rdtype}}_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
+
+
+cdef inline tuple int_op_{{opname}}_{{dtype}}(ndarray x_, IntIndex xindex,
+ {{dtype}}_t xfill,
+ ndarray y_, IntIndex yindex,
+ {{dtype}}_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[{{dtype}}_t, ndim=1] x, y
+ ndarray[{{rdtype}}_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
+
+
+cpdef sparse_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x,
+ SparseIndex xindex, {{dtype}}_t xfill,
+ ndarray[{{dtype}}_t, ndim=1] y,
+ SparseIndex yindex, {{dtype}}_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_{{opname}}_{{dtype}}({{dtype}}_t xfill,
+ {{dtype}}_t yfill):
+ return {{(opname, 'xfill', 'yfill', dtype) | get_op}}
+
+{{endfor}}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_inttypes.h b/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_inttypes.h
new file mode 100644
index 00000000000..1be380337b0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_inttypes.h
@@ -0,0 +1,305 @@
+// ISO C9x compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include "ms_stdint.h"
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+ intmax_t quot;
+ intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198
+
+// The fprintf macros for signed integers are:
+#define PRId8 "d"
+#define PRIi8 "i"
+#define PRIdLEAST8 "d"
+#define PRIiLEAST8 "i"
+#define PRIdFAST8 "d"
+#define PRIiFAST8 "i"
+
+#define PRId16 "hd"
+#define PRIi16 "hi"
+#define PRIdLEAST16 "hd"
+#define PRIiLEAST16 "hi"
+#define PRIdFAST16 "hd"
+#define PRIiFAST16 "hi"
+
+#define PRId32 "I32d"
+#define PRIi32 "I32i"
+#define PRIdLEAST32 "I32d"
+#define PRIiLEAST32 "I32i"
+#define PRIdFAST32 "I32d"
+#define PRIiFAST32 "I32i"
+
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIdLEAST64 "I64d"
+#define PRIiLEAST64 "I64i"
+#define PRIdFAST64 "I64d"
+#define PRIiFAST64 "I64i"
+
+#define PRIdMAX "I64d"
+#define PRIiMAX "I64i"
+
+#define PRIdPTR "Id"
+#define PRIiPTR "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8 "o"
+#define PRIu8 "u"
+#define PRIx8 "x"
+#define PRIX8 "X"
+#define PRIoLEAST8 "o"
+#define PRIuLEAST8 "u"
+#define PRIxLEAST8 "x"
+#define PRIXLEAST8 "X"
+#define PRIoFAST8 "o"
+#define PRIuFAST8 "u"
+#define PRIxFAST8 "x"
+#define PRIXFAST8 "X"
+
+#define PRIo16 "ho"
+#define PRIu16 "hu"
+#define PRIx16 "hx"
+#define PRIX16 "hX"
+#define PRIoLEAST16 "ho"
+#define PRIuLEAST16 "hu"
+#define PRIxLEAST16 "hx"
+#define PRIXLEAST16 "hX"
+#define PRIoFAST16 "ho"
+#define PRIuFAST16 "hu"
+#define PRIxFAST16 "hx"
+#define PRIXFAST16 "hX"
+
+#define PRIo32 "I32o"
+#define PRIu32 "I32u"
+#define PRIx32 "I32x"
+#define PRIX32 "I32X"
+#define PRIoLEAST32 "I32o"
+#define PRIuLEAST32 "I32u"
+#define PRIxLEAST32 "I32x"
+#define PRIXLEAST32 "I32X"
+#define PRIoFAST32 "I32o"
+#define PRIuFAST32 "I32u"
+#define PRIxFAST32 "I32x"
+#define PRIXFAST32 "I32X"
+
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#define PRIoLEAST64 "I64o"
+#define PRIuLEAST64 "I64u"
+#define PRIxLEAST64 "I64x"
+#define PRIXLEAST64 "I64X"
+#define PRIoFAST64 "I64o"
+#define PRIuFAST64 "I64u"
+#define PRIxFAST64 "I64x"
+#define PRIXFAST64 "I64X"
+
+#define PRIoMAX "I64o"
+#define PRIuMAX "I64u"
+#define PRIxMAX "I64x"
+#define PRIXMAX "I64X"
+
+#define PRIoPTR "Io"
+#define PRIuPTR "Iu"
+#define PRIxPTR "Ix"
+#define PRIXPTR "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8 "d"
+#define SCNi8 "i"
+#define SCNdLEAST8 "d"
+#define SCNiLEAST8 "i"
+#define SCNdFAST8 "d"
+#define SCNiFAST8 "i"
+
+#define SCNd16 "hd"
+#define SCNi16 "hi"
+#define SCNdLEAST16 "hd"
+#define SCNiLEAST16 "hi"
+#define SCNdFAST16 "hd"
+#define SCNiFAST16 "hi"
+
+#define SCNd32 "ld"
+#define SCNi32 "li"
+#define SCNdLEAST32 "ld"
+#define SCNiLEAST32 "li"
+#define SCNdFAST32 "ld"
+#define SCNiFAST32 "li"
+
+#define SCNd64 "I64d"
+#define SCNi64 "I64i"
+#define SCNdLEAST64 "I64d"
+#define SCNiLEAST64 "I64i"
+#define SCNdFAST64 "I64d"
+#define SCNiFAST64 "I64i"
+
+#define SCNdMAX "I64d"
+#define SCNiMAX "I64i"
+
+#ifdef _WIN64 // [
+# define SCNdPTR "I64d"
+# define SCNiPTR "I64i"
+#else // _WIN64 ][
+# define SCNdPTR "ld"
+# define SCNiPTR "li"
+#endif // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8 "o"
+#define SCNu8 "u"
+#define SCNx8 "x"
+#define SCNX8 "X"
+#define SCNoLEAST8 "o"
+#define SCNuLEAST8 "u"
+#define SCNxLEAST8 "x"
+#define SCNXLEAST8 "X"
+#define SCNoFAST8 "o"
+#define SCNuFAST8 "u"
+#define SCNxFAST8 "x"
+#define SCNXFAST8 "X"
+
+#define SCNo16 "ho"
+#define SCNu16 "hu"
+#define SCNx16 "hx"
+#define SCNX16 "hX"
+#define SCNoLEAST16 "ho"
+#define SCNuLEAST16 "hu"
+#define SCNxLEAST16 "hx"
+#define SCNXLEAST16 "hX"
+#define SCNoFAST16 "ho"
+#define SCNuFAST16 "hu"
+#define SCNxFAST16 "hx"
+#define SCNXFAST16 "hX"
+
+#define SCNo32 "lo"
+#define SCNu32 "lu"
+#define SCNx32 "lx"
+#define SCNX32 "lX"
+#define SCNoLEAST32 "lo"
+#define SCNuLEAST32 "lu"
+#define SCNxLEAST32 "lx"
+#define SCNXLEAST32 "lX"
+#define SCNoFAST32 "lo"
+#define SCNuFAST32 "lu"
+#define SCNxFAST32 "lx"
+#define SCNXFAST32 "lX"
+
+#define SCNo64 "I64o"
+#define SCNu64 "I64u"
+#define SCNx64 "I64x"
+#define SCNX64 "I64X"
+#define SCNoLEAST64 "I64o"
+#define SCNuLEAST64 "I64u"
+#define SCNxLEAST64 "I64x"
+#define SCNXLEAST64 "I64X"
+#define SCNoFAST64 "I64o"
+#define SCNuFAST64 "I64u"
+#define SCNxFAST64 "I64x"
+#define SCNXFAST64 "I64X"
+
+#define SCNoMAX "I64o"
+#define SCNuMAX "I64u"
+#define SCNxMAX "I64x"
+#define SCNXMAX "I64X"
+
+#ifdef _WIN64 // [
+# define SCNoPTR "I64o"
+# define SCNuPTR "I64u"
+# define SCNxPTR "I64x"
+# define SCNXPTR "I64X"
+#else // _WIN64 ][
+# define SCNoPTR "lo"
+# define SCNuPTR "lu"
+# define SCNxPTR "lx"
+# define SCNXPTR "lX"
+#endif // _WIN64 ]
+
+#endif // __STDC_FORMAT_MACROS ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+ imaxdiv_t result;
+
+ result.quot = numer / denom;
+ result.rem = numer % denom;
+
+ if (numer < 0 && result.rem > 0) {
+ // did division wrong; must fix up
+ ++result.quot;
+ result.rem -= denom;
+ }
+
+ return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/contrib/python/pandas/py2/pandas/conftest.py b/contrib/python/pandas/py2/pandas/conftest.py
new file mode 100644
index 00000000000..35a6b5df35d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/conftest.py
@@ -0,0 +1,677 @@
+from datetime import date, time, timedelta
+from decimal import Decimal
+import os
+
+from dateutil.tz import tzlocal, tzutc
+import hypothesis
+from hypothesis import strategies as st
+import numpy as np
+import pytest
+from pytz import FixedOffset, utc
+
+from pandas.compat import PY3, u
+import pandas.util._test_decorators as td
+
+import pandas as pd
+
+hypothesis.settings.register_profile(
+ "ci",
+ # Hypothesis timing checks are tuned for scalars by default, so we bump
+ # them from 200ms to 500ms per test case as the global default. If this
+ # is too short for a specific test, (a) try to make it faster, and (b)
+ # if it really is slow add `@settings(deadline=...)` with a working value,
+ # or `deadline=None` to entirely disable timeouts for that test.
+ deadline=500,
+ timeout=hypothesis.unlimited,
+ suppress_health_check=(hypothesis.HealthCheck.too_slow,)
+)
+hypothesis.settings.load_profile("ci")
+
+
+def pytest_addoption(parser):
+ parser.addoption("--skip-slow", action="store_true",
+ help="skip slow tests")
+ parser.addoption("--skip-network", action="store_true",
+ help="skip network tests")
+ parser.addoption("--skip-db", action="store_true",
+ help="skip db tests")
+ parser.addoption("--run-high-memory", action="store_true",
+ help="run high memory tests")
+ parser.addoption("--only-slow", action="store_true",
+ help="run only slow tests")
+ parser.addoption("--strict-data-files", action="store_true",
+ help="Fail if a test is skipped for missing data file.")
+
+
+def pytest_runtest_setup(item):
+ if 'slow' in item.keywords and item.config.getoption("--skip-slow"):
+ pytest.skip("skipping due to --skip-slow")
+
+ if 'slow' not in item.keywords and item.config.getoption("--only-slow"):
+ pytest.skip("skipping due to --only-slow")
+
+ if 'network' in item.keywords and item.config.getoption("--skip-network"):
+ pytest.skip("skipping due to --skip-network")
+
+ if 'db' in item.keywords and item.config.getoption("--skip-db"):
+ pytest.skip("skipping due to --skip-db")
+
+ if 'high_memory' in item.keywords and not item.config.getoption(
+ "--run-high-memory"):
+ pytest.skip(
+ "skipping high memory test since --run-high-memory was not set")
+
+
+# Configurations for all tests and all test modules
+
[email protected](autouse=True)
+def configure_tests():
+ pd.set_option('chained_assignment', 'raise')
+
+
+# For running doctests: make np and pd names available
+
[email protected](autouse=True)
+def add_imports(doctest_namespace):
+ doctest_namespace['np'] = np
+ doctest_namespace['pd'] = pd
+
+
[email protected](params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'])
+def spmatrix(request):
+ from scipy import sparse
+ return getattr(sparse, request.param + '_matrix')
+
+
[email protected](params=[0, 1, 'index', 'columns'],
+ ids=lambda x: "axis {!r}".format(x))
+def axis(request):
+ """
+ Fixture for returning the axis numbers of a DataFrame.
+ """
+ return request.param
+
+
+axis_frame = axis
+
+
[email protected](params=[0, 'index'], ids=lambda x: "axis {!r}".format(x))
+def axis_series(request):
+ """
+ Fixture for returning the axis numbers of a Series.
+ """
+ return request.param
+
+
+def ip():
+ """
+ Get an instance of IPython.InteractiveShell.
+
+ Will raise a skip if IPython is not installed.
+ """
+
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.interactiveshell import InteractiveShell
+ return InteractiveShell()
+
+
[email protected](params=[True, False, None])
+def observed(request):
+ """ pass in the observed keyword to groupby for [True, False]
+ This indicates whether categoricals should return values for
+ values which are not in the grouper [False / None], or only values which
+ appear in the grouper [True]. [None] is supported for future compatiblity
+ if we decide to change the default (and would need to warn if this
+ parameter is not passed)"""
+ return request.param
+
+
+_all_arithmetic_operators = ['__add__', '__radd__',
+ '__sub__', '__rsub__',
+ '__mul__', '__rmul__',
+ '__floordiv__', '__rfloordiv__',
+ '__truediv__', '__rtruediv__',
+ '__pow__', '__rpow__',
+ '__mod__', '__rmod__']
+if not PY3:
+ _all_arithmetic_operators.extend(['__div__', '__rdiv__'])
+
+
[email protected](params=_all_arithmetic_operators)
+def all_arithmetic_operators(request):
+ """
+ Fixture for dunder names for common arithmetic operations
+ """
+ return request.param
+
+
+_all_numeric_reductions = ['sum', 'max', 'min',
+ 'mean', 'prod', 'std', 'var', 'median',
+ 'kurt', 'skew']
+
+
[email protected](params=_all_numeric_reductions)
+def all_numeric_reductions(request):
+ """
+ Fixture for numeric reduction names
+ """
+ return request.param
+
+
+_all_boolean_reductions = ['all', 'any']
+
+
[email protected](params=_all_boolean_reductions)
+def all_boolean_reductions(request):
+ """
+ Fixture for boolean reduction names
+ """
+ return request.param
+
+
+_cython_table = pd.core.base.SelectionMixin._cython_table.items()
+
+
[email protected](params=list(_cython_table))
+def cython_table_items(request):
+ return request.param
+
+
+def _get_cython_table_params(ndframe, func_names_and_expected):
+ """combine frame, functions from SelectionMixin._cython_table
+ keys and expected result.
+
+ Parameters
+ ----------
+ ndframe : DataFrame or Series
+ func_names_and_expected : Sequence of two items
+ The first item is a name of a NDFrame method ('sum', 'prod') etc.
+ The second item is the expected return value
+
+ Returns
+ -------
+ results : list
+ List of three items (DataFrame, function, expected result)
+ """
+ results = []
+ for func_name, expected in func_names_and_expected:
+ results.append((ndframe, func_name, expected))
+ results += [(ndframe, func, expected) for func, name in _cython_table
+ if name == func_name]
+ return results
+
+
[email protected](params=['__eq__', '__ne__', '__le__',
+ '__lt__', '__ge__', '__gt__'])
+def all_compare_operators(request):
+ """
+ Fixture for dunder names for common compare operations
+
+ * >=
+ * >
+ * ==
+ * !=
+ * <
+ * <=
+ """
+ return request.param
+
+
[email protected](params=[None, 'gzip', 'bz2', 'zip',
+ pytest.param('xz', marks=td.skip_if_no_lzma)])
+def compression(request):
+ """
+ Fixture for trying common compression types in compression tests
+ """
+ return request.param
+
+
[email protected](params=['gzip', 'bz2', 'zip',
+ pytest.param('xz', marks=td.skip_if_no_lzma)])
+def compression_only(request):
+ """
+ Fixture for trying common compression types in compression tests excluding
+ uncompressed case
+ """
+ return request.param
+
+
[email protected](params=[True, False])
+def writable(request):
+ """
+ Fixture that an array is writable
+ """
+ return request.param
+
+
[email protected](scope='module')
+def datetime_tz_utc():
+ from datetime import timezone
+ return timezone.utc
+
+
+utc_objs = ['utc', 'dateutil/UTC', utc, tzutc()]
+if PY3:
+ from datetime import timezone
+ utc_objs.append(timezone.utc)
+
+
[email protected](params=utc_objs)
+def utc_fixture(request):
+ """
+ Fixture to provide variants of UTC timezone strings and tzinfo objects
+ """
+ return request.param
+
+
[email protected](params=['inner', 'outer', 'left', 'right'])
+def join_type(request):
+ """
+ Fixture for trying all types of join operations
+ """
+ return request.param
+
+
+def strict_data_files(pytestconfig):
+ return pytestconfig.getoption("--strict-data-files")
+
+
+def datapath(strict_data_files):
+ """Get the path to a data file.
+
+ Parameters
+ ----------
+ path : str
+ Path to the file, relative to ``pandas/tests/``
+
+ Returns
+ -------
+ path : path including ``pandas/tests``.
+
+ Raises
+ ------
+ ValueError
+ If the path doesn't exist and the --strict-data-files option is set.
+ """
+ BASE_PATH = os.path.join(os.path.dirname(__file__), 'tests')
+
+ def deco(*args):
+ path = os.path.join(BASE_PATH, *args)
+ if not os.path.exists(path):
+ if strict_data_files:
+ msg = "Could not find file {} and --strict-data-files is set."
+ raise ValueError(msg.format(path))
+ else:
+ msg = "Could not find {}."
+ pytest.skip(msg.format(path))
+ return path
+ return deco
+
+
+def iris(datapath):
+ """The iris dataset as a DataFrame."""
+ return pd.read_csv(datapath('data', 'iris.csv'))
+
+
[email protected](params=['nlargest', 'nsmallest'])
+def nselect_method(request):
+ """
+ Fixture for trying all nselect methods
+ """
+ return request.param
+
+
[email protected](params=['left', 'right', 'both', 'neither'])
+def closed(request):
+ """
+ Fixture for trying all interval closed parameters
+ """
+ return request.param
+
+
[email protected](params=['left', 'right', 'both', 'neither'])
+def other_closed(request):
+ """
+ Secondary closed fixture to allow parametrizing over all pairs of closed
+ """
+ return request.param
+
+
[email protected](params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
+def nulls_fixture(request):
+ """
+ Fixture for each null type in pandas
+ """
+ return request.param
+
+
+nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture
+
+
[email protected](params=[None, np.nan, pd.NaT])
+def unique_nulls_fixture(request):
+ """
+ Fixture for each null type in pandas, each null type exactly once
+ """
+ return request.param
+
+
+# Generate cartesian product of unique_nulls_fixture:
+unique_nulls_fixture2 = unique_nulls_fixture
+
+
+TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
+ 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300),
+ FixedOffset(0), FixedOffset(-300)]
+
+
[email protected]_fixture_doc(str(TIMEZONES))
[email protected](params=TIMEZONES)
+def tz_naive_fixture(request):
+ """
+ Fixture for trying timezones including default (None): {0}
+ """
+ return request.param
+
+
[email protected]_fixture_doc(str(TIMEZONES[1:]))
[email protected](params=TIMEZONES[1:])
+def tz_aware_fixture(request):
+ """
+ Fixture for trying explicit timezones: {0}
+ """
+ return request.param
+
+
+# ----------------------------------------------------------------
+# Dtypes
+UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
+UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"]
+SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"]
+SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"]
+ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES
+ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES
+
+FLOAT_DTYPES = [float, "float32", "float64"]
+COMPLEX_DTYPES = [complex, "complex64", "complex128"]
+STRING_DTYPES = [str, 'str', 'U']
+
+DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]']
+TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]']
+
+BOOL_DTYPES = [bool, 'bool']
+BYTES_DTYPES = [bytes, 'bytes']
+OBJECT_DTYPES = [object, 'object']
+
+ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES
+ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES
+ + DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES
+ + OBJECT_DTYPES + BYTES_DTYPES * PY3) # bytes only for PY3
+
+
[email protected](params=STRING_DTYPES)
+def string_dtype(request):
+ """Parametrized fixture for string dtypes.
+
+ * str
+ * 'str'
+ * 'U'
+ """
+ return request.param
+
+
[email protected](params=FLOAT_DTYPES)
+def float_dtype(request):
+ """
+ Parameterized fixture for float dtypes.
+
+ * float
+ * 'float32'
+ * 'float64'
+ """
+
+ return request.param
+
+
[email protected](params=COMPLEX_DTYPES)
+def complex_dtype(request):
+ """
+ Parameterized fixture for complex dtypes.
+
+ * complex
+ * 'complex64'
+ * 'complex128'
+ """
+
+ return request.param
+
+
[email protected](params=SIGNED_INT_DTYPES)
+def sint_dtype(request):
+ """
+ Parameterized fixture for signed integer dtypes.
+
+ * int
+ * 'int8'
+ * 'int16'
+ * 'int32'
+ * 'int64'
+ """
+
+ return request.param
+
+
[email protected](params=UNSIGNED_INT_DTYPES)
+def uint_dtype(request):
+ """
+ Parameterized fixture for unsigned integer dtypes.
+
+ * 'uint8'
+ * 'uint16'
+ * 'uint32'
+ * 'uint64'
+ """
+
+ return request.param
+
+
[email protected](params=ALL_INT_DTYPES)
+def any_int_dtype(request):
+ """
+ Parameterized fixture for any integer dtype.
+
+ * int
+ * 'int8'
+ * 'uint8'
+ * 'int16'
+ * 'uint16'
+ * 'int32'
+ * 'uint32'
+ * 'int64'
+ * 'uint64'
+ """
+
+ return request.param
+
+
[email protected](params=ALL_REAL_DTYPES)
+def any_real_dtype(request):
+ """
+ Parameterized fixture for any (purely) real numeric dtype.
+
+ * int
+ * 'int8'
+ * 'uint8'
+ * 'int16'
+ * 'uint16'
+ * 'int32'
+ * 'uint32'
+ * 'int64'
+ * 'uint64'
+ * float
+ * 'float32'
+ * 'float64'
+ """
+
+ return request.param
+
+
[email protected](params=ALL_NUMPY_DTYPES)
+def any_numpy_dtype(request):
+ """
+ Parameterized fixture for all numpy dtypes.
+
+ * bool
+ * 'bool'
+ * int
+ * 'int8'
+ * 'uint8'
+ * 'int16'
+ * 'uint16'
+ * 'int32'
+ * 'uint32'
+ * 'int64'
+ * 'uint64'
+ * float
+ * 'float32'
+ * 'float64'
+ * complex
+ * 'complex64'
+ * 'complex128'
+ * str
+ * 'str'
+ * 'U'
+ * bytes
+ * 'bytes'
+ * 'datetime64[ns]'
+ * 'M8[ns]'
+ * 'timedelta64[ns]'
+ * 'm8[ns]'
+ * object
+ * 'object'
+ """
+
+ return request.param
+
+
+# categoricals are handled separately
+_any_skipna_inferred_dtype = [
+ ('string', ['a', np.nan, 'c']),
+ ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
+ ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
+ ('empty', [np.nan, np.nan, np.nan]),
+ ('empty', []),
+ ('mixed-integer', ['a', np.nan, 2]),
+ ('mixed', ['a', np.nan, 2.0]),
+ ('floating', [1.0, np.nan, 2.0]),
+ ('integer', [1, np.nan, 2]),
+ ('mixed-integer-float', [1, np.nan, 2.0]),
+ ('decimal', [Decimal(1), np.nan, Decimal(2)]),
+ ('boolean', [True, np.nan, False]),
+ ('datetime64', [np.datetime64('2013-01-01'), np.nan,
+ np.datetime64('2018-01-01')]),
+ ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]),
+ ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
+ # The following two dtypes are commented out due to GH 23554
+ # ('complex', [1 + 1j, np.nan, 2 + 2j]),
+ # ('timedelta64', [np.timedelta64(1, 'D'),
+ # np.nan, np.timedelta64(2, 'D')]),
+ ('timedelta', [timedelta(1), np.nan, timedelta(2)]),
+ ('time', [time(1), np.nan, time(2)]),
+ ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]),
+ ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])]
+ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id
+
+
[email protected](params=_any_skipna_inferred_dtype, ids=ids)
+def any_skipna_inferred_dtype(request):
+ """
+ Fixture for all inferred dtypes from _libs.lib.infer_dtype
+
+ The covered (inferred) types are:
+ * 'string'
+ * 'unicode' (if PY2)
+ * 'empty'
+ * 'bytes' (if PY3)
+ * 'mixed'
+ * 'mixed-integer'
+ * 'mixed-integer-float'
+ * 'floating'
+ * 'integer'
+ * 'decimal'
+ * 'boolean'
+ * 'datetime64'
+ * 'datetime'
+ * 'date'
+ * 'timedelta'
+ * 'time'
+ * 'period'
+ * 'interval'
+
+ Returns
+ -------
+ inferred_dtype : str
+ The string for the inferred dtype from _libs.lib.infer_dtype
+ values : np.ndarray
+ An array of object dtype that will be inferred to have
+ `inferred_dtype`
+
+ Examples
+ --------
+ >>> import pandas._libs.lib as lib
+ >>>
+ >>> def test_something(any_skipna_inferred_dtype):
+ ... inferred_dtype, values = any_skipna_inferred_dtype
+ ... # will pass
+ ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
+ """
+ inferred_dtype, values = request.param
+ values = np.array(values, dtype=object) # object dtype to avoid casting
+
+ # correctness of inference tested in tests/dtypes/test_inference.py
+ return inferred_dtype, values
+
+
[email protected](params=[getattr(pd.offsets, o) for o in pd.offsets.__all__ if
+ issubclass(getattr(pd.offsets, o), pd.offsets.Tick)])
+def tick_classes(request):
+ """
+ Fixture for Tick based datetime offsets available for a time series.
+ """
+ return request.param
+
+# ----------------------------------------------------------------
+# Global setup for tests using Hypothesis
+
+
+# Registering these strategies makes them globally available via st.from_type,
+# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py
+for name in 'MonthBegin MonthEnd BMonthBegin BMonthEnd'.split():
+ cls = getattr(pd.tseries.offsets, name)
+ st.register_type_strategy(cls, st.builds(
+ cls,
+ n=st.integers(-99, 99),
+ normalize=st.booleans(),
+ ))
+
+for name in 'YearBegin YearEnd BYearBegin BYearEnd'.split():
+ cls = getattr(pd.tseries.offsets, name)
+ st.register_type_strategy(cls, st.builds(
+ cls,
+ n=st.integers(-5, 5),
+ normalize=st.booleans(),
+ month=st.integers(min_value=1, max_value=12),
+ ))
+
+for name in 'QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd'.split():
+ cls = getattr(pd.tseries.offsets, name)
+ st.register_type_strategy(cls, st.builds(
+ cls,
+ n=st.integers(-24, 24),
+ normalize=st.booleans(),
+ startingMonth=st.integers(min_value=1, max_value=12)
+ ))
diff --git a/contrib/python/pandas/py2/pandas/tests/__init__.py b/contrib/python/pandas/py2/pandas/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/api/__init__.py b/contrib/python/pandas/py2/pandas/tests/api/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/api/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/api/test_api.py b/contrib/python/pandas/py2/pandas/tests/api/test_api.py
new file mode 100644
index 00000000000..599ab9a3c5f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/api/test_api.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+import sys
+
+import pandas as pd
+from pandas import api
+from pandas.util import testing as tm
+
+
+class Base(object):
+
+ def check(self, namespace, expected, ignored=None):
+ # see which names are in the namespace, minus optional
+ # ignored ones
+ # compare vs the expected
+
+ result = sorted(f for f in dir(namespace) if not f.startswith('_'))
+ if ignored is not None:
+ result = sorted(list(set(result) - set(ignored)))
+
+ expected = sorted(expected)
+ tm.assert_almost_equal(result, expected)
+
+
+class TestPDApi(Base):
+
+ # these are optionally imported based on testing
+ # & need to be ignored
+ ignored = ['tests', 'locale', 'conftest']
+
+ # top-level sub-packages
+ lib = ['api', 'arrays', 'compat', 'core', 'errors', 'pandas',
+ 'plotting', 'test', 'testing', 'tseries',
+ 'util', 'options', 'io']
+
+ # these are already deprecated; awaiting removal
+ deprecated_modules = []
+
+ # misc
+ misc = ['IndexSlice', 'NaT']
+
+ # top-level classes
+ classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
+ 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
+ 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
+ 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
+ 'Series', 'SparseArray', 'SparseDataFrame', 'SparseDtype',
+ 'SparseSeries', 'Timedelta',
+ 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex',
+ 'CategoricalDtype', 'PeriodDtype', 'IntervalDtype',
+ 'DatetimeTZDtype',
+ 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype',
+ 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype',
+ ]
+
+ # these are already deprecated; awaiting removal
+ deprecated_classes = ['TimeGrouper']
+
+ # these should be deprecated in the future
+ deprecated_classes_in_future = ['Panel']
+
+ # external modules exposed in pandas namespace
+ modules = ['np', 'datetime']
+
+ # top-level functions
+ funcs = ['array', 'bdate_range', 'concat', 'crosstab', 'cut',
+ 'date_range', 'interval_range', 'eval',
+ 'factorize', 'get_dummies',
+ 'infer_freq', 'isna', 'isnull', 'lreshape',
+ 'melt', 'notna', 'notnull', 'offsets',
+ 'merge', 'merge_ordered', 'merge_asof',
+ 'period_range',
+ 'pivot', 'pivot_table', 'qcut',
+ 'show_versions', 'timedelta_range', 'unique',
+ 'value_counts', 'wide_to_long']
+
+ # top-level option funcs
+ funcs_option = ['reset_option', 'describe_option', 'get_option',
+ 'option_context', 'set_option',
+ 'set_eng_float_format']
+
+ # top-level read_* funcs
+ funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf',
+ 'read_gbq', 'read_hdf', 'read_html', 'read_json',
+ 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
+ 'read_sql_query', 'read_sql_table', 'read_stata',
+ 'read_table', 'read_feather', 'read_parquet']
+
+ # top-level to_* funcs
+ funcs_to = ['to_datetime', 'to_msgpack',
+ 'to_numeric', 'to_pickle', 'to_timedelta']
+
+ # top-level to deprecate in the future
+ deprecated_funcs_in_future = []
+
+ # these are already deprecated; awaiting removal
+ deprecated_funcs = []
+
+ def test_api(self):
+
+ self.check(pd,
+ self.lib + self.misc +
+ self.modules + self.deprecated_modules +
+ self.classes + self.deprecated_classes +
+ self.deprecated_classes_in_future +
+ self.funcs + self.funcs_option +
+ self.funcs_read + self.funcs_to +
+ self.deprecated_funcs_in_future +
+ self.deprecated_funcs,
+ self.ignored)
+
+
+class TestApi(Base):
+
+ allowed = ['types', 'extensions']
+
+ def test_api(self):
+
+ self.check(api, self.allowed)
+
+
+class TestTesting(Base):
+
+ funcs = ['assert_frame_equal', 'assert_series_equal',
+ 'assert_index_equal']
+
+ def test_testing(self):
+
+ from pandas import testing
+ self.check(testing, self.funcs)
+
+
+class TestTopLevelDeprecations(object):
+
+ # top-level API deprecations
+ # GH 13790
+
+ def test_TimeGrouper(self):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ pd.TimeGrouper(freq='D')
+
+
+class TestCDateRange(object):
+
+ def test_deprecation_cdaterange(self):
+ # GH17596
+ from pandas.core.indexes.datetimes import cdate_range
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ cdate_range('2017-01-01', '2017-12-31')
+
+
+class TestCategoricalMove(object):
+
+ def test_categorical_move(self):
+ # May have been cached by another import, e.g. pickle tests.
+ sys.modules.pop("pandas.core.categorical", None)
+
+ with tm.assert_produces_warning(FutureWarning):
+ from pandas.core.categorical import Categorical # noqa
+
+ sys.modules.pop("pandas.core.categorical", None)
+
+ with tm.assert_produces_warning(FutureWarning):
+ from pandas.core.categorical import CategoricalDtype # noqa
diff --git a/contrib/python/pandas/py2/pandas/tests/api/test_types.py b/contrib/python/pandas/py2/pandas/tests/api/test_types.py
new file mode 100644
index 00000000000..235d7ecc64f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/api/test_types.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+from pandas.api import types
+from pandas.util import testing as tm
+
+from .test_api import Base
+
+
+class TestTypes(Base):
+
+ allowed = ['is_bool', 'is_bool_dtype',
+ 'is_categorical', 'is_categorical_dtype', 'is_complex',
+ 'is_complex_dtype', 'is_datetime64_any_dtype',
+ 'is_datetime64_dtype', 'is_datetime64_ns_dtype',
+ 'is_datetime64tz_dtype', 'is_dtype_equal',
+ 'is_extension_type', 'is_float', 'is_float_dtype',
+ 'is_int64_dtype', 'is_integer',
+ 'is_integer_dtype', 'is_number', 'is_numeric_dtype',
+ 'is_object_dtype', 'is_scalar', 'is_sparse',
+ 'is_string_dtype', 'is_signed_integer_dtype',
+ 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
+ 'is_unsigned_integer_dtype',
+ 'is_period_dtype', 'is_interval', 'is_interval_dtype',
+ 'is_re', 'is_re_compilable',
+ 'is_dict_like', 'is_iterator', 'is_file_like',
+ 'is_list_like', 'is_hashable', 'is_array_like',
+ 'is_named_tuple',
+ 'pandas_dtype', 'union_categoricals', 'infer_dtype',
+ 'is_extension_array_dtype']
+ deprecated = ['is_period', 'is_datetimetz']
+ dtypes = ['CategoricalDtype', 'DatetimeTZDtype',
+ 'PeriodDtype', 'IntervalDtype']
+
+ def test_types(self):
+
+ self.check(types, self.allowed + self.dtypes + self.deprecated)
+
+ def test_deprecated_from_api_types(self):
+
+ for t in self.deprecated:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ getattr(types, t)(1)
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/__init__.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/conftest.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/conftest.py
new file mode 100644
index 00000000000..671fe69750c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/conftest.py
@@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pandas.compat import long
+
+import pandas as pd
+import pandas.util.testing as tm
+
+# ------------------------------------------------------------------
+# Helper Functions
+
+
+def id_func(x):
+ if isinstance(x, tuple):
+ assert len(x) == 2
+ return x[0].__name__ + '-' + str(x[1])
+ else:
+ return x.__name__
+
+
+# ------------------------------------------------------------------
+
[email protected](params=[1, np.array(1, dtype=np.int64)])
+def one(request):
+ # zero-dim integer array behaves like an integer
+ return request.param
+
+
+zeros = [box_cls([0] * 5, dtype=dtype)
+ for box_cls in [pd.Index, np.array]
+ for dtype in [np.int64, np.uint64, np.float64]]
+zeros.extend([np.array(0, dtype=dtype)
+ for dtype in [np.int64, np.uint64, np.float64]])
+zeros.extend([0, 0.0, long(0)])
+
+
[email protected](params=zeros)
+def zero(request):
+ # For testing division by (or of) zero for Index with length 5, this
+ # gives several scalar-zeros and length-5 vector-zeros
+ return request.param
+
+
+# ------------------------------------------------------------------
+# Vector Fixtures
+
[email protected](params=[pd.Float64Index(np.arange(5, dtype='float64')),
+ pd.Int64Index(np.arange(5, dtype='int64')),
+ pd.UInt64Index(np.arange(5, dtype='uint64')),
+ pd.RangeIndex(5)],
+ ids=lambda x: type(x).__name__)
+def numeric_idx(request):
+ """
+ Several types of numeric-dtypes Index objects
+ """
+ return request.param
+
+
+# ------------------------------------------------------------------
+# Scalar Fixtures
+
[email protected](params=[pd.Timedelta('5m4s').to_pytimedelta(),
+ pd.Timedelta('5m4s'),
+ pd.Timedelta('5m4s').to_timedelta64()],
+ ids=lambda x: type(x).__name__)
+def scalar_td(request):
+ """
+ Several variants of Timedelta scalars representing 5 minutes and 4 seconds
+ """
+ return request.param
+
+
[email protected](params=[pd.offsets.Day(3),
+ pd.offsets.Hour(72),
+ pd.Timedelta(days=3).to_pytimedelta(),
+ pd.Timedelta('72:00:00'),
+ np.timedelta64(3, 'D'),
+ np.timedelta64(72, 'h')],
+ ids=lambda x: type(x).__name__)
+def three_days(request):
+ """
+ Several timedelta-like and DateOffset objects that each represent
+ a 3-day timedelta
+ """
+ return request.param
+
+
[email protected](params=[pd.offsets.Hour(2),
+ pd.offsets.Minute(120),
+ pd.Timedelta(hours=2).to_pytimedelta(),
+ pd.Timedelta(seconds=2 * 3600),
+ np.timedelta64(2, 'h'),
+ np.timedelta64(120, 'm')],
+ ids=lambda x: type(x).__name__)
+def two_hours(request):
+ """
+ Several timedelta-like and DateOffset objects that each represent
+ a 2-hour timedelta
+ """
+ return request.param
+
+
+_common_mismatch = [pd.offsets.YearBegin(2),
+ pd.offsets.MonthBegin(1),
+ pd.offsets.Minute()]
+
+
[email protected](params=[pd.Timedelta(minutes=30).to_pytimedelta(),
+ np.timedelta64(30, 's'),
+ pd.Timedelta(seconds=30)] + _common_mismatch)
+def not_hourly(request):
+ """
+ Several timedelta-like and DateOffset instances that are _not_
+ compatible with Hourly frequencies.
+ """
+ return request.param
+
+
[email protected](params=[np.timedelta64(4, 'h'),
+ pd.Timedelta(hours=23).to_pytimedelta(),
+ pd.Timedelta('23:00:00')] + _common_mismatch)
+def not_daily(request):
+ """
+ Several timedelta-like and DateOffset instances that are _not_
+ compatible with Daily frequencies.
+ """
+ return request.param
+
+
[email protected](params=[np.timedelta64(365, 'D'),
+ pd.Timedelta(days=365).to_pytimedelta(),
+ pd.Timedelta(days=365)] + _common_mismatch)
+def mismatched_freq(request):
+ """
+ Several timedelta-like and DateOffset instances that are _not_
+ compatible with Monthly or Annual frequencies.
+ """
+ return request.param
+
+
+# ------------------------------------------------------------------
+
[email protected](params=[pd.Index, pd.Series, pd.DataFrame],
+ ids=id_func)
+def box(request):
+ """
+ Several array-like containers that should have effectively identical
+ behavior with respect to arithmetic operations.
+ """
+ return request.param
+
+
[email protected](params=[pd.Index,
+ pd.Series,
+ pytest.param(pd.DataFrame,
+ marks=pytest.mark.xfail)],
+ ids=id_func)
+def box_df_fail(request):
+ """
+ Fixture equivalent to `box` fixture but xfailing the DataFrame case.
+ """
+ return request.param
+
+
[email protected](params=[(pd.Index, False),
+ (pd.Series, False),
+ (pd.DataFrame, False),
+ pytest.param((pd.DataFrame, True),
+ marks=pytest.mark.xfail)],
+ ids=id_func)
+def box_transpose_fail(request):
+ """
+ Fixture similar to `box` but testing both transpose cases for DataFrame,
+ with the tranpose=True case xfailed.
+ """
+ # GH#23620
+ return request.param
+
+
[email protected](params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array],
+ ids=id_func)
+def box_with_array(request):
+ """
+ Fixture to test behavior for Index, Series, DataFrame, and pandas Array
+ classes
+ """
+ return request.param
+
+
+# alias so we can use the same fixture for multiple parameters in a test
+box_with_array2 = box_with_array
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/test_datetime64.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_datetime64.py
new file mode 100644
index 00000000000..acf4075feb9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_datetime64.py
@@ -0,0 +1,2334 @@
+# -*- coding: utf-8 -*-
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for datetime64 and datetime64tz dtypes
+from datetime import datetime, timedelta
+from itertools import product, starmap
+import operator
+import warnings
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs.conversion import localize_pydatetime
+from pandas._libs.tslibs.offsets import shift_months
+from pandas.compat.numpy import np_datetime64_compat
+from pandas.errors import NullFrequencyError, PerformanceWarning
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, NaT, Period, Series, Timedelta, TimedeltaIndex, Timestamp,
+ date_range)
+from pandas.core.indexes.datetimes import _to_M8
+import pandas.util.testing as tm
+
+
+def assert_all(obj):
+ """
+ Test helper to call call obj.all() the appropriate number of times on
+ a Series or DataFrame.
+ """
+ if isinstance(obj, pd.DataFrame):
+ assert obj.all().all()
+ else:
+ assert obj.all()
+
+
+# ------------------------------------------------------------------
+# Comparisons
+
+class TestDatetime64DataFrameComparison(object):
+ @pytest.mark.parametrize('timestamps', [
+ [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2,
+ [pd.Timestamp('2012-01-01 13:00:00')] * 2])
+ def test_tz_aware_scalar_comparison(self, timestamps):
+ # GH#15966
+ df = pd.DataFrame({'test': timestamps})
+ expected = pd.DataFrame({'test': [False, False]})
+ tm.assert_frame_equal(df == -1, expected)
+
+ def test_dt64_nat_comparison(self):
+ # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly
+ ts = pd.Timestamp.now()
+ df = pd.DataFrame([ts, pd.NaT])
+ expected = pd.DataFrame([True, False])
+
+ result = df == ts
+ tm.assert_frame_equal(result, expected)
+
+
+class TestDatetime64SeriesComparison(object):
+ # TODO: moved from tests.series.test_operators; needs cleanup
+
+ @pytest.mark.parametrize('pair', [
+ ([pd.Timestamp('2011-01-01'), NaT, pd.Timestamp('2011-01-03')],
+ [NaT, NaT, pd.Timestamp('2011-01-03')]),
+
+ ([pd.Timedelta('1 days'), NaT, pd.Timedelta('3 days')],
+ [NaT, NaT, pd.Timedelta('3 days')]),
+
+ ([pd.Period('2011-01', freq='M'), NaT,
+ pd.Period('2011-03', freq='M')],
+ [NaT, NaT, pd.Period('2011-03', freq='M')]),
+
+ ])
+ @pytest.mark.parametrize('reverse', [True, False])
+ @pytest.mark.parametrize('box', [Series, pd.Index])
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_nat_comparisons(self, dtype, box, reverse, pair):
+ l, r = pair
+ if reverse:
+ # add lhs / rhs switched data
+ l, r = r, l
+
+ left = Series(l, dtype=dtype)
+ right = box(r, dtype=dtype)
+ # Series, Index
+
+ expected = Series([False, False, True])
+ tm.assert_series_equal(left == right, expected)
+
+ expected = Series([True, True, False])
+ tm.assert_series_equal(left != right, expected)
+
+ expected = Series([False, False, False])
+ tm.assert_series_equal(left < right, expected)
+
+ expected = Series([False, False, False])
+ tm.assert_series_equal(left > right, expected)
+
+ expected = Series([False, False, True])
+ tm.assert_series_equal(left >= right, expected)
+
+ expected = Series([False, False, True])
+ tm.assert_series_equal(left <= right, expected)
+
+ def test_comparison_invalid(self, box_with_array):
+ # GH#4968
+ # invalid date/int comparisons
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ ser = Series(range(5))
+ ser2 = Series(pd.date_range('20010101', periods=5))
+
+ ser = tm.box_expected(ser, box_with_array)
+ ser2 = tm.box_expected(ser2, box_with_array)
+
+ for (x, y) in [(ser, ser2), (ser2, ser)]:
+
+ result = x == y
+ expected = tm.box_expected([False] * 5, xbox)
+ tm.assert_equal(result, expected)
+
+ result = x != y
+ expected = tm.box_expected([True] * 5, xbox)
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ x >= y
+ with pytest.raises(TypeError):
+ x > y
+ with pytest.raises(TypeError):
+ x < y
+ with pytest.raises(TypeError):
+ x <= y
+
+ @pytest.mark.parametrize('data', [
+ [Timestamp('2011-01-01'), NaT, Timestamp('2011-01-03')],
+ [Timedelta('1 days'), NaT, Timedelta('3 days')],
+ [Period('2011-01', freq='M'), NaT, Period('2011-03', freq='M')]
+ ])
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_nat_comparisons_scalar(self, dtype, data, box_with_array):
+ if box_with_array is tm.to_array and dtype is object:
+ # dont bother testing ndarray comparison methods as this fails
+ # on older numpys (since they check object identity)
+ return
+
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ left = Series(data, dtype=dtype)
+ left = tm.box_expected(left, box_with_array)
+
+ expected = [False, False, False]
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(left == NaT, expected)
+ tm.assert_equal(NaT == left, expected)
+
+ expected = [True, True, True]
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(left != NaT, expected)
+ tm.assert_equal(NaT != left, expected)
+
+ expected = [False, False, False]
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(left < NaT, expected)
+ tm.assert_equal(NaT > left, expected)
+ tm.assert_equal(left <= NaT, expected)
+ tm.assert_equal(NaT >= left, expected)
+
+ tm.assert_equal(left > NaT, expected)
+ tm.assert_equal(NaT < left, expected)
+ tm.assert_equal(left >= NaT, expected)
+ tm.assert_equal(NaT <= left, expected)
+
+ def test_series_comparison_scalars(self):
+ series = Series(date_range('1/1/2000', periods=10))
+
+ val = datetime(2000, 1, 4)
+ result = series > val
+ expected = Series([x > val for x in series])
+ tm.assert_series_equal(result, expected)
+
+ val = series[5]
+ result = series > val
+ expected = Series([x > val for x in series])
+ tm.assert_series_equal(result, expected)
+
+ def test_dt64_ser_cmp_date_warning(self):
+ # https://github.com/pandas-dev/pandas/issues/21359
+ # Remove this test and enble invalid test below
+ ser = pd.Series(pd.date_range('20010101', periods=10), name='dates')
+ date = ser.iloc[0].to_pydatetime().date()
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser == date
+ expected = pd.Series([True] + [False] * 9, name='dates')
+ tm.assert_series_equal(result, expected)
+ assert "Comparing Series of datetimes " in str(m[0].message)
+ assert "will not compare equal" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser != date
+ tm.assert_series_equal(result, ~expected)
+ assert "will not compare equal" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser <= date
+ tm.assert_series_equal(result, expected)
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser < date
+ tm.assert_series_equal(result, pd.Series([False] * 10, name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser >= date
+ tm.assert_series_equal(result, pd.Series([True] * 10, name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser > date
+ tm.assert_series_equal(result, pd.Series([False] + [True] * 9,
+ name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ @pytest.mark.skip(reason="GH#21359")
+ def test_dt64ser_cmp_date_invalid(self, box_with_array):
+ # GH#19800 datetime.date comparison raises to
+ # match DatetimeIndex/Timestamp. This also matches the behavior
+ # of stdlib datetime.datetime
+
+ ser = pd.date_range('20010101', periods=10)
+ date = ser.iloc[0].to_pydatetime().date()
+
+ ser = tm.box_expected(ser, box_with_array)
+ assert not (ser == date).any()
+ assert (ser != date).all()
+ with pytest.raises(TypeError):
+ ser > date
+ with pytest.raises(TypeError):
+ ser < date
+ with pytest.raises(TypeError):
+ ser >= date
+ with pytest.raises(TypeError):
+ ser <= date
+
+ @pytest.mark.parametrize("left,right", [
+ ("lt", "gt"),
+ ("le", "ge"),
+ ("eq", "eq"),
+ ("ne", "ne"),
+ ])
+ def test_timestamp_compare_series(self, left, right):
+ # see gh-4982
+ # Make sure we can compare Timestamps on the right AND left hand side.
+ ser = pd.Series(pd.date_range("20010101", periods=10), name="dates")
+ s_nat = ser.copy(deep=True)
+
+ ser[0] = pd.Timestamp("nat")
+ ser[3] = pd.Timestamp("nat")
+
+ left_f = getattr(operator, left)
+ right_f = getattr(operator, right)
+
+ # No NaT
+ expected = left_f(ser, pd.Timestamp("20010109"))
+ result = right_f(pd.Timestamp("20010109"), ser)
+ tm.assert_series_equal(result, expected)
+
+ # NaT
+ expected = left_f(ser, pd.Timestamp("nat"))
+ result = right_f(pd.Timestamp("nat"), ser)
+ tm.assert_series_equal(result, expected)
+
+ # Compare to Timestamp with series containing NaT
+ expected = left_f(s_nat, pd.Timestamp("20010109"))
+ result = right_f(pd.Timestamp("20010109"), s_nat)
+ tm.assert_series_equal(result, expected)
+
+ # Compare to NaT with series containing NaT
+ expected = left_f(s_nat, pd.Timestamp("nat"))
+ result = right_f(pd.Timestamp("nat"), s_nat)
+ tm.assert_series_equal(result, expected)
+
+ def test_dt64arr_timestamp_equality(self, box_with_array):
+ # GH#11034
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ ser = pd.Series([pd.Timestamp('2000-01-29 01:59:00'), 'NaT'])
+ ser = tm.box_expected(ser, box_with_array)
+
+ result = ser != ser
+ expected = tm.box_expected([False, True], xbox)
+ tm.assert_equal(result, expected)
+
+ result = ser != ser[0]
+ expected = tm.box_expected([False, True], xbox)
+ tm.assert_equal(result, expected)
+
+ result = ser != ser[1]
+ expected = tm.box_expected([True, True], xbox)
+ tm.assert_equal(result, expected)
+
+ result = ser == ser
+ expected = tm.box_expected([True, False], xbox)
+ tm.assert_equal(result, expected)
+
+ result = ser == ser[0]
+ expected = tm.box_expected([True, False], xbox)
+ tm.assert_equal(result, expected)
+
+ result = ser == ser[1]
+ expected = tm.box_expected([False, False], xbox)
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('op', [operator.eq, operator.ne,
+ operator.gt, operator.ge,
+ operator.lt, operator.le])
+ def test_comparison_tzawareness_compat(self, op):
+ # GH#18162
+ dr = pd.date_range('2016-01-01', periods=6)
+ dz = dr.tz_localize('US/Pacific')
+
+ # Check that there isn't a problem aware-aware and naive-naive do not
+ # raise
+ naive_series = Series(dr)
+ aware_series = Series(dz)
+ with pytest.raises(TypeError):
+ op(dz, naive_series)
+ with pytest.raises(TypeError):
+ op(dr, aware_series)
+
+ # TODO: implement _assert_tzawareness_compat for the reverse
+ # comparison with the Series on the left-hand side
+
+
+class TestDatetimeIndexComparisons(object):
+
+ # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate
+ @pytest.mark.parametrize("op", [
+ operator.eq, operator.ne, operator.gt, operator.lt,
+ operator.ge, operator.le
+ ])
+ def test_comparators(self, op):
+ index = tm.makeDateIndex(100)
+ element = index[len(index) // 2]
+ element = _to_M8(element)
+
+ arr = np.array(index)
+ arr_result = op(arr, element)
+ index_result = op(index, element)
+
+ assert isinstance(index_result, np.ndarray)
+ tm.assert_numpy_array_equal(arr_result, index_result)
+
+ @pytest.mark.parametrize('other', [datetime(2016, 1, 1),
+ Timestamp('2016-01-01'),
+ np.datetime64('2016-01-01')])
+ def test_dti_cmp_datetimelike(self, other, tz_naive_fixture):
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+ if tz is not None:
+ if isinstance(other, np.datetime64):
+ # no tzaware version available
+ return
+ other = localize_pydatetime(other, dti.tzinfo)
+
+ result = dti == other
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = dti > other
+ expected = np.array([False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = dti >= other
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = dti < other
+ expected = np.array([False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = dti <= other
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array):
+ # GH#19301 by convention datetime.date is not considered comparable
+ # to Timestamp or DatetimeIndex. This may change in the future.
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ other = datetime(2016, 1, 1).date()
+ assert not (dtarr == other).any()
+ assert (dtarr != other).all()
+ with pytest.raises(TypeError):
+ dtarr < other
+ with pytest.raises(TypeError):
+ dtarr <= other
+ with pytest.raises(TypeError):
+ dtarr > other
+ with pytest.raises(TypeError):
+ dtarr >= other
+
+ @pytest.mark.parametrize('other', [None, np.nan, pd.NaT])
+ def test_dti_eq_null_scalar(self, other, tz_naive_fixture):
+ # GH#19301
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+ assert not (dti == other).any()
+
+ @pytest.mark.parametrize('other', [None, np.nan, pd.NaT])
+ def test_dti_ne_null_scalar(self, other, tz_naive_fixture):
+ # GH#19301
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+ assert (dti != other).all()
+
+ @pytest.mark.parametrize('other', [None, np.nan])
+ def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other,
+ box_with_array):
+ # GH#19301
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+ # FIXME: ValueError with transpose
+ dtarr = tm.box_expected(dti, box_with_array, transpose=False)
+
+ with pytest.raises(TypeError):
+ dtarr < other
+ with pytest.raises(TypeError):
+ dtarr <= other
+ with pytest.raises(TypeError):
+ dtarr > other
+ with pytest.raises(TypeError):
+ dtarr >= other
+
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_dti_cmp_nat(self, dtype, box_with_array):
+ if box_with_array is tm.to_array and dtype is object:
+ # dont bother testing ndarray comparison methods as this fails
+ # on older numpys (since they check object identity)
+ return
+
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT,
+ pd.Timestamp('2011-01-03')])
+ right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')])
+
+ left = tm.box_expected(left, box_with_array)
+ right = tm.box_expected(right, box_with_array)
+
+ lhs, rhs = left, right
+ if dtype is object:
+ lhs, rhs = left.astype(object), right.astype(object)
+
+ result = rhs == lhs
+ expected = np.array([False, False, True])
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ result = lhs != rhs
+ expected = np.array([True, True, False])
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ expected = np.array([False, False, False])
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(lhs == pd.NaT, expected)
+ tm.assert_equal(pd.NaT == rhs, expected)
+
+ expected = np.array([True, True, True])
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(lhs != pd.NaT, expected)
+ tm.assert_equal(pd.NaT != lhs, expected)
+
+ expected = np.array([False, False, False])
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(lhs < pd.NaT, expected)
+ tm.assert_equal(pd.NaT > lhs, expected)
+
+ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self):
+ fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0])
+ fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0])
+
+ didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT,
+ '2014-05-01', '2014-07-01'])
+ didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT,
+ '2014-06-01', '2014-07-01'])
+ darr = np.array([np_datetime64_compat('2014-02-01 00:00Z'),
+ np_datetime64_compat('2014-03-01 00:00Z'),
+ np_datetime64_compat('nat'), np.datetime64('nat'),
+ np_datetime64_compat('2014-06-01 00:00Z'),
+ np_datetime64_compat('2014-07-01 00:00Z')])
+
+ cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)]
+
+ # Check pd.NaT is handles as the same as np.nan
+ with tm.assert_produces_warning(None):
+ for idx1, idx2 in cases:
+
+ result = idx1 < idx2
+ expected = np.array([True, False, False, False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx2 > idx1
+ expected = np.array([True, False, False, False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 <= idx2
+ expected = np.array([True, False, False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx2 >= idx1
+ expected = np.array([True, False, False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 == idx2
+ expected = np.array([False, False, False, False, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 != idx2
+ expected = np.array([True, True, True, True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ with tm.assert_produces_warning(None):
+ for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]:
+ result = idx1 < val
+ expected = np.array([False, False, False, False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+ result = idx1 > val
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 <= val
+ tm.assert_numpy_array_equal(result, expected)
+ result = idx1 >= val
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 == val
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 != val
+ expected = np.array([True, True, True, True, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Check pd.NaT is handles as the same as np.nan
+ with tm.assert_produces_warning(None):
+ for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]:
+ result = idx1 < val
+ expected = np.array([True, False, False, False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+ result = idx1 > val
+ expected = np.array([False, False, False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 <= val
+ expected = np.array([True, False, True, False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+ result = idx1 >= val
+ expected = np.array([False, False, True, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 == val
+ expected = np.array([False, False, True, False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 != val
+ expected = np.array([True, True, False, True, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('op', [operator.eq, operator.ne,
+ operator.gt, operator.ge,
+ operator.lt, operator.le])
+ def test_comparison_tzawareness_compat(self, op, box_with_array):
+ # GH#18162
+ dr = pd.date_range('2016-01-01', periods=6)
+ dz = dr.tz_localize('US/Pacific')
+
+ # FIXME: ValueError with transpose
+ dr = tm.box_expected(dr, box_with_array, transpose=False)
+ dz = tm.box_expected(dz, box_with_array, transpose=False)
+
+ with pytest.raises(TypeError):
+ op(dr, dz)
+ if box_with_array is not pd.DataFrame:
+ # DataFrame op is invalid until transpose bug is fixed
+ with pytest.raises(TypeError):
+ op(dr, list(dz))
+ with pytest.raises(TypeError):
+ op(dr, np.array(list(dz), dtype=object))
+
+ with pytest.raises(TypeError):
+ op(dz, dr)
+ if box_with_array is not pd.DataFrame:
+ # DataFrame op is invalid until transpose bug is fixed
+ with pytest.raises(TypeError):
+ op(dz, list(dr))
+ with pytest.raises(TypeError):
+ op(dz, np.array(list(dr), dtype=object))
+
+ # Check that there isn't a problem aware-aware and naive-naive do not
+ # raise
+ assert_all(dr == dr)
+ assert_all(dz == dz)
+ if box_with_array is not pd.DataFrame:
+ # DataFrame doesn't align the lists correctly unless we transpose,
+ # which we cannot do at the moment
+ assert (dr == list(dr)).all()
+ assert (dz == list(dz)).all()
+
+ # Check comparisons against scalar Timestamps
+ ts = pd.Timestamp('2000-03-14 01:59')
+ ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam')
+
+ assert_all(dr > ts)
+ with pytest.raises(TypeError):
+ op(dr, ts_tz)
+
+ assert_all(dz > ts_tz)
+ with pytest.raises(TypeError):
+ op(dz, ts)
+
+ # GH#12601: Check comparison against Timestamps and DatetimeIndex
+ with pytest.raises(TypeError):
+ op(ts, dz)
+
+ @pytest.mark.parametrize('op', [operator.eq, operator.ne,
+ operator.gt, operator.ge,
+ operator.lt, operator.le])
+ @pytest.mark.parametrize('other', [datetime(2016, 1, 1),
+ Timestamp('2016-01-01'),
+ np.datetime64('2016-01-01')])
+ def test_scalar_comparison_tzawareness(self, op, other, tz_aware_fixture,
+ box_with_array):
+ tz = tz_aware_fixture
+ dti = pd.date_range('2016-01-01', periods=2, tz=tz)
+
+ # FIXME: ValueError with transpose
+ dtarr = tm.box_expected(dti, box_with_array, transpose=False)
+
+ with pytest.raises(TypeError):
+ op(dtarr, other)
+ with pytest.raises(TypeError):
+ op(other, dtarr)
+
+ @pytest.mark.parametrize('op', [operator.eq, operator.ne,
+ operator.gt, operator.ge,
+ operator.lt, operator.le])
+ def test_nat_comparison_tzawareness(self, op):
+ # GH#19276
+ # tzaware DatetimeIndex should not raise when compared to NaT
+ dti = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT,
+ '2014-05-01', '2014-07-01'])
+ expected = np.array([op == operator.ne] * len(dti))
+ result = op(dti, pd.NaT)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = op(dti.tz_localize('US/Pacific'), pd.NaT)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_dti_cmp_str(self, tz_naive_fixture):
+ # GH#22074
+ # regardless of tz, we expect these comparisons are valid
+ tz = tz_naive_fixture
+ rng = date_range('1/1/2000', periods=10, tz=tz)
+ other = '1/1/2000'
+
+ result = rng == other
+ expected = np.array([True] + [False] * 9)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rng != other
+ expected = np.array([False] + [True] * 9)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rng < other
+ expected = np.array([False] * 10)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rng <= other
+ expected = np.array([True] + [False] * 9)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rng > other
+ expected = np.array([False] + [True] * 9)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rng >= other
+ expected = np.array([True] * 10)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('other', ['foo', 99, 4.0,
+ object(), timedelta(days=2)])
+ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture,
+ box_with_array):
+ # GH#22074
+ tz = tz_naive_fixture
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ rng = date_range('1/1/2000', periods=10, tz=tz)
+ # FIXME: ValueError with transpose
+ rng = tm.box_expected(rng, box_with_array, transpose=False)
+
+ result = rng == other
+ expected = np.array([False] * 10)
+ expected = tm.box_expected(expected, xbox, transpose=False)
+ tm.assert_equal(result, expected)
+
+ result = rng != other
+ expected = np.array([True] * 10)
+ expected = tm.box_expected(expected, xbox, transpose=False)
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ rng < other
+ with pytest.raises(TypeError):
+ rng <= other
+ with pytest.raises(TypeError):
+ rng > other
+ with pytest.raises(TypeError):
+ rng >= other
+
+ def test_dti_cmp_list(self):
+ rng = date_range('1/1/2000', periods=10)
+
+ result = rng == list(rng)
+ expected = rng == rng
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('other', [
+ pd.timedelta_range('1D', periods=10),
+ pd.timedelta_range('1D', periods=10).to_series(),
+ pd.timedelta_range('1D', periods=10).asi8.view('m8[ns]')
+ ], ids=lambda x: type(x).__name__)
+ def test_dti_cmp_tdi_tzawareness(self, other):
+ # GH#22074
+ # reversion test that we _don't_ call _assert_tzawareness_compat
+ # when comparing against TimedeltaIndex
+ dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo')
+
+ result = dti == other
+ expected = np.array([False] * 10)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = dti != other
+ expected = np.array([True] * 10)
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ dti < other
+ with pytest.raises(TypeError):
+ dti <= other
+ with pytest.raises(TypeError):
+ dti > other
+ with pytest.raises(TypeError):
+ dti >= other
+
+ def test_dti_cmp_object_dtype(self):
+ # GH#22074
+ dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo')
+
+ other = dti.astype('O')
+
+ result = dti == other
+ expected = np.array([True] * 10)
+ tm.assert_numpy_array_equal(result, expected)
+
+ other = dti.tz_localize(None)
+ with pytest.raises(TypeError):
+ # tzawareness failure
+ dti != other
+
+ other = np.array(list(dti[:5]) + [Timedelta(days=1)] * 5)
+ result = dti == other
+ expected = np.array([True] * 5 + [False] * 5)
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ dti >= other
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+class TestDatetime64Arithmetic(object):
+ # This class is intended for "finished" tests that are fully parametrized
+ # over DataFrame/Series/Index/DatetimeArray
+
+ # -------------------------------------------------------------
+ # Addition/Subtraction of timedelta-like
+
+ def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture,
+ two_hours, box_with_array):
+ # GH#22005, GH#22163 check DataFrame doesn't raise TypeError
+ tz = tz_naive_fixture
+
+ rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz)
+ expected = pd.date_range('2000-01-01 02:00',
+ '2000-02-01 02:00', tz=tz)
+
+ # FIXME: calling with transpose=True raises ValueError
+ rng = tm.box_expected(rng, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = rng + two_hours
+ tm.assert_equal(result, expected)
+
+ def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture,
+ two_hours, box_with_array):
+ tz = tz_naive_fixture
+
+ rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz)
+ expected = pd.date_range('2000-01-01 02:00',
+ '2000-02-01 02:00', tz=tz)
+
+ # FIXME: calling with transpose=True raises ValueError
+ rng = tm.box_expected(rng, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ rng += two_hours
+ tm.assert_equal(rng, expected)
+
+ def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture,
+ two_hours, box_with_array):
+ tz = tz_naive_fixture
+
+ rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz)
+ expected = pd.date_range('1999-12-31 22:00',
+ '2000-01-31 22:00', tz=tz)
+
+ # FIXME: calling with transpose=True raises ValueError
+ rng = tm.box_expected(rng, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = rng - two_hours
+ tm.assert_equal(result, expected)
+
+ def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture,
+ two_hours, box_with_array):
+ tz = tz_naive_fixture
+
+ rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz)
+ expected = pd.date_range('1999-12-31 22:00',
+ '2000-01-31 22:00', tz=tz)
+
+ # FIXME: calling with transpose=True raises ValueError
+ rng = tm.box_expected(rng, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ rng -= two_hours
+ tm.assert_equal(rng, expected)
+
+ def test_dt64arr_add_td64_scalar(self, box_with_array):
+ # scalar timedeltas/np.timedelta64 objects
+ # operate with np.timedelta64 correctly
+ ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')])
+
+ expected = Series([Timestamp('20130101 9:01:01'),
+ Timestamp('20130101 9:02:01')])
+
+ dtarr = tm.box_expected(ser, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = dtarr + np.timedelta64(1, 's')
+ tm.assert_equal(result, expected)
+ result = np.timedelta64(1, 's') + dtarr
+ tm.assert_equal(result, expected)
+
+ expected = Series([Timestamp('20130101 9:01:00.005'),
+ Timestamp('20130101 9:02:00.005')])
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = dtarr + np.timedelta64(5, 'ms')
+ tm.assert_equal(result, expected)
+ result = np.timedelta64(5, 'ms') + dtarr
+ tm.assert_equal(result, expected)
+
+ def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture):
+ # GH#23320 special handling for timedelta64("NaT")
+ tz = tz_naive_fixture
+
+ dti = pd.date_range("1994-04-01", periods=9, tz=tz, freq="QS")
+ other = np.timedelta64("NaT")
+ expected = pd.DatetimeIndex(["NaT"] * 9, tz=tz)
+
+ # FIXME: fails with transpose=True due to tz-aware DataFrame
+ # transpose bug
+ obj = tm.box_expected(dti, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = obj + other
+ tm.assert_equal(result, expected)
+ result = other + obj
+ tm.assert_equal(result, expected)
+ result = obj - other
+ tm.assert_equal(result, expected)
+ with pytest.raises(TypeError):
+ other - obj
+
+ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture,
+ box_with_array):
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("FIXME: ValueError with transpose; "
+ "alignment error without")
+
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day'])
+ tdarr = tdi.values
+
+ expected = pd.date_range('2015-12-31', periods=3, tz=tz)
+
+ dtarr = tm.box_expected(dti, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = dtarr + tdarr
+ tm.assert_equal(result, expected)
+ result = tdarr + dtarr
+ tm.assert_equal(result, expected)
+
+ expected = pd.date_range('2016-01-02', periods=3, tz=tz)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = dtarr - tdarr
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ tdarr - dtarr
+
+ # -----------------------------------------------------------------
+ # Subtraction of datetime-like scalars
+
+ @pytest.mark.parametrize('ts', [
+ pd.Timestamp('2013-01-01'),
+ pd.Timestamp('2013-01-01').to_pydatetime(),
+ pd.Timestamp('2013-01-01').to_datetime64()])
+ def test_dt64arr_sub_dtscalar(self, box_with_array, ts):
+ # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype
+ idx = pd.date_range('2013-01-01', periods=3)
+ idx = tm.box_expected(idx, box_with_array)
+
+ expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days'])
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = idx - ts
+ tm.assert_equal(result, expected)
+
+ def test_dt64arr_sub_datetime64_not_ns(self, box_with_array):
+ # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano
+ # for DataFrame operation
+ dt64 = np.datetime64('2013-01-01')
+ assert dt64.dtype == 'datetime64[D]'
+
+ dti = pd.date_range('20130101', periods=3)
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days'])
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = dtarr - dt64
+ tm.assert_equal(result, expected)
+
+ result = dt64 - dtarr
+ tm.assert_equal(result, -expected)
+
+ def test_dt64arr_sub_timestamp(self, box_with_array):
+ ser = pd.date_range('2014-03-17', periods=2, freq='D',
+ tz='US/Eastern')
+ ts = ser[0]
+
+ # FIXME: transpose raises ValueError
+ ser = tm.box_expected(ser, box_with_array, transpose=False)
+
+ delta_series = pd.Series([np.timedelta64(0, 'D'),
+ np.timedelta64(1, 'D')])
+ expected = tm.box_expected(delta_series, box_with_array,
+ transpose=False)
+
+ tm.assert_equal(ser - ts, expected)
+ tm.assert_equal(ts - ser, -expected)
+
+ def test_dt64arr_sub_NaT(self, box_with_array):
+ # GH#18808
+ dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp('19900315')])
+ ser = tm.box_expected(dti, box_with_array, transpose=False)
+
+ result = ser - pd.NaT
+ expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]')
+ # FIXME: raises ValueError with transpose
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+ tm.assert_equal(result, expected)
+
+ dti_tz = dti.tz_localize('Asia/Tokyo')
+ ser_tz = tm.box_expected(dti_tz, box_with_array, transpose=False)
+
+ result = ser_tz - pd.NaT
+ expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]')
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+ tm.assert_equal(result, expected)
+
+ # -------------------------------------------------------------
+ # Subtraction of datetime-like array-like
+
+ def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array):
+ dti = pd.date_range('2016-01-01', periods=3, tz=None)
+ dt64vals = dti.values
+
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ expected = dtarr - dtarr
+ result = dtarr - dt64vals
+ tm.assert_equal(result, expected)
+ result = dt64vals - dtarr
+ tm.assert_equal(result, expected)
+
+ def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture,
+ box_with_array):
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("FIXME: ValueError with transpose; "
+ "alignment error without")
+
+ tz = tz_aware_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ dt64vals = dti.values
+
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ with pytest.raises(TypeError):
+ dtarr - dt64vals
+ with pytest.raises(TypeError):
+ dt64vals - dtarr
+
+ # -------------------------------------------------------------
+ # Addition of datetime-like others (invalid)
+
+ def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture,
+ box_with_array):
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("FIXME: ValueError with transpose; "
+ "alignment error without")
+
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ dt64vals = dti.values
+
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ with pytest.raises(TypeError):
+ dtarr + dt64vals
+ with pytest.raises(TypeError):
+ dt64vals + dtarr
+
+ def test_dt64arr_add_timestamp_raises(self, box_with_array):
+ # GH#22163 ensure DataFrame doesn't cast Timestamp to i8
+ idx = DatetimeIndex(['2011-01-01', '2011-01-02'])
+ idx = tm.box_expected(idx, box_with_array)
+ msg = "cannot add"
+ with pytest.raises(TypeError, match=msg):
+ idx + Timestamp('2011-01-01')
+ with pytest.raises(TypeError, match=msg):
+ Timestamp('2011-01-01') + idx
+
+ # -------------------------------------------------------------
+ # Other Invalid Addition/Subtraction
+
+ @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])])
+ def test_dt64arr_add_sub_float(self, other, box_with_array):
+ dti = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')
+ dtarr = tm.box_expected(dti, box_with_array)
+ with pytest.raises(TypeError):
+ dtarr + other
+ with pytest.raises(TypeError):
+ other + dtarr
+ with pytest.raises(TypeError):
+ dtarr - other
+ with pytest.raises(TypeError):
+ other - dtarr
+
+ @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H'])
+ @pytest.mark.parametrize('dti_freq', [None, 'D'])
+ def test_dt64arr_add_sub_parr(self, dti_freq, pi_freq,
+ box_with_array, box_with_array2):
+ # GH#20049 subtracting PeriodIndex should raise TypeError
+ dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq)
+ pi = dti.to_period(pi_freq)
+
+ dtarr = tm.box_expected(dti, box_with_array)
+ parr = tm.box_expected(pi, box_with_array2)
+
+ with pytest.raises(TypeError):
+ dtarr + parr
+ with pytest.raises(TypeError):
+ parr + dtarr
+ with pytest.raises(TypeError):
+ dtarr - parr
+ with pytest.raises(TypeError):
+ parr - dtarr
+
+ @pytest.mark.parametrize('dti_freq', [None, 'D'])
+ def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array):
+ # GH#13078
+ # not supported, check TypeError
+ per = pd.Period('2011-01-01', freq='D')
+
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq)
+ dtarr = tm.box_expected(idx, box_with_array)
+
+ with pytest.raises(TypeError):
+ dtarr + per
+ with pytest.raises(TypeError):
+ per + dtarr
+ with pytest.raises(TypeError):
+ dtarr - per
+ with pytest.raises(TypeError):
+ per - dtarr
+
+
+class TestDatetime64DateOffsetArithmetic(object):
+
+ # -------------------------------------------------------------
+ # Tick DateOffsets
+
+ # TODO: parametrize over timezone?
+ def test_dt64arr_series_add_tick_DateOffset(self, box_with_array):
+ # GH#4532
+ # operate with pd.offsets
+ ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')])
+ expected = Series([Timestamp('20130101 9:01:05'),
+ Timestamp('20130101 9:02:05')])
+
+ ser = tm.box_expected(ser, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = ser + pd.offsets.Second(5)
+ tm.assert_equal(result, expected)
+
+ result2 = pd.offsets.Second(5) + ser
+ tm.assert_equal(result2, expected)
+
+ def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array):
+ # GH#4532
+ # operate with pd.offsets
+ ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')])
+ expected = Series([Timestamp('20130101 9:00:55'),
+ Timestamp('20130101 9:01:55')])
+
+ ser = tm.box_expected(ser, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = ser - pd.offsets.Second(5)
+ tm.assert_equal(result, expected)
+
+ result2 = -pd.offsets.Second(5) + ser
+ tm.assert_equal(result2, expected)
+
+ with pytest.raises(TypeError):
+ pd.offsets.Second(5) - ser
+
+ @pytest.mark.parametrize('cls_name', ['Day', 'Hour', 'Minute', 'Second',
+ 'Milli', 'Micro', 'Nano'])
+ def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name,
+ box_with_array):
+ # GH#4532
+ # smoke tests for valid DateOffsets
+ ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')])
+ ser = tm.box_expected(ser, box_with_array)
+
+ offset_cls = getattr(pd.offsets, cls_name)
+ ser + offset_cls(5)
+ offset_cls(5) + ser
+ ser - offset_cls(5)
+
+ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array):
+ # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype
+ tz = tz_aware_fixture
+ if tz == 'US/Pacific':
+ dates = date_range('2012-11-01', periods=3, tz=tz)
+ offset = dates + pd.offsets.Hour(5)
+ assert dates[0] + pd.offsets.Hour(5) == offset[0]
+
+ dates = date_range('2010-11-01 00:00',
+ periods=3, tz=tz, freq='H')
+ expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00',
+ '2010-11-01 07:00'], freq='H', tz=tz)
+
+ # FIXME: these raise ValueError with transpose=True
+ dates = tm.box_expected(dates, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ # TODO: parametrize over the scalar being added? radd? sub?
+ offset = dates + pd.offsets.Hour(5)
+ tm.assert_equal(offset, expected)
+ offset = dates + np.timedelta64(5, 'h')
+ tm.assert_equal(offset, expected)
+ offset = dates + timedelta(hours=5)
+ tm.assert_equal(offset, expected)
+
+ # -------------------------------------------------------------
+ # RelativeDelta DateOffsets
+
+ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array):
+ # GH#10699
+ vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'),
+ Timestamp('2000-01-31 00:23:00'),
+ Timestamp('2000-01-01'),
+ Timestamp('2000-03-31'),
+ Timestamp('2000-02-29'),
+ Timestamp('2000-12-31'),
+ Timestamp('2000-05-15'),
+ Timestamp('2001-06-15')])
+ vec = tm.box_expected(vec, box_with_array)
+ vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec
+
+ # DateOffset relativedelta fastpath
+ relative_kwargs = [('years', 2), ('months', 5), ('days', 3),
+ ('hours', 5), ('minutes', 10), ('seconds', 2),
+ ('microseconds', 5)]
+ for i, kwd in enumerate(relative_kwargs):
+ off = pd.DateOffset(**dict([kwd]))
+
+ expected = DatetimeIndex([x + off for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec + off)
+
+ expected = DatetimeIndex([x - off for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec - off)
+
+ off = pd.DateOffset(**dict(relative_kwargs[:i + 1]))
+
+ expected = DatetimeIndex([x + off for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec + off)
+
+ expected = DatetimeIndex([x - off for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec - off)
+
+ with pytest.raises(TypeError):
+ off - vec
+
+ # -------------------------------------------------------------
+ # Non-Tick, Non-RelativeDelta DateOffsets
+
+ # TODO: redundant with test_dt64arr_add_sub_DateOffset? that includes
+ # tz-aware cases which this does not
+ @pytest.mark.parametrize('cls_and_kwargs', [
+ 'YearBegin', ('YearBegin', {'month': 5}),
+ 'YearEnd', ('YearEnd', {'month': 5}),
+ 'MonthBegin', 'MonthEnd',
+ 'SemiMonthEnd', 'SemiMonthBegin',
+ 'Week', ('Week', {'weekday': 3}),
+ 'Week', ('Week', {'weekday': 6}),
+ 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin',
+ 'CustomBusinessDay', 'CDay', 'CBMonthEnd',
+ 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd',
+ 'BusinessHour', 'BYearBegin', 'BYearEnd',
+ 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}),
+ ('FY5253Quarter', {'qtr_with_extra_week': 1,
+ 'startingMonth': 1,
+ 'weekday': 2,
+ 'variation': 'nearest'}),
+ ('FY5253', {'weekday': 0, 'startingMonth': 2, 'variation': 'nearest'}),
+ ('WeekOfMonth', {'weekday': 2, 'week': 2}),
+ 'Easter', ('DateOffset', {'day': 4}),
+ ('DateOffset', {'month': 5})])
+ @pytest.mark.parametrize('normalize', [True, False])
+ @pytest.mark.parametrize('n', [0, 5])
+ def test_dt64arr_add_sub_DateOffsets(self, box_with_array,
+ n, normalize, cls_and_kwargs):
+ # GH#10699
+ # assert vectorized operation matches pointwise operations
+
+ if isinstance(cls_and_kwargs, tuple):
+ # If cls_name param is a tuple, then 2nd entry is kwargs for
+ # the offset constructor
+ cls_name, kwargs = cls_and_kwargs
+ else:
+ cls_name = cls_and_kwargs
+ kwargs = {}
+
+ if n == 0 and cls_name in ['WeekOfMonth', 'LastWeekOfMonth',
+ 'FY5253Quarter', 'FY5253']:
+ # passing n = 0 is invalid for these offset classes
+ return
+
+ vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'),
+ Timestamp('2000-01-31 00:23:00'),
+ Timestamp('2000-01-01'),
+ Timestamp('2000-03-31'),
+ Timestamp('2000-02-29'),
+ Timestamp('2000-12-31'),
+ Timestamp('2000-05-15'),
+ Timestamp('2001-06-15')])
+ vec = tm.box_expected(vec, box_with_array)
+ vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec
+
+ offset_cls = getattr(pd.offsets, cls_name)
+
+ with warnings.catch_warnings(record=True):
+ # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being
+ # applied to Series or DatetimeIndex
+ # we aren't testing that here, so ignore.
+ warnings.simplefilter("ignore", PerformanceWarning)
+
+ offset = offset_cls(n, normalize=normalize, **kwargs)
+
+ expected = DatetimeIndex([x + offset for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec + offset)
+
+ expected = DatetimeIndex([x - offset for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, vec - offset)
+
+ expected = DatetimeIndex([offset + x for x in vec_items])
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(expected, offset + vec)
+
+ with pytest.raises(TypeError):
+ offset - vec
+
+ def test_dt64arr_add_sub_DateOffset(self, box_with_array):
+ # GH#10699
+ s = date_range('2000-01-01', '2000-01-31', name='a')
+ s = tm.box_expected(s, box_with_array)
+ result = s + pd.DateOffset(years=1)
+ result2 = pd.DateOffset(years=1) + s
+ exp = date_range('2001-01-01', '2001-01-31', name='a')
+ exp = tm.box_expected(exp, box_with_array)
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+ result = s - pd.DateOffset(years=1)
+ exp = date_range('1999-01-01', '1999-01-31', name='a')
+ exp = tm.box_expected(exp, box_with_array)
+ tm.assert_equal(result, exp)
+
+ s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+ # FIXME: ValueError with tzaware DataFrame transpose
+ s = tm.box_expected(s, box_with_array, transpose=False)
+ result = s + pd.offsets.Day()
+ result2 = pd.offsets.Day() + s
+ exp = DatetimeIndex([Timestamp('2000-01-16 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-16', tz='US/Central')],
+ name='a')
+ exp = tm.box_expected(exp, box_with_array, transpose=False)
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+ s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+ s = tm.box_expected(s, box_with_array, transpose=False)
+ result = s + pd.offsets.MonthEnd()
+ result2 = pd.offsets.MonthEnd() + s
+ exp = DatetimeIndex([Timestamp('2000-01-31 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-29', tz='US/Central')],
+ name='a')
+ exp = tm.box_expected(exp, box_with_array, transpose=False)
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+ # TODO: __sub__, __rsub__
+ def test_dt64arr_add_mixed_offset_array(self, box_with_array):
+ # GH#10699
+ # array of offsets
+ s = DatetimeIndex([Timestamp('2000-1-1'), Timestamp('2000-2-1')])
+ s = tm.box_expected(s, box_with_array)
+
+ warn = None if box_with_array is pd.DataFrame else PerformanceWarning
+ with tm.assert_produces_warning(warn,
+ clear=[pd.core.arrays.datetimelike]):
+ other = pd.Index([pd.offsets.DateOffset(years=1),
+ pd.offsets.MonthEnd()])
+ other = tm.box_expected(other, box_with_array)
+ result = s + other
+ exp = DatetimeIndex([Timestamp('2001-1-1'),
+ Timestamp('2000-2-29')])
+ exp = tm.box_expected(exp, box_with_array)
+ tm.assert_equal(result, exp)
+
+ # same offset
+ other = pd.Index([pd.offsets.DateOffset(years=1),
+ pd.offsets.DateOffset(years=1)])
+ other = tm.box_expected(other, box_with_array)
+ result = s + other
+ exp = DatetimeIndex([Timestamp('2001-1-1'),
+ Timestamp('2001-2-1')])
+ exp = tm.box_expected(exp, box_with_array)
+ tm.assert_equal(result, exp)
+
+ # TODO: overlap with test_dt64arr_add_mixed_offset_array?
+ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture,
+ box_with_array):
+ # GH#18849
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("FIXME: ValueError with transpose; "
+ "alignment error without")
+
+ tz = tz_naive_fixture
+ dti = pd.date_range('2017-01-01', periods=2, tz=tz)
+ dtarr = tm.box_expected(dti, box_with_array)
+
+ other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)])
+
+ warn = None if box_with_array is pd.DataFrame else PerformanceWarning
+ with tm.assert_produces_warning(warn,
+ clear=[pd.core.arrays.datetimelike]):
+ res = dtarr + other
+ expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))],
+ name=dti.name, freq='infer')
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(res, expected)
+
+ with tm.assert_produces_warning(warn,
+ clear=[pd.core.arrays.datetimelike]):
+ res2 = other + dtarr
+ tm.assert_equal(res2, expected)
+
+ with tm.assert_produces_warning(warn,
+ clear=[pd.core.arrays.datetimelike]):
+ res = dtarr - other
+ expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))],
+ name=dti.name, freq='infer')
+ expected = tm.box_expected(expected, box_with_array)
+ tm.assert_equal(res, expected)
+
+
+class TestDatetime64OverflowHandling(object):
+ # TODO: box + de-duplicate
+
+ def test_dt64_overflow_masking(self, box_with_array):
+ # GH#25317
+ left = Series([Timestamp('1969-12-31')])
+ right = Series([NaT])
+
+ left = tm.box_expected(left, box_with_array)
+ right = tm.box_expected(right, box_with_array)
+
+ expected = TimedeltaIndex([NaT])
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = left - right
+ tm.assert_equal(result, expected)
+
+ def test_dt64_series_arith_overflow(self):
+ # GH#12534, fixed by GH#19024
+ dt = pd.Timestamp('1700-01-31')
+ td = pd.Timedelta('20000 Days')
+ dti = pd.date_range('1949-09-30', freq='100Y', periods=4)
+ ser = pd.Series(dti)
+ with pytest.raises(OverflowError):
+ ser - dt
+ with pytest.raises(OverflowError):
+ dt - ser
+ with pytest.raises(OverflowError):
+ ser + td
+ with pytest.raises(OverflowError):
+ td + ser
+
+ ser.iloc[-1] = pd.NaT
+ expected = pd.Series(['2004-10-03', '2104-10-04', '2204-10-04', 'NaT'],
+ dtype='datetime64[ns]')
+ res = ser + td
+ tm.assert_series_equal(res, expected)
+ res = td + ser
+ tm.assert_series_equal(res, expected)
+
+ ser.iloc[1:] = pd.NaT
+ expected = pd.Series(['91279 Days', 'NaT', 'NaT', 'NaT'],
+ dtype='timedelta64[ns]')
+ res = ser - dt
+ tm.assert_series_equal(res, expected)
+ res = dt - ser
+ tm.assert_series_equal(res, -expected)
+
+ def test_datetimeindex_sub_timestamp_overflow(self):
+ dtimax = pd.to_datetime(['now', pd.Timestamp.max])
+ dtimin = pd.to_datetime(['now', pd.Timestamp.min])
+
+ tsneg = Timestamp('1950-01-01')
+ ts_neg_variants = [tsneg,
+ tsneg.to_pydatetime(),
+ tsneg.to_datetime64().astype('datetime64[ns]'),
+ tsneg.to_datetime64().astype('datetime64[D]')]
+
+ tspos = Timestamp('1980-01-01')
+ ts_pos_variants = [tspos,
+ tspos.to_pydatetime(),
+ tspos.to_datetime64().astype('datetime64[ns]'),
+ tspos.to_datetime64().astype('datetime64[D]')]
+
+ for variant in ts_neg_variants:
+ with pytest.raises(OverflowError):
+ dtimax - variant
+
+ expected = pd.Timestamp.max.value - tspos.value
+ for variant in ts_pos_variants:
+ res = dtimax - variant
+ assert res[1].value == expected
+
+ expected = pd.Timestamp.min.value - tsneg.value
+ for variant in ts_neg_variants:
+ res = dtimin - variant
+ assert res[1].value == expected
+
+ for variant in ts_pos_variants:
+ with pytest.raises(OverflowError):
+ dtimin - variant
+
+ def test_datetimeindex_sub_datetimeindex_overflow(self):
+ # GH#22492, GH#22508
+ dtimax = pd.to_datetime(['now', pd.Timestamp.max])
+ dtimin = pd.to_datetime(['now', pd.Timestamp.min])
+
+ ts_neg = pd.to_datetime(['1950-01-01', '1950-01-01'])
+ ts_pos = pd.to_datetime(['1980-01-01', '1980-01-01'])
+
+ # General tests
+ expected = pd.Timestamp.max.value - ts_pos[1].value
+ result = dtimax - ts_pos
+ assert result[1].value == expected
+
+ expected = pd.Timestamp.min.value - ts_neg[1].value
+ result = dtimin - ts_neg
+ assert result[1].value == expected
+
+ with pytest.raises(OverflowError):
+ dtimax - ts_neg
+
+ with pytest.raises(OverflowError):
+ dtimin - ts_pos
+
+ # Edge cases
+ tmin = pd.to_datetime([pd.Timestamp.min])
+ t1 = tmin + pd.Timedelta.max + pd.Timedelta('1us')
+ with pytest.raises(OverflowError):
+ t1 - tmin
+
+ tmax = pd.to_datetime([pd.Timestamp.max])
+ t2 = tmax + pd.Timedelta.min - pd.Timedelta('1us')
+ with pytest.raises(OverflowError):
+ tmax - t2
+
+
+class TestTimestampSeriesArithmetic(object):
+
+ def test_empty_series_add_sub(self):
+ # GH#13844
+ a = Series(dtype='M8[ns]')
+ b = Series(dtype='m8[ns]')
+ tm.assert_series_equal(a, a + b)
+ tm.assert_series_equal(a, a - b)
+ tm.assert_series_equal(a, b + a)
+ with pytest.raises(TypeError):
+ b - a
+
+ def test_operators_datetimelike(self):
+
+ # ## timedelta64 ###
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ # ## datetime64 ###
+ dt1 = Series([pd.Timestamp('20111230'), pd.Timestamp('20120101'),
+ pd.Timestamp('20120103')])
+ dt1.iloc[2] = np.nan
+ dt2 = Series([pd.Timestamp('20111231'), pd.Timestamp('20120102'),
+ pd.Timestamp('20120104')])
+ dt1 - dt2
+ dt2 - dt1
+
+ # ## datetime64 with timetimedelta ###
+ dt1 + td1
+ td1 + dt1
+ dt1 - td1
+ # TODO: Decide if this ought to work.
+ # td1 - dt1
+
+ # ## timetimedelta with datetime64 ###
+ td1 + dt1
+ dt1 + td1
+
+ def test_dt64ser_sub_datetime_dtype(self):
+ ts = Timestamp(datetime(1993, 1, 7, 13, 30, 00))
+ dt = datetime(1993, 6, 22, 13, 30)
+ ser = Series([ts])
+ result = pd.to_timedelta(np.abs(ser - dt))
+ assert result.dtype == 'timedelta64[ns]'
+
+ # -------------------------------------------------------------
+ # TODO: This next block of tests came from tests.series.test_operators,
+ # needs to be de-duplicated and parametrized over `box` classes
+
+ def test_operators_datetimelike_invalid(self, all_arithmetic_operators):
+ # these are all TypeEror ops
+ op_str = all_arithmetic_operators
+
+ def check(get_ser, test_ser):
+
+ # check that we are getting a TypeError
+ # with 'operate' (from core/ops.py) for the ops that are not
+ # defined
+ op = getattr(get_ser, op_str, None)
+ # Previously, _validate_for_numeric_binop in core/indexes/base.py
+ # did this for us.
+ with pytest.raises(TypeError,
+ match='operate|[cC]annot|unsupported operand'):
+ op(test_ser)
+
+ # ## timedelta64 ###
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ # ## datetime64 ###
+ dt1 = Series([Timestamp('20111230'), Timestamp('20120101'),
+ Timestamp('20120103')])
+ dt1.iloc[2] = np.nan
+ dt2 = Series([Timestamp('20111231'), Timestamp('20120102'),
+ Timestamp('20120104')])
+ if op_str not in ['__sub__', '__rsub__']:
+ check(dt1, dt2)
+
+ # ## datetime64 with timetimedelta ###
+ # TODO(jreback) __rsub__ should raise?
+ if op_str not in ['__add__', '__radd__', '__sub__']:
+ check(dt1, td1)
+
+ # 8260, 10763
+ # datetime64 with tz
+ tz = 'US/Eastern'
+ dt1 = Series(date_range('2000-01-01 09:00:00', periods=5,
+ tz=tz), name='foo')
+ dt2 = dt1.copy()
+ dt2.iloc[2] = np.nan
+ td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H'))
+ td2 = td1.copy()
+ td2.iloc[1] = np.nan
+
+ if op_str not in ['__add__', '__radd__', '__sub__', '__rsub__']:
+ check(dt2, td2)
+
+ def test_sub_single_tz(self):
+ # GH#12290
+ s1 = Series([pd.Timestamp('2016-02-10', tz='America/Sao_Paulo')])
+ s2 = Series([pd.Timestamp('2016-02-08', tz='America/Sao_Paulo')])
+ result = s1 - s2
+ expected = Series([Timedelta('2days')])
+ tm.assert_series_equal(result, expected)
+ result = s2 - s1
+ expected = Series([Timedelta('-2days')])
+ tm.assert_series_equal(result, expected)
+
+ def test_dt64tz_series_sub_dtitz(self):
+ # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series
+ # (with same tz) raises, fixed by #19024
+ dti = pd.date_range('1999-09-30', periods=10, tz='US/Pacific')
+ ser = pd.Series(dti)
+ expected = pd.Series(pd.TimedeltaIndex(['0days'] * 10))
+
+ res = dti - ser
+ tm.assert_series_equal(res, expected)
+ res = ser - dti
+ tm.assert_series_equal(res, expected)
+
+ def test_sub_datetime_compat(self):
+ # see GH#14088
+ s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT])
+ dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc)
+ exp = Series([Timedelta('1 days'), pd.NaT])
+ tm.assert_series_equal(s - dt, exp)
+ tm.assert_series_equal(s - Timestamp(dt), exp)
+
+ def test_dt64_series_add_mixed_tick_DateOffset(self):
+ # GH#4532
+ # operate with pd.offsets
+ s = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')])
+
+ result = s + pd.offsets.Milli(5)
+ result2 = pd.offsets.Milli(5) + s
+ expected = Series([Timestamp('20130101 9:01:00.005'),
+ Timestamp('20130101 9:02:00.005')])
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
+
+ result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5)
+ expected = Series([Timestamp('20130101 9:06:00.005'),
+ Timestamp('20130101 9:07:00.005')])
+ tm.assert_series_equal(result, expected)
+
+ def test_datetime64_ops_nat(self):
+ # GH#11349
+ datetime_series = Series([NaT, Timestamp('19900315')])
+ nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]')
+ single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]')
+
+ # subtraction
+ tm.assert_series_equal(-NaT + datetime_series,
+ nat_series_dtype_timestamp)
+ with pytest.raises(TypeError):
+ -single_nat_dtype_datetime + datetime_series
+
+ tm.assert_series_equal(-NaT + nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+ with pytest.raises(TypeError):
+ -single_nat_dtype_datetime + nat_series_dtype_timestamp
+
+ # addition
+ tm.assert_series_equal(nat_series_dtype_timestamp + NaT,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(NaT + nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+
+ tm.assert_series_equal(nat_series_dtype_timestamp + NaT,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(NaT + nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+
+ # -------------------------------------------------------------
+ # Invalid Operations
+ # TODO: this block also needs to be de-duplicated and parametrized
+
+ @pytest.mark.parametrize('dt64_series', [
+ Series([Timestamp('19900315'), Timestamp('19900315')]),
+ Series([pd.NaT, Timestamp('19900315')]),
+ Series([pd.NaT, pd.NaT], dtype='datetime64[ns]')])
+ @pytest.mark.parametrize('one', [1, 1.0, np.array(1)])
+ def test_dt64_mul_div_numeric_invalid(self, one, dt64_series):
+ # multiplication
+ with pytest.raises(TypeError):
+ dt64_series * one
+ with pytest.raises(TypeError):
+ one * dt64_series
+
+ # division
+ with pytest.raises(TypeError):
+ dt64_series / one
+ with pytest.raises(TypeError):
+ one / dt64_series
+
+ @pytest.mark.parametrize('op', ['__add__', '__radd__',
+ '__sub__', '__rsub__'])
+ @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo'])
+ def test_dt64_series_add_intlike(self, tz, op):
+ # GH#19123
+ dti = pd.DatetimeIndex(['2016-01-02', '2016-02-03', 'NaT'], tz=tz)
+ ser = Series(dti)
+
+ other = Series([20, 30, 40], dtype='uint8')
+
+ method = getattr(ser, op)
+ with pytest.raises(TypeError):
+ method(1)
+ with pytest.raises(TypeError):
+ method(other)
+ with pytest.raises(TypeError):
+ method(other.values)
+ with pytest.raises(TypeError):
+ method(pd.Index(other))
+
+ # -------------------------------------------------------------
+ # Timezone-Centric Tests
+
+ def test_operators_datetimelike_with_timezones(self):
+ tz = 'US/Eastern'
+ dt1 = Series(date_range('2000-01-01 09:00:00', periods=5,
+ tz=tz), name='foo')
+ dt2 = dt1.copy()
+ dt2.iloc[2] = np.nan
+
+ td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H'))
+ td2 = td1.copy()
+ td2.iloc[1] = np.nan
+
+ result = dt1 + td1[0]
+ exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = dt2 + td2[0]
+ exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ # odd numpy behavior with scalar timedeltas
+ result = td1[0] + dt1
+ exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = td2[0] + dt2
+ exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = dt1 - td1[0]
+ exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+ with pytest.raises(TypeError):
+ td1[0] - dt1
+
+ result = dt2 - td2[0]
+ exp = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+ with pytest.raises(TypeError):
+ td2[0] - dt2
+
+ result = dt1 + td1
+ exp = (dt1.dt.tz_localize(None) + td1).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = dt2 + td2
+ exp = (dt2.dt.tz_localize(None) + td2).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = dt1 - td1
+ exp = (dt1.dt.tz_localize(None) - td1).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ result = dt2 - td2
+ exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz)
+ tm.assert_series_equal(result, exp)
+
+ with pytest.raises(TypeError):
+ td1 - dt1
+ with pytest.raises(TypeError):
+ td2 - dt2
+
+
+class TestDatetimeIndexArithmetic(object):
+
+ # -------------------------------------------------------------
+ # Binary operations DatetimeIndex and int
+
+ def test_dti_add_int(self, tz_naive_fixture, one):
+ # Variants of `one` for #19012
+ tz = tz_naive_fixture
+ rng = pd.date_range('2000-01-01 09:00', freq='H',
+ periods=10, tz=tz)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = rng + one
+ expected = pd.date_range('2000-01-01 10:00', freq='H',
+ periods=10, tz=tz)
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_iadd_int(self, tz_naive_fixture, one):
+ tz = tz_naive_fixture
+ rng = pd.date_range('2000-01-01 09:00', freq='H',
+ periods=10, tz=tz)
+ expected = pd.date_range('2000-01-01 10:00', freq='H',
+ periods=10, tz=tz)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ rng += one
+ tm.assert_index_equal(rng, expected)
+
+ def test_dti_sub_int(self, tz_naive_fixture, one):
+ tz = tz_naive_fixture
+ rng = pd.date_range('2000-01-01 09:00', freq='H',
+ periods=10, tz=tz)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = rng - one
+ expected = pd.date_range('2000-01-01 08:00', freq='H',
+ periods=10, tz=tz)
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_isub_int(self, tz_naive_fixture, one):
+ tz = tz_naive_fixture
+ rng = pd.date_range('2000-01-01 09:00', freq='H',
+ periods=10, tz=tz)
+ expected = pd.date_range('2000-01-01 08:00', freq='H',
+ periods=10, tz=tz)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ rng -= one
+ tm.assert_index_equal(rng, expected)
+
+ # -------------------------------------------------------------
+ # __add__/__sub__ with integer arrays
+
+ @pytest.mark.parametrize('freq', ['H', 'D'])
+ @pytest.mark.parametrize('int_holder', [np.array, pd.Index])
+ def test_dti_add_intarray_tick(self, int_holder, freq):
+ # GH#19959
+ dti = pd.date_range('2016-01-01', periods=2, freq=freq)
+ other = int_holder([4, -1])
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ expected = DatetimeIndex([dti[n] + other[n]
+ for n in range(len(dti))])
+ result = dti + other
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = other + dti
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['W', 'M', 'MS', 'Q'])
+ @pytest.mark.parametrize('int_holder', [np.array, pd.Index])
+ def test_dti_add_intarray_non_tick(self, int_holder, freq):
+ # GH#19959
+ dti = pd.date_range('2016-01-01', periods=2, freq=freq)
+ other = int_holder([4, -1])
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ expected = DatetimeIndex([dti[n] + other[n]
+ for n in range(len(dti))])
+
+ # tm.assert_produces_warning does not handle cases where we expect
+ # two warnings, in this case PerformanceWarning and FutureWarning.
+ # Until that is fixed, we don't catch either
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ result = dti + other
+ tm.assert_index_equal(result, expected)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ result = other + dti
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('int_holder', [np.array, pd.Index])
+ def test_dti_add_intarray_no_freq(self, int_holder):
+ # GH#19959
+ dti = pd.DatetimeIndex(['2016-01-01', 'NaT', '2017-04-05 06:07:08'])
+ other = int_holder([9, 4, -1])
+ with pytest.raises(NullFrequencyError):
+ dti + other
+ with pytest.raises(NullFrequencyError):
+ other + dti
+ with pytest.raises(NullFrequencyError):
+ dti - other
+ with pytest.raises(TypeError):
+ other - dti
+
+ # -------------------------------------------------------------
+ # Binary operations DatetimeIndex and TimedeltaIndex/array
+
+ def test_dti_add_tdi(self, tz_naive_fixture):
+ # GH#17558
+ tz = tz_naive_fixture
+ dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ tdi = pd.timedelta_range('0 days', periods=10)
+ expected = pd.date_range('2017-01-01', periods=10, tz=tz)
+
+ # add with TimdeltaIndex
+ result = dti + tdi
+ tm.assert_index_equal(result, expected)
+
+ result = tdi + dti
+ tm.assert_index_equal(result, expected)
+
+ # add with timedelta64 array
+ result = dti + tdi.values
+ tm.assert_index_equal(result, expected)
+
+ result = tdi.values + dti
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_iadd_tdi(self, tz_naive_fixture):
+ # GH#17558
+ tz = tz_naive_fixture
+ dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ tdi = pd.timedelta_range('0 days', periods=10)
+ expected = pd.date_range('2017-01-01', periods=10, tz=tz)
+
+ # iadd with TimdeltaIndex
+ result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ result += tdi
+ tm.assert_index_equal(result, expected)
+
+ result = pd.timedelta_range('0 days', periods=10)
+ result += dti
+ tm.assert_index_equal(result, expected)
+
+ # iadd with timedelta64 array
+ result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ result += tdi.values
+ tm.assert_index_equal(result, expected)
+
+ result = pd.timedelta_range('0 days', periods=10)
+ result += dti
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_sub_tdi(self, tz_naive_fixture):
+ # GH#17558
+ tz = tz_naive_fixture
+ dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ tdi = pd.timedelta_range('0 days', periods=10)
+ expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D')
+
+ # sub with TimedeltaIndex
+ result = dti - tdi
+ tm.assert_index_equal(result, expected)
+
+ msg = 'cannot subtract .*TimedeltaArray'
+ with pytest.raises(TypeError, match=msg):
+ tdi - dti
+
+ # sub with timedelta64 array
+ result = dti - tdi.values
+ tm.assert_index_equal(result, expected)
+
+ msg = 'cannot subtract DatetimeArray from'
+ with pytest.raises(TypeError, match=msg):
+ tdi.values - dti
+
+ def test_dti_isub_tdi(self, tz_naive_fixture):
+ # GH#17558
+ tz = tz_naive_fixture
+ dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ tdi = pd.timedelta_range('0 days', periods=10)
+ expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D')
+
+ # isub with TimedeltaIndex
+ result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ result -= tdi
+ tm.assert_index_equal(result, expected)
+
+ msg = 'cannot subtract .* from a TimedeltaArray'
+ with pytest.raises(TypeError, match=msg):
+ tdi -= dti
+
+ # isub with timedelta64 array
+ result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10)
+ result -= tdi.values
+ tm.assert_index_equal(result, expected)
+
+ msg = '|'.join(['cannot perform __neg__ with this index type:',
+ 'ufunc subtract cannot use operands with types',
+ 'cannot subtract DatetimeArray from'])
+ with pytest.raises(TypeError, match=msg):
+ tdi.values -= dti
+
+ # -------------------------------------------------------------
+ # Binary Operations DatetimeIndex and datetime-like
+ # TODO: A couple other tests belong in this section. Move them in
+ # A PR where there isn't already a giant diff.
+
+ @pytest.mark.parametrize('addend', [
+ datetime(2011, 1, 1),
+ DatetimeIndex(['2011-01-01', '2011-01-02']),
+ DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize('US/Eastern'),
+ np.datetime64('2011-01-01'),
+ Timestamp('2011-01-01')
+ ], ids=lambda x: type(x).__name__)
+ @pytest.mark.parametrize('tz', [None, 'US/Eastern'])
+ def test_add_datetimelike_and_dti(self, addend, tz):
+ # GH#9631
+ dti = DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize(tz)
+ msg = ('cannot add DatetimeArray and {0}'
+ .format(type(addend).__name__)).replace('DatetimeIndex',
+ 'DatetimeArray')
+ with pytest.raises(TypeError, match=msg):
+ dti + addend
+ with pytest.raises(TypeError, match=msg):
+ addend + dti
+
+ # -------------------------------------------------------------
+
+ def test_sub_dti_dti(self):
+ # previously performed setop (deprecated in 0.16.0), now changed to
+ # return subtraction -> TimeDeltaIndex (GH ...)
+
+ dti = date_range('20130101', periods=3)
+ dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern')
+ dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC')
+ expected = TimedeltaIndex([0, 0, 0])
+
+ result = dti - dti
+ tm.assert_index_equal(result, expected)
+
+ result = dti_tz - dti_tz
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ dti_tz - dti
+
+ with pytest.raises(TypeError):
+ dti - dti_tz
+
+ with pytest.raises(TypeError):
+ dti_tz - dti_tz2
+
+ # isub
+ dti -= dti
+ tm.assert_index_equal(dti, expected)
+
+ # different length raises ValueError
+ dti1 = date_range('20130101', periods=3)
+ dti2 = date_range('20130101', periods=4)
+ with pytest.raises(ValueError):
+ dti1 - dti2
+
+ # NaN propagation
+ dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03'])
+ dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan])
+ expected = TimedeltaIndex(['1 days', np.nan, np.nan])
+ result = dti2 - dti1
+ tm.assert_index_equal(result, expected)
+
+ # -------------------------------------------------------------------
+ # TODO: Most of this block is moved from series or frame tests, needs
+ # cleanup, box-parametrization, and de-duplication
+
+ @pytest.mark.parametrize('op', [operator.add, operator.sub])
+ def test_timedelta64_equal_timedelta_supported_ops(self, op):
+ ser = Series([Timestamp('20130301'),
+ Timestamp('20130228 23:00:00'),
+ Timestamp('20130228 22:00:00'),
+ Timestamp('20130228 21:00:00')])
+
+ intervals = ['D', 'h', 'm', 's', 'us']
+
+ # TODO: unused
+ # npy16_mappings = {'D': 24 * 60 * 60 * 1000000,
+ # 'h': 60 * 60 * 1000000,
+ # 'm': 60 * 1000000,
+ # 's': 1000000,
+ # 'us': 1}
+
+ def timedelta64(*args):
+ return sum(starmap(np.timedelta64, zip(args, intervals)))
+
+ for d, h, m, s, us in product(*([range(2)] * 5)):
+ nptd = timedelta64(d, h, m, s, us)
+ pytd = timedelta(days=d, hours=h, minutes=m, seconds=s,
+ microseconds=us)
+ lhs = op(ser, nptd)
+ rhs = op(ser, pytd)
+
+ tm.assert_series_equal(lhs, rhs)
+
+ def test_ops_nat_mixed_datetime64_timedelta64(self):
+ # GH#11349
+ timedelta_series = Series([NaT, Timedelta('1s')])
+ datetime_series = Series([NaT, Timestamp('19900315')])
+ nat_series_dtype_timedelta = Series([NaT, NaT],
+ dtype='timedelta64[ns]')
+ nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]')
+ single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]')
+ single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]')
+
+ # subtraction
+ tm.assert_series_equal(datetime_series - single_nat_dtype_datetime,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(datetime_series - single_nat_dtype_timedelta,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(-single_nat_dtype_timedelta + datetime_series,
+ nat_series_dtype_timestamp)
+
+ # without a Series wrapping the NaT, it is ambiguous
+ # whether it is a datetime64 or timedelta64
+ # defaults to interpreting it as timedelta64
+ tm.assert_series_equal(nat_series_dtype_timestamp -
+ single_nat_dtype_datetime,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(nat_series_dtype_timestamp -
+ single_nat_dtype_timedelta,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(-single_nat_dtype_timedelta +
+ nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+
+ with pytest.raises(TypeError):
+ timedelta_series - single_nat_dtype_datetime
+
+ # addition
+ tm.assert_series_equal(nat_series_dtype_timestamp +
+ single_nat_dtype_timedelta,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(single_nat_dtype_timedelta +
+ nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+
+ tm.assert_series_equal(nat_series_dtype_timestamp +
+ single_nat_dtype_timedelta,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(single_nat_dtype_timedelta +
+ nat_series_dtype_timestamp,
+ nat_series_dtype_timestamp)
+
+ tm.assert_series_equal(nat_series_dtype_timedelta +
+ single_nat_dtype_datetime,
+ nat_series_dtype_timestamp)
+ tm.assert_series_equal(single_nat_dtype_datetime +
+ nat_series_dtype_timedelta,
+ nat_series_dtype_timestamp)
+
+ def test_ufunc_coercions(self):
+ idx = date_range('2011-01-01', periods=3, freq='2D', name='x')
+
+ delta = np.timedelta64(1, 'D')
+ for result in [idx + delta, np.add(idx, delta)]:
+ assert isinstance(result, DatetimeIndex)
+ exp = date_range('2011-01-02', periods=3, freq='2D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '2D'
+
+ for result in [idx - delta, np.subtract(idx, delta)]:
+ assert isinstance(result, DatetimeIndex)
+ exp = date_range('2010-12-31', periods=3, freq='2D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '2D'
+
+ delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'),
+ np.timedelta64(3, 'D')])
+ for result in [idx + delta, np.add(idx, delta)]:
+ assert isinstance(result, DatetimeIndex)
+ exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'],
+ freq='3D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '3D'
+
+ for result in [idx - delta, np.subtract(idx, delta)]:
+ assert isinstance(result, DatetimeIndex)
+ exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'],
+ freq='D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == 'D'
+
+ @pytest.mark.parametrize('names', [('foo', None, None),
+ ('baz', 'bar', None),
+ ('bar', 'bar', 'bar')])
+ @pytest.mark.parametrize('tz', [None, 'America/Chicago'])
+ def test_dti_add_series(self, tz, names):
+ # GH#13905
+ index = DatetimeIndex(['2016-06-28 05:30', '2016-06-28 05:31'],
+ tz=tz, name=names[0])
+ ser = Series([Timedelta(seconds=5)] * 2,
+ index=index, name=names[1])
+ expected = Series(index + Timedelta(seconds=5),
+ index=index, name=names[2])
+
+ # passing name arg isn't enough when names[2] is None
+ expected.name = names[2]
+ assert expected.dtype == index.dtype
+ result = ser + index
+ tm.assert_series_equal(result, expected)
+ result2 = index + ser
+ tm.assert_series_equal(result2, expected)
+
+ expected = index + Timedelta(seconds=5)
+ result3 = ser.values + index
+ tm.assert_index_equal(result3, expected)
+ result4 = index + ser.values
+ tm.assert_index_equal(result4, expected)
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_dti_add_offset_index(self, tz_naive_fixture, names):
+ # GH#18849, GH#19744
+ tz = tz_naive_fixture
+ dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0])
+ other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)],
+ name=names[1])
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res = dti + other
+ expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))],
+ name=names[2], freq='infer')
+ tm.assert_index_equal(res, expected)
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res2 = other + dti
+ tm.assert_index_equal(res2, expected)
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_dti_sub_offset_index(self, tz_naive_fixture, names):
+ # GH#18824, GH#19744
+ tz = tz_naive_fixture
+ dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0])
+ other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)],
+ name=names[1])
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res = dti - other
+ expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))],
+ name=names[2], freq='infer')
+ tm.assert_index_equal(res, expected)
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_dti_with_offset_series(self, tz_naive_fixture, names):
+ # GH#18849
+ tz = tz_naive_fixture
+ dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0])
+ other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)],
+ name=names[1])
+
+ expected_add = Series([dti[n] + other[n] for n in range(len(dti))],
+ name=names[2])
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res = dti + other
+ tm.assert_series_equal(res, expected_add)
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res2 = other + dti
+ tm.assert_series_equal(res2, expected_add)
+
+ expected_sub = Series([dti[n] - other[n] for n in range(len(dti))],
+ name=names[2])
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.arrays.datetimelike]):
+ res3 = dti - other
+ tm.assert_series_equal(res3, expected_sub)
+
+
[email protected]('years', [-1, 0, 1])
[email protected]('months', [-2, 0, 2])
+def test_shift_months(years, months):
+ dti = DatetimeIndex([Timestamp('2000-01-05 00:15:00'),
+ Timestamp('2000-01-31 00:23:00'),
+ Timestamp('2000-01-01'),
+ Timestamp('2000-02-29'),
+ Timestamp('2000-12-31')])
+ actual = DatetimeIndex(shift_months(dti.asi8, years * 12 + months))
+
+ raw = [x + pd.offsets.DateOffset(years=years, months=months)
+ for x in dti]
+ expected = DatetimeIndex(raw)
+ tm.assert_index_equal(actual, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/test_numeric.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_numeric.py
new file mode 100644
index 00000000000..da1b3f1da53
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_numeric.py
@@ -0,0 +1,1076 @@
+# -*- coding: utf-8 -*-
+# Arithmetc tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for numeric dtypes
+from decimal import Decimal
+from itertools import combinations
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, Iterable
+
+import pandas as pd
+from pandas import Index, Series, Timedelta, TimedeltaIndex
+from pandas.core import ops
+import pandas.util.testing as tm
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestNumericComparisons(object):
+ def test_operator_series_comparison_zerorank(self):
+ # GH#13006
+ result = np.float64(0) > pd.Series([1, 2, 3])
+ expected = 0.0 > pd.Series([1, 2, 3])
+ tm.assert_series_equal(result, expected)
+ result = pd.Series([1, 2, 3]) < np.float64(0)
+ expected = pd.Series([1, 2, 3]) < 0.0
+ tm.assert_series_equal(result, expected)
+ result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2])
+ expected = 0.0 > pd.Series([1, 2, 3])
+ tm.assert_series_equal(result, expected)
+
+ def test_df_numeric_cmp_dt64_raises(self):
+ # GH#8932, GH#22163
+ ts = pd.Timestamp.now()
+ df = pd.DataFrame({'x': range(5)})
+ with pytest.raises(TypeError):
+ df > ts
+ with pytest.raises(TypeError):
+ df < ts
+ with pytest.raises(TypeError):
+ ts < df
+ with pytest.raises(TypeError):
+ ts > df
+
+ assert not (df == ts).any().any()
+ assert (df != ts).all().all()
+
+ def test_compare_invalid(self):
+ # GH#8058
+ # ops testing
+ a = pd.Series(np.random.randn(5), name=0)
+ b = pd.Series(np.random.randn(5))
+ b.name = pd.Timestamp('2000-01-01')
+ tm.assert_series_equal(a / b, 1 / (b / a))
+
+
+# ------------------------------------------------------------------
+# Numeric dtypes Arithmetic with Timedelta Scalar
+
+class TestNumericArraylikeArithmeticWithTimedeltaLike(object):
+
+ # TODO: also check name retentention
+ @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
+ @pytest.mark.parametrize('left', [
+ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
+ for dtype in ['i1', 'i2', 'i4', 'i8',
+ 'u1', 'u2', 'u4', 'u8',
+ 'f2', 'f4', 'f8']
+ for cls in [pd.Series, pd.Index]],
+ ids=lambda x: type(x).__name__ + str(x.dtype))
+ def test_mul_td64arr(self, left, box_cls):
+ # GH#22390
+ right = np.array([1, 2, 3], dtype='m8[s]')
+ right = box_cls(right)
+
+ expected = pd.TimedeltaIndex(['10s', '40s', '90s'])
+ if isinstance(left, pd.Series) or box_cls is pd.Series:
+ expected = pd.Series(expected)
+
+ result = left * right
+ tm.assert_equal(result, expected)
+
+ result = right * left
+ tm.assert_equal(result, expected)
+
+ # TODO: also check name retentention
+ @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
+ @pytest.mark.parametrize('left', [
+ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
+ for dtype in ['i1', 'i2', 'i4', 'i8',
+ 'u1', 'u2', 'u4', 'u8',
+ 'f2', 'f4', 'f8']
+ for cls in [pd.Series, pd.Index]],
+ ids=lambda x: type(x).__name__ + str(x.dtype))
+ def test_div_td64arr(self, left, box_cls):
+ # GH#22390
+ right = np.array([10, 40, 90], dtype='m8[s]')
+ right = box_cls(right)
+
+ expected = pd.TimedeltaIndex(['1s', '2s', '3s'])
+ if isinstance(left, pd.Series) or box_cls is pd.Series:
+ expected = pd.Series(expected)
+
+ result = right / left
+ tm.assert_equal(result, expected)
+
+ result = right // left
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ left / right
+
+ with pytest.raises(TypeError):
+ left // right
+
+ # TODO: de-duplicate with test_numeric_arr_mul_tdscalar
+ def test_ops_series(self):
+ # regression test for G#H8813
+ td = Timedelta('1 day')
+ other = pd.Series([1, 2])
+ expected = pd.Series(pd.to_timedelta(['1 day', '2 days']))
+ tm.assert_series_equal(expected, td * other)
+ tm.assert_series_equal(expected, other * td)
+
+ # TODO: also test non-nanosecond timedelta64 and Tick objects;
+ # see test_numeric_arr_rdiv_tdscalar for note on these failing
+ @pytest.mark.parametrize('scalar_td', [
+ Timedelta(days=1),
+ Timedelta(days=1).to_timedelta64(),
+ Timedelta(days=1).to_pytimedelta()],
+ ids=lambda x: type(x).__name__)
+ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box):
+ # GH#19333
+ index = numeric_idx
+
+ expected = pd.timedelta_range('0 days', '4 days')
+
+ index = tm.box_expected(index, box)
+ expected = tm.box_expected(expected, box)
+
+ result = index * scalar_td
+ tm.assert_equal(result, expected)
+
+ commute = scalar_td * index
+ tm.assert_equal(commute, expected)
+
+ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box):
+ index = numeric_idx[1:3]
+
+ expected = TimedeltaIndex(['3 Days', '36 Hours'])
+
+ index = tm.box_expected(index, box)
+ expected = tm.box_expected(expected, box)
+
+ result = three_days / index
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ index / three_days
+
+ @pytest.mark.parametrize('other', [
+ pd.Timedelta(hours=31),
+ pd.Timedelta(hours=31).to_pytimedelta(),
+ pd.Timedelta(hours=31).to_timedelta64(),
+ pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'),
+ np.timedelta64('NaT'),
+ np.timedelta64('NaT', 'D'),
+ pd.offsets.Minute(3),
+ pd.offsets.Second(0)])
+ def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box):
+ left = tm.box_expected(numeric_idx, box)
+ with pytest.raises(TypeError):
+ left + other
+ with pytest.raises(TypeError):
+ other + left
+ with pytest.raises(TypeError):
+ left - other
+ with pytest.raises(TypeError):
+ other - left
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+class TestDivisionByZero(object):
+
+ def test_div_zero(self, zero, numeric_idx):
+ idx = numeric_idx
+
+ expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf],
+ dtype=np.float64)
+ result = idx / zero
+ tm.assert_index_equal(result, expected)
+ ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8')
+ tm.assert_series_equal(ser_compat, Series(result))
+
+ def test_floordiv_zero(self, zero, numeric_idx):
+ idx = numeric_idx
+
+ expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf],
+ dtype=np.float64)
+
+ result = idx // zero
+ tm.assert_index_equal(result, expected)
+ ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8')
+ tm.assert_series_equal(ser_compat, Series(result))
+
+ def test_mod_zero(self, zero, numeric_idx):
+ idx = numeric_idx
+
+ expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan],
+ dtype=np.float64)
+ result = idx % zero
+ tm.assert_index_equal(result, expected)
+ ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8')
+ tm.assert_series_equal(ser_compat, Series(result))
+
+ def test_divmod_zero(self, zero, numeric_idx):
+ idx = numeric_idx
+
+ exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf],
+ dtype=np.float64)
+ exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan],
+ dtype=np.float64)
+
+ result = divmod(idx, zero)
+ tm.assert_index_equal(result[0], exleft)
+ tm.assert_index_equal(result[1], exright)
+
+ # ------------------------------------------------------------------
+
+ @pytest.mark.parametrize('dtype2', [
+ np.int64, np.int32, np.int16, np.int8,
+ np.float64, np.float32, np.float16,
+ np.uint64, np.uint32, np.uint16, np.uint8])
+ @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64])
+ def test_ser_div_ser(self, dtype1, dtype2):
+ # no longer do integer div for any ops, but deal with the 0's
+ first = Series([3, 4, 5, 8], name='first').astype(dtype1)
+ second = Series([0, 0, 0, 3], name='second').astype(dtype2)
+
+ with np.errstate(all='ignore'):
+ expected = Series(first.values.astype(np.float64) / second.values,
+ dtype='float64', name=None)
+ expected.iloc[0:3] = np.inf
+
+ result = first / second
+ tm.assert_series_equal(result, expected)
+ assert not result.equals(second / first)
+
+ def test_rdiv_zero_compat(self):
+ # GH#8674
+ zero_array = np.array([0] * 5)
+ data = np.random.randn(5)
+ expected = Series([0.] * 5)
+
+ result = zero_array / Series(data)
+ tm.assert_series_equal(result, expected)
+
+ result = Series(zero_array) / data
+ tm.assert_series_equal(result, expected)
+
+ result = Series(zero_array) / Series(data)
+ tm.assert_series_equal(result, expected)
+
+ def test_div_zero_inf_signs(self):
+ # GH#9144, inf signing
+ ser = Series([-1, 0, 1], name='first')
+ expected = Series([-np.inf, np.nan, np.inf], name='first')
+
+ result = ser / 0
+ tm.assert_series_equal(result, expected)
+
+ def test_rdiv_zero(self):
+ # GH#9144
+ ser = Series([-1, 0, 1], name='first')
+ expected = Series([0.0, np.nan, 0.0], name='first')
+
+ result = 0 / ser
+ tm.assert_series_equal(result, expected)
+
+ def test_floordiv_div(self):
+ # GH#9144
+ ser = Series([-1, 0, 1], name='first')
+
+ result = ser // 0
+ expected = Series([-np.inf, np.nan, np.inf], name='first')
+ tm.assert_series_equal(result, expected)
+
+ def test_df_div_zero_df(self):
+ # integer div, but deal with the 0's (GH#9144)
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+ result = df / df
+
+ first = pd.Series([1.0, 1.0, 1.0, 1.0])
+ second = pd.Series([np.nan, np.nan, np.nan, 1])
+ expected = pd.DataFrame({'first': first, 'second': second})
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_div_zero_array(self):
+ # integer div, but deal with the 0's (GH#9144)
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+
+ first = pd.Series([1.0, 1.0, 1.0, 1.0])
+ second = pd.Series([np.nan, np.nan, np.nan, 1])
+ expected = pd.DataFrame({'first': first, 'second': second})
+
+ with np.errstate(all='ignore'):
+ arr = df.values.astype('float') / df.values
+ result = pd.DataFrame(arr, index=df.index,
+ columns=df.columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_div_zero_int(self):
+ # integer div, but deal with the 0's (GH#9144)
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+
+ result = df / 0
+ expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns)
+ expected.iloc[0:3, 1] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ # numpy has a slightly different (wrong) treatment
+ with np.errstate(all='ignore'):
+ arr = df.values.astype('float64') / 0
+ result2 = pd.DataFrame(arr, index=df.index,
+ columns=df.columns)
+ tm.assert_frame_equal(result2, expected)
+
+ def test_df_div_zero_series_does_not_commute(self):
+ # integer div, but deal with the 0's (GH#9144)
+ df = pd.DataFrame(np.random.randn(10, 5))
+ ser = df[0]
+ res = ser / df
+ res2 = df / ser
+ assert not res.fillna(0).equals(res2.fillna(0))
+
+ # ------------------------------------------------------------------
+ # Mod By Zero
+
+ def test_df_mod_zero_df(self):
+ # GH#3590, modulo as ints
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+
+ # this is technically wrong, as the integer portion is coerced to float
+ # ###
+ first = pd.Series([0, 0, 0, 0], dtype='float64')
+ second = pd.Series([np.nan, np.nan, np.nan, 0])
+ expected = pd.DataFrame({'first': first, 'second': second})
+ result = df % df
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_mod_zero_array(self):
+ # GH#3590, modulo as ints
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+
+ # this is technically wrong, as the integer portion is coerced to float
+ # ###
+ first = pd.Series([0, 0, 0, 0], dtype='float64')
+ second = pd.Series([np.nan, np.nan, np.nan, 0])
+ expected = pd.DataFrame({'first': first, 'second': second})
+
+ # numpy has a slightly different (wrong) treatment
+ with np.errstate(all='ignore'):
+ arr = df.values % df.values
+ result2 = pd.DataFrame(arr, index=df.index,
+ columns=df.columns, dtype='float64')
+ result2.iloc[0:3, 1] = np.nan
+ tm.assert_frame_equal(result2, expected)
+
+ def test_df_mod_zero_int(self):
+ # GH#3590, modulo as ints
+ df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+
+ result = df % 0
+ expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns)
+ tm.assert_frame_equal(result, expected)
+
+ # numpy has a slightly different (wrong) treatment
+ with np.errstate(all='ignore'):
+ arr = df.values.astype('float64') % 0
+ result2 = pd.DataFrame(arr, index=df.index, columns=df.columns)
+ tm.assert_frame_equal(result2, expected)
+
+ def test_df_mod_zero_series_does_not_commute(self):
+ # GH#3590, modulo as ints
+ # not commutative with series
+ df = pd.DataFrame(np.random.randn(10, 5))
+ ser = df[0]
+ res = ser % df
+ res2 = df % ser
+ assert not res.fillna(0).equals(res2.fillna(0))
+
+
+class TestMultiplicationDivision(object):
+ # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__
+ # for non-timestamp/timedelta/period dtypes
+
+ @pytest.mark.parametrize('box', [
+ pytest.param(pd.Index,
+ marks=pytest.mark.xfail(reason="Index.__div__ always "
+ "raises",
+ raises=TypeError)),
+ pd.Series,
+ pd.DataFrame
+ ], ids=lambda x: x.__name__)
+ def test_divide_decimal(self, box):
+ # resolves issue GH#9787
+ ser = Series([Decimal(10)])
+ expected = Series([Decimal(5)])
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = ser / Decimal(2)
+
+ tm.assert_equal(result, expected)
+
+ result = ser // Decimal(2)
+ tm.assert_equal(result, expected)
+
+ def test_div_equiv_binop(self):
+ # Test Series.div as well as Series.__div__
+ # float/integer issue
+ # GH#7785
+ first = Series([1, 0], name='first')
+ second = Series([-0.01, -0.02], name='second')
+ expected = Series([-0.01, -np.inf])
+
+ result = second.div(first)
+ tm.assert_series_equal(result, expected, check_names=False)
+
+ result = second / first
+ tm.assert_series_equal(result, expected)
+
+ def test_div_int(self, numeric_idx):
+ # truediv under PY3
+ idx = numeric_idx
+ result = idx / 1
+ expected = idx
+ if PY3:
+ expected = expected.astype('float64')
+ tm.assert_index_equal(result, expected)
+
+ result = idx / 2
+ if PY3:
+ expected = expected.astype('float64')
+ expected = Index(idx.values / 2)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('op', [operator.mul, ops.rmul, operator.floordiv])
+ def test_mul_int_identity(self, op, numeric_idx, box):
+ idx = numeric_idx
+ idx = tm.box_expected(idx, box)
+
+ result = op(idx, 1)
+ tm.assert_equal(result, idx)
+
+ def test_mul_int_array(self, numeric_idx):
+ idx = numeric_idx
+ didx = idx * idx
+
+ result = idx * np.array(5, dtype='int64')
+ tm.assert_index_equal(result, idx * 5)
+
+ arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64'
+ result = idx * np.arange(5, dtype=arr_dtype)
+ tm.assert_index_equal(result, didx)
+
+ def test_mul_int_series(self, numeric_idx):
+ idx = numeric_idx
+ didx = idx * idx
+
+ arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64'
+ result = idx * Series(np.arange(5, dtype=arr_dtype))
+ tm.assert_series_equal(result, Series(didx))
+
+ def test_mul_float_series(self, numeric_idx):
+ idx = numeric_idx
+ rng5 = np.arange(5, dtype='float64')
+
+ result = idx * Series(rng5 + 0.1)
+ expected = Series(rng5 * (rng5 + 0.1))
+ tm.assert_series_equal(result, expected)
+
+ def test_mul_index(self, numeric_idx):
+ # in general not true for RangeIndex
+ idx = numeric_idx
+ if not isinstance(idx, pd.RangeIndex):
+ result = idx * idx
+ tm.assert_index_equal(result, idx ** 2)
+
+ def test_mul_datelike_raises(self, numeric_idx):
+ idx = numeric_idx
+ with pytest.raises(TypeError):
+ idx * pd.date_range('20130101', periods=5)
+
+ def test_mul_size_mismatch_raises(self, numeric_idx):
+ idx = numeric_idx
+ with pytest.raises(ValueError):
+ idx * idx[0:3]
+ with pytest.raises(ValueError):
+ idx * np.array([1, 2])
+
+ @pytest.mark.parametrize('op', [operator.pow, ops.rpow])
+ def test_pow_float(self, op, numeric_idx, box):
+ # test power calculations both ways, GH#14973
+ idx = numeric_idx
+ expected = pd.Float64Index(op(idx.values, 2.0))
+
+ idx = tm.box_expected(idx, box)
+ expected = tm.box_expected(expected, box)
+
+ result = op(idx, 2.0)
+ tm.assert_equal(result, expected)
+
+ def test_modulo(self, numeric_idx, box):
+ # GH#9244
+ idx = numeric_idx
+ expected = Index(idx.values % 2)
+
+ idx = tm.box_expected(idx, box)
+ expected = tm.box_expected(expected, box)
+
+ result = idx % 2
+ tm.assert_equal(result, expected)
+
+ def test_divmod_scalar(self, numeric_idx):
+ idx = numeric_idx
+
+ result = divmod(idx, 2)
+ with np.errstate(all='ignore'):
+ div, mod = divmod(idx.values, 2)
+
+ expected = Index(div), Index(mod)
+ for r, e in zip(result, expected):
+ tm.assert_index_equal(r, e)
+
+ def test_divmod_ndarray(self, numeric_idx):
+ idx = numeric_idx
+ other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2
+
+ result = divmod(idx, other)
+ with np.errstate(all='ignore'):
+ div, mod = divmod(idx.values, other)
+
+ expected = Index(div), Index(mod)
+ for r, e in zip(result, expected):
+ tm.assert_index_equal(r, e)
+
+ def test_divmod_series(self, numeric_idx):
+ idx = numeric_idx
+ other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2
+
+ result = divmod(idx, Series(other))
+ with np.errstate(all='ignore'):
+ div, mod = divmod(idx.values, other)
+
+ expected = Series(div), Series(mod)
+ for r, e in zip(result, expected):
+ tm.assert_series_equal(r, e)
+
+ @pytest.mark.parametrize('other', [np.nan, 7, -23, 2.718, -3.14, np.inf])
+ def test_ops_np_scalar(self, other):
+ vals = np.random.randn(5, 3)
+ f = lambda x: pd.DataFrame(x, index=list('ABCDE'),
+ columns=['jim', 'joe', 'jolie'])
+
+ df = f(vals)
+
+ tm.assert_frame_equal(df / np.array(other), f(vals / other))
+ tm.assert_frame_equal(np.array(other) * df, f(vals * other))
+ tm.assert_frame_equal(df + np.array(other), f(vals + other))
+ tm.assert_frame_equal(np.array(other) - df, f(other - vals))
+
+ # TODO: This came from series.test.test_operators, needs cleanup
+ def test_operators_frame(self):
+ # rpow does not work with DataFrame
+ ts = tm.makeTimeSeries()
+ ts.name = 'ts'
+
+ df = pd.DataFrame({'A': ts})
+
+ tm.assert_series_equal(ts + ts, ts + df['A'],
+ check_names=False)
+ tm.assert_series_equal(ts ** ts, ts ** df['A'],
+ check_names=False)
+ tm.assert_series_equal(ts < ts, ts < df['A'],
+ check_names=False)
+ tm.assert_series_equal(ts / ts, ts / df['A'],
+ check_names=False)
+
+ # TODO: this came from tests.series.test_analytics, needs cleannup and
+ # de-duplication with test_modulo above
+ def test_modulo2(self):
+ with np.errstate(all='ignore'):
+
+ # GH#3590, modulo as ints
+ p = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
+ result = p['first'] % p['second']
+ expected = Series(p['first'].values % p['second'].values,
+ dtype='float64')
+ expected.iloc[0:3] = np.nan
+ tm.assert_series_equal(result, expected)
+
+ result = p['first'] % 0
+ expected = Series(np.nan, index=p.index, name='first')
+ tm.assert_series_equal(result, expected)
+
+ p = p.astype('float64')
+ result = p['first'] % p['second']
+ expected = Series(p['first'].values % p['second'].values)
+ tm.assert_series_equal(result, expected)
+
+ p = p.astype('float64')
+ result = p['first'] % p['second']
+ result2 = p['second'] % p['first']
+ assert not result.equals(result2)
+
+ # GH#9144
+ s = Series([0, 1])
+
+ result = s % 0
+ expected = Series([np.nan, np.nan])
+ tm.assert_series_equal(result, expected)
+
+ result = 0 % s
+ expected = Series([np.nan, 0.0])
+ tm.assert_series_equal(result, expected)
+
+
+class TestAdditionSubtraction(object):
+ # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__
+ # for non-timestamp/timedelta/period dtypes
+
+ # TODO: This came from series.test.test_operators, needs cleanup
+ def test_arith_ops_df_compat(self):
+ # GH#1134
+ s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x')
+ s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x')
+
+ exp = pd.Series([3.0, 4.0, np.nan, np.nan],
+ index=list('ABCD'), name='x')
+ tm.assert_series_equal(s1 + s2, exp)
+ tm.assert_series_equal(s2 + s1, exp)
+
+ exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]},
+ index=list('ABCD'))
+ tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp)
+ tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp)
+
+ # different length
+ s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x')
+ s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x')
+
+ exp = pd.Series([3, 4, 5, np.nan],
+ index=list('ABCD'), name='x')
+ tm.assert_series_equal(s3 + s4, exp)
+ tm.assert_series_equal(s4 + s3, exp)
+
+ exp = pd.DataFrame({'x': [3, 4, 5, np.nan]},
+ index=list('ABCD'))
+ tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp)
+ tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp)
+
+ # TODO: This came from series.test.test_operators, needs cleanup
+ def test_series_frame_radd_bug(self):
+ # GH#353
+ vals = pd.Series(tm.rands_array(5, 10))
+ result = 'foo_' + vals
+ expected = vals.map(lambda x: 'foo_' + x)
+ tm.assert_series_equal(result, expected)
+
+ frame = pd.DataFrame({'vals': vals})
+ result = 'foo_' + frame
+ expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
+ tm.assert_frame_equal(result, expected)
+
+ ts = tm.makeTimeSeries()
+ ts.name = 'ts'
+
+ # really raise this time
+ now = pd.Timestamp.now().to_pydatetime()
+ with pytest.raises(TypeError):
+ now + ts
+
+ with pytest.raises(TypeError):
+ ts + now
+
+ # TODO: This came from series.test.test_operators, needs cleanup
+ def test_datetime64_with_index(self):
+ # arithmetic integer ops with an index
+ ser = pd.Series(np.random.randn(5))
+ expected = ser - ser.index.to_series()
+ result = ser - ser.index
+ tm.assert_series_equal(result, expected)
+
+ # GH#4629
+ # arithmetic datetime64 ops with an index
+ ser = pd.Series(pd.date_range('20130101', periods=5),
+ index=pd.date_range('20130101', periods=5))
+ expected = ser - ser.index.to_series()
+ result = ser - ser.index
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ # GH#18850
+ result = ser - ser.index.to_period()
+
+ df = pd.DataFrame(np.random.randn(5, 2),
+ index=pd.date_range('20130101', periods=5))
+ df['date'] = pd.Timestamp('20130102')
+ df['expected'] = df['date'] - df.index.to_series()
+ df['result'] = df['date'] - df.index
+ tm.assert_series_equal(df['result'], df['expected'], check_names=False)
+
+ # TODO: taken from tests.frame.test_operators, needs cleanup
+ def test_frame_operators(self):
+ seriesd = tm.getSeriesData()
+ frame = pd.DataFrame(seriesd)
+ frame2 = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
+
+ garbage = np.random.random(4)
+ colSeries = pd.Series(garbage, index=np.array(frame.columns))
+
+ idSum = frame + frame
+ seriesSum = frame + colSeries
+
+ for col, series in idSum.items():
+ for idx, val in series.items():
+ origVal = frame[col][idx] * 2
+ if not np.isnan(val):
+ assert val == origVal
+ else:
+ assert np.isnan(origVal)
+
+ for col, series in seriesSum.items():
+ for idx, val in series.items():
+ origVal = frame[col][idx] + colSeries[col]
+ if not np.isnan(val):
+ assert val == origVal
+ else:
+ assert np.isnan(origVal)
+
+ added = frame2 + frame2
+ expected = frame2 * 2
+ tm.assert_frame_equal(added, expected)
+
+ df = pd.DataFrame({'a': ['a', None, 'b']})
+ tm.assert_frame_equal(df + df,
+ pd.DataFrame({'a': ['aa', np.nan, 'bb']}))
+
+ # Test for issue #10181
+ for dtype in ('float', 'int64'):
+ frames = [
+ pd.DataFrame(dtype=dtype),
+ pd.DataFrame(columns=['A'], dtype=dtype),
+ pd.DataFrame(index=[0], dtype=dtype),
+ ]
+ for df in frames:
+ assert (df + df).equals(df)
+ tm.assert_frame_equal(df + df, df)
+
+ # TODO: taken from tests.series.test_operators; needs cleanup
+ def test_series_operators(self):
+ def _check_op(series, other, op, pos_only=False, check_dtype=True):
+ left = np.abs(series) if pos_only else series
+ right = np.abs(other) if pos_only else other
+
+ cython_or_numpy = op(left, right)
+ python = left.combine(right, op)
+ tm.assert_series_equal(cython_or_numpy, python,
+ check_dtype=check_dtype)
+
+ def check(series, other):
+ simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod']
+
+ for opname in simple_ops:
+ _check_op(series, other, getattr(operator, opname))
+
+ _check_op(series, other, operator.pow, pos_only=True)
+
+ _check_op(series, other, lambda x, y: operator.add(y, x))
+ _check_op(series, other, lambda x, y: operator.sub(y, x))
+ _check_op(series, other, lambda x, y: operator.truediv(y, x))
+ _check_op(series, other, lambda x, y: operator.floordiv(y, x))
+ _check_op(series, other, lambda x, y: operator.mul(y, x))
+ _check_op(series, other, lambda x, y: operator.pow(y, x),
+ pos_only=True)
+ _check_op(series, other, lambda x, y: operator.mod(y, x))
+
+ tser = tm.makeTimeSeries().rename('ts')
+ check(tser, tser * 2)
+ check(tser, tser * 0)
+ check(tser, tser[::2])
+ check(tser, 5)
+
+ def check_comparators(series, other, check_dtype=True):
+ _check_op(series, other, operator.gt, check_dtype=check_dtype)
+ _check_op(series, other, operator.ge, check_dtype=check_dtype)
+ _check_op(series, other, operator.eq, check_dtype=check_dtype)
+ _check_op(series, other, operator.lt, check_dtype=check_dtype)
+ _check_op(series, other, operator.le, check_dtype=check_dtype)
+
+ check_comparators(tser, 5)
+ check_comparators(tser, tser + 1, check_dtype=False)
+
+ # TODO: taken from tests.series.test_operators; needs cleanup
+ def test_divmod(self):
+ def check(series, other):
+ results = divmod(series, other)
+ if isinstance(other, Iterable) and len(series) != len(other):
+ # if the lengths don't match, this is the test where we use
+ # `tser[::2]`. Pad every other value in `other_np` with nan.
+ other_np = []
+ for n in other:
+ other_np.append(n)
+ other_np.append(np.nan)
+ else:
+ other_np = other
+ other_np = np.asarray(other_np)
+ with np.errstate(all='ignore'):
+ expecteds = divmod(series.values, np.asarray(other_np))
+
+ for result, expected in zip(results, expecteds):
+ # check the values, name, and index separately
+ tm.assert_almost_equal(np.asarray(result), expected)
+
+ assert result.name == series.name
+ tm.assert_index_equal(result.index, series.index)
+
+ tser = tm.makeTimeSeries().rename('ts')
+ check(tser, tser * 2)
+ check(tser, tser * 0)
+ check(tser, tser[::2])
+ check(tser, 5)
+
+
+class TestUFuncCompat(object):
+
+ @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index,
+ pd.Float64Index, pd.RangeIndex,
+ pd.Series])
+ def test_ufunc_compat(self, holder):
+ box = pd.Series if holder is pd.Series else pd.Index
+
+ if holder is pd.RangeIndex:
+ idx = pd.RangeIndex(0, 5)
+ else:
+ idx = holder(np.arange(5, dtype='int64'))
+ result = np.sin(idx)
+ expected = box(np.sin(np.arange(5, dtype='int64')))
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index,
+ pd.Float64Index, pd.Series])
+ def test_ufunc_coercions(self, holder):
+ idx = holder([1, 2, 3, 4, 5], name='x')
+ box = pd.Series if holder is pd.Series else pd.Index
+
+ result = np.sqrt(idx)
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+ result = np.divide(idx, 2.)
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+ # _evaluate_numeric_binop
+ result = idx + 2.
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index([3., 4., 5., 6., 7.], name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+ result = idx - 2.
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index([-1., 0., 1., 2., 3.], name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+ result = idx * 1.
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index([1., 2., 3., 4., 5.], name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+ result = idx / 2.
+ assert result.dtype == 'f8' and isinstance(result, box)
+ exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x')
+ exp = tm.box_expected(exp, box)
+ tm.assert_equal(result, exp)
+
+
+class TestObjectDtypeEquivalence(object):
+ # Tests that arithmetic operations match operations executed elementwise
+
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_numarr_with_dtype_add_nan(self, dtype, box):
+ ser = pd.Series([1, 2, 3], dtype=dtype)
+ expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype)
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = np.nan + ser
+ tm.assert_equal(result, expected)
+
+ result = ser + np.nan
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_numarr_with_dtype_add_int(self, dtype, box):
+ ser = pd.Series([1, 2, 3], dtype=dtype)
+ expected = pd.Series([2, 3, 4], dtype=dtype)
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = 1 + ser
+ tm.assert_equal(result, expected)
+
+ result = ser + 1
+ tm.assert_equal(result, expected)
+
+ # TODO: moved from tests.series.test_operators; needs cleanup
+ @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul,
+ operator.truediv, operator.floordiv])
+ def test_operators_reverse_object(self, op):
+ # GH#56
+ arr = pd.Series(np.random.randn(10), index=np.arange(10), dtype=object)
+
+ result = op(1., arr)
+ expected = op(1., arr.astype(float))
+ tm.assert_series_equal(result.astype(float), expected)
+
+
+class TestNumericArithmeticUnsorted(object):
+ # Tests in this class have been moved from type-specific test modules
+ # but not yet sorted, parametrized, and de-duplicated
+
+ def check_binop(self, ops, scalars, idxs):
+ for op in ops:
+ for a, b in combinations(idxs, 2):
+ result = op(a, b)
+ expected = op(pd.Int64Index(a), pd.Int64Index(b))
+ tm.assert_index_equal(result, expected)
+ for idx in idxs:
+ for scalar in scalars:
+ result = op(idx, scalar)
+ expected = op(pd.Int64Index(idx), scalar)
+ tm.assert_index_equal(result, expected)
+
+ def test_binops(self):
+ ops = [operator.add, operator.sub, operator.mul, operator.floordiv,
+ operator.truediv]
+ scalars = [-1, 1, 2]
+ idxs = [pd.RangeIndex(0, 10, 1), pd.RangeIndex(0, 20, 2),
+ pd.RangeIndex(-10, 10, 2), pd.RangeIndex(5, -5, -1)]
+ self.check_binop(ops, scalars, idxs)
+
+ def test_binops_pow(self):
+ # later versions of numpy don't allow powers of negative integers
+ # so test separately
+ # https://github.com/numpy/numpy/pull/8127
+ ops = [pow]
+ scalars = [1, 2]
+ idxs = [pd.RangeIndex(0, 10, 1), pd.RangeIndex(0, 20, 2)]
+ self.check_binop(ops, scalars, idxs)
+
+ # TODO: mod, divmod?
+ @pytest.mark.parametrize('op', [operator.add, operator.sub,
+ operator.mul, operator.floordiv,
+ operator.truediv, operator.pow])
+ def test_arithmetic_with_frame_or_series(self, op):
+ # check that we return NotImplemented when operating with Series
+ # or DataFrame
+ index = pd.RangeIndex(5)
+ other = pd.Series(np.random.randn(5))
+
+ expected = op(pd.Series(index), other)
+ result = op(index, other)
+ tm.assert_series_equal(result, expected)
+
+ other = pd.DataFrame(np.random.randn(2, 5))
+ expected = op(pd.DataFrame([index, index]), other)
+ result = op(index, other)
+ tm.assert_frame_equal(result, expected)
+
+ def test_numeric_compat2(self):
+ # validate that we are handling the RangeIndex overrides to numeric ops
+ # and returning RangeIndex where possible
+
+ idx = pd.RangeIndex(0, 10, 2)
+
+ result = idx * 2
+ expected = pd.RangeIndex(0, 20, 4)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = idx + 2
+ expected = pd.RangeIndex(2, 12, 2)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = idx - 2
+ expected = pd.RangeIndex(-2, 8, 2)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # truediv under PY3
+ result = idx / 2
+
+ if PY3:
+ expected = pd.RangeIndex(0, 5, 1).astype('float64')
+ else:
+ expected = pd.RangeIndex(0, 5, 1)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = idx / 4
+ expected = pd.RangeIndex(0, 10, 2) / 4
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = idx // 1
+ expected = idx
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # __mul__
+ result = idx * idx
+ expected = Index(idx.values * idx.values)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # __pow__
+ idx = pd.RangeIndex(0, 1000, 2)
+ result = idx ** 2
+ expected = idx._int64index ** 2
+ tm.assert_index_equal(Index(result.values), expected, exact=True)
+
+ # __floordiv__
+ cases_exact = [
+ (pd.RangeIndex(0, 1000, 2), 2, pd.RangeIndex(0, 500, 1)),
+ (pd.RangeIndex(-99, -201, -3), -3, pd.RangeIndex(33, 67, 1)),
+ (pd.RangeIndex(0, 1000, 1), 2,
+ pd.RangeIndex(0, 1000, 1)._int64index // 2),
+ (pd.RangeIndex(0, 100, 1), 2.0,
+ pd.RangeIndex(0, 100, 1)._int64index // 2.0),
+ (pd.RangeIndex(0), 50, pd.RangeIndex(0)),
+ (pd.RangeIndex(2, 4, 2), 3, pd.RangeIndex(0, 1, 1)),
+ (pd.RangeIndex(-5, -10, -6), 4, pd.RangeIndex(-2, -1, 1)),
+ (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0))]
+ for idx, div, expected in cases_exact:
+ tm.assert_index_equal(idx // div, expected, exact=True)
+
+ @pytest.mark.parametrize('dtype', [np.int64, np.float64])
+ @pytest.mark.parametrize('delta', [1, 0, -1])
+ def test_addsub_arithmetic(self, dtype, delta):
+ # GH#8142
+ delta = dtype(delta)
+ index = pd.Index([10, 11, 12], dtype=dtype)
+ result = index + delta
+ expected = pd.Index(index.values + delta, dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ # this subtraction used to fail
+ result = index - delta
+ expected = pd.Index(index.values - delta, dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ tm.assert_index_equal(index + index, 2 * index)
+ tm.assert_index_equal(index - index, 0 * index)
+ assert not (index - index).empty
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/test_object.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_object.py
new file mode 100644
index 00000000000..29063ae3f50
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_object.py
@@ -0,0 +1,314 @@
+# -*- coding: utf-8 -*-
+# Arithmetc tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for object dtype
+from decimal import Decimal
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Series, Timestamp
+from pandas.core import ops
+import pandas.util.testing as tm
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestObjectComparisons(object):
+
+ def test_comparison_object_numeric_nas(self):
+ ser = Series(np.random.randn(10), dtype=object)
+ shifted = ser.shift(2)
+
+ ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne']
+ for op in ops:
+ func = getattr(operator, op)
+
+ result = func(ser, shifted)
+ expected = func(ser.astype(float), shifted.astype(float))
+ tm.assert_series_equal(result, expected)
+
+ def test_object_comparisons(self):
+ ser = Series(['a', 'b', np.nan, 'c', 'a'])
+
+ result = ser == 'a'
+ expected = Series([True, False, False, False, True])
+ tm.assert_series_equal(result, expected)
+
+ result = ser < 'a'
+ expected = Series([False, False, False, False, False])
+ tm.assert_series_equal(result, expected)
+
+ result = ser != 'a'
+ expected = -(ser == 'a')
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_more_na_comparisons(self, dtype):
+ left = Series(['a', np.nan, 'c'], dtype=dtype)
+ right = Series(['a', np.nan, 'd'], dtype=dtype)
+
+ result = left == right
+ expected = Series([True, False, False])
+ tm.assert_series_equal(result, expected)
+
+ result = left != right
+ expected = Series([False, True, True])
+ tm.assert_series_equal(result, expected)
+
+ result = left == np.nan
+ expected = Series([False, False, False])
+ tm.assert_series_equal(result, expected)
+
+ result = left != np.nan
+ expected = Series([True, True, True])
+ tm.assert_series_equal(result, expected)
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+class TestArithmetic(object):
+
+ # TODO: parametrize
+ def test_pow_ops_object(self):
+ # GH#22922
+ # pow is weird with masking & 1, so testing here
+ a = Series([1, np.nan, 1, np.nan], dtype=object)
+ b = Series([1, np.nan, np.nan, 1], dtype=object)
+ result = a ** b
+ expected = Series(a.values ** b.values, dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ result = b ** a
+ expected = Series(b.values ** a.values, dtype=object)
+
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("op", [operator.add, ops.radd])
+ @pytest.mark.parametrize("other", ["category", "Int64"])
+ def test_add_extension_scalar(self, other, box, op):
+ # GH#22378
+ # Check that scalars satisfying is_extension_array_dtype(obj)
+ # do not incorrectly try to dispatch to an ExtensionArray operation
+
+ arr = pd.Series(['a', 'b', 'c'])
+ expected = pd.Series([op(x, other) for x in arr])
+
+ arr = tm.box_expected(arr, box)
+ expected = tm.box_expected(expected, box)
+
+ result = op(arr, other)
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [
+ pytest.param(pd.Index,
+ marks=pytest.mark.xfail(reason="Does not mask nulls",
+ raises=TypeError)),
+ pd.Series,
+ pd.DataFrame
+ ], ids=lambda x: x.__name__)
+ def test_objarr_add_str(self, box):
+ ser = pd.Series(['x', np.nan, 'x'])
+ expected = pd.Series(['xa', np.nan, 'xa'])
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = ser + 'a'
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [
+ pytest.param(pd.Index,
+ marks=pytest.mark.xfail(reason="Does not mask nulls",
+ raises=TypeError)),
+ pd.Series,
+ pd.DataFrame
+ ], ids=lambda x: x.__name__)
+ def test_objarr_radd_str(self, box):
+ ser = pd.Series(['x', np.nan, 'x'])
+ expected = pd.Series(['ax', np.nan, 'ax'])
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = 'a' + ser
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('data', [
+ [1, 2, 3],
+ [1.1, 2.2, 3.3],
+ [Timestamp('2011-01-01'), Timestamp('2011-01-02'), pd.NaT],
+ ['x', 'y', 1]])
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_objarr_radd_str_invalid(self, dtype, data, box):
+ ser = Series(data, dtype=dtype)
+
+ ser = tm.box_expected(ser, box)
+ with pytest.raises(TypeError):
+ 'foo_' + ser
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd,
+ operator.sub, ops.rsub])
+ def test_objarr_add_invalid(self, op, box):
+ # invalid ops
+
+ obj_ser = tm.makeObjectSeries()
+ obj_ser.name = 'objects'
+
+ obj_ser = tm.box_expected(obj_ser, box)
+ with pytest.raises(Exception):
+ op(obj_ser, 1)
+ with pytest.raises(Exception):
+ op(obj_ser, np.array(1, dtype=np.int64))
+
+ # TODO: Moved from tests.series.test_operators; needs cleanup
+ def test_operators_na_handling(self):
+ ser = Series(['foo', 'bar', 'baz', np.nan])
+ result = 'prefix_' + ser
+ expected = pd.Series(['prefix_foo', 'prefix_bar',
+ 'prefix_baz', np.nan])
+ tm.assert_series_equal(result, expected)
+
+ result = ser + '_suffix'
+ expected = pd.Series(['foo_suffix', 'bar_suffix',
+ 'baz_suffix', np.nan])
+ tm.assert_series_equal(result, expected)
+
+ # TODO: parametrize over box
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_series_with_dtype_radd_timedelta(self, dtype):
+ # note this test is _not_ aimed at timedelta64-dtyped Series
+ ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'),
+ pd.Timedelta('3 days')], dtype=dtype)
+ expected = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'),
+ pd.Timedelta('6 days')])
+
+ result = pd.Timedelta('3 days') + ser
+ tm.assert_series_equal(result, expected)
+
+ result = ser + pd.Timedelta('3 days')
+ tm.assert_series_equal(result, expected)
+
+ # TODO: cleanup & parametrize over box
+ def test_mixed_timezone_series_ops_object(self):
+ # GH#13043
+ ser = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'),
+ pd.Timestamp('2015-01-01', tz='Asia/Tokyo')],
+ name='xxx')
+ assert ser.dtype == object
+
+ exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'),
+ pd.Timestamp('2015-01-02', tz='Asia/Tokyo')],
+ name='xxx')
+ tm.assert_series_equal(ser + pd.Timedelta('1 days'), exp)
+ tm.assert_series_equal(pd.Timedelta('1 days') + ser, exp)
+
+ # object series & object series
+ ser2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'),
+ pd.Timestamp('2015-01-05', tz='Asia/Tokyo')],
+ name='xxx')
+ assert ser2.dtype == object
+ exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')],
+ name='xxx')
+ tm.assert_series_equal(ser2 - ser, exp)
+ tm.assert_series_equal(ser - ser2, -exp)
+
+ ser = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')],
+ name='xxx', dtype=object)
+ assert ser.dtype == object
+
+ exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')],
+ name='xxx')
+ tm.assert_series_equal(ser + pd.Timedelta('00:30:00'), exp)
+ tm.assert_series_equal(pd.Timedelta('00:30:00') + ser, exp)
+
+ # TODO: cleanup & parametrize over box
+ def test_iadd_preserves_name(self):
+ # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name
+ ser = pd.Series([1, 2, 3])
+ ser.index.name = 'foo'
+
+ ser.index += 1
+ assert ser.index.name == "foo"
+
+ ser.index -= 1
+ assert ser.index.name == "foo"
+
+ def test_add_string(self):
+ # from bug report
+ index = pd.Index(['a', 'b', 'c'])
+ index2 = index + 'foo'
+
+ assert 'a' not in index2
+ assert 'afoo' in index2
+
+ def test_iadd_string(self):
+ index = pd.Index(['a', 'b', 'c'])
+ # doesn't fail test unless there is a check before `+=`
+ assert 'a' in index
+
+ index += '_x'
+ assert 'a_x' in index
+
+ def test_add(self):
+ index = tm.makeStringIndex(100)
+ expected = pd.Index(index.values * 2)
+ tm.assert_index_equal(index + index, expected)
+ tm.assert_index_equal(index + index.tolist(), expected)
+ tm.assert_index_equal(index.tolist() + index, expected)
+
+ # test add and radd
+ index = pd.Index(list('abc'))
+ expected = pd.Index(['a1', 'b1', 'c1'])
+ tm.assert_index_equal(index + '1', expected)
+ expected = pd.Index(['1a', '1b', '1c'])
+ tm.assert_index_equal('1' + index, expected)
+
+ def test_sub_fail(self):
+ index = tm.makeStringIndex(100)
+ with pytest.raises(TypeError):
+ index - 'a'
+ with pytest.raises(TypeError):
+ index - index
+ with pytest.raises(TypeError):
+ index - index.tolist()
+ with pytest.raises(TypeError):
+ index.tolist() - index
+
+ def test_sub_object(self):
+ # GH#19369
+ index = pd.Index([Decimal(1), Decimal(2)])
+ expected = pd.Index([Decimal(0), Decimal(1)])
+
+ result = index - Decimal(1)
+ tm.assert_index_equal(result, expected)
+
+ result = index - pd.Index([Decimal(1), Decimal(1)])
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ index - 'foo'
+
+ with pytest.raises(TypeError):
+ index - np.array([2, 'foo'])
+
+ def test_rsub_object(self):
+ # GH#19369
+ index = pd.Index([Decimal(1), Decimal(2)])
+ expected = pd.Index([Decimal(1), Decimal(0)])
+
+ result = Decimal(2) - index
+ tm.assert_index_equal(result, expected)
+
+ result = np.array([Decimal(2), Decimal(2)]) - index
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ 'foo' - index
+
+ with pytest.raises(TypeError):
+ np.array([True, pd.Timestamp.now()]) - index
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/test_period.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_period.py
new file mode 100644
index 00000000000..92f209b94f0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_period.py
@@ -0,0 +1,1213 @@
+# -*- coding: utf-8 -*-
+# Arithmetc tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for Period dtype
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.period import IncompatibleFrequency
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import Period, PeriodIndex, Series, period_range
+from pandas.core import ops
+import pandas.util.testing as tm
+
+from pandas.tseries.frequencies import to_offset
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestPeriodIndexComparisons(object):
+
+ @pytest.mark.parametrize("other", ["2017", 2017])
+ def test_eq(self, other):
+ idx = PeriodIndex(['2017', '2017', '2018'], freq="D")
+ expected = np.array([True, True, False])
+ result = idx == other
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_pi_cmp_period(self):
+ idx = period_range('2007-01', periods=20, freq='M')
+
+ result = idx < idx[10]
+ exp = idx.values < idx.values[10]
+ tm.assert_numpy_array_equal(result, exp)
+
+ # TODO: moved from test_datetime64; de-duplicate with version below
+ def test_parr_cmp_period_scalar2(self, box_with_array):
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+
+ pi = pd.period_range('2000-01-01', periods=10, freq='D')
+
+ val = Period('2000-01-04', freq='D')
+ expected = [x > val for x in pi]
+
+ ser = tm.box_expected(pi, box_with_array)
+ expected = tm.box_expected(expected, xbox)
+ result = ser > val
+ tm.assert_equal(result, expected)
+
+ val = pi[5]
+ result = ser > val
+ expected = [x > val for x in pi]
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_parr_cmp_period_scalar(self, freq, box_with_array):
+ # GH#13200
+ xbox = np.ndarray if box_with_array is pd.Index else box_with_array
+
+ base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq=freq)
+ base = tm.box_expected(base, box_with_array)
+ per = Period('2011-02', freq=freq)
+
+ exp = np.array([False, True, False, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base == per, exp)
+ tm.assert_equal(per == base, exp)
+
+ exp = np.array([True, False, True, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base != per, exp)
+ tm.assert_equal(per != base, exp)
+
+ exp = np.array([False, False, True, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base > per, exp)
+ tm.assert_equal(per < base, exp)
+
+ exp = np.array([True, False, False, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base < per, exp)
+ tm.assert_equal(per > base, exp)
+
+ exp = np.array([False, True, True, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base >= per, exp)
+ tm.assert_equal(per <= base, exp)
+
+ exp = np.array([True, True, False, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base <= per, exp)
+ tm.assert_equal(per >= base, exp)
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_parr_cmp_pi(self, freq, box_with_array):
+ # GH#13200
+ xbox = np.ndarray if box_with_array is pd.Index else box_with_array
+
+ base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq=freq)
+ base = tm.box_expected(base, box_with_array)
+
+ # TODO: could also box idx?
+ idx = PeriodIndex(['2011-02', '2011-01', '2011-03', '2011-05'],
+ freq=freq)
+
+ exp = np.array([False, False, True, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base == idx, exp)
+
+ exp = np.array([True, True, False, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base != idx, exp)
+
+ exp = np.array([False, True, False, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base > idx, exp)
+
+ exp = np.array([True, False, False, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base < idx, exp)
+
+ exp = np.array([False, True, True, False])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base >= idx, exp)
+
+ exp = np.array([True, False, True, True])
+ exp = tm.box_expected(exp, xbox)
+ tm.assert_equal(base <= idx, exp)
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array):
+ # GH#13200
+ # different base freq
+ base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq=freq)
+ base = tm.box_expected(base, box_with_array)
+
+ msg = "Input has different freq=A-DEC from "
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ base <= Period('2011', freq='A')
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ Period('2011', freq='A') >= base
+
+ # TODO: Could parametrize over boxes for idx?
+ idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A')
+ rev_msg = (r'Input has different freq=(M|2M|3M) from '
+ r'PeriodArray\(freq=A-DEC\)')
+ idx_msg = rev_msg if box_with_array is tm.to_array else msg
+ with pytest.raises(IncompatibleFrequency, match=idx_msg):
+ base <= idx
+
+ # Different frequency
+ msg = "Input has different freq=4M from "
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ base <= Period('2011', freq='4M')
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ Period('2011', freq='4M') >= base
+
+ idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M')
+ rev_msg = (r'Input has different freq=(M|2M|3M) from '
+ r'PeriodArray\(freq=4M\)')
+ idx_msg = rev_msg if box_with_array is tm.to_array else msg
+ with pytest.raises(IncompatibleFrequency, match=idx_msg):
+ base <= idx
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_pi_cmp_nat(self, freq):
+ idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq)
+
+ result = idx1 > Period('2011-02', freq=freq)
+ exp = np.array([False, False, False, True])
+ tm.assert_numpy_array_equal(result, exp)
+ result = Period('2011-02', freq=freq) < idx1
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 == Period('NaT', freq=freq)
+ exp = np.array([False, False, False, False])
+ tm.assert_numpy_array_equal(result, exp)
+ result = Period('NaT', freq=freq) == idx1
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 != Period('NaT', freq=freq)
+ exp = np.array([True, True, True, True])
+ tm.assert_numpy_array_equal(result, exp)
+ result = Period('NaT', freq=freq) != idx1
+ tm.assert_numpy_array_equal(result, exp)
+
+ idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq)
+ result = idx1 < idx2
+ exp = np.array([True, False, False, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 == idx2
+ exp = np.array([False, False, False, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 != idx2
+ exp = np.array([True, True, True, True])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 == idx1
+ exp = np.array([True, True, False, True])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = idx1 != idx1
+ exp = np.array([False, False, True, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_pi_cmp_nat_mismatched_freq_raises(self, freq):
+ idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq)
+
+ diff = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M')
+ msg = "Input has different freq=4M from Period(Array|Index)"
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ idx1 > diff
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ idx1 == diff
+
+ # TODO: De-duplicate with test_pi_cmp_nat
+ @pytest.mark.parametrize('dtype', [object, None])
+ def test_comp_nat(self, dtype):
+ left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT,
+ pd.Period('2011-01-03')])
+ right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')])
+
+ if dtype is not None:
+ left = left.astype(dtype)
+ right = right.astype(dtype)
+
+ result = left == right
+ expected = np.array([False, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = left != right
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = np.array([False, False, False])
+ tm.assert_numpy_array_equal(left == pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT == right, expected)
+
+ expected = np.array([True, True, True])
+ tm.assert_numpy_array_equal(left != pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT != left, expected)
+
+ expected = np.array([False, False, False])
+ tm.assert_numpy_array_equal(left < pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT > left, expected)
+
+
+class TestPeriodSeriesComparisons(object):
+ def test_cmp_series_period_series_mixed_freq(self):
+ # GH#13200
+ base = Series([Period('2011', freq='A'),
+ Period('2011-02', freq='M'),
+ Period('2013', freq='A'),
+ Period('2011-04', freq='M')])
+
+ ser = Series([Period('2012', freq='A'),
+ Period('2011-01', freq='M'),
+ Period('2013', freq='A'),
+ Period('2011-05', freq='M')])
+
+ exp = Series([False, False, True, False])
+ tm.assert_series_equal(base == ser, exp)
+
+ exp = Series([True, True, False, True])
+ tm.assert_series_equal(base != ser, exp)
+
+ exp = Series([False, True, False, False])
+ tm.assert_series_equal(base > ser, exp)
+
+ exp = Series([True, False, False, True])
+ tm.assert_series_equal(base < ser, exp)
+
+ exp = Series([False, True, True, False])
+ tm.assert_series_equal(base >= ser, exp)
+
+ exp = Series([True, False, True, True])
+ tm.assert_series_equal(base <= ser, exp)
+
+
+class TestPeriodIndexSeriesComparisonConsistency(object):
+ """ Test PeriodIndex and Period Series Ops consistency """
+ # TODO: needs parametrization+de-duplication
+
+ def _check(self, values, func, expected):
+ # Test PeriodIndex and Period Series Ops consistency
+
+ idx = pd.PeriodIndex(values)
+ result = func(idx)
+
+ # check that we don't pass an unwanted type to tm.assert_equal
+ assert isinstance(expected, (pd.Index, np.ndarray))
+ tm.assert_equal(result, expected)
+
+ s = pd.Series(values)
+ result = func(s)
+
+ exp = pd.Series(expected, name=values.name)
+ tm.assert_series_equal(result, exp)
+
+ def test_pi_comp_period(self):
+ idx = PeriodIndex(['2011-01', '2011-02', '2011-03',
+ '2011-04'], freq='M', name='idx')
+
+ f = lambda x: x == pd.Period('2011-03', freq='M')
+ exp = np.array([False, False, True, False], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.Period('2011-03', freq='M') == x
+ self._check(idx, f, exp)
+
+ f = lambda x: x != pd.Period('2011-03', freq='M')
+ exp = np.array([True, True, False, True], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.Period('2011-03', freq='M') != x
+ self._check(idx, f, exp)
+
+ f = lambda x: pd.Period('2011-03', freq='M') >= x
+ exp = np.array([True, True, True, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ f = lambda x: x > pd.Period('2011-03', freq='M')
+ exp = np.array([False, False, False, True], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ f = lambda x: pd.Period('2011-03', freq='M') >= x
+ exp = np.array([True, True, True, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ def test_pi_comp_period_nat(self):
+ idx = PeriodIndex(['2011-01', 'NaT', '2011-03',
+ '2011-04'], freq='M', name='idx')
+
+ f = lambda x: x == pd.Period('2011-03', freq='M')
+ exp = np.array([False, False, True, False], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.Period('2011-03', freq='M') == x
+ self._check(idx, f, exp)
+
+ f = lambda x: x == pd.NaT
+ exp = np.array([False, False, False, False], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.NaT == x
+ self._check(idx, f, exp)
+
+ f = lambda x: x != pd.Period('2011-03', freq='M')
+ exp = np.array([True, True, False, True], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.Period('2011-03', freq='M') != x
+ self._check(idx, f, exp)
+
+ f = lambda x: x != pd.NaT
+ exp = np.array([True, True, True, True], dtype=np.bool)
+ self._check(idx, f, exp)
+ f = lambda x: pd.NaT != x
+ self._check(idx, f, exp)
+
+ f = lambda x: pd.Period('2011-03', freq='M') >= x
+ exp = np.array([True, False, True, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ f = lambda x: x < pd.Period('2011-03', freq='M')
+ exp = np.array([True, False, False, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ f = lambda x: x > pd.NaT
+ exp = np.array([False, False, False, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+ f = lambda x: pd.NaT >= x
+ exp = np.array([False, False, False, False], dtype=np.bool)
+ self._check(idx, f, exp)
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+class TestPeriodFrameArithmetic(object):
+
+ def test_ops_frame_period(self):
+ # GH#13043
+ df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'),
+ pd.Period('2015-02', freq='M')],
+ 'B': [pd.Period('2014-01', freq='M'),
+ pd.Period('2014-02', freq='M')]})
+ assert df['A'].dtype == 'Period[M]'
+ assert df['B'].dtype == 'Period[M]'
+
+ p = pd.Period('2015-03', freq='M')
+ off = p.freq
+ # dtype will be object because of original dtype
+ exp = pd.DataFrame({'A': np.array([2 * off, 1 * off], dtype=object),
+ 'B': np.array([14 * off, 13 * off], dtype=object)})
+ tm.assert_frame_equal(p - df, exp)
+ tm.assert_frame_equal(df - p, -1 * exp)
+
+ df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'),
+ pd.Period('2015-06', freq='M')],
+ 'B': [pd.Period('2015-05', freq='M'),
+ pd.Period('2015-06', freq='M')]})
+ assert df2['A'].dtype == 'Period[M]'
+ assert df2['B'].dtype == 'Period[M]'
+
+ exp = pd.DataFrame({'A': np.array([4 * off, 4 * off], dtype=object),
+ 'B': np.array([16 * off, 16 * off], dtype=object)})
+ tm.assert_frame_equal(df2 - df, exp)
+ tm.assert_frame_equal(df - df2, -1 * exp)
+
+
+class TestPeriodIndexArithmetic(object):
+ # ---------------------------------------------------------------
+ # __add__/__sub__ with PeriodIndex
+ # PeriodIndex + other is defined for integers and timedelta-like others
+ # PeriodIndex - other is defined for integers, timedelta-like others,
+ # and PeriodIndex (with matching freq)
+
+ def test_parr_add_iadd_parr_raises(self, box_with_array):
+ rng = pd.period_range('1/1/2000', freq='D', periods=5)
+ other = pd.period_range('1/6/2000', freq='D', periods=5)
+ # TODO: parametrize over boxes for other?
+
+ rng = tm.box_expected(rng, box_with_array)
+ # An earlier implementation of PeriodIndex addition performed
+ # a set operation (union). This has since been changed to
+ # raise a TypeError. See GH#14164 and GH#13077 for historical
+ # reference.
+ with pytest.raises(TypeError):
+ rng + other
+
+ with pytest.raises(TypeError):
+ rng += other
+
+ def test_pi_sub_isub_pi(self):
+ # GH#20049
+ # For historical reference see GH#14164, GH#13077.
+ # PeriodIndex subtraction originally performed set difference,
+ # then changed to raise TypeError before being implemented in GH#20049
+ rng = pd.period_range('1/1/2000', freq='D', periods=5)
+ other = pd.period_range('1/6/2000', freq='D', periods=5)
+
+ off = rng.freq
+ expected = pd.Index([-5 * off] * 5)
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ rng -= other
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_sub_pi_with_nat(self):
+ rng = pd.period_range('1/1/2000', freq='D', periods=5)
+ other = rng[1:].insert(0, pd.NaT)
+ assert other[1:].equals(rng[1:])
+
+ result = rng - other
+ off = rng.freq
+ expected = pd.Index([pd.NaT, 0 * off, 0 * off, 0 * off, 0 * off])
+ tm.assert_index_equal(result, expected)
+
+ def test_parr_sub_pi_mismatched_freq(self, box_with_array):
+ rng = pd.period_range('1/1/2000', freq='D', periods=5)
+ other = pd.period_range('1/6/2000', freq='H', periods=5)
+ # TODO: parametrize over boxes for other?
+
+ rng = tm.box_expected(rng, box_with_array)
+ with pytest.raises(IncompatibleFrequency):
+ rng - other
+
+ @pytest.mark.parametrize('n', [1, 2, 3, 4])
+ def test_sub_n_gt_1_ticks(self, tick_classes, n):
+ # GH 23878
+ p1_d = '19910905'
+ p2_d = '19920406'
+ p1 = pd.PeriodIndex([p1_d], freq=tick_classes(n))
+ p2 = pd.PeriodIndex([p2_d], freq=tick_classes(n))
+
+ expected = (pd.PeriodIndex([p2_d], freq=p2.freq.base)
+ - pd.PeriodIndex([p1_d], freq=p1.freq.base))
+
+ tm.assert_index_equal((p2 - p1), expected)
+
+ @pytest.mark.parametrize('n', [1, 2, 3, 4])
+ @pytest.mark.parametrize('offset, kwd_name', [
+ (pd.offsets.YearEnd, 'month'),
+ (pd.offsets.QuarterEnd, 'startingMonth'),
+ (pd.offsets.MonthEnd, None),
+ (pd.offsets.Week, 'weekday')
+ ])
+ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n):
+ # GH 23878
+ kwds = {kwd_name: 3} if kwd_name is not None else {}
+ p1_d = '19910905'
+ p2_d = '19920406'
+ freq = offset(n, normalize=False, **kwds)
+ p1 = pd.PeriodIndex([p1_d], freq=freq)
+ p2 = pd.PeriodIndex([p2_d], freq=freq)
+
+ result = p2 - p1
+ expected = (pd.PeriodIndex([p2_d], freq=freq.base)
+ - pd.PeriodIndex([p1_d], freq=freq.base))
+
+ tm.assert_index_equal(result, expected)
+
+ # -------------------------------------------------------------
+ # Invalid Operations
+
+ @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])])
+ @pytest.mark.parametrize('op', [operator.add, ops.radd,
+ operator.sub, ops.rsub])
+ def test_parr_add_sub_float_raises(self, op, other, box_with_array):
+ dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')
+ pi = dti.to_period('D')
+ pi = tm.box_expected(pi, box_with_array)
+ with pytest.raises(TypeError):
+ op(pi, other)
+
+ @pytest.mark.parametrize('other', [pd.Timestamp.now(),
+ pd.Timestamp.now().to_pydatetime(),
+ pd.Timestamp.now().to_datetime64()])
+ def test_parr_add_sub_datetime_scalar(self, other, box_with_array):
+ # GH#23215
+ rng = pd.period_range('1/1/2000', freq='D', periods=3)
+ rng = tm.box_expected(rng, box_with_array)
+
+ with pytest.raises(TypeError):
+ rng + other
+ with pytest.raises(TypeError):
+ other + rng
+ with pytest.raises(TypeError):
+ rng - other
+ with pytest.raises(TypeError):
+ other - rng
+
+ # -----------------------------------------------------------------
+ # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64]
+
+ def test_parr_add_sub_dt64_array_raises(self, box_with_array):
+ rng = pd.period_range('1/1/2000', freq='D', periods=3)
+ dti = pd.date_range('2016-01-01', periods=3)
+ dtarr = dti.values
+
+ rng = tm.box_expected(rng, box_with_array)
+
+ with pytest.raises(TypeError):
+ rng + dtarr
+ with pytest.raises(TypeError):
+ dtarr + rng
+
+ with pytest.raises(TypeError):
+ rng - dtarr
+ with pytest.raises(TypeError):
+ dtarr - rng
+
+ def test_pi_add_sub_td64_array_non_tick_raises(self):
+ rng = pd.period_range('1/1/2000', freq='Q', periods=3)
+ tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day'])
+ tdarr = tdi.values
+
+ with pytest.raises(IncompatibleFrequency):
+ rng + tdarr
+ with pytest.raises(IncompatibleFrequency):
+ tdarr + rng
+
+ with pytest.raises(IncompatibleFrequency):
+ rng - tdarr
+ with pytest.raises(TypeError):
+ tdarr - rng
+
+ def test_pi_add_sub_td64_array_tick(self):
+ # PeriodIndex + Timedelta-like is allowed only with
+ # tick-like frequencies
+ rng = pd.period_range('1/1/2000', freq='90D', periods=3)
+ tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day'])
+ tdarr = tdi.values
+
+ expected = pd.period_range('12/31/1999', freq='90D', periods=3)
+ result = rng + tdi
+ tm.assert_index_equal(result, expected)
+ result = rng + tdarr
+ tm.assert_index_equal(result, expected)
+ result = tdi + rng
+ tm.assert_index_equal(result, expected)
+ result = tdarr + rng
+ tm.assert_index_equal(result, expected)
+
+ expected = pd.period_range('1/2/2000', freq='90D', periods=3)
+
+ result = rng - tdi
+ tm.assert_index_equal(result, expected)
+ result = rng - tdarr
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ tdarr - rng
+
+ with pytest.raises(TypeError):
+ tdi - rng
+
+ # -----------------------------------------------------------------
+ # operations with array/Index of DateOffset objects
+
+ @pytest.mark.parametrize('box', [np.array, pd.Index])
+ def test_pi_add_offset_array(self, box):
+ # GH#18849
+ pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')])
+ offs = box([pd.offsets.QuarterEnd(n=1, startingMonth=12),
+ pd.offsets.QuarterEnd(n=-2, startingMonth=12)])
+ expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')])
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pi + offs
+ tm.assert_index_equal(res, expected)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res2 = offs + pi
+ tm.assert_index_equal(res2, expected)
+
+ unanchored = np.array([pd.offsets.Hour(n=1),
+ pd.offsets.Minute(n=-2)])
+ # addition/subtraction ops with incompatible offsets should issue
+ # a PerformanceWarning and _then_ raise a TypeError.
+ with pytest.raises(IncompatibleFrequency):
+ with tm.assert_produces_warning(PerformanceWarning):
+ pi + unanchored
+ with pytest.raises(IncompatibleFrequency):
+ with tm.assert_produces_warning(PerformanceWarning):
+ unanchored + pi
+
+ @pytest.mark.parametrize('box', [np.array, pd.Index])
+ def test_pi_sub_offset_array(self, box):
+ # GH#18824
+ pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')])
+ other = box([pd.offsets.QuarterEnd(n=1, startingMonth=12),
+ pd.offsets.QuarterEnd(n=-2, startingMonth=12)])
+
+ expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))])
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pi - other
+ tm.assert_index_equal(res, expected)
+
+ anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)])
+
+ # addition/subtraction ops with anchored offsets should issue
+ # a PerformanceWarning and _then_ raise a TypeError.
+ with pytest.raises(IncompatibleFrequency):
+ with tm.assert_produces_warning(PerformanceWarning):
+ pi - anchored
+ with pytest.raises(IncompatibleFrequency):
+ with tm.assert_produces_warning(PerformanceWarning):
+ anchored - pi
+
+ def test_pi_add_iadd_int(self, one):
+ # Variants of `one` for #19012
+ rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10)
+ result = rng + one
+ expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10)
+ tm.assert_index_equal(result, expected)
+ rng += one
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_sub_isub_int(self, one):
+ """
+ PeriodIndex.__sub__ and __isub__ with several representations of
+ the integer 1, e.g. int, long, np.int64, np.uint8, ...
+ """
+ rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10)
+ result = rng - one
+ expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10)
+ tm.assert_index_equal(result, expected)
+ rng -= one
+ tm.assert_index_equal(rng, expected)
+
+ @pytest.mark.parametrize('five', [5, np.array(5, dtype=np.int64)])
+ def test_pi_sub_intlike(self, five):
+ rng = period_range('2007-01', periods=50)
+
+ result = rng - five
+ exp = rng + (-five)
+ tm.assert_index_equal(result, exp)
+
+ def test_pi_sub_isub_offset(self):
+ # offset
+ # DateOffset
+ rng = pd.period_range('2014', '2024', freq='A')
+ result = rng - pd.offsets.YearEnd(5)
+ expected = pd.period_range('2009', '2019', freq='A')
+ tm.assert_index_equal(result, expected)
+ rng -= pd.offsets.YearEnd(5)
+ tm.assert_index_equal(rng, expected)
+
+ rng = pd.period_range('2014-01', '2016-12', freq='M')
+ result = rng - pd.offsets.MonthEnd(5)
+ expected = pd.period_range('2013-08', '2016-07', freq='M')
+ tm.assert_index_equal(result, expected)
+
+ rng -= pd.offsets.MonthEnd(5)
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_add_offset_n_gt1(self, box_transpose_fail):
+ # GH#23215
+ # add offset to PeriodIndex with freq.n > 1
+ box, transpose = box_transpose_fail
+
+ per = pd.Period('2016-01', freq='2M')
+ pi = pd.PeriodIndex([per])
+
+ expected = pd.PeriodIndex(['2016-03'], freq='2M')
+
+ pi = tm.box_expected(pi, box, transpose=transpose)
+ expected = tm.box_expected(expected, box, transpose=transpose)
+
+ result = pi + per.freq
+ tm.assert_equal(result, expected)
+
+ result = per.freq + pi
+ tm.assert_equal(result, expected)
+
+ def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array):
+ # GH#23215
+ # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0
+ pi = pd.PeriodIndex(['2016-01'], freq='2M')
+ expected = pd.PeriodIndex(['2016-04'], freq='2M')
+
+ # FIXME: with transposing these tests fail
+ pi = tm.box_expected(pi, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = pi + to_offset('3M')
+ tm.assert_equal(result, expected)
+
+ result = to_offset('3M') + pi
+ tm.assert_equal(result, expected)
+
+ # ---------------------------------------------------------------
+ # __add__/__sub__ with integer arrays
+
+ @pytest.mark.parametrize('int_holder', [np.array, pd.Index])
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_pi_add_intarray(self, int_holder, op):
+ # GH#19959
+ pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')])
+ other = int_holder([4, -1])
+
+ result = op(pi, other)
+ expected = pd.PeriodIndex([pd.Period('2016Q1'), pd.Period('NaT')])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('int_holder', [np.array, pd.Index])
+ def test_pi_sub_intarray(self, int_holder):
+ # GH#19959
+ pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')])
+ other = int_holder([4, -1])
+
+ result = pi - other
+ expected = pd.PeriodIndex([pd.Period('2014Q1'), pd.Period('NaT')])
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ other - pi
+
+ # ---------------------------------------------------------------
+ # Timedelta-like (timedelta, timedelta64, Timedelta, Tick)
+ # TODO: Some of these are misnomers because of non-Tick DateOffsets
+
+ def test_pi_add_timedeltalike_minute_gt1(self, three_days):
+ # GH#23031 adding a time-delta-like offset to a PeriodArray that has
+ # minute frequency with n != 1. A more general case is tested below
+ # in test_pi_add_timedeltalike_tick_gt1, but here we write out the
+ # expected result more explicitly.
+ other = three_days
+ rng = pd.period_range('2014-05-01', periods=3, freq='2D')
+
+ expected = pd.PeriodIndex(['2014-05-04', '2014-05-06', '2014-05-08'],
+ freq='2D')
+
+ result = rng + other
+ tm.assert_index_equal(result, expected)
+
+ result = other + rng
+ tm.assert_index_equal(result, expected)
+
+ # subtraction
+ expected = pd.PeriodIndex(['2014-04-28', '2014-04-30', '2014-05-02'],
+ freq='2D')
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ other - rng
+
+ @pytest.mark.parametrize('freqstr', ['5ns', '5us', '5ms',
+ '5s', '5T', '5h', '5d'])
+ def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr):
+ # GH#23031 adding a time-delta-like offset to a PeriodArray that has
+ # tick-like frequency with n != 1
+ other = three_days
+ rng = pd.period_range('2014-05-01', periods=6, freq=freqstr)
+
+ expected = pd.period_range(rng[0] + other, periods=6, freq=freqstr)
+
+ result = rng + other
+ tm.assert_index_equal(result, expected)
+
+ result = other + rng
+ tm.assert_index_equal(result, expected)
+
+ # subtraction
+ expected = pd.period_range(rng[0] - other, periods=6, freq=freqstr)
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ other - rng
+
+ def test_pi_add_iadd_timedeltalike_daily(self, three_days):
+ # Tick
+ other = three_days
+ rng = pd.period_range('2014-05-01', '2014-05-15', freq='D')
+ expected = pd.period_range('2014-05-04', '2014-05-18', freq='D')
+
+ result = rng + other
+ tm.assert_index_equal(result, expected)
+
+ rng += other
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_sub_isub_timedeltalike_daily(self, three_days):
+ # Tick-like 3 Days
+ other = three_days
+ rng = pd.period_range('2014-05-01', '2014-05-15', freq='D')
+ expected = pd.period_range('2014-04-28', '2014-05-12', freq='D')
+
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ rng -= other
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily):
+ other = not_daily
+ rng = pd.period_range('2014-05-01', '2014-05-15', freq='D')
+ msg = 'Input has different freq(=.+)? from Period.*?\\(freq=D\\)'
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng + other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng += other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng - other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng -= other
+
+ def test_pi_add_iadd_timedeltalike_hourly(self, two_hours):
+ other = two_hours
+ rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H')
+ expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00',
+ freq='H')
+
+ result = rng + other
+ tm.assert_index_equal(result, expected)
+
+ rng += other
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly):
+ other = not_hourly
+ rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H')
+ msg = 'Input has different freq(=.+)? from Period.*?\\(freq=H\\)'
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng + other
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng += other
+
+ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours):
+ other = two_hours
+ rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H')
+ expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00',
+ freq='H')
+
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ rng -= other
+ tm.assert_index_equal(rng, expected)
+
+ def test_add_iadd_timedeltalike_annual(self):
+ # offset
+ # DateOffset
+ rng = pd.period_range('2014', '2024', freq='A')
+ result = rng + pd.offsets.YearEnd(5)
+ expected = pd.period_range('2019', '2029', freq='A')
+ tm.assert_index_equal(result, expected)
+ rng += pd.offsets.YearEnd(5)
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self,
+ mismatched_freq):
+ other = mismatched_freq
+ rng = pd.period_range('2014', '2024', freq='A')
+ msg = ('Input has different freq(=.+)? '
+ 'from Period.*?\\(freq=A-DEC\\)')
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng + other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng += other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng - other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng -= other
+
+ def test_pi_add_iadd_timedeltalike_M(self):
+ rng = pd.period_range('2014-01', '2016-12', freq='M')
+ expected = pd.period_range('2014-06', '2017-05', freq='M')
+
+ result = rng + pd.offsets.MonthEnd(5)
+ tm.assert_index_equal(result, expected)
+
+ rng += pd.offsets.MonthEnd(5)
+ tm.assert_index_equal(rng, expected)
+
+ def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self,
+ mismatched_freq):
+ other = mismatched_freq
+ rng = pd.period_range('2014-01', '2016-12', freq='M')
+ msg = 'Input has different freq(=.+)? from Period.*?\\(freq=M\\)'
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng + other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng += other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng - other
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ rng -= other
+
+ def test_parr_add_sub_td64_nat(self, box_transpose_fail):
+ # GH#23320 special handling for timedelta64("NaT")
+ box, transpose = box_transpose_fail
+
+ pi = pd.period_range("1994-04-01", periods=9, freq="19D")
+ other = np.timedelta64("NaT")
+ expected = pd.PeriodIndex(["NaT"] * 9, freq="19D")
+
+ obj = tm.box_expected(pi, box, transpose=transpose)
+ expected = tm.box_expected(expected, box, transpose=transpose)
+
+ result = obj + other
+ tm.assert_equal(result, expected)
+ result = other + obj
+ tm.assert_equal(result, expected)
+ result = obj - other
+ tm.assert_equal(result, expected)
+ with pytest.raises(TypeError):
+ other - obj
+
+
+class TestPeriodSeriesArithmetic(object):
+ def test_ops_series_timedelta(self):
+ # GH#13043
+ ser = pd.Series([pd.Period('2015-01-01', freq='D'),
+ pd.Period('2015-01-02', freq='D')], name='xxx')
+ assert ser.dtype == 'Period[D]'
+
+ expected = pd.Series([pd.Period('2015-01-02', freq='D'),
+ pd.Period('2015-01-03', freq='D')], name='xxx')
+
+ result = ser + pd.Timedelta('1 days')
+ tm.assert_series_equal(result, expected)
+
+ result = pd.Timedelta('1 days') + ser
+ tm.assert_series_equal(result, expected)
+
+ result = ser + pd.tseries.offsets.Day()
+ tm.assert_series_equal(result, expected)
+
+ result = pd.tseries.offsets.Day() + ser
+ tm.assert_series_equal(result, expected)
+
+ def test_ops_series_period(self):
+ # GH#13043
+ ser = pd.Series([pd.Period('2015-01-01', freq='D'),
+ pd.Period('2015-01-02', freq='D')], name='xxx')
+ assert ser.dtype == "Period[D]"
+
+ per = pd.Period('2015-01-10', freq='D')
+ off = per.freq
+ # dtype will be object because of original dtype
+ expected = pd.Series([9 * off, 8 * off], name='xxx', dtype=object)
+ tm.assert_series_equal(per - ser, expected)
+ tm.assert_series_equal(ser - per, -1 * expected)
+
+ s2 = pd.Series([pd.Period('2015-01-05', freq='D'),
+ pd.Period('2015-01-04', freq='D')], name='xxx')
+ assert s2.dtype == "Period[D]"
+
+ expected = pd.Series([4 * off, 2 * off], name='xxx', dtype=object)
+ tm.assert_series_equal(s2 - ser, expected)
+ tm.assert_series_equal(ser - s2, -1 * expected)
+
+
+class TestPeriodIndexSeriesMethods(object):
+ """ Test PeriodIndex and Period Series Ops consistency """
+
+ def _check(self, values, func, expected):
+ idx = pd.PeriodIndex(values)
+ result = func(idx)
+ tm.assert_equal(result, expected)
+
+ ser = pd.Series(values)
+ result = func(ser)
+
+ exp = pd.Series(expected, name=values.name)
+ tm.assert_series_equal(result, exp)
+
+ def test_pi_ops(self):
+ idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq='M', name='idx')
+
+ expected = PeriodIndex(['2011-03', '2011-04', '2011-05', '2011-06'],
+ freq='M', name='idx')
+
+ self._check(idx, lambda x: x + 2, expected)
+ self._check(idx, lambda x: 2 + x, expected)
+
+ self._check(idx + 2, lambda x: x - 2, idx)
+
+ result = idx - Period('2011-01', freq='M')
+ off = idx.freq
+ exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ result = Period('2011-01', freq='M') - idx
+ exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ @pytest.mark.parametrize('ng', ["str", 1.5])
+ def test_parr_ops_errors(self, ng, box_with_array):
+ idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq='M', name='idx')
+ obj = tm.box_expected(idx, box_with_array)
+
+ msg = r"unsupported operand type\(s\)"
+ with pytest.raises(TypeError, match=msg):
+ obj + ng
+
+ with pytest.raises(TypeError):
+ # error message differs between PY2 and 3
+ ng + obj
+
+ with pytest.raises(TypeError, match=msg):
+ obj - ng
+
+ with pytest.raises(TypeError):
+ np.add(obj, ng)
+
+ with pytest.raises(TypeError):
+ np.add(ng, obj)
+
+ with pytest.raises(TypeError):
+ np.subtract(obj, ng)
+
+ with pytest.raises(TypeError):
+ np.subtract(ng, obj)
+
+ def test_pi_ops_nat(self):
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='M', name='idx')
+ expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'],
+ freq='M', name='idx')
+
+ self._check(idx, lambda x: x + 2, expected)
+ self._check(idx, lambda x: 2 + x, expected)
+ self._check(idx, lambda x: np.add(x, 2), expected)
+
+ self._check(idx + 2, lambda x: x - 2, idx)
+ self._check(idx + 2, lambda x: np.subtract(x, 2), idx)
+
+ # freq with mult
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='2M', name='idx')
+ expected = PeriodIndex(['2011-07', '2011-08', 'NaT', '2011-10'],
+ freq='2M', name='idx')
+
+ self._check(idx, lambda x: x + 3, expected)
+ self._check(idx, lambda x: 3 + x, expected)
+ self._check(idx, lambda x: np.add(x, 3), expected)
+
+ self._check(idx + 3, lambda x: x - 3, idx)
+ self._check(idx + 3, lambda x: np.subtract(x, 3), idx)
+
+ def test_pi_ops_array_int(self):
+
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='M', name='idx')
+ f = lambda x: x + np.array([1, 2, 3, 4])
+ exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'],
+ freq='M', name='idx')
+ self._check(idx, f, exp)
+
+ f = lambda x: np.add(x, np.array([4, -1, 1, 2]))
+ exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'],
+ freq='M', name='idx')
+ self._check(idx, f, exp)
+
+ f = lambda x: x - np.array([1, 2, 3, 4])
+ exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'],
+ freq='M', name='idx')
+ self._check(idx, f, exp)
+
+ f = lambda x: np.subtract(x, np.array([3, 2, 3, -2]))
+ exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'],
+ freq='M', name='idx')
+ self._check(idx, f, exp)
+
+ def test_pi_ops_offset(self):
+ idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01',
+ '2011-04-01'], freq='D', name='idx')
+ f = lambda x: x + pd.offsets.Day()
+ exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02',
+ '2011-04-02'], freq='D', name='idx')
+ self._check(idx, f, exp)
+
+ f = lambda x: x + pd.offsets.Day(2)
+ exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03',
+ '2011-04-03'], freq='D', name='idx')
+ self._check(idx, f, exp)
+
+ f = lambda x: x - pd.offsets.Day(2)
+ exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27',
+ '2011-03-30'], freq='D', name='idx')
+ self._check(idx, f, exp)
+
+ def test_pi_offset_errors(self):
+ idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01',
+ '2011-04-01'], freq='D', name='idx')
+ ser = pd.Series(idx)
+
+ # Series op is applied per Period instance, thus error is raised
+ # from Period
+ for obj in [idx, ser]:
+ msg = r"Input has different freq=2H from Period.*?\(freq=D\)"
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ obj + pd.offsets.Hour(2)
+
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ pd.offsets.Hour(2) + obj
+
+ msg = r"Input has different freq=-2H from Period.*?\(freq=D\)"
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ obj - pd.offsets.Hour(2)
+
+ def test_pi_sub_period(self):
+ # GH#13071
+ idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq='M', name='idx')
+
+ result = idx - pd.Period('2012-01', freq='M')
+ off = idx.freq
+ exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ result = np.subtract(idx, pd.Period('2012-01', freq='M'))
+ tm.assert_index_equal(result, exp)
+
+ result = pd.Period('2012-01', freq='M') - idx
+ exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ result = np.subtract(pd.Period('2012-01', freq='M'), idx)
+ tm.assert_index_equal(result, exp)
+
+ exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx')
+ tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp)
+ tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp)
+
+ def test_pi_sub_pdnat(self):
+ # GH#13071
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='M', name='idx')
+ exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx')
+ tm.assert_index_equal(pd.NaT - idx, exp)
+ tm.assert_index_equal(idx - pd.NaT, exp)
+
+ def test_pi_sub_period_nat(self):
+ # GH#13071
+ idx = PeriodIndex(['2011-01', 'NaT', '2011-03', '2011-04'],
+ freq='M', name='idx')
+
+ result = idx - pd.Period('2012-01', freq='M')
+ off = idx.freq
+ exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ result = pd.Period('2012-01', freq='M') - idx
+ exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx')
+ tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp)
+ tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/arithmetic/test_timedelta64.py b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_timedelta64.py
new file mode 100644
index 00000000000..c31d7acad31
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arithmetic/test_timedelta64.py
@@ -0,0 +1,1977 @@
+# -*- coding: utf-8 -*-
+# Arithmetc tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas.errors import NullFrequencyError, PerformanceWarning
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, NaT, Series, Timedelta, TimedeltaIndex,
+ Timestamp, timedelta_range)
+import pandas.util.testing as tm
+
+
+def get_upcast_box(box, vector):
+ """
+ Given two box-types, find the one that takes priority
+ """
+ if box is DataFrame or isinstance(vector, DataFrame):
+ return DataFrame
+ if box is Series or isinstance(vector, Series):
+ return Series
+ if box is pd.Index or isinstance(vector, pd.Index):
+ return pd.Index
+ return box
+
+
+# ------------------------------------------------------------------
+# Timedelta64[ns] dtype Comparisons
+
+class TestTimedelta64ArrayComparisons(object):
+ # TODO: All of these need to be parametrized over box
+
+ def test_compare_timedelta_series(self):
+ # regresssion test for GH#5963
+ s = pd.Series([timedelta(days=1), timedelta(days=2)])
+ actual = s > timedelta(days=1)
+ expected = pd.Series([False, True])
+ tm.assert_series_equal(actual, expected)
+
+ def test_tdi_cmp_str_invalid(self, box_with_array):
+ # GH#13624
+ xbox = box_with_array if box_with_array is not pd.Index else np.ndarray
+ tdi = TimedeltaIndex(['1 day', '2 days'])
+ tdarr = tm.box_expected(tdi, box_with_array)
+
+ for left, right in [(tdarr, 'a'), ('a', tdarr)]:
+ with pytest.raises(TypeError):
+ left > right
+ with pytest.raises(TypeError):
+ left >= right
+ with pytest.raises(TypeError):
+ left < right
+ with pytest.raises(TypeError):
+ left <= right
+
+ result = left == right
+ expected = np.array([False, False], dtype=bool)
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ result = left != right
+ expected = np.array([True, True], dtype=bool)
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_comp_nat(self, dtype):
+ left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT,
+ pd.Timedelta('3 days')])
+ right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')])
+
+ lhs, rhs = left, right
+ if dtype is object:
+ lhs, rhs = left.astype(object), right.astype(object)
+
+ result = rhs == lhs
+ expected = np.array([False, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = rhs != lhs
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = np.array([False, False, False])
+ tm.assert_numpy_array_equal(lhs == pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT == rhs, expected)
+
+ expected = np.array([True, True, True])
+ tm.assert_numpy_array_equal(lhs != pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT != lhs, expected)
+
+ expected = np.array([False, False, False])
+ tm.assert_numpy_array_equal(lhs < pd.NaT, expected)
+ tm.assert_numpy_array_equal(pd.NaT > lhs, expected)
+
+ def test_comparisons_nat(self):
+ tdidx1 = pd.TimedeltaIndex(['1 day', pd.NaT, '1 day 00:00:01', pd.NaT,
+ '1 day 00:00:01', '5 day 00:00:03'])
+ tdidx2 = pd.TimedeltaIndex(['2 day', '2 day', pd.NaT, pd.NaT,
+ '1 day 00:00:02', '5 days 00:00:03'])
+ tdarr = np.array([np.timedelta64(2, 'D'),
+ np.timedelta64(2, 'D'), np.timedelta64('nat'),
+ np.timedelta64('nat'),
+ np.timedelta64(1, 'D') + np.timedelta64(2, 's'),
+ np.timedelta64(5, 'D') + np.timedelta64(3, 's')])
+
+ cases = [(tdidx1, tdidx2), (tdidx1, tdarr)]
+
+ # Check pd.NaT is handles as the same as np.nan
+ for idx1, idx2 in cases:
+
+ result = idx1 < idx2
+ expected = np.array([True, False, False, False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx2 > idx1
+ expected = np.array([True, False, False, False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 <= idx2
+ expected = np.array([True, False, False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx2 >= idx1
+ expected = np.array([True, False, False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 == idx2
+ expected = np.array([False, False, False, False, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = idx1 != idx2
+ expected = np.array([True, True, True, True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # TODO: better name
+ def test_comparisons_coverage(self):
+ rng = timedelta_range('1 days', periods=10)
+
+ result = rng < rng[3]
+ expected = np.array([True, True, True] + [False] * 7)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # raise TypeError for now
+ with pytest.raises(TypeError):
+ rng < rng[3].value
+
+ result = rng == list(rng)
+ exp = rng == rng
+ tm.assert_numpy_array_equal(result, exp)
+
+
+# ------------------------------------------------------------------
+# Timedelta64[ns] dtype Arithmetic Operations
+
+class TestTimedelta64ArithmeticUnsorted(object):
+ # Tests moved from type-specific test files but not
+ # yet sorted/parametrized/de-duplicated
+
+ def test_ufunc_coercions(self):
+ # normal ops are also tested in tseries/test_timedeltas.py
+ idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'],
+ freq='2H', name='x')
+
+ for result in [idx * 2, np.multiply(idx, 2)]:
+ assert isinstance(result, TimedeltaIndex)
+ exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'],
+ freq='4H', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '4H'
+
+ for result in [idx / 2, np.divide(idx, 2)]:
+ assert isinstance(result, TimedeltaIndex)
+ exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'],
+ freq='H', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == 'H'
+
+ idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'],
+ freq='2H', name='x')
+ for result in [-idx, np.negative(idx)]:
+ assert isinstance(result, TimedeltaIndex)
+ exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'],
+ freq='-2H', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '-2H'
+
+ idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'],
+ freq='H', name='x')
+ for result in [abs(idx), np.absolute(idx)]:
+ assert isinstance(result, TimedeltaIndex)
+ exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'],
+ freq=None, name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq is None
+
+ def test_subtraction_ops(self):
+ # with datetimes/timedelta and tdi/dti
+ tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
+ dti = pd.date_range('20130101', periods=3, name='bar')
+ td = Timedelta('1 days')
+ dt = Timestamp('20130101')
+
+ pytest.raises(TypeError, lambda: tdi - dt)
+ pytest.raises(TypeError, lambda: tdi - dti)
+ pytest.raises(TypeError, lambda: td - dt)
+ pytest.raises(TypeError, lambda: td - dti)
+
+ result = dt - dti
+ expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar')
+ tm.assert_index_equal(result, expected)
+
+ result = dti - dt
+ expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar')
+ tm.assert_index_equal(result, expected)
+
+ result = tdi - td
+ expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo')
+ tm.assert_index_equal(result, expected, check_names=False)
+
+ result = td - tdi
+ expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo')
+ tm.assert_index_equal(result, expected, check_names=False)
+
+ result = dti - td
+ expected = DatetimeIndex(
+ ['20121231', '20130101', '20130102'], name='bar')
+ tm.assert_index_equal(result, expected, check_names=False)
+
+ result = dt - tdi
+ expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ def test_subtraction_ops_with_tz(self):
+
+ # check that dt/dti subtraction ops with tz are validated
+ dti = pd.date_range('20130101', periods=3)
+ ts = Timestamp('20130101')
+ dt = ts.to_pydatetime()
+ dti_tz = pd.date_range('20130101', periods=3).tz_localize('US/Eastern')
+ ts_tz = Timestamp('20130101').tz_localize('US/Eastern')
+ ts_tz2 = Timestamp('20130101').tz_localize('CET')
+ dt_tz = ts_tz.to_pydatetime()
+ td = Timedelta('1 days')
+
+ def _check(result, expected):
+ assert result == expected
+ assert isinstance(result, Timedelta)
+
+ # scalars
+ result = ts - ts
+ expected = Timedelta('0 days')
+ _check(result, expected)
+
+ result = dt_tz - ts_tz
+ expected = Timedelta('0 days')
+ _check(result, expected)
+
+ result = ts_tz - dt_tz
+ expected = Timedelta('0 days')
+ _check(result, expected)
+
+ # tz mismatches
+ pytest.raises(TypeError, lambda: dt_tz - ts)
+ pytest.raises(TypeError, lambda: dt_tz - dt)
+ pytest.raises(TypeError, lambda: dt_tz - ts_tz2)
+ pytest.raises(TypeError, lambda: dt - dt_tz)
+ pytest.raises(TypeError, lambda: ts - dt_tz)
+ pytest.raises(TypeError, lambda: ts_tz2 - ts)
+ pytest.raises(TypeError, lambda: ts_tz2 - dt)
+ pytest.raises(TypeError, lambda: ts_tz - ts_tz2)
+
+ # with dti
+ pytest.raises(TypeError, lambda: dti - ts_tz)
+ pytest.raises(TypeError, lambda: dti_tz - ts)
+ pytest.raises(TypeError, lambda: dti_tz - ts_tz2)
+
+ result = dti_tz - dt_tz
+ expected = TimedeltaIndex(['0 days', '1 days', '2 days'])
+ tm.assert_index_equal(result, expected)
+
+ result = dt_tz - dti_tz
+ expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'])
+ tm.assert_index_equal(result, expected)
+
+ result = dti_tz - ts_tz
+ expected = TimedeltaIndex(['0 days', '1 days', '2 days'])
+ tm.assert_index_equal(result, expected)
+
+ result = ts_tz - dti_tz
+ expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'])
+ tm.assert_index_equal(result, expected)
+
+ result = td - td
+ expected = Timedelta('0 days')
+ _check(result, expected)
+
+ result = dti_tz - td
+ expected = DatetimeIndex(
+ ['20121231', '20130101', '20130102'], tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_tdi_numeric_ops(self):
+ # These are normally union/diff set-like ops
+ tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
+ dti = pd.date_range('20130101', periods=3, name='bar')
+
+ # TODO(wesm): unused?
+ # td = Timedelta('1 days')
+ # dt = Timestamp('20130101')
+
+ result = tdi - tdi
+ expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = tdi + tdi
+ expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = dti - tdi # name will be reset
+ expected = DatetimeIndex(['20121231', pd.NaT, '20130101'])
+ tm.assert_index_equal(result, expected)
+
+ def test_addition_ops(self):
+ # with datetimes/timedelta and tdi/dti
+ tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
+ dti = pd.date_range('20130101', periods=3, name='bar')
+ td = Timedelta('1 days')
+ dt = Timestamp('20130101')
+
+ result = tdi + dt
+ expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = dt + tdi
+ expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = td + tdi
+ expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = tdi + td
+ expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo')
+ tm.assert_index_equal(result, expected)
+
+ # unequal length
+ pytest.raises(ValueError, lambda: tdi + dti[0:1])
+ pytest.raises(ValueError, lambda: tdi[0:1] + dti)
+
+ # random indexes
+ with pytest.raises(NullFrequencyError):
+ tdi + pd.Int64Index([1, 2, 3])
+
+ # this is a union!
+ # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi)
+
+ result = tdi + dti # name will be reset
+ expected = DatetimeIndex(['20130102', pd.NaT, '20130105'])
+ tm.assert_index_equal(result, expected)
+
+ result = dti + tdi # name will be reset
+ expected = DatetimeIndex(['20130102', pd.NaT, '20130105'])
+ tm.assert_index_equal(result, expected)
+
+ result = dt + td
+ expected = Timestamp('20130102')
+ assert result == expected
+
+ result = td + dt
+ expected = Timestamp('20130102')
+ assert result == expected
+
+ # TODO: Needs more informative name, probably split up into
+ # more targeted tests
+ @pytest.mark.parametrize('freq', ['D', 'B'])
+ def test_timedelta(self, freq):
+ index = pd.date_range('1/1/2000', periods=50, freq=freq)
+
+ shifted = index + timedelta(1)
+ back = shifted + timedelta(-1)
+ tm.assert_index_equal(index, back)
+
+ if freq == 'D':
+ expected = pd.tseries.offsets.Day(1)
+ assert index.freq == expected
+ assert shifted.freq == expected
+ assert back.freq == expected
+ else: # freq == 'B'
+ assert index.freq == pd.tseries.offsets.BusinessDay(1)
+ assert shifted.freq is None
+ assert back.freq == pd.tseries.offsets.BusinessDay(1)
+
+ result = index - timedelta(1)
+ expected = index + timedelta(-1)
+ tm.assert_index_equal(result, expected)
+
+ # GH#4134, buggy with timedeltas
+ rng = pd.date_range('2013', '2014')
+ s = Series(rng)
+ result1 = rng - pd.offsets.Hour(1)
+ result2 = DatetimeIndex(s - np.timedelta64(100000000))
+ result3 = rng - np.timedelta64(100000000)
+ result4 = DatetimeIndex(s - pd.offsets.Hour(1))
+ tm.assert_index_equal(result1, result4)
+ tm.assert_index_equal(result2, result3)
+
+
+class TestAddSubNaTMasking(object):
+ # TODO: parametrize over boxes
+
+ def test_tdi_add_timestamp_nat_masking(self):
+ # GH#17991 checking for overflow-masking with NaT
+ tdinat = pd.to_timedelta(['24658 days 11:15:00', 'NaT'])
+
+ tsneg = Timestamp('1950-01-01')
+ ts_neg_variants = [tsneg,
+ tsneg.to_pydatetime(),
+ tsneg.to_datetime64().astype('datetime64[ns]'),
+ tsneg.to_datetime64().astype('datetime64[D]')]
+
+ tspos = Timestamp('1980-01-01')
+ ts_pos_variants = [tspos,
+ tspos.to_pydatetime(),
+ tspos.to_datetime64().astype('datetime64[ns]'),
+ tspos.to_datetime64().astype('datetime64[D]')]
+
+ for variant in ts_neg_variants + ts_pos_variants:
+ res = tdinat + variant
+ assert res[1] is pd.NaT
+
+ def test_tdi_add_overflow(self):
+ # See GH#14068
+ msg = "too (big|large) to convert"
+ with pytest.raises(OverflowError, match=msg):
+ pd.to_timedelta(106580, 'D') + Timestamp('2000')
+ with pytest.raises(OverflowError, match=msg):
+ Timestamp('2000') + pd.to_timedelta(106580, 'D')
+
+ _NaT = int(pd.NaT) + 1
+ msg = "Overflow in int64 addition"
+ with pytest.raises(OverflowError, match=msg):
+ pd.to_timedelta([106580], 'D') + Timestamp('2000')
+ with pytest.raises(OverflowError, match=msg):
+ Timestamp('2000') + pd.to_timedelta([106580], 'D')
+ with pytest.raises(OverflowError, match=msg):
+ pd.to_timedelta([_NaT]) - Timedelta('1 days')
+ with pytest.raises(OverflowError, match=msg):
+ pd.to_timedelta(['5 days', _NaT]) - Timedelta('1 days')
+ with pytest.raises(OverflowError, match=msg):
+ (pd.to_timedelta([_NaT, '5 days', '1 hours']) -
+ pd.to_timedelta(['7 seconds', _NaT, '4 hours']))
+
+ # These should not overflow!
+ exp = TimedeltaIndex([pd.NaT])
+ result = pd.to_timedelta([pd.NaT]) - Timedelta('1 days')
+ tm.assert_index_equal(result, exp)
+
+ exp = TimedeltaIndex(['4 days', pd.NaT])
+ result = pd.to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
+ tm.assert_index_equal(result, exp)
+
+ exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
+ result = (pd.to_timedelta([pd.NaT, '5 days', '1 hours']) +
+ pd.to_timedelta(['7 seconds', pd.NaT, '4 hours']))
+ tm.assert_index_equal(result, exp)
+
+
+class TestTimedeltaArraylikeAddSubOps(object):
+ # Tests for timedelta64[ns] __add__, __sub__, __radd__, __rsub__
+
+ # TODO: moved from frame tests; needs parametrization/de-duplication
+ def test_td64_df_add_int_frame(self):
+ # GH#22696 Check that we don't dispatch to numpy implementation,
+ # which treats int64 as m8[ns]
+ tdi = pd.timedelta_range('1', periods=3)
+ df = tdi.to_frame()
+ other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df`
+ with pytest.raises(TypeError):
+ df + other
+ with pytest.raises(TypeError):
+ other + df
+ with pytest.raises(TypeError):
+ df - other
+ with pytest.raises(TypeError):
+ other - df
+
+ # TODO: moved from tests.indexes.timedeltas.test_arithmetic; needs
+ # parametrization+de-duplication
+ def test_timedelta_ops_with_missing_values(self):
+ # setup
+ s1 = pd.to_timedelta(Series(['00:00:01']))
+ s2 = pd.to_timedelta(Series(['00:00:02']))
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # Passing datetime64-dtype data to TimedeltaIndex is deprecated
+ sn = pd.to_timedelta(Series([pd.NaT]))
+
+ df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta)
+ df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # Passing datetime64-dtype data to TimedeltaIndex is deprecated
+ dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta)
+
+ scalar1 = pd.to_timedelta('00:00:01')
+ scalar2 = pd.to_timedelta('00:00:02')
+ timedelta_NaT = pd.to_timedelta('NaT')
+
+ actual = scalar1 + scalar1
+ assert actual == scalar2
+ actual = scalar2 - scalar1
+ assert actual == scalar1
+
+ actual = s1 + s1
+ tm.assert_series_equal(actual, s2)
+ actual = s2 - s1
+ tm.assert_series_equal(actual, s1)
+
+ actual = s1 + scalar1
+ tm.assert_series_equal(actual, s2)
+ actual = scalar1 + s1
+ tm.assert_series_equal(actual, s2)
+ actual = s2 - scalar1
+ tm.assert_series_equal(actual, s1)
+ actual = -scalar1 + s2
+ tm.assert_series_equal(actual, s1)
+
+ actual = s1 + timedelta_NaT
+ tm.assert_series_equal(actual, sn)
+ actual = timedelta_NaT + s1
+ tm.assert_series_equal(actual, sn)
+ actual = s1 - timedelta_NaT
+ tm.assert_series_equal(actual, sn)
+ actual = -timedelta_NaT + s1
+ tm.assert_series_equal(actual, sn)
+
+ with pytest.raises(TypeError):
+ s1 + np.nan
+ with pytest.raises(TypeError):
+ np.nan + s1
+ with pytest.raises(TypeError):
+ s1 - np.nan
+ with pytest.raises(TypeError):
+ -np.nan + s1
+
+ actual = s1 + pd.NaT
+ tm.assert_series_equal(actual, sn)
+ actual = s2 - pd.NaT
+ tm.assert_series_equal(actual, sn)
+
+ actual = s1 + df1
+ tm.assert_frame_equal(actual, df2)
+ actual = s2 - df1
+ tm.assert_frame_equal(actual, df1)
+ actual = df1 + s1
+ tm.assert_frame_equal(actual, df2)
+ actual = df2 - s1
+ tm.assert_frame_equal(actual, df1)
+
+ actual = df1 + df1
+ tm.assert_frame_equal(actual, df2)
+ actual = df2 - df1
+ tm.assert_frame_equal(actual, df1)
+
+ actual = df1 + scalar1
+ tm.assert_frame_equal(actual, df2)
+ actual = df2 - scalar1
+ tm.assert_frame_equal(actual, df1)
+
+ actual = df1 + timedelta_NaT
+ tm.assert_frame_equal(actual, dfn)
+ actual = df1 - timedelta_NaT
+ tm.assert_frame_equal(actual, dfn)
+
+ with pytest.raises(TypeError):
+ df1 + np.nan
+ with pytest.raises(TypeError):
+ df1 - np.nan
+
+ actual = df1 + pd.NaT # NaT is datetime, not timedelta
+ tm.assert_frame_equal(actual, dfn)
+ actual = df1 - pd.NaT
+ tm.assert_frame_equal(actual, dfn)
+
+ # TODO: moved from tests.series.test_operators, needs splitting, cleanup,
+ # de-duplication, box-parametrization...
+ def test_operators_timedelta64(self):
+ # series ops
+ v1 = pd.date_range('2012-1-1', periods=3, freq='D')
+ v2 = pd.date_range('2012-1-2', periods=3, freq='D')
+ rs = Series(v2) - Series(v1)
+ xp = Series(1e9 * 3600 * 24,
+ rs.index).astype('int64').astype('timedelta64[ns]')
+ tm.assert_series_equal(rs, xp)
+ assert rs.dtype == 'timedelta64[ns]'
+
+ df = DataFrame(dict(A=v1))
+ td = Series([timedelta(days=i) for i in range(3)])
+ assert td.dtype == 'timedelta64[ns]'
+
+ # series on the rhs
+ result = df['A'] - df['A'].shift()
+ assert result.dtype == 'timedelta64[ns]'
+
+ result = df['A'] + td
+ assert result.dtype == 'M8[ns]'
+
+ # scalar Timestamp on rhs
+ maxa = df['A'].max()
+ assert isinstance(maxa, Timestamp)
+
+ resultb = df['A'] - df['A'].max()
+ assert resultb.dtype == 'timedelta64[ns]'
+
+ # timestamp on lhs
+ result = resultb + df['A']
+ values = [Timestamp('20111230'), Timestamp('20120101'),
+ Timestamp('20120103')]
+ expected = Series(values, name='A')
+ tm.assert_series_equal(result, expected)
+
+ # datetimes on rhs
+ result = df['A'] - datetime(2001, 1, 1)
+ expected = Series(
+ [timedelta(days=4017 + i) for i in range(3)], name='A')
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'm8[ns]'
+
+ d = datetime(2001, 1, 1, 3, 4)
+ resulta = df['A'] - d
+ assert resulta.dtype == 'm8[ns]'
+
+ # roundtrip
+ resultb = resulta + d
+ tm.assert_series_equal(df['A'], resultb)
+
+ # timedeltas on rhs
+ td = timedelta(days=1)
+ resulta = df['A'] + td
+ resultb = resulta - td
+ tm.assert_series_equal(resultb, df['A'])
+ assert resultb.dtype == 'M8[ns]'
+
+ # roundtrip
+ td = timedelta(minutes=5, seconds=3)
+ resulta = df['A'] + td
+ resultb = resulta - td
+ tm.assert_series_equal(df['A'], resultb)
+ assert resultb.dtype == 'M8[ns]'
+
+ # inplace
+ value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1))
+ rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1))
+ assert rs[2] == value
+
+ def test_timedelta64_ops_nat(self):
+ # GH 11349
+ timedelta_series = Series([NaT, Timedelta('1s')])
+ nat_series_dtype_timedelta = Series([NaT, NaT],
+ dtype='timedelta64[ns]')
+ single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]')
+
+ # subtraction
+ tm.assert_series_equal(timedelta_series - NaT,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(-NaT + timedelta_series,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(timedelta_series - single_nat_dtype_timedelta,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(-single_nat_dtype_timedelta + timedelta_series,
+ nat_series_dtype_timedelta)
+
+ # addition
+ tm.assert_series_equal(nat_series_dtype_timedelta + NaT,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(NaT + nat_series_dtype_timedelta,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(nat_series_dtype_timedelta +
+ single_nat_dtype_timedelta,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(single_nat_dtype_timedelta +
+ nat_series_dtype_timedelta,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(timedelta_series + NaT,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(NaT + timedelta_series,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(timedelta_series + single_nat_dtype_timedelta,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(single_nat_dtype_timedelta + timedelta_series,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(nat_series_dtype_timedelta + NaT,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(NaT + nat_series_dtype_timedelta,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(nat_series_dtype_timedelta +
+ single_nat_dtype_timedelta,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(single_nat_dtype_timedelta +
+ nat_series_dtype_timedelta,
+ nat_series_dtype_timedelta)
+
+ # multiplication
+ tm.assert_series_equal(nat_series_dtype_timedelta * 1.0,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(1.0 * nat_series_dtype_timedelta,
+ nat_series_dtype_timedelta)
+
+ tm.assert_series_equal(timedelta_series * 1, timedelta_series)
+ tm.assert_series_equal(1 * timedelta_series, timedelta_series)
+
+ tm.assert_series_equal(timedelta_series * 1.5,
+ Series([NaT, Timedelta('1.5s')]))
+ tm.assert_series_equal(1.5 * timedelta_series,
+ Series([NaT, Timedelta('1.5s')]))
+
+ tm.assert_series_equal(timedelta_series * np.nan,
+ nat_series_dtype_timedelta)
+ tm.assert_series_equal(np.nan * timedelta_series,
+ nat_series_dtype_timedelta)
+
+ # division
+ tm.assert_series_equal(timedelta_series / 2,
+ Series([NaT, Timedelta('0.5s')]))
+ tm.assert_series_equal(timedelta_series / 2.0,
+ Series([NaT, Timedelta('0.5s')]))
+ tm.assert_series_equal(timedelta_series / np.nan,
+ nat_series_dtype_timedelta)
+
+ # -------------------------------------------------------------
+ # Invalid Operations
+
+ def test_td64arr_add_str_invalid(self, box_with_array):
+ # GH#13624
+ tdi = TimedeltaIndex(['1 day', '2 days'])
+ tdi = tm.box_expected(tdi, box_with_array)
+
+ with pytest.raises(TypeError):
+ tdi + 'a'
+ with pytest.raises(TypeError):
+ 'a' + tdi
+
+ @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])])
+ def test_td64arr_add_sub_float(self, box_with_array, other):
+ tdi = TimedeltaIndex(['-1 days', '-1 days'])
+ tdarr = tm.box_expected(tdi, box_with_array)
+
+ with pytest.raises(TypeError):
+ tdarr + other
+ with pytest.raises(TypeError):
+ other + tdarr
+ with pytest.raises(TypeError):
+ tdarr - other
+ with pytest.raises(TypeError):
+ other - tdarr
+
+ @pytest.mark.parametrize('freq', [None, 'H'])
+ def test_td64arr_sub_period(self, box_with_array, freq):
+ # GH#13078
+ # not supported, check TypeError
+ p = pd.Period('2011-01-01', freq='D')
+ idx = TimedeltaIndex(['1 hours', '2 hours'], freq=freq)
+ idx = tm.box_expected(idx, box_with_array)
+
+ with pytest.raises(TypeError):
+ idx - p
+
+ with pytest.raises(TypeError):
+ p - idx
+
+ @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H'])
+ @pytest.mark.parametrize('tdi_freq', [None, 'H'])
+ def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq):
+ # GH#20049 subtracting PeriodIndex should raise TypeError
+ tdi = TimedeltaIndex(['1 hours', '2 hours'], freq=tdi_freq)
+ dti = Timestamp('2018-03-07 17:16:40') + tdi
+ pi = dti.to_period(pi_freq)
+
+ # TODO: parametrize over box for pi?
+ tdi = tm.box_expected(tdi, box_with_array)
+ with pytest.raises(TypeError):
+ tdi - pi
+
+ # -------------------------------------------------------------
+ # Binary operations td64 arraylike and datetime-like
+
+ def test_td64arr_sub_timestamp_raises(self, box_with_array):
+ idx = TimedeltaIndex(['1 day', '2 day'])
+ idx = tm.box_expected(idx, box_with_array)
+
+ msg = ("cannot subtract a datelike from|"
+ "Could not operate|"
+ "cannot perform operation")
+ with pytest.raises(TypeError, match=msg):
+ idx - Timestamp('2011-01-01')
+
+ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture):
+ # GH#23215
+
+ # TODO: parametrize over scalar datetime types?
+ tz = tz_naive_fixture
+ other = Timestamp('2011-01-01', tz=tz)
+
+ idx = TimedeltaIndex(['1 day', '2 day'])
+ expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz)
+
+ # FIXME: fails with transpose=True because of tz-aware DataFrame
+ # transpose bug
+ idx = tm.box_expected(idx, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = idx + other
+ tm.assert_equal(result, expected)
+
+ result = other + idx
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_add_sub_timestamp(self, box_with_array):
+ # GH#11925
+ ts = Timestamp('2012-01-01')
+ # TODO: parametrize over types of datetime scalar?
+
+ tdi = timedelta_range('1 day', periods=3)
+ expected = pd.date_range('2012-01-02', periods=3)
+
+ tdarr = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ tm.assert_equal(ts + tdarr, expected)
+ tm.assert_equal(tdarr + ts, expected)
+
+ expected2 = pd.date_range('2011-12-31', periods=3, freq='-1D')
+ expected2 = tm.box_expected(expected2, box_with_array)
+
+ tm.assert_equal(ts - tdarr, expected2)
+ tm.assert_equal(ts + (-tdarr), expected2)
+
+ with pytest.raises(TypeError):
+ tdarr - ts
+
+ def test_tdi_sub_dt64_array(self, box_with_array):
+ dti = pd.date_range('2016-01-01', periods=3)
+ tdi = dti - dti.shift(1)
+ dtarr = dti.values
+ expected = pd.DatetimeIndex(dtarr) - tdi
+
+ tdi = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ with pytest.raises(TypeError):
+ tdi - dtarr
+
+ # TimedeltaIndex.__rsub__
+ result = dtarr - tdi
+ tm.assert_equal(result, expected)
+
+ def test_tdi_add_dt64_array(self, box_with_array):
+ dti = pd.date_range('2016-01-01', periods=3)
+ tdi = dti - dti.shift(1)
+ dtarr = dti.values
+ expected = pd.DatetimeIndex(dtarr) + tdi
+
+ tdi = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdi + dtarr
+ tm.assert_equal(result, expected)
+ result = dtarr + tdi
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_add_datetime64_nat(self, box_with_array):
+ # GH#23215
+ other = np.datetime64('NaT')
+
+ tdi = timedelta_range('1 day', periods=3)
+ expected = pd.DatetimeIndex(["NaT", "NaT", "NaT"])
+
+ tdser = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ tm.assert_equal(tdser + other, expected)
+ tm.assert_equal(other + tdser, expected)
+
+ # ------------------------------------------------------------------
+ # Operations with int-like others
+
+ def test_td64arr_add_int_series_invalid(self, box):
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ tdser = tm.box_expected(tdser, box)
+ err = TypeError if box is not pd.Index else NullFrequencyError
+ int_ser = Series([2, 3, 4])
+
+ with pytest.raises(err):
+ tdser + int_ser
+ with pytest.raises(err):
+ int_ser + tdser
+ with pytest.raises(err):
+ tdser - int_ser
+ with pytest.raises(err):
+ int_ser - tdser
+
+ def test_td64arr_add_intlike(self, box_with_array):
+ # GH#19123
+ tdi = TimedeltaIndex(['59 days', '59 days', 'NaT'])
+ ser = tm.box_expected(tdi, box_with_array)
+
+ err = TypeError
+ if box_with_array in [pd.Index, tm.to_array]:
+ err = NullFrequencyError
+
+ other = Series([20, 30, 40], dtype='uint8')
+
+ # TODO: separate/parametrize
+ with pytest.raises(err):
+ ser + 1
+ with pytest.raises(err):
+ ser - 1
+
+ with pytest.raises(err):
+ ser + other
+ with pytest.raises(err):
+ ser - other
+
+ with pytest.raises(err):
+ ser + np.array(other)
+ with pytest.raises(err):
+ ser - np.array(other)
+
+ with pytest.raises(err):
+ ser + pd.Index(other)
+ with pytest.raises(err):
+ ser - pd.Index(other)
+
+ @pytest.mark.parametrize('scalar', [1, 1.5, np.array(2)])
+ def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array,
+ scalar):
+ box = box_with_array
+
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ tdser = tm.box_expected(tdser, box)
+ err = TypeError
+ if box in [pd.Index, tm.to_array] and not isinstance(scalar, float):
+ err = NullFrequencyError
+
+ with pytest.raises(err):
+ tdser + scalar
+ with pytest.raises(err):
+ scalar + tdser
+ with pytest.raises(err):
+ tdser - scalar
+ with pytest.raises(err):
+ scalar - tdser
+
+ @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16',
+ 'uint64', 'uint32', 'uint16', 'uint8',
+ 'float64', 'float32', 'float16'])
+ @pytest.mark.parametrize('vec', [
+ np.array([1, 2, 3]),
+ pd.Index([1, 2, 3]),
+ Series([1, 2, 3])
+ # TODO: Add DataFrame in here?
+ ], ids=lambda x: type(x).__name__)
+ def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype):
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ tdser = tm.box_expected(tdser, box)
+ err = TypeError
+ if box is pd.Index and not dtype.startswith('float'):
+ err = NullFrequencyError
+
+ vector = vec.astype(dtype)
+ with pytest.raises(err):
+ tdser + vector
+ with pytest.raises(err):
+ vector + tdser
+ with pytest.raises(err):
+ tdser - vector
+ with pytest.raises(err):
+ vector - tdser
+
+ # ------------------------------------------------------------------
+ # Operations with timedelta-like others
+
+ # TODO: this was taken from tests.series.test_ops; de-duplicate
+ @pytest.mark.parametrize('scalar_td', [timedelta(minutes=5, seconds=4),
+ Timedelta(minutes=5, seconds=4),
+ Timedelta('5m4s').to_timedelta64()])
+ def test_operators_timedelta64_with_timedelta(self, scalar_td):
+ # smoke tests
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ td1 + scalar_td
+ scalar_td + td1
+ td1 - scalar_td
+ scalar_td - td1
+ td1 / scalar_td
+ scalar_td / td1
+
+ # TODO: this was taken from tests.series.test_ops; de-duplicate
+ def test_timedelta64_operations_with_timedeltas(self):
+ # td operate with td
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td2 = timedelta(minutes=5, seconds=4)
+ result = td1 - td2
+ expected = (Series([timedelta(seconds=0)] * 3) -
+ Series([timedelta(seconds=1)] * 3))
+ assert result.dtype == 'm8[ns]'
+ tm.assert_series_equal(result, expected)
+
+ result2 = td2 - td1
+ expected = (Series([timedelta(seconds=1)] * 3) -
+ Series([timedelta(seconds=0)] * 3))
+ tm.assert_series_equal(result2, expected)
+
+ # roundtrip
+ tm.assert_series_equal(result + td2, td1)
+
+ # Now again, using pd.to_timedelta, which should build
+ # a Series or a scalar, depending on input.
+ td1 = Series(pd.to_timedelta(['00:05:03'] * 3))
+ td2 = pd.to_timedelta('00:05:04')
+ result = td1 - td2
+ expected = (Series([timedelta(seconds=0)] * 3) -
+ Series([timedelta(seconds=1)] * 3))
+ assert result.dtype == 'm8[ns]'
+ tm.assert_series_equal(result, expected)
+
+ result2 = td2 - td1
+ expected = (Series([timedelta(seconds=1)] * 3) -
+ Series([timedelta(seconds=0)] * 3))
+ tm.assert_series_equal(result2, expected)
+
+ # roundtrip
+ tm.assert_series_equal(result + td2, td1)
+
+ def test_td64arr_add_td64_array(self, box):
+ dti = pd.date_range('2016-01-01', periods=3)
+ tdi = dti - dti.shift(1)
+ tdarr = tdi.values
+
+ expected = 2 * tdi
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ result = tdi + tdarr
+ tm.assert_equal(result, expected)
+ result = tdarr + tdi
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_sub_td64_array(self, box):
+ dti = pd.date_range('2016-01-01', periods=3)
+ tdi = dti - dti.shift(1)
+ tdarr = tdi.values
+
+ expected = 0 * tdi
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ result = tdi - tdarr
+ tm.assert_equal(result, expected)
+ result = tdarr - tdi
+ tm.assert_equal(result, expected)
+
+ # TODO: parametrize over [add, sub, radd, rsub]?
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('Egon', 'Venkman', None),
+ ('NCC1701D', 'NCC1701D', 'NCC1701D')])
+ def test_td64arr_add_sub_tdi(self, box, names):
+ # GH#17250 make sure result dtype is correct
+ # GH#19043 make sure names are propagated correctly
+ if box is pd.DataFrame and names[1] == 'Venkman':
+ pytest.skip("Name propagation for DataFrame does not behave like "
+ "it does for Index/Series")
+
+ tdi = TimedeltaIndex(['0 days', '1 day'], name=names[0])
+ ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1])
+ expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)],
+ name=names[2])
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ result = tdi + ser
+ tm.assert_equal(result, expected)
+ if box is not pd.DataFrame:
+ assert result.dtype == 'timedelta64[ns]'
+ else:
+ assert result.dtypes[0] == 'timedelta64[ns]'
+
+ result = ser + tdi
+ tm.assert_equal(result, expected)
+ if box is not pd.DataFrame:
+ assert result.dtype == 'timedelta64[ns]'
+ else:
+ assert result.dtypes[0] == 'timedelta64[ns]'
+
+ expected = Series([Timedelta(hours=-3), Timedelta(days=1, hours=-4)],
+ name=names[2])
+ expected = tm.box_expected(expected, box)
+
+ result = tdi - ser
+ tm.assert_equal(result, expected)
+ if box is not pd.DataFrame:
+ assert result.dtype == 'timedelta64[ns]'
+ else:
+ assert result.dtypes[0] == 'timedelta64[ns]'
+
+ result = ser - tdi
+ tm.assert_equal(result, -expected)
+ if box is not pd.DataFrame:
+ assert result.dtype == 'timedelta64[ns]'
+ else:
+ assert result.dtypes[0] == 'timedelta64[ns]'
+
+ def test_td64arr_add_sub_td64_nat(self, box):
+ # GH#23320 special handling for timedelta64("NaT")
+ tdi = pd.TimedeltaIndex([NaT, Timedelta('1s')])
+ other = np.timedelta64("NaT")
+ expected = pd.TimedeltaIndex(["NaT"] * 2)
+
+ obj = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ result = obj + other
+ tm.assert_equal(result, expected)
+ result = other + obj
+ tm.assert_equal(result, expected)
+ result = obj - other
+ tm.assert_equal(result, expected)
+ result = other - obj
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_sub_NaT(self, box):
+ # GH#18808
+ ser = Series([NaT, Timedelta('1s')])
+ expected = Series([NaT, NaT], dtype='timedelta64[ns]')
+
+ ser = tm.box_expected(ser, box)
+ expected = tm.box_expected(expected, box)
+
+ res = ser - pd.NaT
+ tm.assert_equal(res, expected)
+
+ def test_td64arr_add_timedeltalike(self, two_hours, box):
+ # only test adding/sub offsets as + is now numeric
+ rng = timedelta_range('1 days', '10 days')
+ expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00',
+ freq='D')
+ rng = tm.box_expected(rng, box)
+ expected = tm.box_expected(expected, box)
+
+ result = rng + two_hours
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_sub_timedeltalike(self, two_hours, box):
+ # only test adding/sub offsets as - is now numeric
+ rng = timedelta_range('1 days', '10 days')
+ expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00')
+
+ rng = tm.box_expected(rng, box)
+ expected = tm.box_expected(expected, box)
+
+ result = rng - two_hours
+ tm.assert_equal(result, expected)
+
+ # ------------------------------------------------------------------
+ # __add__/__sub__ with DateOffsets and arrays of DateOffsets
+
+ # TODO: this was taken from tests.series.test_operators; de-duplicate
+ def test_timedelta64_operations_with_DateOffset(self):
+ # GH#10699
+ td = Series([timedelta(minutes=5, seconds=3)] * 3)
+ result = td + pd.offsets.Minute(1)
+ expected = Series([timedelta(minutes=6, seconds=3)] * 3)
+ tm.assert_series_equal(result, expected)
+
+ result = td - pd.offsets.Minute(1)
+ expected = Series([timedelta(minutes=4, seconds=3)] * 3)
+ tm.assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = td + Series([pd.offsets.Minute(1), pd.offsets.Second(3),
+ pd.offsets.Hour(2)])
+ expected = Series([timedelta(minutes=6, seconds=3),
+ timedelta(minutes=5, seconds=6),
+ timedelta(hours=2, minutes=5, seconds=3)])
+ tm.assert_series_equal(result, expected)
+
+ result = td + pd.offsets.Minute(1) + pd.offsets.Second(12)
+ expected = Series([timedelta(minutes=6, seconds=15)] * 3)
+ tm.assert_series_equal(result, expected)
+
+ # valid DateOffsets
+ for do in ['Hour', 'Minute', 'Second', 'Day', 'Micro', 'Milli',
+ 'Nano']:
+ op = getattr(pd.offsets, do)
+ td + op(5)
+ op(5) + td
+ td - op(5)
+ op(5) - td
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_td64arr_add_offset_index(self, names, box):
+ # GH#18849, GH#19744
+ if box is pd.DataFrame and names[1] == 'bar':
+ pytest.skip("Name propagation for DataFrame does not behave like "
+ "it does for Index/Series")
+
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'],
+ name=names[0])
+ other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)],
+ name=names[1])
+
+ expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))],
+ freq='infer', name=names[2])
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ # The DataFrame operation is transposed and so operates as separate
+ # scalar operations, which do not issue a PerformanceWarning
+ warn = PerformanceWarning if box is not pd.DataFrame else None
+ with tm.assert_produces_warning(warn):
+ res = tdi + other
+ tm.assert_equal(res, expected)
+
+ with tm.assert_produces_warning(warn):
+ res2 = other + tdi
+ tm.assert_equal(res2, expected)
+
+ # TODO: combine with test_td64arr_add_offset_index by parametrizing
+ # over second box?
+ def test_td64arr_add_offset_array(self, box):
+ # GH#18849
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'])
+ other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)])
+
+ expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))],
+ freq='infer')
+
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ # The DataFrame operation is transposed and so operates as separate
+ # scalar operations, which do not issue a PerformanceWarning
+ warn = PerformanceWarning if box is not pd.DataFrame else None
+ with tm.assert_produces_warning(warn):
+ res = tdi + other
+ tm.assert_equal(res, expected)
+
+ with tm.assert_produces_warning(warn):
+ res2 = other + tdi
+ tm.assert_equal(res2, expected)
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_td64arr_sub_offset_index(self, names, box):
+ # GH#18824, GH#19744
+ if box is pd.DataFrame and names[1] == 'bar':
+ pytest.skip("Name propagation for DataFrame does not behave like "
+ "it does for Index/Series")
+
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'],
+ name=names[0])
+ other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)],
+ name=names[1])
+
+ expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))],
+ freq='infer', name=names[2])
+
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, box)
+
+ # The DataFrame operation is transposed and so operates as separate
+ # scalar operations, which do not issue a PerformanceWarning
+ warn = PerformanceWarning if box is not pd.DataFrame else None
+ with tm.assert_produces_warning(warn):
+ res = tdi - other
+ tm.assert_equal(res, expected)
+
+ def test_td64arr_sub_offset_array(self, box_with_array):
+ # GH#18824
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'])
+ other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)])
+
+ expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))],
+ freq='infer')
+
+ tdi = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ # The DataFrame operation is transposed and so operates as separate
+ # scalar operations, which do not issue a PerformanceWarning
+ warn = None if box_with_array is pd.DataFrame else PerformanceWarning
+ with tm.assert_produces_warning(warn):
+ res = tdi - other
+ tm.assert_equal(res, expected)
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('foo', 'foo', 'foo')])
+ def test_td64arr_with_offset_series(self, names, box_df_fail):
+ # GH#18849
+ box = box_df_fail
+ box2 = Series if box in [pd.Index, tm.to_array] else box
+
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'],
+ name=names[0])
+ other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)],
+ name=names[1])
+
+ expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))],
+ name=names[2])
+ tdi = tm.box_expected(tdi, box)
+ expected_add = tm.box_expected(expected_add, box2)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = tdi + other
+ tm.assert_equal(res, expected_add)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res2 = other + tdi
+ tm.assert_equal(res2, expected_add)
+
+ # TODO: separate/parametrize add/sub test?
+ expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))],
+ name=names[2])
+ expected_sub = tm.box_expected(expected_sub, box2)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res3 = tdi - other
+ tm.assert_equal(res3, expected_sub)
+
+ @pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series])
+ def test_td64arr_addsub_anchored_offset_arraylike(self, obox,
+ box_with_array):
+ # GH#18824
+ tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'])
+ tdi = tm.box_expected(tdi, box_with_array)
+
+ anchored = obox([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)])
+
+ # addition/subtraction ops with anchored offsets should issue
+ # a PerformanceWarning and _then_ raise a TypeError.
+ with pytest.raises(TypeError):
+ with tm.assert_produces_warning(PerformanceWarning):
+ tdi + anchored
+ with pytest.raises(TypeError):
+ with tm.assert_produces_warning(PerformanceWarning):
+ anchored + tdi
+ with pytest.raises(TypeError):
+ with tm.assert_produces_warning(PerformanceWarning):
+ tdi - anchored
+ with pytest.raises(TypeError):
+ with tm.assert_produces_warning(PerformanceWarning):
+ anchored - tdi
+
+
+class TestTimedeltaArraylikeMulDivOps(object):
+ # Tests for timedelta64[ns]
+ # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__
+
+ # TODO: Moved from tests.series.test_operators; needs cleanup
+ @pytest.mark.parametrize("m", [1, 3, 10])
+ @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns'])
+ def test_timedelta64_conversions(self, m, unit):
+ startdate = Series(pd.date_range('2013-01-01', '2013-01-03'))
+ enddate = Series(pd.date_range('2013-03-01', '2013-03-03'))
+
+ ser = enddate - startdate
+ ser[2] = np.nan
+
+ # op
+ expected = Series([x / np.timedelta64(m, unit) for x in ser])
+ result = ser / np.timedelta64(m, unit)
+ tm.assert_series_equal(result, expected)
+
+ # reverse op
+ expected = Series([Timedelta(np.timedelta64(m, unit)) / x
+ for x in ser])
+ result = np.timedelta64(m, unit) / ser
+ tm.assert_series_equal(result, expected)
+
+ # ------------------------------------------------------------------
+ # Multiplication
+ # organized with scalar others first, then array-like
+
+ def test_td64arr_mul_int(self, box_with_array):
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box_with_array)
+
+ result = idx * 1
+ tm.assert_equal(result, idx)
+
+ result = 1 * idx
+ tm.assert_equal(result, idx)
+
+ def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array):
+ rng = timedelta_range('1 days', '10 days', name='foo')
+ rng = tm.box_expected(rng, box_with_array)
+ with pytest.raises(TypeError):
+ rng * two_hours
+
+ def test_tdi_mul_int_array_zerodim(self, box_with_array):
+ rng5 = np.arange(5, dtype='int64')
+ idx = TimedeltaIndex(rng5)
+ expected = TimedeltaIndex(rng5 * 5)
+
+ idx = tm.box_expected(idx, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = idx * np.array(5, dtype='int64')
+ tm.assert_equal(result, expected)
+
+ def test_tdi_mul_int_array(self, box_with_array):
+ rng5 = np.arange(5, dtype='int64')
+ idx = TimedeltaIndex(rng5)
+ expected = TimedeltaIndex(rng5 ** 2)
+
+ idx = tm.box_expected(idx, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = idx * rng5
+ tm.assert_equal(result, expected)
+
+ def test_tdi_mul_int_series(self, box_with_array):
+ box = box_with_array
+ xbox = pd.Series if box in [pd.Index, tm.to_array] else box
+
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ expected = TimedeltaIndex(np.arange(5, dtype='int64') ** 2)
+
+ idx = tm.box_expected(idx, box)
+ expected = tm.box_expected(expected, xbox)
+
+ result = idx * pd.Series(np.arange(5, dtype='int64'))
+ tm.assert_equal(result, expected)
+
+ def test_tdi_mul_float_series(self, box_with_array):
+ box = box_with_array
+ xbox = pd.Series if box in [pd.Index, tm.to_array] else box
+
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box)
+
+ rng5f = np.arange(5, dtype='float64')
+ expected = TimedeltaIndex(rng5f * (rng5f + 1.0))
+ expected = tm.box_expected(expected, xbox)
+
+ result = idx * Series(rng5f + 1.0)
+ tm.assert_equal(result, expected)
+
+ # TODO: Put Series/DataFrame in others?
+ @pytest.mark.parametrize('other', [
+ np.arange(1, 11),
+ pd.Int64Index(range(1, 11)),
+ pd.UInt64Index(range(1, 11)),
+ pd.Float64Index(range(1, 11)),
+ pd.RangeIndex(1, 11)
+ ], ids=lambda x: type(x).__name__)
+ def test_tdi_rmul_arraylike(self, other, box_with_array):
+ box = box_with_array
+ xbox = get_upcast_box(box, other)
+
+ tdi = TimedeltaIndex(['1 Day'] * 10)
+ expected = timedelta_range('1 days', '10 days')
+ expected._data.freq = None
+
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, xbox)
+
+ result = other * tdi
+ tm.assert_equal(result, expected)
+ commute = tdi * other
+ tm.assert_equal(commute, expected)
+
+ # ------------------------------------------------------------------
+ # __div__, __rdiv__
+
+ def test_td64arr_div_nat_invalid(self, box_with_array):
+ # don't allow division by NaT (maybe could in the future)
+ rng = timedelta_range('1 days', '10 days', name='foo')
+ rng = tm.box_expected(rng, box_with_array)
+
+ with pytest.raises(TypeError, match='true_divide cannot use operands'):
+ rng / pd.NaT
+ with pytest.raises(TypeError, match='Cannot divide NaTType by'):
+ pd.NaT / rng
+
+ def test_td64arr_div_td64nat(self, box_with_array):
+ # GH#23829
+ rng = timedelta_range('1 days', '10 days',)
+ rng = tm.box_expected(rng, box_with_array)
+
+ other = np.timedelta64('NaT')
+
+ expected = np.array([np.nan] * 10)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = rng / other
+ tm.assert_equal(result, expected)
+
+ result = other / rng
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_div_int(self, box_with_array):
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box_with_array)
+
+ result = idx / 1
+ tm.assert_equal(result, idx)
+
+ with pytest.raises(TypeError, match='Cannot divide'):
+ # GH#23829
+ 1 / idx
+
+ def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array):
+ # GH#20088, GH#22163 ensure DataFrame returns correct dtype
+ rng = timedelta_range('1 days', '10 days', name='foo')
+ expected = pd.Float64Index((np.arange(10) + 1) * 12, name='foo')
+
+ rng = tm.box_expected(rng, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = rng / two_hours
+ tm.assert_equal(result, expected)
+
+ result = two_hours / rng
+ expected = 1 / expected
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours,
+ box_with_array):
+ rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo')
+ expected = pd.Float64Index([12, np.nan, 24], name='foo')
+
+ rng = tm.box_expected(rng, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = rng / two_hours
+ tm.assert_equal(result, expected)
+
+ result = two_hours / rng
+ expected = 1 / expected
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_div_td64_ndarray(self, box_with_array):
+ # GH#22631
+ rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'])
+ expected = pd.Float64Index([12, np.nan, 24])
+
+ rng = tm.box_expected(rng, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ other = np.array([2, 4, 2], dtype='m8[h]')
+ result = rng / other
+ tm.assert_equal(result, expected)
+
+ result = rng / tm.box_expected(other, box_with_array)
+ tm.assert_equal(result, expected)
+
+ result = rng / other.astype(object)
+ tm.assert_equal(result, expected)
+
+ result = rng / list(other)
+ tm.assert_equal(result, expected)
+
+ # reversed op
+ expected = 1 / expected
+ result = other / rng
+ tm.assert_equal(result, expected)
+
+ result = tm.box_expected(other, box_with_array) / rng
+ tm.assert_equal(result, expected)
+
+ result = other.astype(object) / rng
+ tm.assert_equal(result, expected)
+
+ result = list(other) / rng
+ tm.assert_equal(result, expected)
+
+ def test_tdarr_div_length_mismatch(self, box_with_array):
+ rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'])
+ mismatched = [1, 2, 3, 4]
+
+ rng = tm.box_expected(rng, box_with_array)
+ for obj in [mismatched, mismatched[:2]]:
+ # one shorter, one longer
+ for other in [obj, np.array(obj), pd.Index(obj)]:
+ with pytest.raises(ValueError):
+ rng / other
+ with pytest.raises(ValueError):
+ other / rng
+
+ # ------------------------------------------------------------------
+ # __floordiv__, __rfloordiv__
+
+ def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td):
+ # GH#18831
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ expected = Series([0, 0, np.nan])
+
+ td1 = tm.box_expected(td1, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = td1 // scalar_td
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_rfloordiv_tdscalar(self, box_with_array, scalar_td):
+ # GH#18831
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ expected = Series([1, 1, np.nan])
+
+ td1 = tm.box_expected(td1, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ result = scalar_td // td1
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array,
+ scalar_td):
+ # GH#18831
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ expected = Series([1, 1, np.nan])
+
+ td1 = tm.box_expected(td1, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ # We can test __rfloordiv__ using this syntax,
+ # see `test_timedelta_rfloordiv`
+ result = td1.__rfloordiv__(scalar_td)
+ tm.assert_equal(result, expected)
+
+ def test_td64arr_floordiv_int(self, box_with_array):
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box_with_array)
+ result = idx // 1
+ tm.assert_equal(result, idx)
+
+ pattern = ('floor_divide cannot use operands|'
+ 'Cannot divide int by Timedelta*')
+ with pytest.raises(TypeError, match=pattern):
+ 1 // idx
+
+ def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array):
+ tdi = timedelta_range('1 days', '10 days', name='foo')
+ expected = pd.Int64Index((np.arange(10) + 1) * 12, name='foo')
+
+ tdi = tm.box_expected(tdi, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdi // two_hours
+ tm.assert_equal(result, expected)
+
+ # TODO: Is this redundant with test_td64arr_floordiv_tdlike_scalar?
+ @pytest.mark.parametrize('scalar_td', [
+ timedelta(minutes=10, seconds=7),
+ Timedelta('10m7s'),
+ Timedelta('10m7s').to_timedelta64()
+ ], ids=lambda x: type(x).__name__)
+ def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array):
+ # GH#19125
+ tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None)
+ expected = pd.Index([2.0, 2.0, np.nan])
+
+ tdi = tm.box_expected(tdi, box_with_array, transpose=False)
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ res = tdi.__rfloordiv__(scalar_td)
+ tm.assert_equal(res, expected)
+
+ expected = pd.Index([0.0, 0.0, np.nan])
+ expected = tm.box_expected(expected, box_with_array, transpose=False)
+
+ res = tdi // (scalar_td)
+ tm.assert_equal(res, expected)
+
+ # ------------------------------------------------------------------
+ # mod, divmod
+ # TODO: operations with timedelta-like arrays, numeric arrays,
+ # reversed ops
+
+ def test_td64arr_mod_tdscalar(self, box_with_array, three_days):
+ tdi = timedelta_range('1 Day', '9 days')
+ tdarr = tm.box_expected(tdi, box_with_array)
+
+ expected = TimedeltaIndex(['1 Day', '2 Days', '0 Days'] * 3)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdarr % three_days
+ tm.assert_equal(result, expected)
+
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__")
+
+ result = divmod(tdarr, three_days)
+ tm.assert_equal(result[1], expected)
+ tm.assert_equal(result[0], tdarr // three_days)
+
+ def test_td64arr_mod_int(self, box_with_array):
+ tdi = timedelta_range('1 ns', '10 ns', periods=10)
+ tdarr = tm.box_expected(tdi, box_with_array)
+
+ expected = TimedeltaIndex(['1 ns', '0 ns'] * 5)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdarr % 2
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ 2 % tdarr
+
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__")
+
+ result = divmod(tdarr, 2)
+ tm.assert_equal(result[1], expected)
+ tm.assert_equal(result[0], tdarr // 2)
+
+ def test_td64arr_rmod_tdscalar(self, box_with_array, three_days):
+ tdi = timedelta_range('1 Day', '9 days')
+ tdarr = tm.box_expected(tdi, box_with_array)
+
+ expected = ['0 Days', '1 Day', '0 Days'] + ['3 Days'] * 6
+ expected = TimedeltaIndex(expected)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = three_days % tdarr
+ tm.assert_equal(result, expected)
+
+ if box_with_array is pd.DataFrame:
+ pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__")
+
+ result = divmod(three_days, tdarr)
+ tm.assert_equal(result[1], expected)
+ tm.assert_equal(result[0], three_days // tdarr)
+
+ # ------------------------------------------------------------------
+ # Operations with invalid others
+
+ def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td):
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ td1 = tm.box_expected(td1, box_with_array)
+
+ # check that we are getting a TypeError
+ # with 'operate' (from core/ops.py) for the ops that are not
+ # defined
+ pattern = 'operate|unsupported|cannot|not supported'
+ with pytest.raises(TypeError, match=pattern):
+ td1 * scalar_td
+ with pytest.raises(TypeError, match=pattern):
+ scalar_td * td1
+
+ def test_td64arr_mul_too_short_raises(self, box_with_array):
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box_with_array)
+ with pytest.raises(TypeError):
+ idx * idx[:3]
+ with pytest.raises(ValueError):
+ idx * np.array([1, 2])
+
+ def test_td64arr_mul_td64arr_raises(self, box_with_array):
+ idx = TimedeltaIndex(np.arange(5, dtype='int64'))
+ idx = tm.box_expected(idx, box_with_array)
+ with pytest.raises(TypeError):
+ idx * idx
+
+ # ------------------------------------------------------------------
+ # Operations with numeric others
+
+ @pytest.mark.parametrize('one', [1, np.array(1), 1.0, np.array(1.0)])
+ def test_td64arr_mul_numeric_scalar(self, box_with_array, one):
+ # GH#4521
+ # divide/multiply by integers
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ expected = Series(['-59 Days', '-59 Days', 'NaT'],
+ dtype='timedelta64[ns]')
+
+ tdser = tm.box_expected(tdser, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdser * (-one)
+ tm.assert_equal(result, expected)
+ result = (-one) * tdser
+ tm.assert_equal(result, expected)
+
+ expected = Series(['118 Days', '118 Days', 'NaT'],
+ dtype='timedelta64[ns]')
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdser * (2 * one)
+ tm.assert_equal(result, expected)
+ result = (2 * one) * tdser
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('two', [2, 2.0, np.array(2), np.array(2.0)])
+ def test_td64arr_div_numeric_scalar(self, box_with_array, two):
+ # GH#4521
+ # divide/multiply by integers
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ expected = Series(['29.5D', '29.5D', 'NaT'], dtype='timedelta64[ns]')
+
+ tdser = tm.box_expected(tdser, box_with_array)
+ expected = tm.box_expected(expected, box_with_array)
+
+ result = tdser / two
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError, match='Cannot divide'):
+ two / tdser
+
+ @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16',
+ 'uint64', 'uint32', 'uint16', 'uint8',
+ 'float64', 'float32', 'float16'])
+ @pytest.mark.parametrize('vector', [np.array([20, 30, 40]),
+ pd.Index([20, 30, 40]),
+ Series([20, 30, 40])],
+ ids=lambda x: type(x).__name__)
+ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype):
+ # GH#4521
+ # divide/multiply by integers
+ xbox = get_upcast_box(box_with_array, vector)
+
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ vector = vector.astype(dtype)
+
+ expected = Series(['1180 Days', '1770 Days', 'NaT'],
+ dtype='timedelta64[ns]')
+
+ tdser = tm.box_expected(tdser, box_with_array)
+ expected = tm.box_expected(expected, xbox)
+
+ result = tdser * vector
+ tm.assert_equal(result, expected)
+
+ result = vector * tdser
+ tm.assert_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16',
+ 'uint64', 'uint32', 'uint16', 'uint8',
+ 'float64', 'float32', 'float16'])
+ @pytest.mark.parametrize('vector', [np.array([20, 30, 40]),
+ pd.Index([20, 30, 40]),
+ Series([20, 30, 40])],
+ ids=lambda x: type(x).__name__)
+ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype):
+ # GH#4521
+ # divide/multiply by integers
+ xbox = get_upcast_box(box_with_array, vector)
+ tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]')
+ vector = vector.astype(dtype)
+ expected = Series(['2.95D', '1D 23H 12m', 'NaT'],
+ dtype='timedelta64[ns]')
+
+ tdser = tm.box_expected(tdser, box_with_array)
+ expected = tm.box_expected(expected, xbox)
+
+ result = tdser / vector
+ tm.assert_equal(result, expected)
+
+ pattern = ('true_divide cannot use operands|'
+ 'cannot perform __div__|'
+ 'cannot perform __truediv__|'
+ 'unsupported operand|'
+ 'Cannot divide')
+ with pytest.raises(TypeError, match=pattern):
+ vector / tdser
+
+ if not isinstance(vector, pd.Index):
+ # Index.__rdiv__ won't try to operate elementwise, just raises
+ result = tdser / vector.astype(object)
+ if box_with_array is pd.DataFrame:
+ expected = [tdser.iloc[0, n] / vector[n]
+ for n in range(len(vector))]
+ else:
+ expected = [tdser[n] / vector[n] for n in range(len(tdser))]
+ expected = tm.box_expected(expected, xbox)
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(TypeError, match=pattern):
+ vector.astype(object) / tdser
+
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('Egon', 'Venkman', None),
+ ('NCC1701D', 'NCC1701D', 'NCC1701D')])
+ def test_td64arr_mul_int_series(self, box_df_fail, names):
+ # GH#19042 test for correct name attachment
+ box = box_df_fail # broadcasts along wrong axis, but doesn't raise
+ tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'],
+ name=names[0])
+ # TODO: Should we be parametrizing over types for `ser` too?
+ ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1])
+
+ expected = Series(['0days', '1day', '4days', '9days', '16days'],
+ dtype='timedelta64[ns]',
+ name=names[2])
+
+ tdi = tm.box_expected(tdi, box)
+ box = Series if (box is pd.Index and type(ser) is Series) else box
+ expected = tm.box_expected(expected, box)
+
+ result = ser * tdi
+ tm.assert_equal(result, expected)
+
+ # The direct operation tdi * ser still needs to be fixed.
+ result = ser.__rmul__(tdi)
+ tm.assert_equal(result, expected)
+
+ # TODO: Should we be parametrizing over types for `ser` too?
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('Egon', 'Venkman', None),
+ ('NCC1701D', 'NCC1701D', 'NCC1701D')])
+ def test_float_series_rdiv_td64arr(self, box_with_array, names):
+ # GH#19042 test for correct name attachment
+ # TODO: the direct operation TimedeltaIndex / Series still
+ # needs to be fixed.
+ box = box_with_array
+ tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'],
+ name=names[0])
+ ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1])
+
+ xname = names[2] if box is not tm.to_array else names[1]
+ expected = Series([tdi[n] / ser[n] for n in range(len(ser))],
+ dtype='timedelta64[ns]',
+ name=xname)
+
+ xbox = box
+ if box in [pd.Index, tm.to_array] and type(ser) is Series:
+ xbox = Series
+
+ tdi = tm.box_expected(tdi, box)
+ expected = tm.box_expected(expected, xbox)
+
+ result = ser.__rdiv__(tdi)
+ if box is pd.DataFrame:
+ # TODO: Should we skip this case sooner or test something else?
+ assert result is NotImplemented
+ else:
+ tm.assert_equal(result, expected)
+
+
+class TestTimedeltaArraylikeInvalidArithmeticOps(object):
+
+ def test_td64arr_pow_invalid(self, scalar_td, box_with_array):
+ td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+ td1.iloc[2] = np.nan
+
+ td1 = tm.box_expected(td1, box_with_array)
+
+ # check that we are getting a TypeError
+ # with 'operate' (from core/ops.py) for the ops that are not
+ # defined
+ pattern = 'operate|unsupported|cannot|not supported'
+ with pytest.raises(TypeError, match=pattern):
+ scalar_td ** td1
+
+ with pytest.raises(TypeError, match=pattern):
+ td1 ** scalar_td
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/__init__.py b/contrib/python/pandas/py2/pandas/tests/arrays/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/__init__.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/common.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/common.py
new file mode 100644
index 00000000000..9462482553e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/common.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+
+from pandas import Categorical
+
+
+class TestCategorical(object):
+
+ def setup_method(self, method):
+ self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+ ordered=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/conftest.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/conftest.py
new file mode 100644
index 00000000000..274389d4849
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/conftest.py
@@ -0,0 +1,13 @@
+import pytest
+
+
[email protected](params=[True, False])
+def allow_fill(request):
+ """Boolean 'allow_fill' parameter for Categorical.take"""
+ return request.param
+
+
[email protected](params=[True, False])
+def ordered(request):
+ """Boolean 'ordered' parameter for Categorical."""
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_algos.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_algos.py
new file mode 100644
index 00000000000..50f643756c5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_algos.py
@@ -0,0 +1,142 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
[email protected]('ordered', [True, False])
[email protected]('categories', [
+ ['b', 'a', 'c'],
+ ['a', 'b', 'c', 'd'],
+])
+def test_factorize(categories, ordered):
+ cat = pd.Categorical(['b', 'b', 'a', 'c', None],
+ categories=categories,
+ ordered=ordered)
+ labels, uniques = pd.factorize(cat)
+ expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
+ expected_uniques = pd.Categorical(['b', 'a', 'c'],
+ categories=categories,
+ ordered=ordered)
+
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_factorized_sort():
+ cat = pd.Categorical(['b', 'b', None, 'a'])
+ labels, uniques = pd.factorize(cat, sort=True)
+ expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
+ expected_uniques = pd.Categorical(['a', 'b'])
+
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_factorized_sort_ordered():
+ cat = pd.Categorical(['b', 'b', None, 'a'],
+ categories=['c', 'b', 'a'],
+ ordered=True)
+
+ labels, uniques = pd.factorize(cat, sort=True)
+ expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
+ expected_uniques = pd.Categorical(['b', 'a'],
+ categories=['c', 'b', 'a'],
+ ordered=True)
+
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_isin_cats():
+ # GH2003
+ cat = pd.Categorical(["a", "b", np.nan])
+
+ result = cat.isin(["a", np.nan])
+ expected = np.array([True, False, True], dtype=bool)
+ tm.assert_numpy_array_equal(expected, result)
+
+ result = cat.isin(["a", "c"])
+ expected = np.array([True, False, False], dtype=bool)
+ tm.assert_numpy_array_equal(expected, result)
+
+
[email protected]("empty", [[], pd.Series(), np.array([])])
+def test_isin_empty(empty):
+ s = pd.Categorical(["a", "b"])
+ expected = np.array([False, False], dtype=bool)
+
+ result = s.isin(empty)
+ tm.assert_numpy_array_equal(expected, result)
+
+
+class TestTake(object):
+ # https://github.com/pandas-dev/pandas/issues/20664
+
+ def test_take_warns(self):
+ cat = pd.Categorical(['a', 'b'])
+ with tm.assert_produces_warning(FutureWarning):
+ cat.take([0, -1])
+
+ def test_take_positive_no_warning(self):
+ cat = pd.Categorical(['a', 'b'])
+ with tm.assert_produces_warning(None):
+ cat.take([0, 0])
+
+ def test_take_bounds(self, allow_fill):
+ # https://github.com/pandas-dev/pandas/issues/20664
+ cat = pd.Categorical(['a', 'b', 'a'])
+ with pytest.raises(IndexError):
+ cat.take([4, 5], allow_fill=allow_fill)
+
+ def test_take_empty(self, allow_fill):
+ # https://github.com/pandas-dev/pandas/issues/20664
+ cat = pd.Categorical([], categories=['a', 'b'])
+ with pytest.raises(IndexError):
+ cat.take([0], allow_fill=allow_fill)
+
+ def test_positional_take(self, ordered):
+ cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'],
+ ordered=ordered)
+ result = cat.take([0, 1, 2], allow_fill=False)
+ expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_positional_take_unobserved(self, ordered):
+ cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'],
+ ordered=ordered)
+ result = cat.take([1, 0], allow_fill=False)
+ expected = pd.Categorical(['b', 'a'], categories=cat.categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_take_allow_fill(self):
+ # https://github.com/pandas-dev/pandas/issues/23296
+ cat = pd.Categorical(['a', 'a', 'b'])
+ result = cat.take([0, -1, -1], allow_fill=True)
+ expected = pd.Categorical(['a', np.nan, np.nan],
+ categories=['a', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_take_fill_with_negative_one(self):
+ # -1 was a category
+ cat = pd.Categorical([-1, 0, 1])
+ result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
+ expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_take_fill_value(self):
+ # https://github.com/pandas-dev/pandas/issues/23296
+ cat = pd.Categorical(['a', 'b', 'c'])
+ result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
+ expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_take_fill_value_new_raises(self):
+ # https://github.com/pandas-dev/pandas/issues/23296
+ cat = pd.Categorical(['a', 'b', 'c'])
+ xpr = r"'fill_value' \('d'\) is not in this Categorical's categories."
+ with pytest.raises(TypeError, match=xpr):
+ cat.take([0, 1, -1], fill_value='d', allow_fill=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_analytics.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_analytics.py
new file mode 100644
index 00000000000..5efcd527de8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_analytics.py
@@ -0,0 +1,303 @@
+# -*- coding: utf-8 -*-
+
+import sys
+
+import numpy as np
+import pytest
+
+from pandas.compat import PYPY
+
+from pandas import Categorical, Index, Series
+from pandas.api.types import is_scalar
+import pandas.util.testing as tm
+
+
+class TestCategoricalAnalytics(object):
+
+ def test_min_max(self):
+
+ # unordered cats have no min/max
+ cat = Categorical(["a", "b", "c", "d"], ordered=False)
+ pytest.raises(TypeError, lambda: cat.min())
+ pytest.raises(TypeError, lambda: cat.max())
+
+ cat = Categorical(["a", "b", "c", "d"], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "a"
+ assert _max == "d"
+
+ cat = Categorical(["a", "b", "c", "d"],
+ categories=['d', 'c', 'b', 'a'], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "d"
+ assert _max == "a"
+
+ cat = Categorical([np.nan, "b", "c", np.nan],
+ categories=['d', 'c', 'b', 'a'], ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == "b"
+
+ _min = cat.min(numeric_only=True)
+ assert _min == "c"
+ _max = cat.max(numeric_only=True)
+ assert _max == "b"
+
+ cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1],
+ ordered=True)
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == 1
+
+ _min = cat.min(numeric_only=True)
+ assert _min == 2
+ _max = cat.max(numeric_only=True)
+ assert _max == 1
+
+ @pytest.mark.parametrize("values,categories,exp_mode", [
+ ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
+ ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
+ ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
+ ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
+ ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
+ ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4])])
+ def test_mode(self, values, categories, exp_mode):
+ s = Categorical(values, categories=categories, ordered=True)
+ res = s.mode()
+ exp = Categorical(exp_mode, categories=categories, ordered=True)
+ tm.assert_categorical_equal(res, exp)
+
+ def test_searchsorted(self):
+ # https://github.com/pandas-dev/pandas/issues/8420
+ # https://github.com/pandas-dev/pandas/issues/14522
+
+ c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
+ categories=['cheese', 'milk', 'apple', 'bread'],
+ ordered=True)
+ s1 = Series(c1)
+ c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'],
+ categories=['cheese', 'milk', 'apple', 'bread'],
+ ordered=False)
+ s2 = Series(c2)
+
+ # Searching for single item argument, side='left' (default)
+ res_cat = c1.searchsorted('apple')
+ assert res_cat == 2
+ assert is_scalar(res_cat)
+
+ res_ser = s1.searchsorted('apple')
+ assert res_ser == 2
+ assert is_scalar(res_ser)
+
+ # Searching for single item array, side='left' (default)
+ res_cat = c1.searchsorted(['bread'])
+ res_ser = s1.searchsorted(['bread'])
+ exp = np.array([3], dtype=np.intp)
+ tm.assert_numpy_array_equal(res_cat, exp)
+ tm.assert_numpy_array_equal(res_ser, exp)
+
+ # Searching for several items array, side='right'
+ res_cat = c1.searchsorted(['apple', 'bread'], side='right')
+ res_ser = s1.searchsorted(['apple', 'bread'], side='right')
+ exp = np.array([3, 5], dtype=np.intp)
+ tm.assert_numpy_array_equal(res_cat, exp)
+ tm.assert_numpy_array_equal(res_ser, exp)
+
+ # Searching for a single value that is not from the Categorical
+ pytest.raises(KeyError, lambda: c1.searchsorted('cucumber'))
+ pytest.raises(KeyError, lambda: s1.searchsorted('cucumber'))
+
+ # Searching for multiple values one of each is not from the Categorical
+ pytest.raises(KeyError,
+ lambda: c1.searchsorted(['bread', 'cucumber']))
+ pytest.raises(KeyError,
+ lambda: s1.searchsorted(['bread', 'cucumber']))
+
+ # searchsorted call for unordered Categorical
+ pytest.raises(ValueError, lambda: c2.searchsorted('apple'))
+ pytest.raises(ValueError, lambda: s2.searchsorted('apple'))
+
+ def test_unique(self):
+ # categories are reordered based on value when ordered=False
+ cat = Categorical(["a", "b"])
+ exp = Index(["a", "b"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ tm.assert_categorical_equal(res, cat)
+
+ cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ tm.assert_categorical_equal(res, Categorical(exp))
+
+ cat = Categorical(["c", "a", "b", "a", "a"],
+ categories=["a", "b", "c"])
+ exp = Index(["c", "a", "b"])
+ res = cat.unique()
+ tm.assert_index_equal(res.categories, exp)
+ exp_cat = Categorical(exp, categories=['c', 'a', 'b'])
+ tm.assert_categorical_equal(res, exp_cat)
+
+ # nan must be removed
+ cat = Categorical(["b", np.nan, "b", np.nan, "a"],
+ categories=["a", "b", "c"])
+ res = cat.unique()
+ exp = Index(["b", "a"])
+ tm.assert_index_equal(res.categories, exp)
+ exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
+ tm.assert_categorical_equal(res, exp_cat)
+
+ def test_unique_ordered(self):
+ # keep categories order when ordered=True
+ cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'],
+ ordered=True)
+ res = cat.unique()
+ exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'],
+ ordered=True)
+ tm.assert_categorical_equal(res, exp_cat)
+
+ def test_unique_index_series(self):
+ c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
+ # Categorical.unique sorts categories by appearance order
+ # if ordered=False
+ exp = Categorical([3, 1, 2], categories=[3, 1, 2])
+ tm.assert_categorical_equal(c.unique(), exp)
+
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
+ exp = Categorical([1, 2], categories=[1, 2])
+ tm.assert_categorical_equal(c.unique(), exp)
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
+ # Categorical.unique keeps categories order if ordered=True
+ exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
+ tm.assert_categorical_equal(c.unique(), exp)
+
+ tm.assert_index_equal(Index(c).unique(), Index(exp))
+ tm.assert_categorical_equal(Series(c).unique(), exp)
+
+ def test_shift(self):
+ # GH 9416
+ cat = Categorical(['a', 'b', 'c', 'd', 'a'])
+
+ # shift forward
+ sp1 = cat.shift(1)
+ xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd'])
+ tm.assert_categorical_equal(sp1, xp1)
+ tm.assert_categorical_equal(cat[:-1], sp1[1:])
+
+ # shift back
+ sn2 = cat.shift(-2)
+ xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan],
+ categories=['a', 'b', 'c', 'd'])
+ tm.assert_categorical_equal(sn2, xp2)
+ tm.assert_categorical_equal(cat[2:], sn2[:-2])
+
+ # shift by zero
+ tm.assert_categorical_equal(cat, cat.shift(0))
+
+ def test_nbytes(self):
+ cat = Categorical([1, 2, 3])
+ exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
+ assert cat.nbytes == exp
+
+ def test_memory_usage(self):
+ cat = Categorical([1, 2, 3])
+
+ # .categories is an index, so we include the hashtable
+ assert 0 < cat.nbytes <= cat.memory_usage()
+ assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
+
+ cat = Categorical(['foo', 'foo', 'bar'])
+ assert cat.memory_usage(deep=True) > cat.nbytes
+
+ if not PYPY:
+ # sys.getsizeof will call the .memory_usage with
+ # deep=True, and add on some GC overhead
+ diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
+ assert abs(diff) < 100
+
+ def test_map(self):
+ c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True)
+ result = c.map(lambda x: x.lower())
+ exp = Categorical(list('ababc'), categories=list('cba'), ordered=True)
+ tm.assert_categorical_equal(result, exp)
+
+ c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False)
+ result = c.map(lambda x: x.lower())
+ exp = Categorical(list('ababc'), categories=list('abc'), ordered=False)
+ tm.assert_categorical_equal(result, exp)
+
+ result = c.map(lambda x: 1)
+ # GH 12766: Return an index not an array
+ tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
+
+ def test_validate_inplace(self):
+ cat = Categorical(['A', 'B', 'B', 'C', 'A'])
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ cat.set_ordered(value=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.as_ordered(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.as_unordered(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.rename_categories(['X', 'Y', 'Z'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.reorder_categories(
+ ['X', 'Y', 'Z'], ordered=True, inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.add_categories(
+ new_categories=['D', 'E', 'F'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.remove_categories(removals=['D', 'E', 'F'], inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.remove_unused_categories(inplace=value)
+
+ with pytest.raises(ValueError):
+ cat.sort_values(inplace=value)
+
+ def test_isna(self):
+ exp = np.array([False, False, True])
+ c = Categorical(["a", "b", np.nan])
+ res = c.isna()
+
+ tm.assert_numpy_array_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_api.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_api.py
new file mode 100644
index 00000000000..86dbc5ebf9f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_api.py
@@ -0,0 +1,508 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
+from pandas.core.arrays.categorical import _recode_for_categories
+from pandas.tests.arrays.categorical.common import TestCategorical
+import pandas.util.testing as tm
+
+
+class TestCategoricalAPI(object):
+
+ def test_ordered_api(self):
+ # GH 9347
+ cat1 = Categorical(list('acb'), ordered=False)
+ tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c']))
+ assert not cat1.ordered
+
+ cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False)
+ tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a']))
+ assert not cat2.ordered
+
+ cat3 = Categorical(list('acb'), ordered=True)
+ tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c']))
+ assert cat3.ordered
+
+ cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True)
+ tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
+ assert cat4.ordered
+
+ def test_set_ordered(self):
+
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ cat2 = cat.as_unordered()
+ assert not cat2.ordered
+ cat2 = cat.as_ordered()
+ assert cat2.ordered
+ cat2.as_unordered(inplace=True)
+ assert not cat2.ordered
+ cat2.as_ordered(inplace=True)
+ assert cat2.ordered
+
+ assert cat2.set_ordered(True).ordered
+ assert not cat2.set_ordered(False).ordered
+ cat2.set_ordered(True, inplace=True)
+ assert cat2.ordered
+ cat2.set_ordered(False, inplace=True)
+ assert not cat2.ordered
+
+ # removed in 0.19.0
+ msg = "can\'t set attribute"
+ with pytest.raises(AttributeError, match=msg):
+ cat.ordered = True
+ with pytest.raises(AttributeError, match=msg):
+ cat.ordered = False
+
+ def test_rename_categories(self):
+ cat = Categorical(["a", "b", "c", "a"])
+
+ # inplace=False: the old one must not be changed
+ res = cat.rename_categories([1, 2, 3])
+ tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1],
+ dtype=np.int64))
+ tm.assert_index_equal(res.categories, Index([1, 2, 3]))
+
+ exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ exp_cat = Index(["a", "b", "c"])
+ tm.assert_index_equal(cat.categories, exp_cat)
+
+ # GH18862 (let rename_categories take callables)
+ result = cat.rename_categories(lambda x: x.upper())
+ expected = Categorical(["A", "B", "C", "A"])
+ tm.assert_categorical_equal(result, expected)
+
+ # and now inplace
+ res = cat.rename_categories([1, 2, 3], inplace=True)
+ assert res is None
+ tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1],
+ dtype=np.int64))
+ tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
+
+ # Lengthen
+ with pytest.raises(ValueError):
+ cat.rename_categories([1, 2, 3, 4])
+
+ # Shorten
+ with pytest.raises(ValueError):
+ cat.rename_categories([1, 2])
+
+ def test_rename_categories_series(self):
+ # https://github.com/pandas-dev/pandas/issues/17981
+ c = Categorical(['a', 'b'])
+ xpr = "Treating Series 'new_categories' as a list-like "
+ with tm.assert_produces_warning(FutureWarning) as rec:
+ result = c.rename_categories(Series([0, 1]))
+
+ assert len(rec) == 1
+ assert xpr in str(rec[0].message)
+ expected = Categorical([0, 1])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_rename_categories_dict(self):
+ # GH 17336
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1})
+ expected = Index([4, 3, 2, 1])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for inplace
+ res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1},
+ inplace=True)
+ assert res is None
+ tm.assert_index_equal(cat.categories, expected)
+
+ # Test for dicts of smaller length
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 1, 'c': 3})
+
+ expected = Index([1, 'b', 3, 'd'])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for dicts with bigger length
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3,
+ 'd': 4, 'e': 5, 'f': 6})
+ expected = Index([1, 2, 3, 4])
+ tm.assert_index_equal(res.categories, expected)
+
+ # Test for dicts with no items from old categories
+ cat = Categorical(['a', 'b', 'c', 'd'])
+ res = cat.rename_categories({'f': 1, 'g': 3})
+
+ expected = Index(['a', 'b', 'c', 'd'])
+ tm.assert_index_equal(res.categories, expected)
+
+ def test_reorder_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"],
+ ordered=True)
+
+ # first inplace == False
+ res = cat.reorder_categories(["c", "b", "a"])
+ # cat must be the same as before
+ tm.assert_categorical_equal(cat, old)
+ # only res is changed
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.reorder_categories(["c", "b", "a"], inplace=True)
+ assert res is None
+ tm.assert_categorical_equal(cat, new)
+
+ # not all "old" included in "new"
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+
+ with pytest.raises(ValueError):
+ cat.reorder_categories(["a"])
+
+ # still not all "old" in "new"
+ with pytest.raises(ValueError):
+ cat.reorder_categories(["a", "b", "d"])
+
+ # all "old" included in "new", but too long
+ with pytest.raises(ValueError):
+ cat.reorder_categories(["a", "b", "c", "d"])
+
+ def test_add_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", "c", "a"],
+ categories=["a", "b", "c", "d"], ordered=True)
+
+ # first inplace == False
+ res = cat.add_categories("d")
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ res = cat.add_categories(["d"])
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.add_categories("d", inplace=True)
+ tm.assert_categorical_equal(cat, new)
+ assert res is None
+
+ # new is in old categories
+ with pytest.raises(ValueError):
+ cat.add_categories(["d"])
+
+ # GH 9927
+ cat = Categorical(list("abc"), ordered=True)
+ expected = Categorical(
+ list("abc"), categories=list("abcde"), ordered=True)
+ # test with Series, np.array, index, list
+ res = cat.add_categories(Series(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(np.array(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(Index(["d", "e"]))
+ tm.assert_categorical_equal(res, expected)
+ res = cat.add_categories(["d", "e"])
+ tm.assert_categorical_equal(res, expected)
+
+ def test_set_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ exp_categories = Index(["c", "b", "a"])
+ exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
+
+ res = cat.set_categories(["c", "b", "a"], inplace=True)
+ tm.assert_index_equal(cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_values)
+ assert res is None
+
+ res = cat.set_categories(["a", "b", "c"])
+ # cat must be the same as before
+ tm.assert_index_equal(cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_values)
+ # only res is changed
+ exp_categories_back = Index(["a", "b", "c"])
+ tm.assert_index_equal(res.categories, exp_categories_back)
+ tm.assert_numpy_array_equal(res.__array__(), exp_values)
+
+ # not all "old" included in "new" -> all not included ones are now
+ # np.nan
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ res = cat.set_categories(["a"])
+ tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0],
+ dtype=np.int8))
+
+ # still not all "old" in "new"
+ res = cat.set_categories(["a", "b", "d"])
+ tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+ tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
+
+ # all "old" included in "new"
+ cat = cat.set_categories(["a", "b", "c", "d"])
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_index_equal(cat.categories, exp_categories)
+
+ # internals...
+ c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0],
+ dtype=np.int8))
+ tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
+
+ exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(c.get_values(), exp)
+
+ # all "pointers" to '4' must be changed from 3 to 0,...
+ c = c.set_categories([4, 3, 2, 1])
+
+ # positions are changed
+ tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3],
+ dtype=np.int8))
+
+ # categories are now in new order
+ tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
+
+ # output is the same
+ exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(c.get_values(), exp)
+ assert c.min() == 4
+ assert c.max() == 1
+
+ # set_categories should set the ordering if specified
+ c2 = c.set_categories([4, 3, 2, 1], ordered=False)
+ assert not c2.ordered
+
+ tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
+
+ # set_categories should pass thru the ordering
+ c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
+ assert not c2.ordered
+
+ tm.assert_numpy_array_equal(c.get_values(), c2.get_values())
+
+ @pytest.mark.parametrize('values, categories, new_categories', [
+ # No NaNs, same cats, same order
+ (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+ # Same, unsorted
+ (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+ # NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ # Introduce NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ # No overlap
+ (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_set_categories_many(self, values, categories, new_categories,
+ ordered):
+ c = Categorical(values, categories)
+ expected = Categorical(values, new_categories, ordered)
+ result = c.set_categories(new_categories, ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_set_categories_rename_less(self):
+ # GH 24675
+ cat = Categorical(['A', 'B'])
+ result = cat.set_categories(['A'], rename=True)
+ expected = Categorical(['A', np.nan])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_set_categories_private(self):
+ cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+ cat._set_categories(['a', 'c', 'd', 'e'])
+ expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+ tm.assert_categorical_equal(cat, expected)
+
+ # fastpath
+ cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+ cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
+ expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+ tm.assert_categorical_equal(cat, expected)
+
+ def test_remove_categories(self):
+ cat = Categorical(["a", "b", "c", "a"], ordered=True)
+ old = cat.copy()
+ new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
+ ordered=True)
+
+ # first inplace == False
+ res = cat.remove_categories("c")
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ res = cat.remove_categories(["c"])
+ tm.assert_categorical_equal(cat, old)
+ tm.assert_categorical_equal(res, new)
+
+ # inplace == True
+ res = cat.remove_categories("c", inplace=True)
+ tm.assert_categorical_equal(cat, new)
+ assert res is None
+
+ # removal is not in categories
+ with pytest.raises(ValueError):
+ cat.remove_categories(["c"])
+
+ def test_remove_unused_categories(self):
+ c = Categorical(["a", "b", "c", "d", "a"],
+ categories=["a", "b", "c", "d", "e"])
+ exp_categories_all = Index(["a", "b", "c", "d", "e"])
+ exp_categories_dropped = Index(["a", "b", "c", "d"])
+
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ res = c.remove_unused_categories()
+ tm.assert_index_equal(res.categories, exp_categories_dropped)
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ res = c.remove_unused_categories(inplace=True)
+ tm.assert_index_equal(c.categories, exp_categories_dropped)
+ assert res is None
+
+ # with NaN values (GH11599)
+ c = Categorical(["a", "b", "c", np.nan],
+ categories=["a", "b", "c", "d", "e"])
+ res = c.remove_unused_categories()
+ tm.assert_index_equal(res.categories,
+ Index(np.array(["a", "b", "c"])))
+ exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(res.codes, exp_codes)
+ tm.assert_index_equal(c.categories, exp_categories_all)
+
+ val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
+ cat = Categorical(values=val, categories=list('ABCDEFG'))
+ out = cat.remove_unused_categories()
+ tm.assert_index_equal(out.categories, Index(['B', 'D', 'F']))
+ exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(out.codes, exp_codes)
+ assert out.get_values().tolist() == val
+
+ alpha = list('abcdefghijklmnopqrstuvwxyz')
+ val = np.random.choice(alpha[::2], 10000).astype('object')
+ val[np.random.choice(len(val), 100)] = np.nan
+
+ cat = Categorical(values=val, categories=alpha)
+ out = cat.remove_unused_categories()
+ assert out.get_values().tolist() == val.tolist()
+
+
+class TestCategoricalAPIWithFactor(TestCategorical):
+
+ def test_describe(self):
+ # string type
+ desc = self.factor.describe()
+ assert self.factor.ordered
+ exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
+ ordered=self.factor.ordered)
+ expected = DataFrame({'counts': [3, 2, 3],
+ 'freqs': [3 / 8., 2 / 8., 3 / 8.]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # check unused categories
+ cat = self.factor.copy()
+ cat.set_categories(["a", "b", "c", "d"], inplace=True)
+ desc = cat.describe()
+
+ exp_index = CategoricalIndex(
+ list('abcd'), ordered=self.factor.ordered, name='categories')
+ expected = DataFrame({'counts': [3, 2, 3, 0],
+ 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # check an integer one
+ cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
+ desc = cat.describe()
+ exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
+ name='categories')
+ expected = DataFrame({'counts': [5, 3, 3],
+ 'freqs': [5 / 11., 3 / 11., 3 / 11.]},
+ index=exp_index)
+ tm.assert_frame_equal(desc, expected)
+
+ # https://github.com/pandas-dev/pandas/issues/3678
+ # describe should work with NaN
+ cat = Categorical([np.nan, 1, 2, 2])
+ desc = cat.describe()
+ expected = DataFrame({'counts': [1, 2, 1],
+ 'freqs': [1 / 4., 2 / 4., 1 / 4.]},
+ index=CategoricalIndex([1, 2, np.nan],
+ categories=[1, 2],
+ name='categories'))
+ tm.assert_frame_equal(desc, expected)
+
+ def test_set_categories_inplace(self):
+ cat = self.factor.copy()
+ cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
+ tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
+
+
+class TestPrivateCategoricalAPI(object):
+
+ def test_codes_immutable(self):
+
+ # Codes should be read only
+ c = Categorical(["a", "b", "c", "a", np.nan])
+ exp = np.array([0, 1, 2, 0, -1], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ # Assignments to codes should raise
+ with pytest.raises(ValueError):
+ c.codes = np.array([0, 1, 2, 0, 1], dtype='int8')
+
+ # changes in the codes array should raise
+ codes = c.codes
+
+ with pytest.raises(ValueError):
+ codes[4] = 1
+
+ # But even after getting the codes, the original array should still be
+ # writeable!
+ c[4] = "a"
+ exp = np.array([0, 1, 2, 0, 0], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+ c._codes[4] = 2
+ exp = np.array([0, 1, 2, 0, 2], dtype='int8')
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ @pytest.mark.parametrize('codes, old, new, expected', [
+ ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
+ ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
+ ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
+ ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
+ ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
+ ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
+ ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
+ ([-1, -1], [], ['a', 'b'], [-1, -1]),
+ ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
+ ])
+ def test_recode_to_categories(self, codes, old, new, expected):
+ codes = np.asanyarray(codes, dtype=np.int8)
+ expected = np.asanyarray(expected, dtype=np.int8)
+ old = Index(old)
+ new = Index(new)
+ result = _recode_for_categories(codes, old, new)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_recode_to_categories_large(self):
+ N = 1000
+ codes = np.arange(N)
+ old = Index(codes)
+ expected = np.arange(N - 1, -1, -1, dtype=np.int16)
+ new = Index(expected)
+ result = _recode_for_categories(codes, old, new)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_constructors.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_constructors.py
new file mode 100644
index 00000000000..f07e3aba53c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_constructors.py
@@ -0,0 +1,586 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DatetimeIndex, Index, Interval,
+ IntervalIndex, NaT, Series, Timestamp, date_range, period_range,
+ timedelta_range)
+import pandas.util.testing as tm
+
+
+class TestCategoricalConstructors(object):
+
+ def test_validate_ordered(self):
+ # see gh-14058
+ exp_msg = "'ordered' must either be 'True' or 'False'"
+ exp_err = TypeError
+
+ # This should be a boolean.
+ ordered = np.array([0, 1, 2])
+
+ with pytest.raises(exp_err, match=exp_msg):
+ Categorical([1, 2, 3], ordered=ordered)
+
+ with pytest.raises(exp_err, match=exp_msg):
+ Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'],
+ ordered=ordered)
+
+ def test_constructor_empty(self):
+ # GH 17248
+ c = Categorical([])
+ expected = Index([])
+ tm.assert_index_equal(c.categories, expected)
+
+ c = Categorical([], categories=[1, 2, 3])
+ expected = pd.Int64Index([1, 2, 3])
+ tm.assert_index_equal(c.categories, expected)
+
+ def test_constructor_empty_boolean(self):
+ # see gh-22702
+ cat = pd.Categorical([], categories=[True, False])
+ categories = sorted(cat.categories.tolist())
+ assert categories == [False, True]
+
+ def test_constructor_tuples(self):
+ values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
+ result = Categorical(values)
+ expected = Index([(1,), (1, 2)], tupleize_cols=False)
+ tm.assert_index_equal(result.categories, expected)
+ assert result.ordered is False
+
+ def test_constructor_tuples_datetimes(self):
+ # numpy will auto reshape when all of the tuples are the
+ # same len, so add an extra one with 2 items and slice it off
+ values = np.array([(Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),),
+ (Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),),
+ ('a', 'b')], dtype=object)[:-1]
+ result = Categorical(values)
+ expected = Index([(Timestamp('2010-01-01'),),
+ (Timestamp('2010-01-02'),)], tupleize_cols=False)
+ tm.assert_index_equal(result.categories, expected)
+
+ def test_constructor_unsortable(self):
+
+ # it works!
+ arr = np.array([1, 2, 3, datetime.now()], dtype='O')
+ factor = Categorical(arr, ordered=False)
+ assert not factor.ordered
+
+ # this however will raise as cannot be sorted
+ msg = ("'values' is not ordered, please explicitly specify the "
+ "categories order by passing in a categories argument.")
+ with pytest.raises(TypeError, match=msg):
+ Categorical(arr, ordered=True)
+
+ def test_constructor_interval(self):
+ result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)],
+ ordered=True)
+ ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
+ exp = Categorical(ii, ordered=True)
+ tm.assert_categorical_equal(result, exp)
+ tm.assert_index_equal(result.categories, ii)
+
+ def test_constructor(self):
+
+ exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
+ c1 = Categorical(exp_arr)
+ tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
+ c2 = Categorical(exp_arr, categories=["a", "b", "c"])
+ tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
+ c2 = Categorical(exp_arr, categories=["c", "b", "a"])
+ tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
+
+ # categories must be unique
+ msg = "Categorical categories must be unique"
+ with pytest.raises(ValueError, match=msg):
+ Categorical([1, 2], [1, 2, 2])
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical(["a", "b"], ["a", "b", "b"])
+
+ # The default should be unordered
+ c1 = Categorical(["a", "b", "c", "a"])
+ assert not c1.ordered
+
+ # Categorical as input
+ c1 = Categorical(["a", "b", "c", "a"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(c1)
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(c1, categories=["a", "b", "c"])
+ tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
+ tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
+
+ # Series of dtype category
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(Series(c1))
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
+ c2 = Categorical(Series(c1))
+ tm.assert_categorical_equal(c1, c2)
+
+ # Series
+ c1 = Categorical(["a", "b", "c", "a"])
+ c2 = Categorical(Series(["a", "b", "c", "a"]))
+ tm.assert_categorical_equal(c1, c2)
+
+ c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
+ c2 = Categorical(Series(["a", "b", "c", "a"]),
+ categories=["a", "b", "c", "d"])
+ tm.assert_categorical_equal(c1, c2)
+
+ # This should result in integer categories, not float!
+ cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
+ assert is_integer_dtype(cat.categories)
+
+ # https://github.com/pandas-dev/pandas/issues/3678
+ cat = Categorical([np.nan, 1, 2, 3])
+ assert is_integer_dtype(cat.categories)
+
+ # this should result in floats
+ cat = Categorical([np.nan, 1, 2., 3])
+ assert is_float_dtype(cat.categories)
+
+ cat = Categorical([np.nan, 1., 2., 3.])
+ assert is_float_dtype(cat.categories)
+
+ # This doesn't work -> this would probably need some kind of "remember
+ # the original type" feature to try to cast the array interface result
+ # to...
+
+ # vals = np.asarray(cat[cat.notna()])
+ # assert is_integer_dtype(vals)
+
+ # corner cases
+ cat = Categorical([1])
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == 1
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ cat = Categorical(["a"])
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == "a"
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ # Scalars should be converted to lists
+ cat = Categorical(1)
+ assert len(cat.categories) == 1
+ assert cat.categories[0] == 1
+ assert len(cat.codes) == 1
+ assert cat.codes[0] == 0
+
+ # two arrays
+ # - when the first is an integer dtype and the second is not
+ # - when the resulting codes are all -1/NaN
+ with tm.assert_produces_warning(None):
+ c_old = Categorical([0, 1, 2, 0, 1, 2],
+ categories=["a", "b", "c"]) # noqa
+
+ with tm.assert_produces_warning(None):
+ c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa
+ categories=[3, 4, 5])
+
+ # the next one are from the old docs
+ with tm.assert_produces_warning(None):
+ c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa
+ cat = Categorical([1, 2], categories=[1, 2, 3])
+
+ # this is a legitimate constructor
+ with tm.assert_produces_warning(None):
+ c = Categorical(np.array([], dtype='int64'), # noqa
+ categories=[3, 2, 1], ordered=True)
+
+ def test_constructor_with_existing_categories(self):
+ # GH25318: constructing with pd.Series used to bogusly skip recoding
+ # categories
+ c0 = Categorical(["a", "b", "c", "a"])
+ c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
+
+ c2 = Categorical(c0, categories=c1.categories)
+ tm.assert_categorical_equal(c1, c2)
+
+ c3 = Categorical(Series(c0), categories=c1.categories)
+ tm.assert_categorical_equal(c1, c3)
+
+ def test_constructor_not_sequence(self):
+ # https://github.com/pandas-dev/pandas/issues/16022
+ msg = r"^Parameter 'categories' must be list-like, was"
+ with pytest.raises(TypeError, match=msg):
+ Categorical(['a', 'b'], categories='a')
+
+ def test_constructor_with_null(self):
+
+ # Cannot have NaN in categories
+ msg = "Categorial categories cannot be null"
+ with pytest.raises(ValueError, match=msg):
+ Categorical([np.nan, "a", "b", "c"],
+ categories=[np.nan, "a", "b", "c"])
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical([None, "a", "b", "c"],
+ categories=[None, "a", "b", "c"])
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical(DatetimeIndex(['nat', '20160101']),
+ categories=[NaT, Timestamp('20160101')])
+
+ def test_constructor_with_index(self):
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ tm.assert_categorical_equal(ci.values, Categorical(ci))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ tm.assert_categorical_equal(ci.values,
+ Categorical(ci.astype(object),
+ categories=ci.categories))
+
+ def test_constructor_with_generator(self):
+ # This was raising an Error in isna(single_val).any() because isna
+ # returned a scalar for a generator
+ xrange = range
+
+ exp = Categorical([0, 1, 2])
+ cat = Categorical((x for x in [0, 1, 2]))
+ tm.assert_categorical_equal(cat, exp)
+ cat = Categorical(xrange(3))
+ tm.assert_categorical_equal(cat, exp)
+
+ # This uses xrange internally
+ from pandas.core.index import MultiIndex
+ MultiIndex.from_product([range(5), ['a', 'b', 'c']])
+
+ # check that categories accept generators and sequences
+ cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
+ tm.assert_categorical_equal(cat, exp)
+ cat = Categorical([0, 1, 2], categories=xrange(3))
+ tm.assert_categorical_equal(cat, exp)
+
+ @pytest.mark.parametrize("dtl", [
+ date_range("1995-01-01 00:00:00", periods=5, freq="s"),
+ date_range("1995-01-01 00:00:00", periods=5,
+ freq="s", tz="US/Eastern"),
+ timedelta_range("1 day", periods=5, freq="s")
+ ])
+ def test_constructor_with_datetimelike(self, dtl):
+ # see gh-12077
+ # constructor with a datetimelike and NaT
+
+ s = Series(dtl)
+ c = Categorical(s)
+
+ expected = type(dtl)(s)
+ expected.freq = None
+
+ tm.assert_index_equal(c.categories, expected)
+ tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
+
+ # with NaT
+ s2 = s.copy()
+ s2.iloc[-1] = NaT
+ c = Categorical(s2)
+
+ expected = type(dtl)(s2.dropna())
+ expected.freq = None
+
+ tm.assert_index_equal(c.categories, expected)
+
+ exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
+ tm.assert_numpy_array_equal(c.codes, exp)
+
+ result = repr(c)
+ assert "NaT" in result
+
+ def test_constructor_from_index_series_datetimetz(self):
+ idx = date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern')
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_from_index_series_timedelta(self):
+ idx = timedelta_range('1 days', freq='D', periods=3)
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_from_index_series_period(self):
+ idx = period_range('2015-01-01', freq='D', periods=3)
+ result = Categorical(idx)
+ tm.assert_index_equal(result.categories, idx)
+
+ result = Categorical(Series(idx))
+ tm.assert_index_equal(result.categories, idx)
+
+ def test_constructor_invariant(self):
+ # GH 14190
+ vals = [
+ np.array([1., 1.2, 1.8, np.nan]),
+ np.array([1, 2, 3], dtype='int64'),
+ ['a', 'b', 'c', np.nan],
+ [pd.Period('2014-01'), pd.Period('2014-02'), NaT],
+ [Timestamp('2014-01-01'), Timestamp('2014-01-02'), NaT],
+ [Timestamp('2014-01-01', tz='US/Eastern'),
+ Timestamp('2014-01-02', tz='US/Eastern'), NaT],
+ ]
+ for val in vals:
+ c = Categorical(val)
+ c2 = Categorical(c)
+ tm.assert_categorical_equal(c, c2)
+
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_constructor_with_dtype(self, ordered):
+ categories = ['b', 'a', 'c']
+ dtype = CategoricalDtype(categories, ordered=ordered)
+ result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype)
+ expected = Categorical(['a', 'b', 'a', 'c'], categories=categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+ assert result.ordered is ordered
+
+ def test_constructor_dtype_and_others_raises(self):
+ dtype = CategoricalDtype(['a', 'b'], ordered=True)
+ msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+ with pytest.raises(ValueError, match=msg):
+ Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical(['a', 'b'], ordered=True, dtype=dtype)
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical(['a', 'b'], ordered=False, dtype=dtype)
+
+ @pytest.mark.parametrize('categories', [
+ None, ['a', 'b'], ['a', 'c'],
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_constructor_str_category(self, categories, ordered):
+ result = Categorical(['a', 'b'], categories=categories,
+ ordered=ordered, dtype='category')
+ expected = Categorical(['a', 'b'], categories=categories,
+ ordered=ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_str_unknown(self):
+ with pytest.raises(ValueError, match="Unknown dtype"):
+ Categorical([1, 2], dtype="foo")
+
+ def test_constructor_from_categorical_with_dtype(self):
+ dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True)
+ values = Categorical(['a', 'b', 'd'])
+ result = Categorical(values, dtype=dtype)
+ # We use dtype.categories, not values.categories
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_from_categorical_with_unknown_dtype(self):
+ dtype = CategoricalDtype(None, ordered=True)
+ values = Categorical(['a', 'b', 'd'])
+ result = Categorical(values, dtype=dtype)
+ # We use values.categories, not dtype.categories
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_from_categorical_string(self):
+ values = Categorical(['a', 'b', 'd'])
+ # use categories, ordered
+ result = Categorical(values, categories=['a', 'b', 'c'], ordered=True,
+ dtype='category')
+ expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ # No string
+ result = Categorical(values, categories=['a', 'b', 'c'], ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_constructor_with_categorical_categories(self):
+ # GH17884
+ expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+
+ result = Categorical(
+ ['a', 'b'], categories=Categorical(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ result = Categorical(
+ ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_codes(self):
+
+ # too few categories
+ dtype = CategoricalDtype(categories=[1, 2])
+ msg = "codes need to be between "
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([1, 2], categories=dtype.categories)
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([1, 2], dtype=dtype)
+
+ # no int codes
+ msg = "codes need to be array-like integers"
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes(["a"], categories=dtype.categories)
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes(["a"], dtype=dtype)
+
+ # no unique categories
+ with pytest.raises(ValueError,
+ match="Categorical categories must be unique"):
+ Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
+
+ # NaN categories included
+ with pytest.raises(ValueError,
+ match="Categorial categories cannot be null"):
+ Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
+
+ # too negative
+ dtype = CategoricalDtype(categories=["a", "b", "c"])
+ msg = r"codes need to be between -1 and len\(categories\)-1"
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([-2, 1, 2], dtype=dtype)
+
+ exp = Categorical(["a", "b", "c"], ordered=False)
+ res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
+ tm.assert_categorical_equal(exp, res)
+
+ res = Categorical.from_codes([0, 1, 2], dtype=dtype)
+ tm.assert_categorical_equal(exp, res)
+
+ def test_from_codes_with_categorical_categories(self):
+ # GH17884
+ expected = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+
+ result = Categorical.from_codes(
+ [0, 1], categories=Categorical(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ result = Categorical.from_codes(
+ [0, 1], categories=CategoricalIndex(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, expected)
+
+ # non-unique Categorical still raises
+ with pytest.raises(ValueError,
+ match="Categorical categories must be unique"):
+ Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
+
+ def test_from_codes_with_nan_code(self):
+ # GH21767
+ codes = [1, 2, np.nan]
+ dtype = CategoricalDtype(categories=['a', 'b', 'c'])
+ with pytest.raises(ValueError,
+ match="codes need to be array-like integers"):
+ Categorical.from_codes(codes, categories=dtype.categories)
+ with pytest.raises(ValueError,
+ match="codes need to be array-like integers"):
+ Categorical.from_codes(codes, dtype=dtype)
+
+ def test_from_codes_with_float(self):
+ # GH21767
+ codes = [1.0, 2.0, 0] # integer, but in float dtype
+ dtype = CategoricalDtype(categories=['a', 'b', 'c'])
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical.from_codes(codes, dtype.categories)
+ tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
+
+ with tm.assert_produces_warning(FutureWarning):
+ cat = Categorical.from_codes(codes, dtype=dtype)
+ tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
+
+ codes = [1.1, 2.0, 0] # non-integer
+ with pytest.raises(ValueError,
+ match="codes need to be array-like integers"):
+ Categorical.from_codes(codes, dtype.categories)
+ with pytest.raises(ValueError,
+ match="codes need to be array-like integers"):
+ Categorical.from_codes(codes, dtype=dtype)
+
+ def test_from_codes_with_dtype_raises(self):
+ msg = 'Cannot specify'
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([0, 1], categories=['a', 'b'],
+ dtype=CategoricalDtype(['a', 'b']))
+
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([0, 1], ordered=True,
+ dtype=CategoricalDtype(['a', 'b']))
+
+ def test_from_codes_neither(self):
+ msg = "Both were None"
+ with pytest.raises(ValueError, match=msg):
+ Categorical.from_codes([0, 1])
+
+ @pytest.mark.parametrize('dtype', [None, 'category'])
+ def test_from_inferred_categories(self, dtype):
+ cats = ['a', 'b']
+ codes = np.array([0, 0, 1, 1], dtype='i8')
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical.from_codes(codes, cats)
+ tm.assert_categorical_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, 'category'])
+ def test_from_inferred_categories_sorts(self, dtype):
+ cats = ['b', 'a']
+ codes = np.array([0, 1, 1, 1], dtype='i8')
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_inferred_categories_dtype(self):
+ cats = ['a', 'b', 'd']
+ codes = np.array([0, 1, 0, 2], dtype='i8')
+ dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical(['a', 'b', 'a', 'd'],
+ categories=['c', 'b', 'a'],
+ ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_from_inferred_categories_coerces(self):
+ cats = ['1', '2', 'bad']
+ codes = np.array([0, 0, 1, 2], dtype='i8')
+ dtype = CategoricalDtype([1, 2])
+ result = Categorical._from_inferred_categories(cats, codes, dtype)
+ expected = Categorical([1, 1, 2, np.nan])
+ tm.assert_categorical_equal(result, expected)
+
+ @pytest.mark.parametrize('ordered', [None, True, False])
+ def test_construction_with_ordered(self, ordered):
+ # GH 9347, 9190
+ cat = Categorical([0, 1, 2], ordered=ordered)
+ assert cat.ordered == bool(ordered)
+
+ @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
+ def test_constructor_imaginary(self):
+ values = [1, 2, 3 + 1j]
+ c1 = Categorical(values)
+ tm.assert_index_equal(c1.categories, Index(values))
+ tm.assert_numpy_array_equal(np.array(c1), np.array(values))
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_dtypes.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_dtypes.py
new file mode 100644
index 00000000000..66f08355e75
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_dtypes.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pandas.compat import long
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp
+import pandas.util.testing as tm
+
+
+class TestCategoricalDtypes(object):
+
+ def test_is_equal_dtype(self):
+
+ # test dtype comparisons between cats
+
+ c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
+ c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
+ c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
+ assert c1.is_dtype_equal(c1)
+ assert c2.is_dtype_equal(c2)
+ assert c3.is_dtype_equal(c3)
+ assert c1.is_dtype_equal(c2)
+ assert not c1.is_dtype_equal(c3)
+ assert not c1.is_dtype_equal(Index(list('aabca')))
+ assert not c1.is_dtype_equal(c1.astype(object))
+ assert c1.is_dtype_equal(CategoricalIndex(c1))
+ assert (c1.is_dtype_equal(
+ CategoricalIndex(c1, categories=list('cab'))))
+ assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
+
+ # GH 16659
+ s1 = Series(c1)
+ s2 = Series(c2)
+ s3 = Series(c3)
+ assert c1.is_dtype_equal(s1)
+ assert c2.is_dtype_equal(s2)
+ assert c3.is_dtype_equal(s3)
+ assert c1.is_dtype_equal(s2)
+ assert not c1.is_dtype_equal(s3)
+ assert not c1.is_dtype_equal(s1.astype(object))
+
+ def test_set_dtype_same(self):
+ c = Categorical(['a', 'b', 'c'])
+ result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
+ tm.assert_categorical_equal(result, c)
+
+ def test_set_dtype_new_categories(self):
+ c = Categorical(['a', 'b', 'c'])
+ result = c._set_dtype(CategoricalDtype(list('abcd')))
+ tm.assert_numpy_array_equal(result.codes, c.codes)
+ tm.assert_index_equal(result.dtype.categories, Index(list('abcd')))
+
+ @pytest.mark.parametrize('values, categories, new_categories', [
+ # No NaNs, same cats, same order
+ (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+ # Same, unsorted
+ (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+ # No NaNs, same cats, different order
+ (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+ # NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+ # Introduce NaNs
+ (['a', 'b', 'c'], ['a', 'b'], ['a']),
+ (['a', 'b', 'c'], ['a', 'b'], ['b']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ (['b', 'a', 'c'], ['a', 'b'], ['a']),
+ # No overlap
+ (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+ ])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_set_dtype_many(self, values, categories, new_categories,
+ ordered):
+ c = Categorical(values, categories)
+ expected = Categorical(values, new_categories, ordered)
+ result = c._set_dtype(expected.dtype)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_set_dtype_no_overlap(self):
+ c = Categorical(['a', 'b', 'c'], ['d', 'e'])
+ result = c._set_dtype(CategoricalDtype(['a', 'b']))
+ expected = Categorical([None, None, None], categories=['a', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_codes_dtypes(self):
+
+ # GH 8453
+ result = Categorical(['foo', 'bar', 'baz'])
+ assert result.codes.dtype == 'int8'
+
+ result = Categorical(['foo%05d' % i for i in range(400)])
+ assert result.codes.dtype == 'int16'
+
+ result = Categorical(['foo%05d' % i for i in range(40000)])
+ assert result.codes.dtype == 'int32'
+
+ # adding cats
+ result = Categorical(['foo', 'bar', 'baz'])
+ assert result.codes.dtype == 'int8'
+ result = result.add_categories(['foo%05d' % i for i in range(400)])
+ assert result.codes.dtype == 'int16'
+
+ # removing cats
+ result = result.remove_categories(['foo%05d' % i for i in range(300)])
+ assert result.codes.dtype == 'int8'
+
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_astype(self, ordered):
+ # string
+ cat = Categorical(list('abbaaccc'), ordered=ordered)
+ result = cat.astype(object)
+ expected = np.array(cat)
+ tm.assert_numpy_array_equal(result, expected)
+
+ msg = 'could not convert string to float'
+ with pytest.raises(ValueError, match=msg):
+ cat.astype(float)
+
+ # numeric
+ cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
+ result = cat.astype(object)
+ expected = np.array(cat, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = cat.astype(int)
+ expected = np.array(cat, dtype=np.int)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = cat.astype(float)
+ expected = np.array(cat, dtype=np.float)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype_ordered', [True, False])
+ @pytest.mark.parametrize('cat_ordered', [True, False])
+ def test_astype_category(self, dtype_ordered, cat_ordered):
+ # GH 10696/18593
+ data = list('abcaacbab')
+ cat = Categorical(data, categories=list('bac'), ordered=cat_ordered)
+
+ # standard categories
+ dtype = CategoricalDtype(ordered=dtype_ordered)
+ result = cat.astype(dtype)
+ expected = Categorical(
+ data, categories=cat.categories, ordered=dtype_ordered)
+ tm.assert_categorical_equal(result, expected)
+
+ # non-standard categories
+ dtype = CategoricalDtype(list('adc'), dtype_ordered)
+ result = cat.astype(dtype)
+ expected = Categorical(data, dtype=dtype)
+ tm.assert_categorical_equal(result, expected)
+
+ if dtype_ordered is False:
+ # dtype='category' can't specify ordered, so only test once
+ result = cat.astype('category')
+ expected = cat
+ tm.assert_categorical_equal(result, expected)
+
+ def test_iter_python_types(self):
+ # GH-19909
+ # TODO(Py2): Remove long
+ cat = Categorical([1, 2])
+ assert isinstance(list(cat)[0], (int, long))
+ assert isinstance(cat.tolist()[0], (int, long))
+
+ def test_iter_python_types_datetime(self):
+ cat = Categorical([Timestamp('2017-01-01'),
+ Timestamp('2017-01-02')])
+ assert isinstance(list(cat)[0], Timestamp)
+ assert isinstance(cat.tolist()[0], Timestamp)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_indexing.py
new file mode 100644
index 00000000000..294344da7c9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_indexing.py
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series
+import pandas.core.common as com
+from pandas.tests.arrays.categorical.common import TestCategorical
+import pandas.util.testing as tm
+
+
+class TestCategoricalIndexingWithFactor(TestCategorical):
+
+ def test_getitem(self):
+ assert self.factor[0] == 'a'
+ assert self.factor[-1] == 'c'
+
+ subf = self.factor[[0, 1, 2]]
+ tm.assert_numpy_array_equal(subf._codes,
+ np.array([0, 1, 1], dtype=np.int8))
+
+ subf = self.factor[np.asarray(self.factor) == 'c']
+ tm.assert_numpy_array_equal(subf._codes,
+ np.array([2, 2, 2], dtype=np.int8))
+
+ def test_setitem(self):
+
+ # int/positional
+ c = self.factor.copy()
+ c[0] = 'b'
+ assert c[0] == 'b'
+ c[-1] = 'a'
+ assert c[-1] == 'a'
+
+ # boolean
+ c = self.factor.copy()
+ indexer = np.zeros(len(c), dtype='bool')
+ indexer[0] = True
+ indexer[-1] = True
+ c[indexer] = 'c'
+ expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+ ordered=True)
+
+ tm.assert_categorical_equal(c, expected)
+
+ @pytest.mark.parametrize('other', [
+ pd.Categorical(['b', 'a']),
+ pd.Categorical(['b', 'a'], categories=['b', 'a']),
+ ])
+ def test_setitem_same_but_unordered(self, other):
+ # GH-24142
+ target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+ mask = np.array([True, False])
+ target[mask] = other[mask]
+ expected = pd.Categorical(['b', 'b'], categories=['a', 'b'])
+ tm.assert_categorical_equal(target, expected)
+
+ @pytest.mark.parametrize('other', [
+ pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']),
+ pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']),
+ pd.Categorical(['a', 'a'], categories=['a']),
+ pd.Categorical(['b', 'b'], categories=['b']),
+ ])
+ def test_setitem_different_unordered_raises(self, other):
+ # GH-24142
+ target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+ mask = np.array([True, False])
+ with pytest.raises(ValueError):
+ target[mask] = other[mask]
+
+ @pytest.mark.parametrize('other', [
+ pd.Categorical(['b', 'a']),
+ pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True),
+ pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True),
+ ])
+ def test_setitem_same_ordered_rasies(self, other):
+ # Gh-24142
+ target = pd.Categorical(['a', 'b'], categories=['a', 'b'],
+ ordered=True)
+ mask = np.array([True, False])
+
+ with pytest.raises(ValueError):
+ target[mask] = other[mask]
+
+
+class TestCategoricalIndexing(object):
+
+ def test_getitem_listlike(self):
+
+ # GH 9469
+ # properly coerce the input indexers
+ np.random.seed(1)
+ c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
+ result = c.codes[np.array([100000]).astype(np.int64)]
+ expected = c[np.array([100000]).astype(np.int64)].codes
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_periodindex(self):
+ idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+ '2014-03', '2014-03'], freq='M')
+
+ cat1 = Categorical(idx1)
+ str(cat1)
+ exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
+ exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+ tm.assert_numpy_array_equal(cat1._codes, exp_arr)
+ tm.assert_index_equal(cat1.categories, exp_idx)
+
+ idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+ '2014-03', '2014-01'], freq='M')
+ cat2 = Categorical(idx2, ordered=True)
+ str(cat2)
+ exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
+ exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+ tm.assert_numpy_array_equal(cat2._codes, exp_arr)
+ tm.assert_index_equal(cat2.categories, exp_idx2)
+
+ idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
+ '2013-08', '2013-07', '2013-05'], freq='M')
+ cat3 = Categorical(idx3, ordered=True)
+ exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
+ exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
+ '2013-10', '2013-11', '2013-12'], freq='M')
+ tm.assert_numpy_array_equal(cat3._codes, exp_arr)
+ tm.assert_index_equal(cat3.categories, exp_idx)
+
+ def test_categories_assigments(self):
+ s = Categorical(["a", "b", "c", "a"])
+ exp = np.array([1, 2, 3, 1], dtype=np.int64)
+ s.categories = [1, 2, 3]
+ tm.assert_numpy_array_equal(s.__array__(), exp)
+ tm.assert_index_equal(s.categories, Index([1, 2, 3]))
+
+ # lengthen
+ with pytest.raises(ValueError):
+ s.categories = [1, 2, 3, 4]
+
+ # shorten
+ with pytest.raises(ValueError):
+ s.categories = [1, 2]
+
+ # Combinations of sorted/unique:
+ @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4],
+ [1, 3, 3, 4], [1, 2, 2, 4]])
+ # Combinations of missing/unique
+ @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
+ @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
+ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
+ # GH 21448
+ key = key_class(key_values, categories=range(1, 5))
+ # Test for flat index and CategoricalIndex with same/different cats:
+ for dtype in None, 'category', key.dtype:
+ idx = Index(idx_values, dtype=dtype)
+ expected, exp_miss = idx.get_indexer_non_unique(key_values)
+ result, res_miss = idx.get_indexer_non_unique(key)
+
+ tm.assert_numpy_array_equal(expected, result)
+ tm.assert_numpy_array_equal(exp_miss, res_miss)
+
+ def test_where_unobserved_nan(self):
+ ser = pd.Series(pd.Categorical(['a', 'b']))
+ result = ser.where([True, False])
+ expected = pd.Series(pd.Categorical(['a', None],
+ categories=['a', 'b']))
+ tm.assert_series_equal(result, expected)
+
+ # all NA
+ ser = pd.Series(pd.Categorical(['a', 'b']))
+ result = ser.where([False, False])
+ expected = pd.Series(pd.Categorical([None, None],
+ categories=['a', 'b']))
+ tm.assert_series_equal(result, expected)
+
+ def test_where_unobserved_categories(self):
+ ser = pd.Series(
+ Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+ )
+ result = ser.where([True, True, False], other='b')
+ expected = pd.Series(
+ Categorical(['a', 'b', 'b'], categories=ser.cat.categories)
+ )
+ tm.assert_series_equal(result, expected)
+
+ def test_where_other_categorical(self):
+ ser = pd.Series(
+ Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+ )
+ other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'])
+ result = ser.where([True, False, True], other)
+ expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype))
+ tm.assert_series_equal(result, expected)
+
+ def test_where_warns(self):
+ ser = pd.Series(Categorical(['a', 'b', 'c']))
+ with tm.assert_produces_warning(FutureWarning):
+ result = ser.where([True, False, True], 'd')
+
+ expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object'))
+ tm.assert_series_equal(result, expected)
+
+ def test_where_ordered_differs_rasies(self):
+ ser = pd.Series(
+ Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
+ ordered=True)
+ )
+ other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'],
+ ordered=True)
+ with tm.assert_produces_warning(FutureWarning):
+ result = ser.where([True, False, True], other)
+
+ expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("index", [True, False])
+def test_mask_with_boolean(index):
+ s = Series(range(3))
+ idx = Categorical([True, False, True])
+ if index:
+ idx = CategoricalIndex(idx)
+
+ assert com.is_bool_indexer(idx)
+ result = s[idx]
+ expected = s[idx.astype('object')]
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("index", [True, False])
+def test_mask_with_boolean_raises(index):
+ s = Series(range(3))
+ idx = Categorical([True, False, None])
+ if index:
+ idx = CategoricalIndex(idx)
+
+ with pytest.raises(ValueError, match='NA / NaN'):
+ s[idx]
+
+
+def non_coercible_categorical(monkeypatch):
+ """
+ Monkeypatch Categorical.__array__ to ensure no implicit conversion.
+
+ Raises
+ ------
+ ValueError
+ When Categorical.__array__ is called.
+ """
+ # TODO(Categorical): identify other places where this may be
+ # useful and move to a conftest.py
+ def array(self, dtype=None):
+ raise ValueError("I cannot be converted.")
+
+ with monkeypatch.context() as m:
+ m.setattr(Categorical, "__array__", array)
+ yield
+
+
+def test_series_at(non_coercible_categorical):
+ arr = Categorical(['a', 'b', 'c'])
+ ser = Series(arr)
+ result = ser.at[0]
+ assert result == 'a'
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_missing.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_missing.py
new file mode 100644
index 00000000000..b4b361dabac
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_missing.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+import collections
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas import Categorical, Index, isna
+import pandas.util.testing as tm
+
+
+class TestCategoricalMissing(object):
+
+ def test_na_flags_int_categories(self):
+ # #1457
+
+ categories = lrange(10)
+ labels = np.random.randint(0, 10, 20)
+ labels[::5] = -1
+
+ cat = Categorical(labels, categories, fastpath=True)
+ repr(cat)
+
+ tm.assert_numpy_array_equal(isna(cat), labels == -1)
+
+ def test_nan_handling(self):
+
+ # Nans are represented as -1 in codes
+ c = Categorical(["a", "b", np.nan, "a"])
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+ c[1] = np.nan
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0],
+ dtype=np.int8))
+
+ # Adding nan to categories should make assigned nan point to the
+ # category!
+ c = Categorical(["a", "b", np.nan, "a"])
+ tm.assert_index_equal(c.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+ dtype=np.int8))
+
+ def test_set_dtype_nans(self):
+ c = Categorical(['a', 'b', np.nan])
+ result = c._set_dtype(CategoricalDtype(['a', 'c']))
+ tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
+ dtype='int8'))
+
+ def test_set_item_nan(self):
+ cat = Categorical([1, 2, 3])
+ cat[1] = np.nan
+
+ exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
+ tm.assert_categorical_equal(cat, exp)
+
+ @pytest.mark.parametrize('fillna_kwargs, msg', [
+ (dict(value=1, method='ffill'),
+ "Cannot specify both 'value' and 'method'."),
+ (dict(),
+ "Must specify a fill 'value' or 'method'."),
+ (dict(method='bad'),
+ "Invalid fill method. Expecting .* bad"),
+ ])
+ def test_fillna_raises(self, fillna_kwargs, msg):
+ # https://github.com/pandas-dev/pandas/issues/19682
+ cat = Categorical([1, 2, 3])
+
+ with pytest.raises(ValueError, match=msg):
+ cat.fillna(**fillna_kwargs)
+
+ @pytest.mark.parametrize("named", [True, False])
+ def test_fillna_iterable_category(self, named):
+ # https://github.com/pandas-dev/pandas/issues/21097
+ if named:
+ Point = collections.namedtuple("Point", "x y")
+ else:
+ Point = lambda *args: args # tuple
+ cat = Categorical([Point(0, 0), Point(0, 1), None])
+ result = cat.fillna(Point(0, 0))
+ expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
+
+ tm.assert_categorical_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_operators.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_operators.py
new file mode 100644
index 00000000000..b2965bbcc45
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_operators.py
@@ -0,0 +1,331 @@
+# -*- coding: utf-8 -*-
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Categorical, DataFrame, Series, date_range
+from pandas.tests.arrays.categorical.common import TestCategorical
+import pandas.util.testing as tm
+
+
+class TestCategoricalOpsWithFactor(TestCategorical):
+
+ def test_categories_none_comparisons(self):
+ factor = Categorical(['a', 'b', 'b', 'a',
+ 'a', 'c', 'c', 'c'], ordered=True)
+ tm.assert_categorical_equal(factor, self.factor)
+
+ def test_comparisons(self):
+
+ result = self.factor[self.factor == 'a']
+ expected = self.factor[np.asarray(self.factor) == 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor != 'a']
+ expected = self.factor[np.asarray(self.factor) != 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor < 'c']
+ expected = self.factor[np.asarray(self.factor) < 'c']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor > 'a']
+ expected = self.factor[np.asarray(self.factor) > 'a']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor >= 'b']
+ expected = self.factor[np.asarray(self.factor) >= 'b']
+ tm.assert_categorical_equal(result, expected)
+
+ result = self.factor[self.factor <= 'b']
+ expected = self.factor[np.asarray(self.factor) <= 'b']
+ tm.assert_categorical_equal(result, expected)
+
+ n = len(self.factor)
+
+ other = self.factor[np.random.permutation(n)]
+ result = self.factor == other
+ expected = np.asarray(self.factor) == np.asarray(other)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = self.factor == 'd'
+ expected = np.repeat(False, len(self.factor))
+ tm.assert_numpy_array_equal(result, expected)
+
+ # comparisons with categoricals
+ cat_rev = Categorical(
+ ["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
+ cat_rev_base = Categorical(
+ ["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
+ cat = Categorical(["a", "b", "c"], ordered=True)
+ cat_base = Categorical(
+ ["b", "b", "b"], categories=cat.categories, ordered=True)
+
+ # comparisons need to take categories ordering into account
+ res_rev = cat_rev > cat_rev_base
+ exp_rev = np.array([True, False, False])
+ tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+ res_rev = cat_rev < cat_rev_base
+ exp_rev = np.array([False, False, True])
+ tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+ res = cat > cat_base
+ exp = np.array([False, False, True])
+ tm.assert_numpy_array_equal(res, exp)
+
+ # Only categories with same categories can be compared
+ with pytest.raises(TypeError):
+ cat > cat_rev
+
+ cat_rev_base2 = Categorical(
+ ["b", "b", "b"], categories=["c", "b", "a", "d"])
+
+ with pytest.raises(TypeError):
+ cat_rev > cat_rev_base2
+
+ # Only categories with same ordering information can be compared
+ cat_unorderd = cat.set_ordered(False)
+ assert not (cat > cat).any()
+
+ with pytest.raises(TypeError):
+ cat > cat_unorderd
+
+ # comparison (in both directions) with Series will raise
+ s = Series(["b", "b", "b"])
+ pytest.raises(TypeError, lambda: cat > s)
+ pytest.raises(TypeError, lambda: cat_rev > s)
+ pytest.raises(TypeError, lambda: s < cat)
+ pytest.raises(TypeError, lambda: s < cat_rev)
+
+ # comparison with numpy.array will raise in both direction, but only on
+ # newer numpy versions
+ a = np.array(["b", "b", "b"])
+ pytest.raises(TypeError, lambda: cat > a)
+ pytest.raises(TypeError, lambda: cat_rev > a)
+
+ # Make sure that unequal comparison take the categories order in
+ # account
+ cat_rev = Categorical(
+ list("abc"), categories=list("cba"), ordered=True)
+ exp = np.array([True, False, False])
+ res = cat_rev > "b"
+ tm.assert_numpy_array_equal(res, exp)
+
+ # check that zero-dim array gets unboxed
+ res = cat_rev > np.array("b")
+ tm.assert_numpy_array_equal(res, exp)
+
+
+class TestCategoricalOps(object):
+
+ def test_compare_frame(self):
+ # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
+ data = ["a", "b", 2, "a"]
+ cat = Categorical(data)
+
+ df = DataFrame(cat)
+
+ for op in [operator.eq, operator.ne, operator.ge,
+ operator.gt, operator.le, operator.lt]:
+ with pytest.raises(ValueError):
+ # alignment raises unless we transpose
+ op(cat, df)
+
+ result = cat == df.T
+ expected = DataFrame([[True, True, True, True]])
+ tm.assert_frame_equal(result, expected)
+
+ result = cat[::-1] != df.T
+ expected = DataFrame([[False, True, True, False]])
+ tm.assert_frame_equal(result, expected)
+
+ def test_datetime_categorical_comparison(self):
+ dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
+ tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
+ np.array([False, True, True]))
+ tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
+ np.array([False, True, True]))
+
+ def test_reflected_comparison_with_scalars(self):
+ # GH8658
+ cat = Categorical([1, 2, 3], ordered=True)
+ tm.assert_numpy_array_equal(cat > cat[0],
+ np.array([False, True, True]))
+ tm.assert_numpy_array_equal(cat[0] < cat,
+ np.array([False, True, True]))
+
+ def test_comparison_with_unknown_scalars(self):
+ # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+ # and following comparisons with scalars not in categories should raise
+ # for unequal comps, but not for equal/not equal
+ cat = Categorical([1, 2, 3], ordered=True)
+
+ pytest.raises(TypeError, lambda: cat < 4)
+ pytest.raises(TypeError, lambda: cat > 4)
+ pytest.raises(TypeError, lambda: 4 < cat)
+ pytest.raises(TypeError, lambda: 4 > cat)
+
+ tm.assert_numpy_array_equal(cat == 4,
+ np.array([False, False, False]))
+ tm.assert_numpy_array_equal(cat != 4,
+ np.array([True, True, True]))
+
+ @pytest.mark.parametrize('data,reverse,base', [
+ (list("abc"), list("cba"), list("bbb")),
+ ([1, 2, 3], [3, 2, 1], [2, 2, 2])]
+ )
+ def test_comparisons(self, data, reverse, base):
+ cat_rev = Series(
+ Categorical(data, categories=reverse, ordered=True))
+ cat_rev_base = Series(
+ Categorical(base, categories=reverse, ordered=True))
+ cat = Series(Categorical(data, ordered=True))
+ cat_base = Series(
+ Categorical(base, categories=cat.cat.categories, ordered=True))
+ s = Series(base)
+ a = np.array(base)
+
+ # comparisons need to take categories ordering into account
+ res_rev = cat_rev > cat_rev_base
+ exp_rev = Series([True, False, False])
+ tm.assert_series_equal(res_rev, exp_rev)
+
+ res_rev = cat_rev < cat_rev_base
+ exp_rev = Series([False, False, True])
+ tm.assert_series_equal(res_rev, exp_rev)
+
+ res = cat > cat_base
+ exp = Series([False, False, True])
+ tm.assert_series_equal(res, exp)
+
+ scalar = base[1]
+ res = cat > scalar
+ exp = Series([False, False, True])
+ exp2 = cat.values > scalar
+ tm.assert_series_equal(res, exp)
+ tm.assert_numpy_array_equal(res.values, exp2)
+ res_rev = cat_rev > scalar
+ exp_rev = Series([True, False, False])
+ exp_rev2 = cat_rev.values > scalar
+ tm.assert_series_equal(res_rev, exp_rev)
+ tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
+
+ # Only categories with same categories can be compared
+ with pytest.raises(TypeError):
+ cat > cat_rev
+
+ # categorical cannot be compared to Series or numpy array, and also
+ # not the other way around
+ pytest.raises(TypeError, lambda: cat > s)
+ pytest.raises(TypeError, lambda: cat_rev > s)
+ pytest.raises(TypeError, lambda: cat > a)
+ pytest.raises(TypeError, lambda: cat_rev > a)
+
+ pytest.raises(TypeError, lambda: s < cat)
+ pytest.raises(TypeError, lambda: s < cat_rev)
+
+ pytest.raises(TypeError, lambda: a < cat)
+ pytest.raises(TypeError, lambda: a < cat_rev)
+
+ @pytest.mark.parametrize('ctor', [
+ lambda *args, **kwargs: Categorical(*args, **kwargs),
+ lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
+ ])
+ def test_unordered_different_order_equal(self, ctor):
+ # https://github.com/pandas-dev/pandas/issues/16014
+ c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+ assert (c1 == c2).all()
+
+ c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
+ assert (c1 != c2).all()
+
+ c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
+ assert (c1 != c2).all()
+
+ c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+ c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+ result = c1 == c2
+ tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
+
+ def test_unordered_different_categories_raises(self):
+ c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
+ c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
+
+ with pytest.raises(TypeError, match=("Categoricals can "
+ "only be compared")):
+ c1 == c2
+
+ def test_compare_different_lengths(self):
+ c1 = Categorical([], categories=['a', 'b'])
+ c2 = Categorical([], categories=['a'])
+
+ msg = "Categories are different lengths"
+ with pytest.raises(TypeError, match=msg):
+ c1 == c2
+
+ def test_compare_unordered_different_order(self):
+ # https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
+ # 349290078
+ a = pd.Categorical(['a'], categories=['a', 'b'])
+ b = pd.Categorical(['b'], categories=['b', 'a'])
+ assert not a.equals(b)
+
+ def test_numeric_like_ops(self):
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ # numeric ops should not succeed
+ for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+ pytest.raises(TypeError,
+ lambda: getattr(df, op)(df))
+
+ # reduction ops should not succeed (unless specifically defined, e.g.
+ # min/max)
+ s = df['value_group']
+ for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
+ pytest.raises(TypeError,
+ lambda: getattr(s, op)(numeric_only=False))
+
+ # mad technically works because it takes always the numeric data
+
+ # numpy ops
+ s = Series(Categorical([1, 2, 3, 4]))
+ with pytest.raises(TypeError):
+ np.sum(s)
+
+ # numeric ops on a Series
+ for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+ pytest.raises(TypeError, lambda: getattr(s, op)(2))
+
+ # invalid ufunc
+ with pytest.raises(TypeError):
+ np.log(s)
+
+ def test_contains(self):
+ # GH21508
+ c = pd.Categorical(list('aabbca'), categories=list('cab'))
+
+ assert 'b' in c
+ assert 'z' not in c
+ assert np.nan not in c
+ with pytest.raises(TypeError):
+ assert [1] in c
+
+ # assert codes NOT in index
+ assert 0 not in c
+ assert 1 not in c
+
+ c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab'))
+ assert np.nan in c
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_repr.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_repr.py
new file mode 100644
index 00000000000..08b32a216ff
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_repr.py
@@ -0,0 +1,529 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pandas.compat import PY3, u
+
+from pandas import (
+ Categorical, CategoricalIndex, Series, date_range, period_range,
+ timedelta_range)
+from pandas.core.config import option_context
+from pandas.tests.arrays.categorical.common import TestCategorical
+
+
+class TestCategoricalReprWithFactor(TestCategorical):
+
+ def test_print(self):
+ expected = ["[a, b, b, a, a, c, c, c]",
+ "Categories (3, object): [a < b < c]"]
+ expected = "\n".join(expected)
+ actual = repr(self.factor)
+ assert actual == expected
+
+
+class TestCategoricalRepr(object):
+
+ def test_big_print(self):
+ factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'],
+ fastpath=True)
+ expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600",
+ "Categories (3, object): [a, b, c]"]
+ expected = "\n".join(expected)
+
+ actual = repr(factor)
+
+ assert actual == expected
+
+ def test_empty_print(self):
+ factor = Categorical([], ["a", "b", "c"])
+ expected = ("[], Categories (3, object): [a, b, c]")
+ actual = repr(factor)
+ assert actual == expected
+
+ assert expected == actual
+ factor = Categorical([], ["a", "b", "c"], ordered=True)
+ expected = ("[], Categories (3, object): [a < b < c]")
+ actual = repr(factor)
+ assert expected == actual
+
+ factor = Categorical([], [])
+ expected = ("[], Categories (0, object): []")
+ assert expected == repr(factor)
+
+ def test_print_none_width(self):
+ # GH10087
+ a = Series(Categorical([1, 2, 3, 4]))
+ exp = u("0 1\n1 2\n2 3\n3 4\n" +
+ "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
+
+ with option_context("display.width", None):
+ assert exp == repr(a)
+
+ def test_unicode_print(self):
+ if PY3:
+ _rep = repr
+ else:
+ _rep = unicode # noqa
+
+ c = Categorical(['aaaaa', 'bb', 'cccc'] * 20)
+ expected = u"""\
+[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
+Length: 60
+Categories (3, object): [aaaaa, bb, cccc]"""
+
+ assert _rep(c) == expected
+
+ c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
+ expected = u"""\
+[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
+Length: 60
+Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
+
+ assert _rep(c) == expected
+
+ # unicode option should not affect to Categorical, as it doesn't care
+ # the repr width
+ with option_context('display.unicode.east_asian_width', True):
+
+ c = Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20)
+ expected = u"""[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
+Length: 60
+Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
+
+ assert _rep(c) == expected
+
+ def test_categorical_repr(self):
+ c = Categorical([1, 2, 3])
+ exp = """[1, 2, 3]
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
+ exp = """[1, 2, 3, 1, 2, 3]
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 4, 5] * 10)
+ exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
+Length: 50
+Categories (5, int64): [1, 2, 3, 4, 5]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(np.arange(20))
+ exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
+Length: 20
+Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_ordered(self):
+ c = Categorical([1, 2, 3], ordered=True)
+ exp = """[1, 2, 3]
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
+ exp = """[1, 2, 3, 1, 2, 3]
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(c) == exp
+
+ c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
+ exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
+Length: 50
+Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(np.arange(20), ordered=True)
+ exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
+Length: 20
+Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx)
+
+ # TODO(wesm): exceeding 80 characters in the console is not good
+ # behavior
+ exp = (
+ "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
+ "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
+ "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
+ " 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]""")
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = (
+ "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
+ "2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]\n"
+ "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
+ "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
+ " 2011-01-01 12:00:00, "
+ "2011-01-01 13:00:00]")
+
+ assert repr(c) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ c = Categorical(idx)
+ exp = (
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
+ "2011-01-01 13:00:00-05:00]\n"
+ "Categories (5, datetime64[ns, US/Eastern]): "
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
+ " "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
+ " "
+ "2011-01-01 13:00:00-05:00]")
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = (
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
+ "2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
+ "2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
+ "2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
+ "Categories (5, datetime64[ns, US/Eastern]): "
+ "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
+ " "
+ "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
+ " "
+ "2011-01-01 13:00:00-05:00]")
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_int_with_nan(self):
+ c = Categorical([1, 2, np.nan])
+ c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
+ assert repr(c) == c_exp
+
+ s = Series([1, 2, np.nan], dtype="object").astype("category")
+ s_exp = """0 1\n1 2\n2 NaN
+dtype: category
+Categories (2, int64): [1, 2]"""
+ assert repr(s) == s_exp
+
+ def test_categorical_repr_period(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ c = Categorical(idx)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ c = Categorical(idx)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = timedelta_range('1 hours', periods=20)
+ c = Categorical(idx)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 20
+Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
+ 18 days 01:00:00, 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 40
+Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
+ 18 days 01:00:00, 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ c = Categorical(idx, ordered=True)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(c) == exp
+
+ idx = timedelta_range('1 hours', periods=20)
+ c = Categorical(idx, ordered=True)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 20
+Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
+ 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ c = Categorical(idx.append(idx), categories=idx, ordered=True)
+ exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
+Length: 40
+Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
+ 18 days 01:00:00 < 19 days 01:00:00]""" # noqa
+
+ assert repr(c) == exp
+
+ def test_categorical_index_repr(self):
+ idx = CategoricalIndex(Categorical([1, 2, 3]))
+ exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa
+ assert repr(idx) == exp
+
+ i = CategoricalIndex(Categorical(np.arange(10)))
+ exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_ordered(self):
+ i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
+ exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(np.arange(10), ordered=True))
+ exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
+ '2011-01-01 11:00:00', '2011-01-01 12:00:00',
+ '2011-01-01 13:00:00'],
+ categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
+ '2011-01-01 11:00:00', '2011-01-01 12:00:00',
+ '2011-01-01 13:00:00'],
+ categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
+ '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
+ '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
+ '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
+ '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
+ categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_period(self):
+ # test all length
+ idx = period_range('2011-01-01 09:00', freq='H', periods=1)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=2)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=3)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ i = CategoricalIndex(Categorical(idx.append(idx)))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
+ '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
+ '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00', '2011-01-01 13:00'],
+ categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ i = CategoricalIndex(Categorical(idx))
+ exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
+ '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
+ '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
+ '9 days 01:00:00'],
+ categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa
+
+ assert repr(i) == exp
+
+ def test_categorical_index_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa
+ assert repr(i) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ i = CategoricalIndex(Categorical(idx, ordered=True))
+ exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
+ '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
+ '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
+ '9 days 01:00:00'],
+ categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa
+
+ assert repr(i) == exp
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_sorting.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_sorting.py
new file mode 100644
index 00000000000..3d55862cd2c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_sorting.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import Categorical, Index
+import pandas.util.testing as tm
+
+
+class TestCategoricalSort(object):
+
+ def test_argsort(self):
+ c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+ expected = np.array([2, 4, 1, 3, 0])
+ tm.assert_numpy_array_equal(c.argsort(ascending=True), expected,
+ check_dtype=False)
+
+ expected = expected[::-1]
+ tm.assert_numpy_array_equal(c.argsort(ascending=False), expected,
+ check_dtype=False)
+
+ def test_numpy_argsort(self):
+ c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+ expected = np.array([2, 4, 1, 3, 0])
+ tm.assert_numpy_array_equal(np.argsort(c), expected,
+ check_dtype=False)
+
+ tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected,
+ check_dtype=False)
+
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(c, axis=0)
+
+ msg = "the 'order' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(c, order='C')
+
+ def test_sort_values(self):
+
+ # unordered cats are sortable
+ cat = Categorical(["a", "b", "b", "a"], ordered=False)
+ cat.sort_values()
+
+ cat = Categorical(["a", "c", "b", "d"], ordered=True)
+
+ # sort_values
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ cat = Categorical(["a", "c", "b", "d"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ res = cat.sort_values(ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=object)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ # sort (inplace order)
+ cat1 = cat.copy()
+ cat1.sort_values(inplace=True)
+ exp = np.array(["a", "b", "c", "d"], dtype=object)
+ tm.assert_numpy_array_equal(cat1.__array__(), exp)
+ tm.assert_index_equal(res.categories, cat.categories)
+
+ # reverse
+ cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
+ res = cat.sort_values(ascending=False)
+ exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ def test_sort_values_na_position(self):
+ # see gh-12882
+ cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
+ exp_categories = Index([2, 5])
+
+ exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+ res = cat.sort_values() # default arguments
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
+ res = cat.sort_values(ascending=True, na_position='first')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
+ res = cat.sort_values(ascending=False, na_position='first')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+ res = cat.sort_values(ascending=True, na_position='last')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
+ res = cat.sort_values(ascending=False, na_position='last')
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+ res = cat.sort_values(ascending=False, na_position='last')
+ exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
+
+ cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+ res = cat.sort_values(ascending=False, na_position='first')
+ exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
+ exp_categories = Index(["a", "b", "c", "d"])
+ tm.assert_numpy_array_equal(res.__array__(), exp_val)
+ tm.assert_index_equal(res.categories, exp_categories)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_subclass.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_subclass.py
new file mode 100644
index 00000000000..7e90f8d51a3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_subclass.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+from pandas import Categorical
+import pandas.util.testing as tm
+
+
+class TestCategoricalSubclassing(object):
+
+ def test_constructor(self):
+ sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+ assert isinstance(sc, tm.SubclassedCategorical)
+ tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
+
+ def test_from_codes(self):
+ sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+ assert isinstance(sc, tm.SubclassedCategorical)
+ exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+ tm.assert_categorical_equal(sc, exp)
+
+ def test_map(self):
+ sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+ res = sc.map(lambda x: x.upper())
+ assert isinstance(res, tm.SubclassedCategorical)
+ exp = Categorical(['A', 'B', 'C'])
+ tm.assert_categorical_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_warnings.py b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_warnings.py
new file mode 100644
index 00000000000..23d00585f95
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/categorical/test_warnings.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestCategoricalWarnings(object):
+ def test_tab_complete_warning(self, ip):
+ # https://github.com/pandas-dev/pandas/issues/16409
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.completer import provisionalcompleter
+
+ code = "import pandas as pd; c = Categorical([])"
+ ip.run_code(code)
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('c.', 1))
+
+ def test_CategoricalAccessor_categorical_deprecation(object):
+ with tm.assert_produces_warning(FutureWarning):
+ pd.Series(['a', 'b'], dtype='category').cat.categorical
+
+ def test_CategoricalAccessor_name_deprecation(object):
+ with tm.assert_produces_warning(FutureWarning):
+ pd.Series(['a', 'b'], dtype='category').cat.name
+
+ def test_CategoricalAccessor_index_deprecation(object):
+ with tm.assert_produces_warning(FutureWarning):
+ pd.Series(['a', 'b'], dtype='category').cat.index
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/interval/__init__.py b/contrib/python/pandas/py2/pandas/tests/arrays/interval/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/interval/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_interval.py b/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_interval.py
new file mode 100644
index 00000000000..e81e64d90ff
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_interval.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range
+from pandas.core.arrays import IntervalArray
+import pandas.util.testing as tm
+
+
+ (Index([0, 2, 4]), Index([1, 3, 5])),
+ (Index([0., 1., 2.]), Index([1., 2., 3.])),
+ (timedelta_range('0 days', periods=3),
+ timedelta_range('1 day', periods=3)),
+ (date_range('20170101', periods=3), date_range('20170102', periods=3)),
+ (date_range('20170101', periods=3, tz='US/Eastern'),
+ date_range('20170102', periods=3, tz='US/Eastern'))],
+ ids=lambda x: str(x[0].dtype))
+def left_right_dtypes(request):
+ """
+ Fixture for building an IntervalArray from various dtypes
+ """
+ return request.param
+
+
+class TestMethods(object):
+
+ @pytest.mark.parametrize('new_closed', [
+ 'left', 'right', 'both', 'neither'])
+ def test_set_closed(self, closed, new_closed):
+ # GH 21670
+ array = IntervalArray.from_breaks(range(10), closed=closed)
+ result = array.set_closed(new_closed)
+ expected = IntervalArray.from_breaks(range(10), closed=new_closed)
+ tm.assert_extension_array_equal(result, expected)
+
+ @pytest.mark.parametrize('other', [
+ Interval(0, 1, closed='right'),
+ IntervalArray.from_breaks([1, 2, 3, 4], closed='right'),
+ ])
+ def test_where_raises(self, other):
+ ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4],
+ closed='left'))
+ match = "'value.closed' is 'right', expected 'left'."
+ with pytest.raises(ValueError, match=match):
+ ser.where([True, False, True], other=other)
+
+
+class TestSetitem(object):
+
+ def test_set_na(self, left_right_dtypes):
+ left, right = left_right_dtypes
+ result = IntervalArray.from_arrays(left, right)
+ result[0] = np.nan
+
+ expected_left = Index([left._na_value] + list(left[1:]))
+ expected_right = Index([right._na_value] + list(right[1:]))
+ expected = IntervalArray.from_arrays(expected_left, expected_right)
+
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_repr_matches():
+ idx = IntervalIndex.from_breaks([1, 2, 3])
+ a = repr(idx)
+ b = repr(idx.values)
+ assert a.replace("Index", "Array") == b
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_ops.py b/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_ops.py
new file mode 100644
index 00000000000..bdbd145ed2a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/interval/test_ops.py
@@ -0,0 +1,82 @@
+"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
+import numpy as np
+import pytest
+
+from pandas import Interval, IntervalIndex, Timedelta, Timestamp
+from pandas.core.arrays import IntervalArray
+import pandas.util.testing as tm
+
+
[email protected](params=[IntervalArray, IntervalIndex])
+def constructor(request):
+ """
+ Fixture for testing both interval container classes.
+ """
+ return request.param
+
+
+ (Timedelta('0 days'), Timedelta('1 day')),
+ (Timestamp('2018-01-01'), Timedelta('1 day')),
+ (0, 1)], ids=lambda x: type(x[0]).__name__)
+def start_shift(request):
+ """
+ Fixture for generating intervals of different types from a start value
+ and a shift value that can be added to start to generate an endpoint.
+ """
+ return request.param
+
+
+class TestOverlaps(object):
+
+ def test_overlaps_interval(
+ self, constructor, start_shift, closed, other_closed):
+ start, shift = start_shift
+ interval = Interval(start, start + 3 * shift, other_closed)
+
+ # intervals: identical, nested, spanning, partial, adjacent, disjoint
+ tuples = [(start, start + 3 * shift),
+ (start + shift, start + 2 * shift),
+ (start - shift, start + 4 * shift),
+ (start + 2 * shift, start + 4 * shift),
+ (start + 3 * shift, start + 4 * shift),
+ (start + 4 * shift, start + 5 * shift)]
+ interval_container = constructor.from_tuples(tuples, closed)
+
+ adjacent = (interval.closed_right and interval_container.closed_left)
+ expected = np.array([True, True, True, True, adjacent, False])
+ result = interval_container.overlaps(interval)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('other_constructor', [
+ IntervalArray, IntervalIndex])
+ def test_overlaps_interval_container(self, constructor, other_constructor):
+ # TODO: modify this test when implemented
+ interval_container = constructor.from_breaks(range(5))
+ other_container = other_constructor.from_breaks(range(5))
+ with pytest.raises(NotImplementedError):
+ interval_container.overlaps(other_container)
+
+ def test_overlaps_na(self, constructor, start_shift):
+ """NA values are marked as False"""
+ start, shift = start_shift
+ interval = Interval(start, start + shift)
+
+ tuples = [(start, start + shift),
+ np.nan,
+ (start + 2 * shift, start + 3 * shift)]
+ interval_container = constructor.from_tuples(tuples)
+
+ expected = np.array([True, False, False])
+ result = interval_container.overlaps(interval)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('other', [
+ 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')],
+ ids=lambda x: type(x).__name__)
+ def test_overlaps_invalid_type(self, constructor, other):
+ interval_container = constructor.from_breaks(range(5))
+ msg = '`other` must be Interval-like, got {other}'.format(
+ other=type(other).__name__)
+ with pytest.raises(TypeError, match=msg):
+ interval_container.overlaps(other)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/sparse/__init__.py b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_arithmetics.py b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_arithmetics.py
new file mode 100644
index 00000000000..42a29654b44
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_arithmetics.py
@@ -0,0 +1,538 @@
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.sparse.api import SparseDtype
+import pandas.util.testing as tm
+
+
+class TestSparseArrayArithmetics(object):
+
+ _base = np.array
+ _klass = pd.SparseArray
+
+ def _assert(self, a, b):
+ tm.assert_numpy_array_equal(a, b)
+
+ def _check_numeric_ops(self, a, b, a_dense, b_dense):
+ with np.errstate(invalid='ignore', divide='ignore'):
+ # Unfortunately, trying to wrap the computation of each expected
+ # value is with np.errstate() is too tedious.
+
+ # sparse & sparse
+ self._assert((a + b).to_dense(), a_dense + b_dense)
+ self._assert((b + a).to_dense(), b_dense + a_dense)
+
+ self._assert((a - b).to_dense(), a_dense - b_dense)
+ self._assert((b - a).to_dense(), b_dense - a_dense)
+
+ self._assert((a * b).to_dense(), a_dense * b_dense)
+ self._assert((b * a).to_dense(), b_dense * a_dense)
+
+ # pandas uses future division
+ self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense)
+ self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense)
+
+ # ToDo: FIXME in GH 13843
+ if not (self._base == pd.Series and
+ a.dtype.subtype == np.dtype('int64')):
+ self._assert((a // b).to_dense(), a_dense // b_dense)
+ self._assert((b // a).to_dense(), b_dense // a_dense)
+
+ self._assert((a % b).to_dense(), a_dense % b_dense)
+ self._assert((b % a).to_dense(), b_dense % a_dense)
+
+ self._assert((a ** b).to_dense(), a_dense ** b_dense)
+ self._assert((b ** a).to_dense(), b_dense ** a_dense)
+
+ # sparse & dense
+ self._assert((a + b_dense).to_dense(), a_dense + b_dense)
+ self._assert((b_dense + a).to_dense(), b_dense + a_dense)
+
+ self._assert((a - b_dense).to_dense(), a_dense - b_dense)
+ self._assert((b_dense - a).to_dense(), b_dense - a_dense)
+
+ self._assert((a * b_dense).to_dense(), a_dense * b_dense)
+ self._assert((b_dense * a).to_dense(), b_dense * a_dense)
+
+ # pandas uses future division
+ self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense)
+ self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense)
+
+ # ToDo: FIXME in GH 13843
+ if not (self._base == pd.Series and
+ a.dtype.subtype == np.dtype('int64')):
+ self._assert((a // b_dense).to_dense(), a_dense // b_dense)
+ self._assert((b_dense // a).to_dense(), b_dense // a_dense)
+
+ self._assert((a % b_dense).to_dense(), a_dense % b_dense)
+ self._assert((b_dense % a).to_dense(), b_dense % a_dense)
+
+ self._assert((a ** b_dense).to_dense(), a_dense ** b_dense)
+ self._assert((b_dense ** a).to_dense(), b_dense ** a_dense)
+
+ def _check_bool_result(self, res):
+ assert isinstance(res, self._klass)
+ assert isinstance(res.dtype, SparseDtype)
+ assert res.dtype.subtype == np.bool
+ assert isinstance(res.fill_value, bool)
+
+ def _check_comparison_ops(self, a, b, a_dense, b_dense):
+ with np.errstate(invalid='ignore'):
+ # Unfortunately, trying to wrap the computation of each expected
+ # value is with np.errstate() is too tedious.
+ #
+ # sparse & sparse
+ self._check_bool_result(a == b)
+ self._assert((a == b).to_dense(), a_dense == b_dense)
+
+ self._check_bool_result(a != b)
+ self._assert((a != b).to_dense(), a_dense != b_dense)
+
+ self._check_bool_result(a >= b)
+ self._assert((a >= b).to_dense(), a_dense >= b_dense)
+
+ self._check_bool_result(a <= b)
+ self._assert((a <= b).to_dense(), a_dense <= b_dense)
+
+ self._check_bool_result(a > b)
+ self._assert((a > b).to_dense(), a_dense > b_dense)
+
+ self._check_bool_result(a < b)
+ self._assert((a < b).to_dense(), a_dense < b_dense)
+
+ # sparse & dense
+ self._check_bool_result(a == b_dense)
+ self._assert((a == b_dense).to_dense(), a_dense == b_dense)
+
+ self._check_bool_result(a != b_dense)
+ self._assert((a != b_dense).to_dense(), a_dense != b_dense)
+
+ self._check_bool_result(a >= b_dense)
+ self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
+
+ self._check_bool_result(a <= b_dense)
+ self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
+
+ self._check_bool_result(a > b_dense)
+ self._assert((a > b_dense).to_dense(), a_dense > b_dense)
+
+ self._check_bool_result(a < b_dense)
+ self._assert((a < b_dense).to_dense(), a_dense < b_dense)
+
+ def _check_logical_ops(self, a, b, a_dense, b_dense):
+ # sparse & sparse
+ self._check_bool_result(a & b)
+ self._assert((a & b).to_dense(), a_dense & b_dense)
+
+ self._check_bool_result(a | b)
+ self._assert((a | b).to_dense(), a_dense | b_dense)
+ # sparse & dense
+ self._check_bool_result(a & b_dense)
+ self._assert((a & b_dense).to_dense(), a_dense & b_dense)
+
+ self._check_bool_result(a | b_dense)
+ self._assert((a | b_dense).to_dense(), a_dense | b_dense)
+
+ def test_float_scalar(self):
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ self._check_numeric_ops(a, 1, values, 1)
+ self._check_numeric_ops(a, 0, values, 0)
+ self._check_numeric_ops(a, 3, values, 3)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ self._check_numeric_ops(a, 1, values, 1)
+ self._check_numeric_ops(a, 0, values, 0)
+ self._check_numeric_ops(a, 3, values, 3)
+
+ a = self._klass(values, kind=kind, fill_value=2)
+ self._check_numeric_ops(a, 1, values, 1)
+ self._check_numeric_ops(a, 0, values, 0)
+ self._check_numeric_ops(a, 3, values, 3)
+
+ def test_float_scalar_comparison(self):
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ self._check_comparison_ops(a, 1, values, 1)
+ self._check_comparison_ops(a, 0, values, 0)
+ self._check_comparison_ops(a, 3, values, 3)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ self._check_comparison_ops(a, 1, values, 1)
+ self._check_comparison_ops(a, 0, values, 0)
+ self._check_comparison_ops(a, 3, values, 3)
+
+ a = self._klass(values, kind=kind, fill_value=2)
+ self._check_comparison_ops(a, 1, values, 1)
+ self._check_comparison_ops(a, 0, values, 0)
+ self._check_comparison_ops(a, 3, values, 3)
+
+ def test_float_same_index(self):
+ # when sp_index are the same
+ for kind in ['integer', 'block']:
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
+
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.])
+ rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.])
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ def test_float_same_index_comparison(self):
+ # when sp_index are the same
+ for kind in ['integer', 'block']:
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
+
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.])
+ rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.])
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ def test_float_array(self):
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ self._check_numeric_ops(a, b, values, rvalues)
+ self._check_numeric_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=1)
+ b = self._klass(rvalues, kind=kind, fill_value=2)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ def test_float_array_different_kind(self):
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
+
+ a = self._klass(values, kind='integer')
+ b = self._klass(rvalues, kind='block')
+ self._check_numeric_ops(a, b, values, rvalues)
+ self._check_numeric_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, kind='integer', fill_value=0)
+ b = self._klass(rvalues, kind='block')
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind='integer', fill_value=0)
+ b = self._klass(rvalues, kind='block', fill_value=0)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind='integer', fill_value=1)
+ b = self._klass(rvalues, kind='block', fill_value=2)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ def test_float_array_comparison(self):
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ self._check_comparison_ops(a, b, values, rvalues)
+ self._check_comparison_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=1)
+ b = self._klass(rvalues, kind=kind, fill_value=2)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ def test_int_array(self):
+ # have to specify dtype explicitly until fixing GH 667
+ dtype = np.int64
+
+ values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
+ rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, dtype=dtype, kind=kind)
+ assert a.dtype == SparseDtype(dtype)
+ b = self._klass(rvalues, dtype=dtype, kind=kind)
+ assert b.dtype == SparseDtype(dtype)
+
+ self._check_numeric_ops(a, b, values, rvalues)
+ self._check_numeric_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
+ assert a.dtype == SparseDtype(dtype)
+ b = self._klass(rvalues, dtype=dtype, kind=kind)
+ assert b.dtype == SparseDtype(dtype)
+
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
+ assert a.dtype == SparseDtype(dtype)
+ b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind)
+ assert b.dtype == SparseDtype(dtype)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, fill_value=1, dtype=dtype, kind=kind)
+ assert a.dtype == SparseDtype(dtype, fill_value=1)
+ b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind)
+ assert b.dtype == SparseDtype(dtype, fill_value=2)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ def test_int_array_comparison(self):
+
+ # int32 NI ATM
+ for dtype in ['int64']:
+ values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
+ rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, dtype=dtype, kind=kind)
+ b = self._klass(rvalues, dtype=dtype, kind=kind)
+ self._check_comparison_ops(a, b, values, rvalues)
+ self._check_comparison_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
+ b = self._klass(rvalues, dtype=dtype, kind=kind)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
+ b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, dtype=dtype, kind=kind, fill_value=1)
+ b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ def test_bool_same_index(self):
+ # GH 14000
+ # when sp_index are the same
+ for kind in ['integer', 'block']:
+ values = self._base([True, False, True, True], dtype=np.bool)
+ rvalues = self._base([True, False, True, True], dtype=np.bool)
+
+ for fill_value in [True, False, np.nan]:
+ a = self._klass(values, kind=kind, dtype=np.bool,
+ fill_value=fill_value)
+ b = self._klass(rvalues, kind=kind, dtype=np.bool,
+ fill_value=fill_value)
+ self._check_logical_ops(a, b, values, rvalues)
+
+ def test_bool_array_logical(self):
+ # GH 14000
+ # when sp_index are the same
+ for kind in ['integer', 'block']:
+ values = self._base([True, False, True, False, True, True],
+ dtype=np.bool)
+ rvalues = self._base([True, False, False, True, False, True],
+ dtype=np.bool)
+
+ for fill_value in [True, False, np.nan]:
+ a = self._klass(values, kind=kind, dtype=np.bool,
+ fill_value=fill_value)
+ b = self._klass(rvalues, kind=kind, dtype=np.bool,
+ fill_value=fill_value)
+ self._check_logical_ops(a, b, values, rvalues)
+
+ def test_mixed_array_float_int(self):
+
+ for rdtype in ['int64']:
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ assert b.dtype == SparseDtype(rdtype)
+
+ self._check_numeric_ops(a, b, values, rvalues)
+ self._check_numeric_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind)
+ assert b.dtype == SparseDtype(rdtype)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ assert b.dtype == SparseDtype(rdtype)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=1)
+ b = self._klass(rvalues, kind=kind, fill_value=2)
+ assert b.dtype == SparseDtype(rdtype, fill_value=2)
+ self._check_numeric_ops(a, b, values, rvalues)
+
+ def test_mixed_array_comparison(self):
+
+ # int32 NI ATM
+ for rdtype in ['int64']:
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
+
+ for kind in ['integer', 'block']:
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ assert b.dtype == SparseDtype(rdtype)
+
+ self._check_comparison_ops(a, b, values, rvalues)
+ self._check_comparison_ops(a, b * 0, values, rvalues * 0)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind)
+ assert b.dtype == SparseDtype(rdtype)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=0)
+ b = self._klass(rvalues, kind=kind, fill_value=0)
+ assert b.dtype == SparseDtype(rdtype)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+ a = self._klass(values, kind=kind, fill_value=1)
+ b = self._klass(rvalues, kind=kind, fill_value=2)
+ assert b.dtype == SparseDtype(rdtype, fill_value=2)
+ self._check_comparison_ops(a, b, values, rvalues)
+
+
+class TestSparseSeriesArithmetic(TestSparseArrayArithmetics):
+
+ _base = pd.Series
+ _klass = pd.SparseSeries
+
+ def _assert(self, a, b):
+ tm.assert_series_equal(a, b)
+
+ def test_alignment(self):
+ da = pd.Series(np.arange(4))
+ db = pd.Series(np.arange(4), index=[1, 2, 3, 4])
+
+ sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0)
+ sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4],
+ dtype=np.int64, fill_value=0)
+ self._check_numeric_ops(sa, sb, da, db)
+
+ sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan)
+ sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4],
+ dtype=np.int64, fill_value=np.nan)
+ self._check_numeric_ops(sa, sb, da, db)
+
+ da = pd.Series(np.arange(4))
+ db = pd.Series(np.arange(4), index=[10, 11, 12, 13])
+
+ sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0)
+ sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13],
+ dtype=np.int64, fill_value=0)
+ self._check_numeric_ops(sa, sb, da, db)
+
+ sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan)
+ sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13],
+ dtype=np.int64, fill_value=np.nan)
+ self._check_numeric_ops(sa, sb, da, db)
+
+
+ operator.eq,
+ operator.add,
+])
+def test_with_list(op):
+ arr = pd.SparseArray([0, 1], fill_value=0)
+ result = op(arr, [0, 1])
+ expected = op(arr, pd.SparseArray([0, 1]))
+ tm.assert_sp_array_equal(result, expected)
+
+
+ np.abs, np.exp,
+])
+ pd.SparseArray([0, 0, -1, 1]),
+ pd.SparseArray([None, None, -1, 1]),
+])
+def test_ufuncs(ufunc, arr):
+ result = ufunc(arr)
+ fill_value = ufunc(arr.fill_value)
+ expected = pd.SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
+ tm.assert_sp_array_equal(result, expected)
+
+
+ (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])),
+ (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
+ (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
+ (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
+ (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
+])
+ np.add,
+ np.greater,
+])
+def test_binary_ufuncs(ufunc, a, b):
+ # can't say anything about fill value here.
+ result = ufunc(a, b)
+ expected = ufunc(np.asarray(a), np.asarray(b))
+ assert isinstance(result, pd.SparseArray)
+ tm.assert_numpy_array_equal(np.asarray(result), expected)
+
+
+def test_ndarray_inplace():
+ sparray = pd.SparseArray([0, 2, 0, 0])
+ ndarray = np.array([0, 1, 2, 3])
+ ndarray += sparray
+ expected = np.array([0, 3, 2, 3])
+ tm.assert_numpy_array_equal(ndarray, expected)
+
+
+def test_sparray_inplace():
+ sparray = pd.SparseArray([0, 2, 0, 0])
+ ndarray = np.array([0, 1, 2, 3])
+ sparray += ndarray
+ expected = pd.SparseArray([0, 3, 2, 3], fill_value=0)
+ tm.assert_sp_array_equal(sparray, expected)
+
+
[email protected]("fill_value", [True, False])
+def test_invert(fill_value):
+ arr = np.array([True, False, False, True])
+ sparray = pd.SparseArray(arr, fill_value=fill_value)
+ result = ~sparray
+ expected = pd.SparseArray(~arr, fill_value=not fill_value)
+ tm.assert_sp_array_equal(result, expected)
+
+
[email protected]("fill_value", [0, np.nan])
[email protected]("op", [operator.pos, operator.neg])
+def test_unary_op(op, fill_value):
+ arr = np.array([0, 1, np.nan, 2])
+ sparray = pd.SparseArray(arr, fill_value=fill_value)
+ result = op(sparray)
+ expected = pd.SparseArray(op(arr), fill_value=op(fill_value))
+ tm.assert_sp_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_array.py b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_array.py
new file mode 100644
index 00000000000..11b5bcf702e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_array.py
@@ -0,0 +1,1203 @@
+import operator
+import re
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas._libs.sparse import IntIndex
+from pandas.compat import range
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import isna
+from pandas.core.sparse.api import SparseArray, SparseDtype, SparseSeries
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+
[email protected](params=["integer", "block"])
+def kind(request):
+ return request.param
+
+
+class TestSparseArray(object):
+
+ def setup_method(self, method):
+ self.arr_data = np.array([np.nan, np.nan, 1, 2, 3,
+ np.nan, 4, 5, np.nan, 6])
+ self.arr = SparseArray(self.arr_data)
+ self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
+
+ def test_constructor_dtype(self):
+ arr = SparseArray([np.nan, 1, 2, np.nan])
+ assert arr.dtype == SparseDtype(np.float64, np.nan)
+ assert arr.dtype.subtype == np.float64
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0)
+ assert arr.dtype == SparseDtype(np.float64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseArray([0, 1, 2, 4], dtype=np.float64)
+ assert arr.dtype == SparseDtype(np.float64, np.nan)
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseArray([0, 1, 2, 4], dtype=np.int64)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseArray([0, 1, 2, 4], dtype=None)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ def test_constructor_dtype_str(self):
+ result = SparseArray([1, 2, 3], dtype='int')
+ expected = SparseArray([1, 2, 3], dtype=int)
+ tm.assert_sp_array_equal(result, expected)
+
+ def test_constructor_sparse_dtype(self):
+ result = SparseArray([1, 0, 0, 1], dtype=SparseDtype('int64', -1))
+ expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64)
+ tm.assert_sp_array_equal(result, expected)
+ assert result.sp_values.dtype == np.dtype('int64')
+
+ def test_constructor_sparse_dtype_str(self):
+ result = SparseArray([1, 0, 0, 1], dtype='Sparse[int32]')
+ expected = SparseArray([1, 0, 0, 1], dtype=np.int32)
+ tm.assert_sp_array_equal(result, expected)
+ assert result.sp_values.dtype == np.dtype('int32')
+
+ def test_constructor_object_dtype(self):
+ # GH 11856
+ arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object)
+ assert arr.dtype == SparseDtype(np.object)
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object,
+ fill_value='A')
+ assert arr.dtype == SparseDtype(np.object, 'A')
+ assert arr.fill_value == 'A'
+
+ # GH 17574
+ data = [False, 0, 100.0, 0.0]
+ arr = SparseArray(data, dtype=np.object, fill_value=False)
+ assert arr.dtype == SparseDtype(np.object, False)
+ assert arr.fill_value is False
+ arr_expected = np.array(data, dtype=np.object)
+ it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
+ assert np.fromiter(it, dtype=np.bool).all()
+
+ @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int])
+ def test_constructor_na_dtype(self, dtype):
+ with pytest.raises(ValueError, match="Cannot convert"):
+ SparseArray([0, 1, np.nan], dtype=dtype)
+
+ def test_constructor_spindex_dtype(self):
+ arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
+ # XXX: Behavior change: specifying SparseIndex no longer changes the
+ # fill_value
+ expected = SparseArray([0, 1, 2, 0], kind='integer')
+ tm.assert_sp_array_equal(arr, expected)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ arr = SparseArray(data=[1, 2, 3],
+ sparse_index=IntIndex(4, [1, 2, 3]),
+ dtype=np.int64, fill_value=0)
+ exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]),
+ fill_value=0, dtype=np.int64)
+ exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ arr = SparseArray(data=[1, 2, 3],
+ sparse_index=IntIndex(4, [1, 2, 3]),
+ dtype=None, fill_value=0)
+ exp = SparseArray([0, 1, 2, 3], dtype=None)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ @pytest.mark.parametrize("sparse_index", [
+ None, IntIndex(1, [0]),
+ ])
+ def test_constructor_spindex_dtype_scalar(self, sparse_index):
+ # scalar input
+ arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None)
+ exp = SparseArray([1], dtype=None)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
+ exp = SparseArray([1], dtype=None)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ def test_constructor_spindex_dtype_scalar_broadcasts(self):
+ arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]),
+ fill_value=0, dtype=None)
+ exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None)
+ tm.assert_sp_array_equal(arr, exp)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ @pytest.mark.parametrize('data, fill_value', [
+ (np.array([1, 2]), 0),
+ (np.array([1.0, 2.0]), np.nan),
+ ([True, False], False),
+ ([pd.Timestamp('2017-01-01')], pd.NaT),
+ ])
+ def test_constructor_inferred_fill_value(self, data, fill_value):
+ result = SparseArray(data).fill_value
+
+ if pd.isna(fill_value):
+ assert pd.isna(result)
+ else:
+ assert result == fill_value
+
+ @pytest.mark.parametrize('scalar,dtype', [
+ (False, SparseDtype(bool, False)),
+ (0.0, SparseDtype('float64', 0)),
+ (1, SparseDtype('int64', 1)),
+ ('z', SparseDtype('object', 'z'))])
+ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
+ # GH 19163
+ arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar)
+ exp = SparseArray([scalar, scalar, scalar], fill_value=scalar)
+
+ tm.assert_sp_array_equal(arr, exp)
+
+ assert arr.dtype == dtype
+ assert exp.dtype == dtype
+
+ @pytest.mark.parametrize("fill", [1, np.nan, 0])
+ def test_sparse_series_round_trip(self, kind, fill):
+ # see gh-13999
+ arr = SparseArray([np.nan, 1, np.nan, 2, 3],
+ kind=kind, fill_value=fill)
+ res = SparseArray(SparseSeries(arr))
+ tm.assert_sp_array_equal(arr, res)
+
+ arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64,
+ kind=kind, fill_value=fill)
+ res = SparseArray(SparseSeries(arr), dtype=np.int64)
+ tm.assert_sp_array_equal(arr, res)
+
+ res = SparseArray(SparseSeries(arr))
+ tm.assert_sp_array_equal(arr, res)
+
+ @pytest.mark.parametrize("fill", [True, False, np.nan])
+ def test_sparse_series_round_trip2(self, kind, fill):
+ # see gh-13999
+ arr = SparseArray([True, False, True, True], dtype=np.bool,
+ kind=kind, fill_value=fill)
+ res = SparseArray(SparseSeries(arr))
+ tm.assert_sp_array_equal(arr, res)
+
+ res = SparseArray(SparseSeries(arr))
+ tm.assert_sp_array_equal(arr, res)
+
+ def test_get_item(self):
+
+ assert np.isnan(self.arr[1])
+ assert self.arr[2] == 1
+ assert self.arr[7] == 5
+
+ assert self.zarr[0] == 0
+ assert self.zarr[2] == 1
+ assert self.zarr[7] == 5
+
+ errmsg = re.compile("bounds")
+
+ with pytest.raises(IndexError, match=errmsg):
+ self.arr[11]
+
+ with pytest.raises(IndexError, match=errmsg):
+ self.arr[-11]
+
+ assert self.arr[-1] == self.arr[len(self.arr) - 1]
+
+ def test_take_scalar_raises(self):
+ msg = "'indices' must be an array, not a scalar '2'."
+ with pytest.raises(ValueError, match=msg):
+ self.arr.take(2)
+
+ def test_take(self):
+ exp = SparseArray(np.take(self.arr_data, [2, 3]))
+ tm.assert_sp_array_equal(self.arr.take([2, 3]), exp)
+
+ exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
+ tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
+
+ def test_take_fill_value(self):
+ data = np.array([1, np.nan, 0, 3, 0])
+ sparse = SparseArray(data, fill_value=0)
+
+ exp = SparseArray(np.take(data, [0]), fill_value=0)
+ tm.assert_sp_array_equal(sparse.take([0]), exp)
+
+ exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0)
+ tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp)
+
+ def test_take_negative(self):
+ exp = SparseArray(np.take(self.arr_data, [-1]))
+ tm.assert_sp_array_equal(self.arr.take([-1]), exp)
+
+ exp = SparseArray(np.take(self.arr_data, [-4, -3, -2]))
+ tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp)
+
+ @pytest.mark.parametrize('fill_value', [0, None, np.nan])
+ def test_shift_fill_value(self, fill_value):
+ # GH #24128
+ sparse = SparseArray(np.array([1, 0, 0, 3, 0]),
+ fill_value=8.0)
+ res = sparse.shift(1, fill_value=fill_value)
+ if isna(fill_value):
+ fill_value = res.dtype.na_value
+ exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]),
+ fill_value=8.0)
+ tm.assert_sp_array_equal(res, exp)
+
+ def test_bad_take(self):
+ with pytest.raises(IndexError, match="bounds"):
+ self.arr.take([11])
+
+ def test_take_filling(self):
+ # similar tests as GH 12631
+ sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4])
+ result = sparse.take(np.array([1, 0, -1]))
+ expected = SparseArray([np.nan, np.nan, 4])
+ tm.assert_sp_array_equal(result, expected)
+
+ # XXX: test change: fill_value=True -> allow_fill=True
+ result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
+ expected = SparseArray([np.nan, np.nan, np.nan])
+ tm.assert_sp_array_equal(result, expected)
+
+ # allow_fill=False
+ result = sparse.take(np.array([1, 0, -1]),
+ allow_fill=False, fill_value=True)
+ expected = SparseArray([np.nan, np.nan, 4])
+ tm.assert_sp_array_equal(result, expected)
+
+ msg = "Invalid value in 'indices'"
+ with pytest.raises(ValueError, match=msg):
+ sparse.take(np.array([1, 0, -2]), allow_fill=True)
+
+ with pytest.raises(ValueError, match=msg):
+ sparse.take(np.array([1, 0, -5]), allow_fill=True)
+
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, -6]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]), allow_fill=True)
+
+ def test_take_filling_fill_value(self):
+ # same tests as GH 12631
+ sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0)
+ result = sparse.take(np.array([1, 0, -1]))
+ expected = SparseArray([0, np.nan, 4], fill_value=0)
+ tm.assert_sp_array_equal(result, expected)
+
+ # fill_value
+ result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
+ # XXX: behavior change.
+ # the old way of filling self.fill_value doesn't follow EA rules.
+ # It's supposed to be self.dtype.na_value (nan in this case)
+ expected = SparseArray([0, np.nan, np.nan], fill_value=0)
+ tm.assert_sp_array_equal(result, expected)
+
+ # allow_fill=False
+ result = sparse.take(np.array([1, 0, -1]),
+ allow_fill=False, fill_value=True)
+ expected = SparseArray([0, np.nan, 4], fill_value=0)
+ tm.assert_sp_array_equal(result, expected)
+
+ msg = ("Invalid value in 'indices'.")
+ with pytest.raises(ValueError, match=msg):
+ sparse.take(np.array([1, 0, -2]), allow_fill=True)
+ with pytest.raises(ValueError, match=msg):
+ sparse.take(np.array([1, 0, -5]), allow_fill=True)
+
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, -6]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]), fill_value=True)
+
+ def test_take_filling_all_nan(self):
+ sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan])
+ # XXX: did the default kind from take change?
+ result = sparse.take(np.array([1, 0, -1]))
+ expected = SparseArray([np.nan, np.nan, np.nan], kind='block')
+ tm.assert_sp_array_equal(result, expected)
+
+ result = sparse.take(np.array([1, 0, -1]), fill_value=True)
+ expected = SparseArray([np.nan, np.nan, np.nan], kind='block')
+ tm.assert_sp_array_equal(result, expected)
+
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, -6]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]))
+ with pytest.raises(IndexError):
+ sparse.take(np.array([1, 5]), fill_value=True)
+
+ def test_set_item(self):
+ def setitem():
+ self.arr[5] = 3
+
+ def setslice():
+ self.arr[1:5] = 2
+
+ with pytest.raises(TypeError, match="assignment via setitem"):
+ setitem()
+
+ with pytest.raises(TypeError, match="assignment via setitem"):
+ setslice()
+
+ def test_constructor_from_too_large_array(self):
+ with pytest.raises(TypeError, match="expected dimension <= 1 data"):
+ SparseArray(np.arange(10).reshape((2, 5)))
+
+ def test_constructor_from_sparse(self):
+ res = SparseArray(self.zarr)
+ assert res.fill_value == 0
+ assert_almost_equal(res.sp_values, self.zarr.sp_values)
+
+ def test_constructor_copy(self):
+ cp = SparseArray(self.arr, copy=True)
+ cp.sp_values[:3] = 0
+ assert not (self.arr.sp_values[:3] == 0).any()
+
+ not_copy = SparseArray(self.arr)
+ not_copy.sp_values[:3] = 0
+ assert (self.arr.sp_values[:3] == 0).all()
+
+ def test_constructor_bool(self):
+ # GH 10648
+ data = np.array([False, False, True, True, False, False])
+ arr = SparseArray(data, fill_value=False, dtype=bool)
+
+ assert arr.dtype == SparseDtype(bool)
+ tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True]))
+ # Behavior change: np.asarray densifies.
+ # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
+ tm.assert_numpy_array_equal(arr.sp_index.indices,
+ np.array([2, 3], np.int32))
+
+ for dense in [arr.to_dense(), arr.values]:
+ assert dense.dtype == bool
+ tm.assert_numpy_array_equal(dense, data)
+
+ def test_constructor_bool_fill_value(self):
+ arr = SparseArray([True, False, True], dtype=None)
+ assert arr.dtype == SparseDtype(np.bool)
+ assert not arr.fill_value
+
+ arr = SparseArray([True, False, True], dtype=np.bool)
+ assert arr.dtype == SparseDtype(np.bool)
+ assert not arr.fill_value
+
+ arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True)
+ assert arr.dtype == SparseDtype(np.bool, True)
+ assert arr.fill_value
+
+ def test_constructor_float32(self):
+ # GH 10648
+ data = np.array([1., np.nan, 3], dtype=np.float32)
+ arr = SparseArray(data, dtype=np.float32)
+
+ assert arr.dtype == SparseDtype(np.float32)
+ tm.assert_numpy_array_equal(arr.sp_values,
+ np.array([1, 3], dtype=np.float32))
+ # Behavior change: np.asarray densifies.
+ # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
+ tm.assert_numpy_array_equal(arr.sp_index.indices,
+ np.array([0, 2], dtype=np.int32))
+
+ for dense in [arr.to_dense(), arr.values]:
+ assert dense.dtype == np.float32
+ tm.assert_numpy_array_equal(dense, data)
+
+ def test_astype(self):
+ # float -> float
+ arr = SparseArray([None, None, 0, 2])
+ result = arr.astype("Sparse[float32]")
+ expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32'))
+ tm.assert_sp_array_equal(result, expected)
+
+ dtype = SparseDtype("float64", fill_value=0)
+ result = arr.astype(dtype)
+ expected = SparseArray._simple_new(np.array([0., 2.],
+ dtype=dtype.subtype),
+ IntIndex(4, [2, 3]),
+ dtype)
+ tm.assert_sp_array_equal(result, expected)
+
+ dtype = SparseDtype("int64", 0)
+ result = arr.astype(dtype)
+ expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64),
+ IntIndex(4, [2, 3]),
+ dtype)
+ tm.assert_sp_array_equal(result, expected)
+
+ arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
+ with pytest.raises(ValueError, match='NA'):
+ arr.astype('Sparse[i8]')
+
+ def test_astype_bool(self):
+ a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
+ result = a.astype(bool)
+ expected = SparseArray([True, 0, 0, True],
+ dtype=SparseDtype(bool, 0))
+ tm.assert_sp_array_equal(result, expected)
+
+ # update fill value
+ result = a.astype(SparseDtype(bool, False))
+ expected = SparseArray([True, False, False, True],
+ dtype=SparseDtype(bool, False))
+ tm.assert_sp_array_equal(result, expected)
+
+ def test_astype_all(self, any_real_dtype):
+ vals = np.array([1, 2, 3])
+ arr = SparseArray(vals, fill_value=1)
+ typ = np.dtype(any_real_dtype)
+ res = arr.astype(typ)
+ assert res.dtype == SparseDtype(typ, 1)
+ assert res.sp_values.dtype == typ
+
+ tm.assert_numpy_array_equal(np.asarray(res.values),
+ vals.astype(typ))
+
+ @pytest.mark.parametrize('array, dtype, expected', [
+ (SparseArray([0, 1]), 'float',
+ SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
+ (SparseArray([0, 1]), bool, SparseArray([False, True])),
+ (SparseArray([0, 1], fill_value=1), bool,
+ SparseArray([False, True], dtype=SparseDtype(bool, True))),
+ pytest.param(
+ SparseArray([0, 1]), 'datetime64[ns]',
+ SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
+ dtype=SparseDtype('datetime64[ns]',
+ pd.Timestamp('1970'))),
+ marks=[pytest.mark.xfail(reason="NumPy-7619")],
+ ),
+ (SparseArray([0, 1, 10]), str,
+ SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
+ (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
+ (SparseArray([0, 1, 0]), object,
+ SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
+ ])
+ def test_astype_more(self, array, dtype, expected):
+ result = array.astype(dtype)
+ tm.assert_sp_array_equal(result, expected)
+
+ def test_astype_nan_raises(self):
+ arr = SparseArray([1.0, np.nan])
+ with pytest.raises(ValueError, match='Cannot convert non-finite'):
+ arr.astype(int)
+
+ def test_set_fill_value(self):
+ arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
+ arr.fill_value = 2
+ assert arr.fill_value == 2
+
+ arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
+ arr.fill_value = 2
+ assert arr.fill_value == 2
+
+ # XXX: this seems fine? You can construct an integer
+ # sparsearray with NaN fill value, why not update one?
+ # coerces to int
+ # msg = "unable to set fill_value 3\\.1 to int64 dtype"
+ # with pytest.raises(ValueError, match=msg):
+ arr.fill_value = 3.1
+ assert arr.fill_value == 3.1
+
+ # msg = "unable to set fill_value nan to int64 dtype"
+ # with pytest.raises(ValueError, match=msg):
+ arr.fill_value = np.nan
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool)
+ arr.fill_value = True
+ assert arr.fill_value
+
+ # coerces to bool
+ # msg = "unable to set fill_value 0 to bool dtype"
+ # with pytest.raises(ValueError, match=msg):
+ arr.fill_value = 0
+ assert arr.fill_value == 0
+
+ # msg = "unable to set fill_value nan to bool dtype"
+ # with pytest.raises(ValueError, match=msg):
+ arr.fill_value = np.nan
+ assert np.isnan(arr.fill_value)
+
+ @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
+ def test_set_fill_invalid_non_scalar(self, val):
+ arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool)
+ msg = "fill_value must be a scalar"
+
+ with pytest.raises(ValueError, match=msg):
+ arr.fill_value = val
+
+ def test_copy_shallow(self):
+ arr2 = self.arr.copy(deep=False)
+ assert arr2.sp_values is self.arr.sp_values
+ assert arr2.sp_index is self.arr.sp_index
+
+ def test_values_asarray(self):
+ assert_almost_equal(self.arr.values, self.arr_data)
+ assert_almost_equal(self.arr.to_dense(), self.arr_data)
+
+ @pytest.mark.parametrize('data,shape,dtype', [
+ ([0, 0, 0, 0, 0], (5,), None),
+ ([], (0,), None),
+ ([0], (1,), None),
+ (['A', 'A', np.nan, 'B'], (4,), np.object)
+ ])
+ def test_shape(self, data, shape, dtype):
+ # GH 21126
+ out = SparseArray(data, dtype=dtype)
+ assert out.shape == shape
+
+ @pytest.mark.parametrize("vals", [
+ [np.nan, np.nan, np.nan, np.nan, np.nan],
+ [1, np.nan, np.nan, 3, np.nan],
+ [1, np.nan, 0, 3, 0],
+ ])
+ @pytest.mark.parametrize("method", ["to_dense", "get_values"])
+ @pytest.mark.parametrize("fill_value", [None, 0])
+ def test_dense_repr(self, vals, fill_value, method):
+ vals = np.array(vals)
+ arr = SparseArray(vals, fill_value=fill_value)
+ dense_func = getattr(arr, method)
+
+ res = dense_func()
+ tm.assert_numpy_array_equal(res, vals)
+
+ def test_getitem(self):
+ def _checkit(i):
+ assert_almost_equal(self.arr[i], self.arr.values[i])
+
+ for i in range(len(self.arr)):
+ _checkit(i)
+ _checkit(-i)
+
+ def test_getitem_arraylike_mask(self):
+ arr = SparseArray([0, 1, 2])
+ result = arr[[True, False, True]]
+ expected = SparseArray([0, 2])
+ tm.assert_sp_array_equal(result, expected)
+
+ def test_getslice(self):
+ result = self.arr[:-3]
+ exp = SparseArray(self.arr.values[:-3])
+ tm.assert_sp_array_equal(result, exp)
+
+ result = self.arr[-4:]
+ exp = SparseArray(self.arr.values[-4:])
+ tm.assert_sp_array_equal(result, exp)
+
+ # two corner cases from Series
+ result = self.arr[-12:]
+ exp = SparseArray(self.arr)
+ tm.assert_sp_array_equal(result, exp)
+
+ result = self.arr[:-12]
+ exp = SparseArray(self.arr.values[:0])
+ tm.assert_sp_array_equal(result, exp)
+
+ def test_getslice_tuple(self):
+ dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
+
+ sparse = SparseArray(dense)
+ res = sparse[4:, ]
+ exp = SparseArray(dense[4:, ])
+ tm.assert_sp_array_equal(res, exp)
+
+ sparse = SparseArray(dense, fill_value=0)
+ res = sparse[4:, ]
+ exp = SparseArray(dense[4:, ], fill_value=0)
+ tm.assert_sp_array_equal(res, exp)
+
+ with pytest.raises(IndexError):
+ sparse[4:, :]
+
+ with pytest.raises(IndexError):
+ # check numpy compat
+ dense[4:, :]
+
+ def test_boolean_slice_empty(self):
+ arr = pd.SparseArray([0, 1, 2])
+ res = arr[[False, False, False]]
+ assert res.dtype == arr.dtype
+
+ @pytest.mark.parametrize("op", ["add", "sub", "mul",
+ "truediv", "floordiv", "pow"])
+ def test_binary_operators(self, op):
+ op = getattr(operator, op)
+ data1 = np.random.randn(20)
+ data2 = np.random.randn(20)
+
+ data1[::2] = np.nan
+ data2[::3] = np.nan
+
+ arr1 = SparseArray(data1)
+ arr2 = SparseArray(data2)
+
+ data1[::2] = 3
+ data2[::3] = 3
+ farr1 = SparseArray(data1, fill_value=3)
+ farr2 = SparseArray(data2, fill_value=3)
+
+ def _check_op(op, first, second):
+ res = op(first, second)
+ exp = SparseArray(op(first.values, second.values),
+ fill_value=first.fill_value)
+ assert isinstance(res, SparseArray)
+ assert_almost_equal(res.values, exp.values)
+
+ res2 = op(first, second.values)
+ assert isinstance(res2, SparseArray)
+ tm.assert_sp_array_equal(res, res2)
+
+ res3 = op(first.values, second)
+ assert isinstance(res3, SparseArray)
+ tm.assert_sp_array_equal(res, res3)
+
+ res4 = op(first, 4)
+ assert isinstance(res4, SparseArray)
+
+ # Ignore this if the actual op raises (e.g. pow).
+ try:
+ exp = op(first.values, 4)
+ exp_fv = op(first.fill_value, 4)
+ except ValueError:
+ pass
+ else:
+ assert_almost_equal(res4.fill_value, exp_fv)
+ assert_almost_equal(res4.values, exp)
+
+ with np.errstate(all="ignore"):
+ for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]:
+ _check_op(op, first_arr, second_arr)
+
+ def test_pickle(self):
+ def _check_roundtrip(obj):
+ unpickled = tm.round_trip_pickle(obj)
+ tm.assert_sp_array_equal(unpickled, obj)
+
+ _check_roundtrip(self.arr)
+ _check_roundtrip(self.zarr)
+
+ def test_generator_warnings(self):
+ sp_arr = SparseArray([1, 2, 3])
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings(action='always',
+ category=DeprecationWarning)
+ warnings.filterwarnings(action='always',
+ category=PendingDeprecationWarning)
+ for _ in sp_arr:
+ pass
+ assert len(w) == 0
+
+ def test_fillna(self):
+ s = SparseArray([1, np.nan, np.nan, 3, np.nan])
+ res = s.fillna(-1)
+ exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
+ res = s.fillna(-1)
+ exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ s = SparseArray([1, np.nan, 0, 3, 0])
+ res = s.fillna(-1)
+ exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
+ res = s.fillna(-1)
+ exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ s = SparseArray([np.nan, np.nan, np.nan, np.nan])
+ res = s.fillna(-1)
+ exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
+ res = s.fillna(-1)
+ exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ # float dtype's fill_value is np.nan, replaced by -1
+ s = SparseArray([0., 0., 0., 0.])
+ res = s.fillna(-1)
+ exp = SparseArray([0., 0., 0., 0.], fill_value=-1)
+ tm.assert_sp_array_equal(res, exp)
+
+ # int dtype shouldn't have missing. No changes.
+ s = SparseArray([0, 0, 0, 0])
+ assert s.dtype == SparseDtype(np.int64)
+ assert s.fill_value == 0
+ res = s.fillna(-1)
+ tm.assert_sp_array_equal(res, s)
+
+ s = SparseArray([0, 0, 0, 0], fill_value=0)
+ assert s.dtype == SparseDtype(np.int64)
+ assert s.fill_value == 0
+ res = s.fillna(-1)
+ exp = SparseArray([0, 0, 0, 0], fill_value=0)
+ tm.assert_sp_array_equal(res, exp)
+
+ # fill_value can be nan if there is no missing hole.
+ # only fill_value will be changed
+ s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
+ assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
+ assert np.isnan(s.fill_value)
+ res = s.fillna(-1)
+ exp = SparseArray([0, 0, 0, 0], fill_value=-1)
+ tm.assert_sp_array_equal(res, exp)
+
+ def test_fillna_overlap(self):
+ s = SparseArray([1, np.nan, np.nan, 3, np.nan])
+ # filling with existing value doesn't replace existing value with
+ # fill_value, i.e. existing 3 remains in sp_values
+ res = s.fillna(3)
+ exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
+ tm.assert_numpy_array_equal(res.to_dense(), exp)
+
+ s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
+ res = s.fillna(3)
+ exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
+ tm.assert_sp_array_equal(res, exp)
+
+ def test_nonzero(self):
+ # Tests regression #21172.
+ sa = pd.SparseArray([
+ float('nan'),
+ float('nan'),
+ 1, 0, 0,
+ 2, 0, 0, 0,
+ 3, 0, 0
+ ])
+ expected = np.array([2, 5, 9], dtype=np.int32)
+ result, = sa.nonzero()
+ tm.assert_numpy_array_equal(expected, result)
+
+ sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
+ result, = sa.nonzero()
+ tm.assert_numpy_array_equal(expected, result)
+
+
+class TestSparseArrayAnalytics(object):
+
+ @pytest.mark.parametrize('data,pos,neg', [
+ ([True, True, True], True, False),
+ ([1, 2, 1], 1, 0),
+ ([1.0, 2.0, 1.0], 1.0, 0.0)
+ ])
+ def test_all(self, data, pos, neg):
+ # GH 17570
+ out = SparseArray(data).all()
+ assert out
+
+ out = SparseArray(data, fill_value=pos).all()
+ assert out
+
+ data[1] = neg
+ out = SparseArray(data).all()
+ assert not out
+
+ out = SparseArray(data, fill_value=pos).all()
+ assert not out
+
+ @pytest.mark.parametrize('data,pos,neg', [
+ ([True, True, True], True, False),
+ ([1, 2, 1], 1, 0),
+ ([1.0, 2.0, 1.0], 1.0, 0.0)
+ ])
+ @td.skip_if_np_lt_115 # prior didn't dispatch
+ def test_numpy_all(self, data, pos, neg):
+ # GH 17570
+ out = np.all(SparseArray(data))
+ assert out
+
+ out = np.all(SparseArray(data, fill_value=pos))
+ assert out
+
+ data[1] = neg
+ out = np.all(SparseArray(data))
+ assert not out
+
+ out = np.all(SparseArray(data, fill_value=pos))
+ assert not out
+
+ # raises with a different message on py2.
+ msg = "the \'out\' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.all(SparseArray(data), out=np.array([]))
+
+ @pytest.mark.parametrize('data,pos,neg', [
+ ([False, True, False], True, False),
+ ([0, 2, 0], 2, 0),
+ ([0.0, 2.0, 0.0], 2.0, 0.0)
+ ])
+ def test_any(self, data, pos, neg):
+ # GH 17570
+ out = SparseArray(data).any()
+ assert out
+
+ out = SparseArray(data, fill_value=pos).any()
+ assert out
+
+ data[1] = neg
+ out = SparseArray(data).any()
+ assert not out
+
+ out = SparseArray(data, fill_value=pos).any()
+ assert not out
+
+ @pytest.mark.parametrize('data,pos,neg', [
+ ([False, True, False], True, False),
+ ([0, 2, 0], 2, 0),
+ ([0.0, 2.0, 0.0], 2.0, 0.0)
+ ])
+ @td.skip_if_np_lt_115 # prior didn't dispatch
+ def test_numpy_any(self, data, pos, neg):
+ # GH 17570
+ out = np.any(SparseArray(data))
+ assert out
+
+ out = np.any(SparseArray(data, fill_value=pos))
+ assert out
+
+ data[1] = neg
+ out = np.any(SparseArray(data))
+ assert not out
+
+ out = np.any(SparseArray(data, fill_value=pos))
+ assert not out
+
+ msg = "the \'out\' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.any(SparseArray(data), out=out)
+
+ def test_sum(self):
+ data = np.arange(10).astype(float)
+ out = SparseArray(data).sum()
+ assert out == 45.0
+
+ data[5] = np.nan
+ out = SparseArray(data, fill_value=2).sum()
+ assert out == 40.0
+
+ out = SparseArray(data, fill_value=np.nan).sum()
+ assert out == 40.0
+
+ def test_numpy_sum(self):
+ data = np.arange(10).astype(float)
+ out = np.sum(SparseArray(data))
+ assert out == 45.0
+
+ data[5] = np.nan
+ out = np.sum(SparseArray(data, fill_value=2))
+ assert out == 40.0
+
+ out = np.sum(SparseArray(data, fill_value=np.nan))
+ assert out == 40.0
+
+ msg = "the 'dtype' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.sum(SparseArray(data), dtype=np.int64)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.sum(SparseArray(data), out=out)
+
+ @pytest.mark.parametrize("data,expected", [
+ (np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
+ SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0]))),
+ (np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
+ SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])))
+ ])
+ @pytest.mark.parametrize("numpy", [True, False])
+ def test_cumsum(self, data, expected, numpy):
+ cumsum = np.cumsum if numpy else lambda s: s.cumsum()
+
+ out = cumsum(SparseArray(data))
+ tm.assert_sp_array_equal(out, expected)
+
+ out = cumsum(SparseArray(data, fill_value=np.nan))
+ tm.assert_sp_array_equal(out, expected)
+
+ out = cumsum(SparseArray(data, fill_value=2))
+ tm.assert_sp_array_equal(out, expected)
+
+ if numpy: # numpy compatibility checks.
+ msg = "the 'dtype' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(SparseArray(data), dtype=np.int64)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(SparseArray(data), out=out)
+ else:
+ axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
+ msg = "axis\\(={axis}\\) out of bounds".format(axis=axis)
+ with pytest.raises(ValueError, match=msg):
+ SparseArray(data).cumsum(axis=axis)
+
+ def test_mean(self):
+ data = np.arange(10).astype(float)
+ out = SparseArray(data).mean()
+ assert out == 4.5
+
+ data[5] = np.nan
+ out = SparseArray(data).mean()
+ assert out == 40.0 / 9
+
+ def test_numpy_mean(self):
+ data = np.arange(10).astype(float)
+ out = np.mean(SparseArray(data))
+ assert out == 4.5
+
+ data[5] = np.nan
+ out = np.mean(SparseArray(data))
+ assert out == 40.0 / 9
+
+ msg = "the 'dtype' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.mean(SparseArray(data), dtype=np.int64)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.mean(SparseArray(data), out=out)
+
+ def test_ufunc(self):
+ # GH 13853 make sure ufunc is applied to fill_value
+ sparse = SparseArray([1, np.nan, 2, np.nan, -2])
+ result = SparseArray([1, np.nan, 2, np.nan, 2])
+ tm.assert_sp_array_equal(abs(sparse), result)
+ tm.assert_sp_array_equal(np.abs(sparse), result)
+
+ sparse = SparseArray([1, -1, 2, -2], fill_value=1)
+ result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index,
+ fill_value=1)
+ tm.assert_sp_array_equal(abs(sparse), result)
+ tm.assert_sp_array_equal(np.abs(sparse), result)
+
+ sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
+ result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index,
+ fill_value=1)
+ tm.assert_sp_array_equal(abs(sparse), result)
+ tm.assert_sp_array_equal(np.abs(sparse), result)
+
+ sparse = SparseArray([1, np.nan, 2, np.nan, -2])
+ result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
+ tm.assert_sp_array_equal(np.sin(sparse), result)
+
+ sparse = SparseArray([1, -1, 2, -2], fill_value=1)
+ result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
+ tm.assert_sp_array_equal(np.sin(sparse), result)
+
+ sparse = SparseArray([1, -1, 0, -2], fill_value=0)
+ result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
+ tm.assert_sp_array_equal(np.sin(sparse), result)
+
+ def test_ufunc_args(self):
+ # GH 13853 make sure ufunc is applied to fill_value, including its arg
+ sparse = SparseArray([1, np.nan, 2, np.nan, -2])
+ result = SparseArray([2, np.nan, 3, np.nan, -1])
+ tm.assert_sp_array_equal(np.add(sparse, 1), result)
+
+ sparse = SparseArray([1, -1, 2, -2], fill_value=1)
+ result = SparseArray([2, 0, 3, -1], fill_value=2)
+ tm.assert_sp_array_equal(np.add(sparse, 1), result)
+
+ sparse = SparseArray([1, -1, 0, -2], fill_value=0)
+ result = SparseArray([2, 0, 1, -1], fill_value=1)
+ tm.assert_sp_array_equal(np.add(sparse, 1), result)
+
+ def test_nbytes_integer(self):
+ arr = SparseArray([1, 0, 0, 0, 2], kind='integer')
+ result = arr.nbytes
+ # (2 * 8) + 2 * 4
+ assert result == 24
+
+ def test_nbytes_block(self):
+ arr = SparseArray([1, 2, 0, 0, 0], kind='block')
+ result = arr.nbytes
+ # (2 * 8) + 4 + 4
+ # sp_values, blocs, blenghts
+ assert result == 24
+
+ def test_asarray_datetime64(self):
+ s = pd.SparseArray(
+ pd.to_datetime(['2012', None, None, '2013'])
+ )
+ np.asarray(s)
+
+ def test_density(self):
+ arr = SparseArray([0, 1])
+ assert arr.density == 0.5
+
+ def test_npoints(self):
+ arr = SparseArray([0, 1])
+ assert arr.npoints == 1
+
+
+class TestAccessor(object):
+
+ @pytest.mark.parametrize('attr', [
+ 'npoints', 'density', 'fill_value', 'sp_values',
+ ])
+ def test_get_attributes(self, attr):
+ arr = SparseArray([0, 1])
+ ser = pd.Series(arr)
+
+ result = getattr(ser.sparse, attr)
+ expected = getattr(arr, attr)
+ assert result == expected
+
+ def test_from_coo(self):
+ sparse = pytest.importorskip("scipy.sparse")
+
+ row = [0, 3, 1, 0]
+ col = [0, 3, 1, 2]
+ data = [4, 5, 7, 9]
+ sp_array = sparse.coo_matrix((data, (row, col)))
+ result = pd.Series.sparse.from_coo(sp_array)
+
+ index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]])
+ expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]')
+ tm.assert_series_equal(result, expected)
+
+ def test_to_coo(self):
+ sparse = pytest.importorskip("scipy.sparse")
+ ser = pd.Series([1, 2, 3],
+ index=pd.MultiIndex.from_product([[0], [1, 2, 3]],
+ names=['a', 'b']),
+ dtype='Sparse[int]')
+ A, _, _ = ser.sparse.to_coo()
+ assert isinstance(A, sparse.coo.coo_matrix)
+
+ def test_non_sparse_raises(self):
+ ser = pd.Series([1, 2, 3])
+ with pytest.raises(AttributeError, match='.sparse'):
+ ser.sparse.density
+
+
+def test_setting_fill_value_fillna_still_works():
+ # This is why letting users update fill_value / dtype is bad
+ # astype has the same problem.
+ arr = SparseArray([1., np.nan, 1.0], fill_value=0.0)
+ arr.fill_value = np.nan
+ result = arr.isna()
+ # Can't do direct comparison, since the sp_index will be different
+ # So let's convert to ndarray and check there.
+ result = np.asarray(result)
+
+ expected = np.array([False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_setting_fill_value_updates():
+ arr = SparseArray([0.0, np.nan], fill_value=0)
+ arr.fill_value = np.nan
+ # use private constructor to get the index right
+ # otherwise both nans would be un-stored.
+ expected = SparseArray._simple_new(
+ sparse_array=np.array([np.nan]),
+ sparse_index=IntIndex(2, [1]),
+ dtype=SparseDtype(float, np.nan),
+ )
+ tm.assert_sp_array_equal(arr, expected)
+
+
[email protected]("arr, loc", [
+ ([None, 1, 2], 0),
+ ([0, None, 2], 1),
+ ([0, 1, None], 2),
+ ([0, 1, 1, None, None], 3),
+ ([1, 1, 1, 2], -1),
+ ([], -1),
+])
+def test_first_fill_value_loc(arr, loc):
+ result = SparseArray(arr)._first_fill_value_loc()
+ assert result == loc
+
+
+ [1, 2, np.nan, np.nan],
+ [1, np.nan, 2, np.nan],
+ [1, 2, np.nan],
+])
[email protected]("fill_value", [
+ np.nan, 0, 1
+])
+def test_unique_na_fill(arr, fill_value):
+ a = pd.SparseArray(arr, fill_value=fill_value).unique()
+ b = pd.Series(arr).unique()
+ assert isinstance(a, SparseArray)
+ a = np.asarray(a)
+ tm.assert_numpy_array_equal(a, b)
+
+
+def test_unique_all_sparse():
+ # https://github.com/pandas-dev/pandas/issues/23168
+ arr = SparseArray([0, 0])
+ result = arr.unique()
+ expected = SparseArray([0])
+ tm.assert_sp_array_equal(result, expected)
+
+
+def test_map():
+ arr = SparseArray([0, 1, 2])
+ expected = SparseArray([10, 11, 12], fill_value=10)
+
+ # dict
+ result = arr.map({0: 10, 1: 11, 2: 12})
+ tm.assert_sp_array_equal(result, expected)
+
+ # series
+ result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
+ tm.assert_sp_array_equal(result, expected)
+
+ # function
+ result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
+ expected = SparseArray([10, 11, 12], fill_value=10)
+ tm.assert_sp_array_equal(result, expected)
+
+
+def test_map_missing():
+ arr = SparseArray([0, 1, 2])
+ expected = SparseArray([10, 11, None], fill_value=10)
+
+ result = arr.map({0: 10, 1: 11})
+ tm.assert_sp_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_dtype.py b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_dtype.py
new file mode 100644
index 00000000000..2d386de0d31
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_dtype.py
@@ -0,0 +1,161 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.sparse.api import SparseDtype
+
+
[email protected]("dtype, fill_value", [
+ ('int', 0),
+ ('float', np.nan),
+ ('bool', False),
+ ('object', np.nan),
+ ('datetime64[ns]', pd.NaT),
+ ('timedelta64[ns]', pd.NaT),
+])
+def test_inferred_dtype(dtype, fill_value):
+ sparse_dtype = SparseDtype(dtype)
+ result = sparse_dtype.fill_value
+ if pd.isna(fill_value):
+ assert pd.isna(result) and type(result) == type(fill_value)
+ else:
+ assert result == fill_value
+
+
+def test_from_sparse_dtype():
+ dtype = SparseDtype('float', 0)
+ result = SparseDtype(dtype)
+ assert result.fill_value == 0
+
+
+def test_from_sparse_dtype_fill_value():
+ dtype = SparseDtype('int', 1)
+ result = SparseDtype(dtype, fill_value=2)
+ expected = SparseDtype('int', 2)
+ assert result == expected
+
+
[email protected]('dtype, fill_value', [
+ ('int', None),
+ ('float', None),
+ ('bool', None),
+ ('object', None),
+ ('datetime64[ns]', None),
+ ('timedelta64[ns]', None),
+ ('int', np.nan),
+ ('float', 0),
+])
+def test_equal(dtype, fill_value):
+ a = SparseDtype(dtype, fill_value)
+ b = SparseDtype(dtype, fill_value)
+ assert a == b
+ assert b == a
+
+
+def test_nans_equal():
+ a = SparseDtype(float, float('nan'))
+ b = SparseDtype(float, np.nan)
+ assert a == b
+ assert b == a
+
+
+ (SparseDtype('float64'), SparseDtype('float32')),
+ (SparseDtype('float64'), SparseDtype('float64', 0)),
+ (SparseDtype('float64'), SparseDtype('datetime64[ns]', np.nan)),
+ (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
+ (SparseDtype('float64'), np.dtype('float64')),
+])
+def test_not_equal(a, b):
+ assert a != b
+
+
+def test_construct_from_string_raises():
+ with pytest.raises(TypeError):
+ SparseDtype.construct_from_string('not a dtype')
+
+
[email protected]("dtype, expected", [
+ (SparseDtype(int), True),
+ (SparseDtype(float), True),
+ (SparseDtype(bool), True),
+ (SparseDtype(object), False),
+ (SparseDtype(str), False),
+])
+def test_is_numeric(dtype, expected):
+ assert dtype._is_numeric is expected
+
+
+def test_str_uses_object():
+ result = SparseDtype(str).subtype
+ assert result == np.dtype('object')
+
+
[email protected]("string, expected", [
+ ('Sparse[float64]', SparseDtype(np.dtype('float64'))),
+ ('Sparse[float32]', SparseDtype(np.dtype('float32'))),
+ ('Sparse[int]', SparseDtype(np.dtype('int'))),
+ ('Sparse[str]', SparseDtype(np.dtype('str'))),
+ ('Sparse[datetime64[ns]]', SparseDtype(np.dtype('datetime64[ns]'))),
+ ("Sparse", SparseDtype(np.dtype("float"), np.nan))
+])
+def test_construct_from_string(string, expected):
+ result = SparseDtype.construct_from_string(string)
+ assert result == expected
+
+
[email protected]("a, b, expected", [
+ (SparseDtype(float, 0.0), SparseDtype(np.dtype('float'), 0.0), True),
+ (SparseDtype(int, 0), SparseDtype(int, 0), True),
+ (SparseDtype(float, float('nan')), SparseDtype(float, np.nan), True),
+ (SparseDtype(float, 0), SparseDtype(float, np.nan), False),
+ (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
+])
+def test_hash_equal(a, b, expected):
+ result = a == b
+ assert result is expected
+
+ result = hash(a) == hash(b)
+ assert result is expected
+
+
[email protected]('string, expected', [
+ ('Sparse[int]', 'int'),
+ ('Sparse[int, 0]', 'int'),
+ ('Sparse[int64]', 'int64'),
+ ('Sparse[int64, 0]', 'int64'),
+ ('Sparse[datetime64[ns], 0]', 'datetime64[ns]'),
+])
+def test_parse_subtype(string, expected):
+ subtype, _ = SparseDtype._parse_subtype(string)
+ assert subtype == expected
+
+
[email protected]("string", [
+ "Sparse[int, 1]",
+ "Sparse[float, 0.0]",
+ "Sparse[bool, True]",
+])
+def test_construct_from_string_fill_value_raises(string):
+ with pytest.raises(TypeError, match='fill_value in the string is not'):
+ SparseDtype.construct_from_string(string)
+
+
[email protected]('original, dtype, expected', [
+ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
+ (SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
+ (SparseDtype(int, 1), str, SparseDtype(object, '1')),
+ (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
+])
+def test_update_dtype(original, dtype, expected):
+ result = original.update_dtype(dtype)
+ assert result == expected
+
+
[email protected]("original, dtype", [
+ (SparseDtype(float, np.nan), int),
+ (SparseDtype(str, 'abc'), int),
+])
+def test_update_dtype_raises(original, dtype):
+ with pytest.raises(ValueError):
+ original.update_dtype(dtype)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_libsparse.py b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_libsparse.py
new file mode 100644
index 00000000000..6e9d790bf85
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/sparse/test_libsparse.py
@@ -0,0 +1,605 @@
+import operator
+
+import numpy as np
+import pytest
+
+import pandas._libs.sparse as splib
+import pandas.util._test_decorators as td
+
+from pandas import Series
+from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index
+import pandas.util.testing as tm
+
+TEST_LENGTH = 20
+
+plain_case = dict(xloc=[0, 7, 15], xlen=[3, 5, 5], yloc=[2, 9, 14],
+ ylen=[2, 3, 5], intersect_loc=[2, 9, 15],
+ intersect_len=[1, 3, 4])
+delete_blocks = dict(xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4],
+ intersect_loc=[1], intersect_len=[3])
+split_blocks = dict(xloc=[0], xlen=[10], yloc=[0, 5], ylen=[3, 7],
+ intersect_loc=[0, 5], intersect_len=[3, 5])
+skip_block = dict(xloc=[10], xlen=[5], yloc=[0, 12], ylen=[5, 3],
+ intersect_loc=[12], intersect_len=[3])
+
+no_intersect = dict(xloc=[0, 10], xlen=[4, 6], yloc=[5, 17], ylen=[4, 2],
+ intersect_loc=[], intersect_len=[])
+
+
+def check_cases(_check_case):
+ def _check_case_dict(case):
+ _check_case(case['xloc'], case['xlen'], case['yloc'], case['ylen'],
+ case['intersect_loc'], case['intersect_len'])
+
+ _check_case_dict(plain_case)
+ _check_case_dict(delete_blocks)
+ _check_case_dict(split_blocks)
+ _check_case_dict(skip_block)
+ _check_case_dict(no_intersect)
+
+ # one or both is empty
+ _check_case([0], [5], [], [], [], [])
+ _check_case([], [], [], [], [], [])
+
+
+class TestSparseIndexUnion(object):
+
+ def test_index_make_union(self):
+ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
+ xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
+ yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
+ bresult = xindex.make_union(yindex)
+ assert (isinstance(bresult, BlockIndex))
+ tm.assert_numpy_array_equal(bresult.blocs,
+ np.array(eloc, dtype=np.int32))
+ tm.assert_numpy_array_equal(bresult.blengths,
+ np.array(elen, dtype=np.int32))
+
+ ixindex = xindex.to_int_index()
+ iyindex = yindex.to_int_index()
+ iresult = ixindex.make_union(iyindex)
+ assert (isinstance(iresult, IntIndex))
+ tm.assert_numpy_array_equal(iresult.indices,
+ bresult.to_int_index().indices)
+
+ """
+ x: ----
+ y: ----
+ r: --------
+ """
+ xloc = [0]
+ xlen = [5]
+ yloc = [5]
+ ylen = [4]
+ eloc = [0]
+ elen = [9]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ----- -----
+ y: ----- --
+ """
+ xloc = [0, 10]
+ xlen = [5, 5]
+ yloc = [2, 17]
+ ylen = [5, 2]
+ eloc = [0, 10, 17]
+ elen = [7, 5, 2]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ------
+ y: -------
+ r: ----------
+ """
+ xloc = [1]
+ xlen = [5]
+ yloc = [3]
+ ylen = [5]
+ eloc = [1]
+ elen = [7]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ------ -----
+ y: -------
+ r: -------------
+ """
+ xloc = [2, 10]
+ xlen = [4, 4]
+ yloc = [4]
+ ylen = [8]
+ eloc = [2]
+ elen = [12]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: --- -----
+ y: -------
+ r: -------------
+ """
+ xloc = [0, 5]
+ xlen = [3, 5]
+ yloc = [0]
+ ylen = [7]
+ eloc = [0]
+ elen = [10]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ------ -----
+ y: ------- ---
+ r: -------------
+ """
+ xloc = [2, 10]
+ xlen = [4, 4]
+ yloc = [4, 13]
+ ylen = [8, 4]
+ eloc = [2]
+ elen = [15]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ----------------------
+ y: ---- ---- ---
+ r: ----------------------
+ """
+ xloc = [2]
+ xlen = [15]
+ yloc = [4, 9, 14]
+ ylen = [3, 2, 2]
+ eloc = [2]
+ elen = [15]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+ """
+ x: ---- ---
+ y: --- ---
+ """
+ xloc = [0, 10]
+ xlen = [3, 3]
+ yloc = [5, 15]
+ ylen = [2, 2]
+ eloc = [0, 5, 10, 15]
+ elen = [3, 2, 3, 2]
+ _check_case(xloc, xlen, yloc, ylen, eloc, elen)
+
+ def test_intindex_make_union(self):
+ a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
+ b = IntIndex(5, np.array([0, 2], dtype=np.int32))
+ res = a.make_union(b)
+ exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
+ assert res.equals(exp)
+
+ a = IntIndex(5, np.array([], dtype=np.int32))
+ b = IntIndex(5, np.array([0, 2], dtype=np.int32))
+ res = a.make_union(b)
+ exp = IntIndex(5, np.array([0, 2], np.int32))
+ assert res.equals(exp)
+
+ a = IntIndex(5, np.array([], dtype=np.int32))
+ b = IntIndex(5, np.array([], dtype=np.int32))
+ res = a.make_union(b)
+ exp = IntIndex(5, np.array([], np.int32))
+ assert res.equals(exp)
+
+ a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
+ b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
+ res = a.make_union(b)
+ exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
+ assert res.equals(exp)
+
+ a = IntIndex(5, np.array([0, 1], dtype=np.int32))
+ b = IntIndex(4, np.array([0, 1], dtype=np.int32))
+ with pytest.raises(ValueError):
+ a.make_union(b)
+
+
+class TestSparseIndexIntersect(object):
+
+ @td.skip_if_windows
+ def test_intersect(self):
+ def _check_correct(a, b, expected):
+ result = a.intersect(b)
+ assert (result.equals(expected))
+
+ def _check_length_exc(a, longer):
+ pytest.raises(Exception, a.intersect, longer)
+
+ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
+ xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
+ yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
+ expected = BlockIndex(TEST_LENGTH, eloc, elen)
+ longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen)
+
+ _check_correct(xindex, yindex, expected)
+ _check_correct(xindex.to_int_index(), yindex.to_int_index(),
+ expected.to_int_index())
+
+ _check_length_exc(xindex, longer_index)
+ _check_length_exc(xindex.to_int_index(),
+ longer_index.to_int_index())
+
+ check_cases(_check_case)
+
+ def test_intersect_empty(self):
+ xindex = IntIndex(4, np.array([], dtype=np.int32))
+ yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
+ assert xindex.intersect(yindex).equals(xindex)
+ assert yindex.intersect(xindex).equals(xindex)
+
+ xindex = xindex.to_block_index()
+ yindex = yindex.to_block_index()
+ assert xindex.intersect(yindex).equals(xindex)
+ assert yindex.intersect(xindex).equals(xindex)
+
+ def test_intersect_identical(self):
+ cases = [IntIndex(5, np.array([1, 2], dtype=np.int32)),
+ IntIndex(5, np.array([0, 2, 4], dtype=np.int32)),
+ IntIndex(0, np.array([], dtype=np.int32)),
+ IntIndex(5, np.array([], dtype=np.int32))]
+
+ for case in cases:
+ assert case.intersect(case).equals(case)
+ case = case.to_block_index()
+ assert case.intersect(case).equals(case)
+
+
+class TestSparseIndexCommon(object):
+
+ def test_int_internal(self):
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 2
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([2, 3], dtype=np.int32))
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 0
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 4
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([0, 1, 2, 3], dtype=np.int32))
+
+ def test_block_internal(self):
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 2
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([2], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([2], dtype=np.int32))
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 0
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 4
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([0], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([4], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32),
+ kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 3
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([0, 2], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([1, 2], dtype=np.int32))
+
+ def test_lookup(self):
+ for kind in ['integer', 'block']:
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
+ assert idx.lookup(-1) == -1
+ assert idx.lookup(0) == -1
+ assert idx.lookup(1) == -1
+ assert idx.lookup(2) == 0
+ assert idx.lookup(3) == 1
+ assert idx.lookup(4) == -1
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
+
+ for i in range(-1, 5):
+ assert idx.lookup(i) == -1
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind=kind)
+ assert idx.lookup(-1) == -1
+ assert idx.lookup(0) == 0
+ assert idx.lookup(1) == 1
+ assert idx.lookup(2) == 2
+ assert idx.lookup(3) == 3
+ assert idx.lookup(4) == -1
+
+ idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32),
+ kind=kind)
+ assert idx.lookup(-1) == -1
+ assert idx.lookup(0) == 0
+ assert idx.lookup(1) == -1
+ assert idx.lookup(2) == 1
+ assert idx.lookup(3) == 2
+ assert idx.lookup(4) == -1
+
+ def test_lookup_array(self):
+ for kind in ['integer', 'block']:
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
+
+ res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
+ exp = np.array([-1, -1, 0], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
+ exp = np.array([-1, 0, -1, 1], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
+ res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
+ exp = np.array([-1, -1, -1, -1], dtype=np.int32)
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind=kind)
+ res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
+ exp = np.array([-1, 0, 2], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
+ exp = np.array([-1, 2, 1, 3], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32),
+ kind=kind)
+ res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
+ exp = np.array([1, -1, 2, 0], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
+ exp = np.array([-1, -1, 1, -1], dtype=np.int32)
+ tm.assert_numpy_array_equal(res, exp)
+
+ def test_lookup_basics(self):
+ def _check(index):
+ assert (index.lookup(0) == -1)
+ assert (index.lookup(5) == 0)
+ assert (index.lookup(7) == 2)
+ assert (index.lookup(8) == -1)
+ assert (index.lookup(9) == -1)
+ assert (index.lookup(10) == -1)
+ assert (index.lookup(11) == -1)
+ assert (index.lookup(12) == 3)
+ assert (index.lookup(17) == 8)
+ assert (index.lookup(18) == -1)
+
+ bindex = BlockIndex(20, [5, 12], [3, 6])
+ iindex = bindex.to_int_index()
+
+ _check(bindex)
+ _check(iindex)
+
+ # corner cases
+
+
+class TestBlockIndex(object):
+
+ def test_block_internal(self):
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 2
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([2], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([2], dtype=np.int32))
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 0
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 4
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([0], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([4], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind='block')
+ assert isinstance(idx, BlockIndex)
+ assert idx.npoints == 3
+ tm.assert_numpy_array_equal(idx.blocs,
+ np.array([0, 2], dtype=np.int32))
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.array([1, 2], dtype=np.int32))
+
+ def test_make_block_boundary(self):
+ for i in [5, 10, 100, 101]:
+ idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32),
+ kind='block')
+
+ exp = np.arange(0, i, 2, dtype=np.int32)
+ tm.assert_numpy_array_equal(idx.blocs, exp)
+ tm.assert_numpy_array_equal(idx.blengths,
+ np.ones(len(exp), dtype=np.int32))
+
+ def test_equals(self):
+ index = BlockIndex(10, [0, 4], [2, 5])
+
+ assert index.equals(index)
+ assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
+
+ def test_check_integrity(self):
+ locs = []
+ lengths = []
+
+ # 0-length OK
+ # TODO: index variables are not used...is that right?
+ index = BlockIndex(0, locs, lengths) # noqa
+
+ # also OK even though empty
+ index = BlockIndex(1, locs, lengths) # noqa
+
+ # block extend beyond end
+ pytest.raises(Exception, BlockIndex, 10, [5], [10])
+
+ # block overlap
+ pytest.raises(Exception, BlockIndex, 10, [2, 5], [5, 3])
+
+ def test_to_int_index(self):
+ locs = [0, 10]
+ lengths = [4, 6]
+ exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
+
+ block = BlockIndex(20, locs, lengths)
+ dense = block.to_int_index()
+
+ tm.assert_numpy_array_equal(dense.indices,
+ np.array(exp_inds, dtype=np.int32))
+
+ def test_to_block_index(self):
+ index = BlockIndex(10, [0, 5], [4, 5])
+ assert index.to_block_index() is index
+
+
+class TestIntIndex(object):
+
+ def test_check_integrity(self):
+
+ # Too many indices than specified in self.length
+ msg = "Too many indices"
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=1, indices=[1, 2, 3])
+
+ # No index can be negative.
+ msg = "No index can be less than zero"
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, -2, 3])
+
+ # No index can be negative.
+ msg = "No index can be less than zero"
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, -2, 3])
+
+ # All indices must be less than the length.
+ msg = "All indices must be less than the length"
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, 2, 5])
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, 2, 6])
+
+ # Indices must be strictly ascending.
+ msg = "Indices must be strictly increasing"
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, 3, 2])
+
+ with pytest.raises(ValueError, match=msg):
+ IntIndex(length=5, indices=[1, 3, 3])
+
+ def test_int_internal(self):
+ idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 2
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([2, 3], dtype=np.int32))
+
+ idx = _make_index(4, np.array([], dtype=np.int32), kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 0
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([], dtype=np.int32))
+
+ idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32),
+ kind='integer')
+ assert isinstance(idx, IntIndex)
+ assert idx.npoints == 4
+ tm.assert_numpy_array_equal(idx.indices,
+ np.array([0, 1, 2, 3], dtype=np.int32))
+
+ def test_equals(self):
+ index = IntIndex(10, [0, 1, 2, 3, 4])
+ assert index.equals(index)
+ assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
+
+ def test_to_block_index(self):
+
+ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
+ xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
+ yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
+
+ # see if survive the round trip
+ xbindex = xindex.to_int_index().to_block_index()
+ ybindex = yindex.to_int_index().to_block_index()
+ assert isinstance(xbindex, BlockIndex)
+ assert xbindex.equals(xindex)
+ assert ybindex.equals(yindex)
+
+ check_cases(_check_case)
+
+ def test_to_int_index(self):
+ index = IntIndex(10, [2, 3, 4, 5, 6])
+ assert index.to_int_index() is index
+
+
+class TestSparseOperators(object):
+
+ def _op_tests(self, sparse_op, python_op):
+ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
+ xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
+ yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
+
+ xdindex = xindex.to_int_index()
+ ydindex = yindex.to_int_index()
+
+ x = np.arange(xindex.npoints) * 10. + 1
+ y = np.arange(yindex.npoints) * 100. + 1
+
+ xfill = 0
+ yfill = 2
+
+ result_block_vals, rb_index, bfill = sparse_op(x, xindex, xfill, y,
+ yindex, yfill)
+ result_int_vals, ri_index, ifill = sparse_op(x, xdindex, xfill, y,
+ ydindex, yfill)
+
+ assert rb_index.to_int_index().equals(ri_index)
+ tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
+ assert bfill == ifill
+
+ # check versus Series...
+ xseries = Series(x, xdindex.indices)
+ xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill)
+
+ yseries = Series(y, ydindex.indices)
+ yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill)
+
+ series_result = python_op(xseries, yseries)
+ series_result = series_result.reindex(ri_index.indices)
+
+ tm.assert_numpy_array_equal(result_block_vals,
+ series_result.values)
+ tm.assert_numpy_array_equal(result_int_vals, series_result.values)
+
+ check_cases(_check_case)
+
+ @pytest.mark.parametrize('opname',
+ ['add', 'sub', 'mul', 'truediv', 'floordiv'])
+ def test_op(self, opname):
+ sparse_op = getattr(splib, 'sparse_%s_float64' % opname)
+ python_op = getattr(operator, opname)
+ self._op_tests(sparse_op, python_op)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_array.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_array.py
new file mode 100644
index 00000000000..9fea1989e46
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_array.py
@@ -0,0 +1,256 @@
+import datetime
+import decimal
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas.core.dtypes.dtypes import registry
+
+import pandas as pd
+from pandas.api.extensions import register_extension_dtype
+from pandas.core.arrays import PandasArray, integer_array, period_array
+from pandas.tests.extension.decimal import (
+ DecimalArray, DecimalDtype, to_decimal)
+import pandas.util.testing as tm
+
+
[email protected]("data, dtype, expected", [
+ # Basic NumPy defaults.
+ ([1, 2], None, PandasArray(np.array([1, 2]))),
+ ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
+ ([1, 2], np.dtype('float32'),
+ PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))),
+ (np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
+
+ # String alias passes through to NumPy
+ ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))),
+
+ # Period alias
+ ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]',
+ period_array(['2000', '2001'], freq='D')),
+
+ # Period dtype
+ ([pd.Period('2000', 'D')], pd.PeriodDtype('D'),
+ period_array(['2000'], freq='D')),
+
+ # Datetime (naive)
+ ([1, 2], np.dtype('datetime64[ns]'),
+ pd.arrays.DatetimeArray._from_sequence(
+ np.array([1, 2], dtype='datetime64[ns]'))),
+
+ (np.array([1, 2], dtype='datetime64[ns]'), None,
+ pd.arrays.DatetimeArray._from_sequence(
+ np.array([1, 2], dtype='datetime64[ns]'))),
+
+ (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'),
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
+
+ (pd.DatetimeIndex(['2000', '2001']), None,
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
+
+ (['2000', '2001'], np.dtype('datetime64[ns]'),
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
+
+ # Datetime (tz-aware)
+ (['2000', '2001'], pd.DatetimeTZDtype(tz="CET"),
+ pd.arrays.DatetimeArray._from_sequence(
+ ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz="CET"))),
+
+ # Timedelta
+ (['1H', '2H'], np.dtype('timedelta64[ns]'),
+ pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),
+
+ (pd.TimedeltaIndex(['1H', '2H']), np.dtype('timedelta64[ns]'),
+ pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),
+
+ (pd.TimedeltaIndex(['1H', '2H']), None,
+ pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),
+
+ # Category
+ (['a', 'b'], 'category', pd.Categorical(['a', 'b'])),
+ (['a', 'b'], pd.CategoricalDtype(None, ordered=True),
+ pd.Categorical(['a', 'b'], ordered=True)),
+
+ # Interval
+ ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval',
+ pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)])),
+
+ # Sparse
+ ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')),
+
+ # IntegerNA
+ ([1, None], 'Int16', integer_array([1, None], dtype='Int16')),
+ (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
+
+ # Index
+ (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
+
+ # Series[EA] returns the EA
+ (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),
+ None,
+ pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),
+
+ # "3rd party" EAs work
+ ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])),
+
+ # pass an ExtensionArray, but a different dtype
+ (period_array(['2000', '2001'], freq='D'),
+ 'category',
+ pd.Categorical([pd.Period('2000', 'D'), pd.Period('2001', 'D')])),
+])
+def test_array(data, dtype, expected):
+ result = pd.array(data, dtype=dtype)
+ tm.assert_equal(result, expected)
+
+
+def test_array_copy():
+ a = np.array([1, 2])
+ # default is to copy
+ b = pd.array(a)
+ assert np.shares_memory(a, b._ndarray) is False
+
+ # copy=True
+ b = pd.array(a, copy=True)
+ assert np.shares_memory(a, b._ndarray) is False
+
+ # copy=False
+ b = pd.array(a, copy=False)
+ assert np.shares_memory(a, b._ndarray) is True
+
+
+cet = pytz.timezone("CET")
+
+
[email protected]('data, expected', [
+ # period
+ ([pd.Period("2000", "D"), pd.Period("2001", "D")],
+ period_array(["2000", "2001"], freq="D")),
+
+ # interval
+ ([pd.Interval(0, 1), pd.Interval(1, 2)],
+ pd.arrays.IntervalArray.from_breaks([0, 1, 2])),
+
+ # datetime
+ ([pd.Timestamp('2000',), pd.Timestamp('2001')],
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
+
+ ([datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
+
+ (np.array([1, 2], dtype='M8[ns]'),
+ pd.arrays.DatetimeArray(np.array([1, 2], dtype='M8[ns]'))),
+
+ (np.array([1, 2], dtype='M8[us]'),
+ pd.arrays.DatetimeArray(np.array([1000, 2000], dtype='M8[ns]'))),
+
+ # datetimetz
+ ([pd.Timestamp('2000', tz='CET'), pd.Timestamp('2001', tz='CET')],
+ pd.arrays.DatetimeArray._from_sequence(
+ ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz='CET'))),
+
+ ([datetime.datetime(2000, 1, 1, tzinfo=cet),
+ datetime.datetime(2001, 1, 1, tzinfo=cet)],
+ pd.arrays.DatetimeArray._from_sequence(['2000', '2001'],
+ tz=cet)),
+
+ # timedelta
+ ([pd.Timedelta('1H'), pd.Timedelta('2H')],
+ pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),
+
+ (np.array([1, 2], dtype='m8[ns]'),
+ pd.arrays.TimedeltaArray(np.array([1, 2], dtype='m8[ns]'))),
+
+ (np.array([1, 2], dtype='m8[us]'),
+ pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype='m8[ns]'))),
+
+])
+def test_array_inference(data, expected):
+ result = pd.array(data)
+ tm.assert_equal(result, expected)
+
+
+ # mix of frequencies
+ [pd.Period("2000", "D"), pd.Period("2001", "A")],
+ # mix of closed
+ [pd.Interval(0, 1, closed='left'), pd.Interval(1, 2, closed='right')],
+ # Mix of timezones
+ [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
+ # Mix of tz-aware and tz-naive
+ [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
+ np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')]),
+])
+def test_array_inference_fails(data):
+ result = pd.array(data)
+ expected = PandasArray(np.array(data, dtype=object))
+ tm.assert_extension_array_equal(result, expected)
+
+
+ np.array([[1, 2], [3, 4]]),
+ [[1, 2], [3, 4]],
+])
+def test_nd_raises(data):
+ with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'):
+ pd.array(data)
+
+
+def test_scalar_raises():
+ with pytest.raises(ValueError,
+ match="Cannot pass scalar '1'"):
+ pd.array(1)
+
+# ---------------------------------------------------------------------------
+# A couple dummy classes to ensure that Series and Indexes are unboxed before
+# getting to the EA classes.
+
+
+@register_extension_dtype
+class DecimalDtype2(DecimalDtype):
+ name = 'decimal2'
+
+ @classmethod
+ def construct_array_type(cls):
+ return DecimalArray2
+
+
+class DecimalArray2(DecimalArray):
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ if isinstance(scalars, (pd.Series, pd.Index)):
+ raise TypeError
+
+ return super(DecimalArray2, cls)._from_sequence(
+ scalars, dtype=dtype, copy=copy
+ )
+
+
[email protected]("box", [pd.Series, pd.Index])
+def test_array_unboxes(box):
+ data = box([decimal.Decimal('1'), decimal.Decimal('2')])
+ # make sure it works
+ with pytest.raises(TypeError):
+ DecimalArray2._from_sequence(data)
+
+ result = pd.array(data, dtype='decimal2')
+ expected = DecimalArray2._from_sequence(data.values)
+ tm.assert_equal(result, expected)
+
+
+def registry_without_decimal():
+ idx = registry.dtypes.index(DecimalDtype)
+ registry.dtypes.pop(idx)
+ yield
+ registry.dtypes.append(DecimalDtype)
+
+
+def test_array_not_registered(registry_without_decimal):
+ # check we aren't on it
+ assert registry.find('decimal') is None
+ data = [decimal.Decimal('1'), decimal.Decimal('2')]
+
+ result = pd.array(data, dtype=DecimalDtype)
+ expected = DecimalArray._from_sequence(data)
+ tm.assert_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimelike.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimelike.py
new file mode 100644
index 00000000000..f234e4fadec
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimelike.py
@@ -0,0 +1,657 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+
+import pandas as pd
+from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
+import pandas.util.testing as tm
+
+
+# TODO: more freq variants
[email protected](params=['D', 'B', 'W', 'M', 'Q', 'Y'])
+def period_index(request):
+ """
+ A fixture to provide PeriodIndex objects with different frequencies.
+
+ Most PeriodArray behavior is already tested in PeriodIndex tests,
+ so here we just test that the PeriodArray behavior matches
+ the PeriodIndex behavior.
+ """
+ freqstr = request.param
+ # TODO: non-monotone indexes; NaTs, different start dates
+ pi = pd.period_range(start=pd.Timestamp('2000-01-01'),
+ periods=100,
+ freq=freqstr)
+ return pi
+
+
[email protected](params=['D', 'B', 'W', 'M', 'Q', 'Y'])
+def datetime_index(request):
+ """
+ A fixture to provide DatetimeIndex objects with different frequencies.
+
+ Most DatetimeArray behavior is already tested in DatetimeIndex tests,
+ so here we just test that the DatetimeArray behavior matches
+ the DatetimeIndex behavior.
+ """
+ freqstr = request.param
+ # TODO: non-monotone indexes; NaTs, different start dates, timezones
+ pi = pd.date_range(start=pd.Timestamp('2000-01-01'),
+ periods=100,
+ freq=freqstr)
+ return pi
+
+
+def timedelta_index(request):
+ """
+ A fixture to provide TimedeltaIndex objects with different frequencies.
+ Most TimedeltaArray behavior is already tested in TimedeltaIndex tests,
+ so here we just test that the TimedeltaArray behavior matches
+ the TimedeltaIndex behavior.
+ """
+ # TODO: flesh this out
+ return pd.TimedeltaIndex(['1 Day', '3 Hours', 'NaT'])
+
+
+class SharedTests(object):
+ index_cls = None
+
+ def test_compare_len1_raises(self):
+ # make sure we raise when comparing with different lengths, specific
+ # to the case where one has length-1, which numpy would broadcast
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+
+ idx = self.index_cls._simple_new(data, freq='D')
+ arr = self.array_cls(idx)
+
+ with pytest.raises(ValueError, match="Lengths must match"):
+ arr == arr[:1]
+
+ # test the index classes while we're at it, GH#23078
+ with pytest.raises(ValueError, match="Lengths must match"):
+ idx <= idx[[0]]
+
+ def test_take(self):
+ data = np.arange(100, dtype='i8') * 24 * 3600 * 10**9
+ np.random.shuffle(data)
+
+ idx = self.index_cls._simple_new(data, freq='D')
+ arr = self.array_cls(idx)
+
+ takers = [1, 4, 94]
+ result = arr.take(takers)
+ expected = idx.take(takers)
+
+ tm.assert_index_equal(self.index_cls(result), expected)
+
+ takers = np.array([1, 4, 94])
+ result = arr.take(takers)
+ expected = idx.take(takers)
+
+ tm.assert_index_equal(self.index_cls(result), expected)
+
+ def test_take_fill(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+
+ idx = self.index_cls._simple_new(data, freq='D')
+ arr = self.array_cls(idx)
+
+ result = arr.take([-1, 1], allow_fill=True, fill_value=None)
+ assert result[0] is pd.NaT
+
+ result = arr.take([-1, 1], allow_fill=True, fill_value=np.nan)
+ assert result[0] is pd.NaT
+
+ result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT)
+ assert result[0] is pd.NaT
+
+ with pytest.raises(ValueError):
+ arr.take([0, 1], allow_fill=True, fill_value=2)
+
+ with pytest.raises(ValueError):
+ arr.take([0, 1], allow_fill=True, fill_value=2.0)
+
+ with pytest.raises(ValueError):
+ arr.take([0, 1], allow_fill=True,
+ fill_value=pd.Timestamp.now().time)
+
+ def test_concat_same_type(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+
+ idx = self.index_cls._simple_new(data, freq='D').insert(0, pd.NaT)
+ arr = self.array_cls(idx)
+
+ result = arr._concat_same_type([arr[:-1], arr[1:], arr])
+ expected = idx._concat_same_dtype([idx[:-1], idx[1:], idx], None)
+
+ tm.assert_index_equal(self.index_cls(result), expected)
+
+ def test_unbox_scalar(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+ result = arr._unbox_scalar(arr[0])
+ assert isinstance(result, (int, compat.long))
+
+ result = arr._unbox_scalar(pd.NaT)
+ assert isinstance(result, (int, compat.long))
+
+ with pytest.raises(ValueError):
+ arr._unbox_scalar('foo')
+
+ def test_check_compatible_with(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+
+ arr._check_compatible_with(arr[0])
+ arr._check_compatible_with(arr[:1])
+ arr._check_compatible_with(pd.NaT)
+
+ def test_scalar_from_string(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+ result = arr._scalar_from_string(str(arr[0]))
+ assert result == arr[0]
+
+ def test_reduce_invalid(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+
+ with pytest.raises(TypeError, match='cannot perform'):
+ arr._reduce("not a method")
+
+ @pytest.mark.parametrize('method', ['pad', 'backfill'])
+ def test_fillna_method_doesnt_change_orig(self, method):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+ arr[4] = pd.NaT
+
+ fill_value = arr[3] if method == 'pad' else arr[5]
+
+ result = arr.fillna(method=method)
+ assert result[4] == fill_value
+
+ # check that the original was not changed
+ assert arr[4] is pd.NaT
+
+ def test_searchsorted(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+
+ # scalar
+ result = arr.searchsorted(arr[1])
+ assert result == 1
+
+ result = arr.searchsorted(arr[2], side="right")
+ assert result == 3
+
+ # own-type
+ result = arr.searchsorted(arr[1:3])
+ expected = np.array([1, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = arr.searchsorted(arr[1:3], side="right")
+ expected = np.array([2, 3], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Following numpy convention, NaT goes at the beginning
+ # (unlike NaN which goes at the end)
+ result = arr.searchsorted(pd.NaT)
+ assert result == 0
+
+ def test_setitem(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+
+ arr[0] = arr[1]
+ expected = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ expected[0] = expected[1]
+
+ tm.assert_numpy_array_equal(arr.asi8, expected)
+
+ arr[:2] = arr[-2:]
+ expected[:2] = expected[-2:]
+ tm.assert_numpy_array_equal(arr.asi8, expected)
+
+ def test_setitem_raises(self):
+ data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9
+ arr = self.array_cls(data, freq='D')
+ val = arr[0]
+
+ with pytest.raises(IndexError, match="index 12 is out of bounds"):
+ arr[12] = val
+
+ with pytest.raises(TypeError, match="'value' should be a.* 'object'"):
+ arr[0] = object()
+
+
+class TestDatetimeArray(SharedTests):
+ index_cls = pd.DatetimeIndex
+ array_cls = DatetimeArray
+
+ def test_round(self, tz_naive_fixture):
+ # GH#24064
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01 01:01:00', periods=3, freq='H', tz=tz)
+
+ result = dti.round(freq='2T')
+ expected = dti - pd.Timedelta(minutes=1)
+ tm.assert_index_equal(result, expected)
+
+ def test_array_interface(self, datetime_index):
+ arr = DatetimeArray(datetime_index)
+
+ # default asarray gives the same underlying data (for tz naive)
+ result = np.asarray(arr)
+ expected = arr._data
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, copy=False)
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+
+ # specifying M8[ns] gives the same result as default
+ result = np.asarray(arr, dtype='datetime64[ns]')
+ expected = arr._data
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, dtype='datetime64[ns]', copy=False)
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, dtype='datetime64[ns]')
+ assert result is not expected
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to object dtype
+ result = np.asarray(arr, dtype=object)
+ expected = np.array(list(arr), dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to other dtype always copies
+ result = np.asarray(arr, dtype='int64')
+ assert result is not arr.asi8
+ assert not np.may_share_memory(arr, result)
+ expected = arr.asi8.copy()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # other dtypes handled by numpy
+ for dtype in ['float64', str]:
+ result = np.asarray(arr, dtype=dtype)
+ expected = np.asarray(arr).astype(dtype)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_array_object_dtype(self, tz_naive_fixture):
+ # GH#23524
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ arr = DatetimeArray(dti)
+
+ expected = np.array(list(dti))
+
+ result = np.array(arr, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # also test the DatetimeIndex method while we're at it
+ result = np.array(dti, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_array_tz(self, tz_naive_fixture):
+ # GH#23524
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ arr = DatetimeArray(dti)
+
+ expected = dti.asi8.view('M8[ns]')
+ result = np.array(arr, dtype='M8[ns]')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.array(arr, dtype='datetime64[ns]')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # check that we are not making copies when setting copy=False
+ result = np.array(arr, dtype='M8[ns]', copy=False)
+ assert result.base is expected.base
+ assert result.base is not None
+ result = np.array(arr, dtype='datetime64[ns]', copy=False)
+ assert result.base is expected.base
+ assert result.base is not None
+
+ def test_array_i8_dtype(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ arr = DatetimeArray(dti)
+
+ expected = dti.asi8
+ result = np.array(arr, dtype='i8')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.array(arr, dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # check that we are still making copies when setting copy=False
+ result = np.array(arr, dtype='i8', copy=False)
+ assert result.base is not expected.base
+ assert result.base is None
+
+ def test_from_array_keeps_base(self):
+ # Ensure that DatetimeArray._data.base isn't lost.
+ arr = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]')
+ dta = DatetimeArray(arr)
+
+ assert dta._data is arr
+ dta = DatetimeArray(arr[:0])
+ assert dta._data.base is arr
+
+ def test_from_dti(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ arr = DatetimeArray(dti)
+ assert list(dti) == list(arr)
+
+ # Check that Index.__new__ knows what to do with DatetimeArray
+ dti2 = pd.Index(arr)
+ assert isinstance(dti2, pd.DatetimeIndex)
+ assert list(dti2) == list(arr)
+
+ def test_astype_object(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ arr = DatetimeArray(dti)
+ asobj = arr.astype('O')
+ assert isinstance(asobj, np.ndarray)
+ assert asobj.dtype == 'O'
+ assert list(asobj) == list(dti)
+
+ @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y'])
+ def test_to_perioddelta(self, datetime_index, freqstr):
+ # GH#23113
+ dti = datetime_index
+ arr = DatetimeArray(dti)
+
+ expected = dti.to_perioddelta(freq=freqstr)
+ result = arr.to_perioddelta(freq=freqstr)
+ assert isinstance(result, TimedeltaArray)
+
+ # placeholder until these become actual EA subclasses and we can use
+ # an EA-specific tm.assert_ function
+ tm.assert_index_equal(pd.Index(result), pd.Index(expected))
+
+ @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y'])
+ def test_to_period(self, datetime_index, freqstr):
+ dti = datetime_index
+ arr = DatetimeArray(dti)
+
+ expected = dti.to_period(freq=freqstr)
+ result = arr.to_period(freq=freqstr)
+ assert isinstance(result, PeriodArray)
+
+ # placeholder until these become actual EA subclasses and we can use
+ # an EA-specific tm.assert_ function
+ tm.assert_index_equal(pd.Index(result), pd.Index(expected))
+
+ @pytest.mark.parametrize('propname', pd.DatetimeIndex._bool_ops)
+ def test_bool_properties(self, datetime_index, propname):
+ # in this case _bool_ops is just `is_leap_year`
+ dti = datetime_index
+ arr = DatetimeArray(dti)
+ assert dti.freq == arr.freq
+
+ result = getattr(arr, propname)
+ expected = np.array(getattr(dti, propname), dtype=result.dtype)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('propname', pd.DatetimeIndex._field_ops)
+ def test_int_properties(self, datetime_index, propname):
+ dti = datetime_index
+ arr = DatetimeArray(dti)
+
+ result = getattr(arr, propname)
+ expected = np.array(getattr(dti, propname), dtype=result.dtype)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_take_fill_valid(self, datetime_index, tz_naive_fixture):
+ dti = datetime_index.tz_localize(tz_naive_fixture)
+ arr = DatetimeArray(dti)
+
+ now = pd.Timestamp.now().tz_localize(dti.tz)
+ result = arr.take([-1, 1], allow_fill=True, fill_value=now)
+ assert result[0] == now
+
+ with pytest.raises(ValueError):
+ # fill_value Timedelta invalid
+ arr.take([-1, 1], allow_fill=True, fill_value=now - now)
+
+ with pytest.raises(ValueError):
+ # fill_value Period invalid
+ arr.take([-1, 1], allow_fill=True, fill_value=pd.Period('2014Q1'))
+
+ tz = None if dti.tz is not None else 'US/Eastern'
+ now = pd.Timestamp.now().tz_localize(tz)
+ with pytest.raises(TypeError):
+ # Timestamp with mismatched tz-awareness
+ arr.take([-1, 1], allow_fill=True, fill_value=now)
+
+ with pytest.raises(ValueError):
+ # require NaT, not iNaT, as it could be confused with an integer
+ arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT.value)
+
+ def test_concat_same_type_invalid(self, datetime_index):
+ # different timezones
+ dti = datetime_index
+ arr = DatetimeArray(dti)
+
+ if arr.tz is None:
+ other = arr.tz_localize('UTC')
+ else:
+ other = arr.tz_localize(None)
+
+ with pytest.raises(AssertionError):
+ arr._concat_same_type([arr, other])
+
+ def test_concat_same_type_different_freq(self):
+ # we *can* concatentate DTI with different freqs.
+ a = DatetimeArray(pd.date_range('2000', periods=2, freq='D',
+ tz='US/Central'))
+ b = DatetimeArray(pd.date_range('2000', periods=2, freq='H',
+ tz='US/Central'))
+ result = DatetimeArray._concat_same_type([a, b])
+ expected = DatetimeArray(pd.to_datetime([
+ '2000-01-01 00:00:00', '2000-01-02 00:00:00',
+ '2000-01-01 00:00:00', '2000-01-01 01:00:00',
+ ]).tz_localize("US/Central"))
+
+ tm.assert_datetime_array_equal(result, expected)
+
+
+class TestTimedeltaArray(SharedTests):
+ index_cls = pd.TimedeltaIndex
+ array_cls = TimedeltaArray
+
+ def test_from_tdi(self):
+ tdi = pd.TimedeltaIndex(['1 Day', '3 Hours'])
+ arr = TimedeltaArray(tdi)
+ assert list(arr) == list(tdi)
+
+ # Check that Index.__new__ knows what to do with TimedeltaArray
+ tdi2 = pd.Index(arr)
+ assert isinstance(tdi2, pd.TimedeltaIndex)
+ assert list(tdi2) == list(arr)
+
+ def test_astype_object(self):
+ tdi = pd.TimedeltaIndex(['1 Day', '3 Hours'])
+ arr = TimedeltaArray(tdi)
+ asobj = arr.astype('O')
+ assert isinstance(asobj, np.ndarray)
+ assert asobj.dtype == 'O'
+ assert list(asobj) == list(tdi)
+
+ def test_to_pytimedelta(self, timedelta_index):
+ tdi = timedelta_index
+ arr = TimedeltaArray(tdi)
+
+ expected = tdi.to_pytimedelta()
+ result = arr.to_pytimedelta()
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_total_seconds(self, timedelta_index):
+ tdi = timedelta_index
+ arr = TimedeltaArray(tdi)
+
+ expected = tdi.total_seconds()
+ result = arr.total_seconds()
+
+ tm.assert_numpy_array_equal(result, expected.values)
+
+ @pytest.mark.parametrize('propname', pd.TimedeltaIndex._field_ops)
+ def test_int_properties(self, timedelta_index, propname):
+ tdi = timedelta_index
+ arr = TimedeltaArray(tdi)
+
+ result = getattr(arr, propname)
+ expected = np.array(getattr(tdi, propname), dtype=result.dtype)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_array_interface(self, timedelta_index):
+ arr = TimedeltaArray(timedelta_index)
+
+ # default asarray gives the same underlying data
+ result = np.asarray(arr)
+ expected = arr._data
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, copy=False)
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+
+ # specifying m8[ns] gives the same result as default
+ result = np.asarray(arr, dtype='timedelta64[ns]')
+ expected = arr._data
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, dtype='timedelta64[ns]', copy=False)
+ assert result is expected
+ tm.assert_numpy_array_equal(result, expected)
+ result = np.array(arr, dtype='timedelta64[ns]')
+ assert result is not expected
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to object dtype
+ result = np.asarray(arr, dtype=object)
+ expected = np.array(list(arr), dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to other dtype always copies
+ result = np.asarray(arr, dtype='int64')
+ assert result is not arr.asi8
+ assert not np.may_share_memory(arr, result)
+ expected = arr.asi8.copy()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # other dtypes handled by numpy
+ for dtype in ['float64', str]:
+ result = np.asarray(arr, dtype=dtype)
+ expected = np.asarray(arr).astype(dtype)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_take_fill_valid(self, timedelta_index):
+ tdi = timedelta_index
+ arr = TimedeltaArray(tdi)
+
+ td1 = pd.Timedelta(days=1)
+ result = arr.take([-1, 1], allow_fill=True, fill_value=td1)
+ assert result[0] == td1
+
+ now = pd.Timestamp.now()
+ with pytest.raises(ValueError):
+ # fill_value Timestamp invalid
+ arr.take([0, 1], allow_fill=True, fill_value=now)
+
+ with pytest.raises(ValueError):
+ # fill_value Period invalid
+ arr.take([0, 1], allow_fill=True, fill_value=now.to_period('D'))
+
+
+class TestPeriodArray(SharedTests):
+ index_cls = pd.PeriodIndex
+ array_cls = PeriodArray
+
+ def test_from_pi(self, period_index):
+ pi = period_index
+ arr = PeriodArray(pi)
+ assert list(arr) == list(pi)
+
+ # Check that Index.__new__ knows what to do with PeriodArray
+ pi2 = pd.Index(arr)
+ assert isinstance(pi2, pd.PeriodIndex)
+ assert list(pi2) == list(arr)
+
+ def test_astype_object(self, period_index):
+ pi = period_index
+ arr = PeriodArray(pi)
+ asobj = arr.astype('O')
+ assert isinstance(asobj, np.ndarray)
+ assert asobj.dtype == 'O'
+ assert list(asobj) == list(pi)
+
+ @pytest.mark.parametrize('how', ['S', 'E'])
+ def test_to_timestamp(self, how, period_index):
+ pi = period_index
+ arr = PeriodArray(pi)
+
+ expected = DatetimeArray(pi.to_timestamp(how=how))
+ result = arr.to_timestamp(how=how)
+ assert isinstance(result, DatetimeArray)
+
+ # placeholder until these become actual EA subclasses and we can use
+ # an EA-specific tm.assert_ function
+ tm.assert_index_equal(pd.Index(result), pd.Index(expected))
+
+ @pytest.mark.parametrize('propname', PeriodArray._bool_ops)
+ def test_bool_properties(self, period_index, propname):
+ # in this case _bool_ops is just `is_leap_year`
+ pi = period_index
+ arr = PeriodArray(pi)
+
+ result = getattr(arr, propname)
+ expected = np.array(getattr(pi, propname))
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('propname', PeriodArray._field_ops)
+ def test_int_properties(self, period_index, propname):
+ pi = period_index
+ arr = PeriodArray(pi)
+
+ result = getattr(arr, propname)
+ expected = np.array(getattr(pi, propname))
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_array_interface(self, period_index):
+ arr = PeriodArray(period_index)
+
+ # default asarray gives objects
+ result = np.asarray(arr)
+ expected = np.array(list(arr), dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to object dtype (same as default)
+ result = np.asarray(arr, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # to other dtypes
+ with pytest.raises(TypeError):
+ np.asarray(arr, dtype='int64')
+
+ with pytest.raises(TypeError):
+ np.asarray(arr, dtype='float64')
+
+ result = np.asarray(arr, dtype='S20')
+ expected = np.asarray(arr).astype('S20')
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimes.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimes.py
new file mode 100644
index 00000000000..60caf61782b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_datetimes.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for DatetimeArray
+"""
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+from pandas.core.arrays import DatetimeArray
+from pandas.core.arrays.datetimes import sequence_to_dt64ns
+import pandas.util.testing as tm
+
+
+class TestDatetimeArrayConstructor(object):
+ def test_freq_validation(self):
+ # GH#24623 check that invalid instances cannot be created with the
+ # public constructor
+ arr = np.arange(5, dtype=np.int64) * 3600 * 10**9
+
+ msg = ("Inferred frequency H from passed values does not "
+ "conform to passed frequency W-SUN")
+ with pytest.raises(ValueError, match=msg):
+ DatetimeArray(arr, freq="W")
+
+ @pytest.mark.parametrize('meth', [DatetimeArray._from_sequence,
+ sequence_to_dt64ns,
+ pd.to_datetime,
+ pd.DatetimeIndex])
+ def test_mixing_naive_tzaware_raises(self, meth):
+ # GH#24569
+ arr = np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')])
+
+ msg = ('Cannot mix tz-aware with tz-naive values|'
+ 'Tz-aware datetime.datetime cannot be converted '
+ 'to datetime64 unless utc=True')
+
+ for obj in [arr, arr[::-1]]:
+ # check that we raise regardless of whether naive is found
+ # before aware or vice-versa
+ with pytest.raises(ValueError, match=msg):
+ meth(obj)
+
+ def test_from_pandas_array(self):
+ arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
+
+ result = DatetimeArray._from_sequence(arr, freq='infer')
+
+ expected = pd.date_range('1970-01-01', periods=5, freq='H')._data
+ tm.assert_datetime_array_equal(result, expected)
+
+ def test_mismatched_timezone_raises(self):
+ arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'),
+ dtype=DatetimeTZDtype(tz='US/Central'))
+ dtype = DatetimeTZDtype(tz='US/Eastern')
+ with pytest.raises(TypeError, match='Timezone of the array'):
+ DatetimeArray(arr, dtype=dtype)
+
+ def test_non_array_raises(self):
+ with pytest.raises(ValueError, match='list'):
+ DatetimeArray([1, 2, 3])
+
+ def test_other_type_raises(self):
+ with pytest.raises(ValueError,
+ match="The dtype of 'values' is incorrect.*bool"):
+ DatetimeArray(np.array([1, 2, 3], dtype='bool'))
+
+ def test_incorrect_dtype_raises(self):
+ with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
+ DatetimeArray(np.array([1, 2, 3], dtype='i8'), dtype='category')
+
+ def test_freq_infer_raises(self):
+ with pytest.raises(ValueError, match='Frequency inference'):
+ DatetimeArray(np.array([1, 2, 3], dtype='i8'), freq="infer")
+
+ def test_copy(self):
+ data = np.array([1, 2, 3], dtype='M8[ns]')
+ arr = DatetimeArray(data, copy=False)
+ assert arr._data is data
+
+ arr = DatetimeArray(data, copy=True)
+ assert arr._data is not data
+
+
+class TestDatetimeArrayComparisons(object):
+ # TODO: merge this into tests/arithmetic/test_datetime64 once it is
+ # sufficiently robust
+
+ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators):
+ # arbitrary tz-naive DatetimeIndex
+ opname = all_compare_operators.strip('_')
+ op = getattr(operator, opname)
+
+ dti = pd.date_range('2016-01-1', freq='MS', periods=9, tz=None)
+ arr = DatetimeArray(dti)
+ assert arr.freq == dti.freq
+ assert arr.tz == dti.tz
+
+ right = dti
+
+ expected = np.ones(len(arr), dtype=bool)
+ if opname in ['ne', 'gt', 'lt']:
+ # for these the comparisons should be all-False
+ expected = ~expected
+
+ result = op(arr, arr)
+ tm.assert_numpy_array_equal(result, expected)
+ for other in [right, np.array(right)]:
+ # TODO: add list and tuple, and object-dtype once those
+ # are fixed in the constructor
+ result = op(arr, other)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = op(other, arr)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestDatetimeArray(object):
+ def test_astype_to_same(self):
+ arr = DatetimeArray._from_sequence(['2000'], tz='US/Central')
+ result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False)
+ assert result is arr
+
+ @pytest.mark.parametrize("dtype", [
+ int, np.int32, np.int64, 'uint32', 'uint64',
+ ])
+ def test_astype_int(self, dtype):
+ arr = DatetimeArray._from_sequence([pd.Timestamp('2000'),
+ pd.Timestamp('2001')])
+ result = arr.astype(dtype)
+
+ if np.dtype(dtype).kind == 'u':
+ expected_dtype = np.dtype('uint64')
+ else:
+ expected_dtype = np.dtype('int64')
+ expected = arr.astype(expected_dtype)
+
+ assert result.dtype == expected_dtype
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_tz_setter_raises(self):
+ arr = DatetimeArray._from_sequence(['2000'], tz='US/Central')
+ with pytest.raises(AttributeError, match='tz_localize'):
+ arr.tz = 'UTC'
+
+ def test_setitem_different_tz_raises(self):
+ data = np.array([1, 2, 3], dtype='M8[ns]')
+ arr = DatetimeArray(data, copy=False,
+ dtype=DatetimeTZDtype(tz="US/Central"))
+ with pytest.raises(ValueError, match="None"):
+ arr[0] = pd.Timestamp('2000')
+
+ with pytest.raises(ValueError, match="US/Central"):
+ arr[0] = pd.Timestamp('2000', tz="US/Eastern")
+
+ def test_setitem_clears_freq(self):
+ a = DatetimeArray(pd.date_range('2000', periods=2, freq='D',
+ tz='US/Central'))
+ a[0] = pd.Timestamp("2000", tz="US/Central")
+ assert a.freq is None
+
+ def test_repeat_preserves_tz(self):
+ dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central')
+ arr = DatetimeArray(dti)
+
+ repeated = arr.repeat([1, 1])
+
+ # preserves tz and values, but not freq
+ expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype)
+ tm.assert_equal(repeated, expected)
+
+ def test_value_counts_preserves_tz(self):
+ dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central')
+ arr = DatetimeArray(dti).repeat([4, 3])
+
+ result = arr.value_counts()
+
+ # Note: not tm.assert_index_equal, since `freq`s do not match
+ assert result.index.equals(dti)
+
+ arr[-2] = pd.NaT
+ result = arr.value_counts()
+ expected = pd.Series([1, 4, 2],
+ index=[pd.NaT, dti[0], dti[1]])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('method', ['pad', 'backfill'])
+ def test_fillna_preserves_tz(self, method):
+ dti = pd.date_range('2000-01-01', periods=5, freq='D', tz='US/Central')
+ arr = DatetimeArray(dti, copy=True)
+ arr[2] = pd.NaT
+
+ fill_val = dti[1] if method == 'pad' else dti[3]
+ expected = DatetimeArray._from_sequence(
+ [dti[0], dti[1], fill_val, dti[3], dti[4]],
+ freq=None, tz='US/Central'
+ )
+
+ result = arr.fillna(method=method)
+ tm.assert_extension_array_equal(result, expected)
+
+ # assert that arr and dti were not modified in-place
+ assert arr[2] is pd.NaT
+ assert dti[2] == pd.Timestamp('2000-01-03', tz='US/Central')
+
+ def test_array_interface_tz(self):
+ tz = "US/Central"
+ data = DatetimeArray(pd.date_range('2017', periods=2, tz=tz))
+ result = np.asarray(data)
+
+ expected = np.array([pd.Timestamp('2017-01-01T00:00:00', tz=tz),
+ pd.Timestamp('2017-01-02T00:00:00', tz=tz)],
+ dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.asarray(data, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.asarray(data, dtype='M8[ns]')
+
+ expected = np.array(['2017-01-01T06:00:00',
+ '2017-01-02T06:00:00'], dtype="M8[ns]")
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_array_interface(self):
+ data = DatetimeArray(pd.date_range('2017', periods=2))
+ expected = np.array(['2017-01-01T00:00:00', '2017-01-02T00:00:00'],
+ dtype='datetime64[ns]')
+
+ result = np.asarray(data)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.asarray(data, dtype=object)
+ expected = np.array([pd.Timestamp('2017-01-01T00:00:00'),
+ pd.Timestamp('2017-01-02T00:00:00')],
+ dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestSequenceToDT64NS(object):
+
+ def test_tz_dtype_mismatch_raises(self):
+ arr = DatetimeArray._from_sequence(['2000'], tz='US/Central')
+ with pytest.raises(TypeError, match='data is already tz-aware'):
+ sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC"))
+
+ def test_tz_dtype_matches(self):
+ arr = DatetimeArray._from_sequence(['2000'], tz='US/Central')
+ result, _, _ = sequence_to_dt64ns(
+ arr, dtype=DatetimeTZDtype(tz="US/Central"))
+ tm.assert_numpy_array_equal(arr._data, result)
+
+
+class TestReductions(object):
+
+ @pytest.mark.parametrize("tz", [None, "US/Central"])
+ def test_min_max(self, tz):
+ arr = DatetimeArray._from_sequence([
+ '2000-01-03',
+ '2000-01-03',
+ 'NaT',
+ '2000-01-02',
+ '2000-01-05',
+ '2000-01-04',
+ ], tz=tz)
+
+ result = arr.min()
+ expected = pd.Timestamp('2000-01-02', tz=tz)
+ assert result == expected
+
+ result = arr.max()
+ expected = pd.Timestamp('2000-01-05', tz=tz)
+ assert result == expected
+
+ result = arr.min(skipna=False)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=False)
+ assert result is pd.NaT
+
+ @pytest.mark.parametrize("tz", [None, "US/Central"])
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_min_max_empty(self, skipna, tz):
+ arr = DatetimeArray._from_sequence([], tz=tz)
+ result = arr.min(skipna=skipna)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=skipna)
+ assert result is pd.NaT
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_integer.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_integer.py
new file mode 100644
index 00000000000..09298bb5cd0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_integer.py
@@ -0,0 +1,713 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.generic import ABCIndexClass
+
+import pandas as pd
+from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar
+from pandas.core.arrays import IntegerArray, integer_array
+from pandas.core.arrays.integer import (
+ Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype,
+ UInt32Dtype, UInt64Dtype)
+from pandas.tests.extension.base import BaseOpsUtil
+import pandas.util.testing as tm
+
+
+def make_data():
+ return (list(range(8)) +
+ [np.nan] +
+ list(range(10, 98)) +
+ [np.nan] +
+ [99, 100])
+
+
[email protected](params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype,
+ UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype])
+def dtype(request):
+ return request.param()
+
+
+def data(dtype):
+ return integer_array(make_data(), dtype=dtype)
+
+
+def data_missing(dtype):
+ return integer_array([np.nan, 1], dtype=dtype)
+
+
[email protected](params=['data', 'data_missing'])
+def all_data(request, data, data_missing):
+ """Parametrized fixture giving 'data' and 'data_missing'"""
+ if request.param == 'data':
+ return data
+ elif request.param == 'data_missing':
+ return data_missing
+
+
+def test_dtypes(dtype):
+ # smoke tests on auto dtype construction
+
+ if dtype.is_signed_integer:
+ assert np.dtype(dtype.type).kind == 'i'
+ else:
+ assert np.dtype(dtype.type).kind == 'u'
+ assert dtype.name is not None
+
+
[email protected]('dtype, expected', [
+ (Int8Dtype(), 'Int8Dtype()'),
+ (Int16Dtype(), 'Int16Dtype()'),
+ (Int32Dtype(), 'Int32Dtype()'),
+ (Int64Dtype(), 'Int64Dtype()'),
+ (UInt8Dtype(), 'UInt8Dtype()'),
+ (UInt16Dtype(), 'UInt16Dtype()'),
+ (UInt32Dtype(), 'UInt32Dtype()'),
+ (UInt64Dtype(), 'UInt64Dtype()'),
+])
+def test_repr_dtype(dtype, expected):
+ assert repr(dtype) == expected
+
+
+def test_repr_array():
+ result = repr(integer_array([1, None, 3]))
+ expected = (
+ '<IntegerArray>\n'
+ '[1, NaN, 3]\n'
+ 'Length: 3, dtype: Int64'
+ )
+ assert result == expected
+
+
+def test_repr_array_long():
+ data = integer_array([1, 2, None] * 1000)
+ expected = (
+ "<IntegerArray>\n"
+ "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n"
+ " ...\n"
+ " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n"
+ "Length: 3000, dtype: Int64"
+ )
+ result = repr(data)
+ assert result == expected
+
+
+class TestConstructors(object):
+
+ def test_from_dtype_from_float(self, data):
+ # construct from our dtype & string dtype
+ dtype = data.dtype
+
+ # from float
+ expected = pd.Series(data)
+ result = pd.Series(np.array(data).astype('float'), dtype=str(dtype))
+ tm.assert_series_equal(result, expected)
+
+ # from int / list
+ expected = pd.Series(data)
+ result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
+ tm.assert_series_equal(result, expected)
+
+ # from int / array
+ expected = pd.Series(data).dropna().reset_index(drop=True)
+ dropped = np.array(data.dropna()).astype(np.dtype((dtype.type)))
+ result = pd.Series(dropped, dtype=str(dtype))
+ tm.assert_series_equal(result, expected)
+
+
+class TestArithmeticOps(BaseOpsUtil):
+
+ def _check_divmod_op(self, s, op, other, exc=None):
+ super(TestArithmeticOps, self)._check_divmod_op(s, op, other, None)
+
+ def _check_op(self, s, op_name, other, exc=None):
+ op = self.get_op_from_name(op_name)
+ result = op(s, other)
+
+ # compute expected
+ mask = s.isna()
+
+ # if s is a DataFrame, squeeze to a Series
+ # for comparison
+ if isinstance(s, pd.DataFrame):
+ result = result.squeeze()
+ s = s.squeeze()
+ mask = mask.squeeze()
+
+ # other array is an Integer
+ if isinstance(other, IntegerArray):
+ omask = getattr(other, 'mask', None)
+ mask = getattr(other, 'data', other)
+ if omask is not None:
+ mask |= omask
+
+ # 1 ** na is na, so need to unmask those
+ if op_name == '__pow__':
+ mask = np.where(s == 1, False, mask)
+
+ elif op_name == '__rpow__':
+ mask = np.where(other == 1, False, mask)
+
+ # float result type or float op
+ if ((is_float_dtype(other) or is_float(other) or
+ op_name in ['__rtruediv__', '__truediv__',
+ '__rdiv__', '__div__'])):
+ rs = s.astype('float')
+ expected = op(rs, other)
+ self._check_op_float(result, expected, mask, s, op_name, other)
+
+ # integer result type
+ else:
+ rs = pd.Series(s.values._data)
+ expected = op(rs, other)
+ self._check_op_integer(result, expected, mask, s, op_name, other)
+
+ def _check_op_float(self, result, expected, mask, s, op_name, other):
+ # check comparisions that are resulting in float dtypes
+
+ expected[mask] = np.nan
+ tm.assert_series_equal(result, expected)
+
+ def _check_op_integer(self, result, expected, mask, s, op_name, other):
+ # check comparisions that are resulting in integer dtypes
+
+ # to compare properly, we convert the expected
+ # to float, mask to nans and convert infs
+ # if we have uints then we process as uints
+ # then conert to float
+ # and we ultimately want to create a IntArray
+ # for comparisons
+
+ fill_value = 0
+
+ # mod/rmod turn floating 0 into NaN while
+ # integer works as expected (no nan)
+ if op_name in ['__mod__', '__rmod__']:
+ if is_scalar(other):
+ if other == 0:
+ expected[s.values == 0] = 0
+ else:
+ expected = expected.fillna(0)
+ else:
+ expected[(s.values == 0) &
+ ((expected == 0) | expected.isna())] = 0
+ try:
+ expected[(expected == np.inf) | (expected == -np.inf)] = fill_value
+ original = expected
+ expected = expected.astype(s.dtype)
+
+ except ValueError:
+
+ expected = expected.astype(float)
+ expected[(expected == np.inf) | (expected == -np.inf)] = fill_value
+ original = expected
+ expected = expected.astype(s.dtype)
+
+ expected[mask] = np.nan
+
+ # assert that the expected astype is ok
+ # (skip for unsigned as they have wrap around)
+ if not s.dtype.is_unsigned_integer:
+ original = pd.Series(original)
+
+ # we need to fill with 0's to emulate what an astype('int') does
+ # (truncation) for certain ops
+ if op_name in ['__rtruediv__', '__rdiv__']:
+ mask |= original.isna()
+ original = original.fillna(0).astype('int')
+
+ original = original.astype('float')
+ original[mask] = np.nan
+ tm.assert_series_equal(original, expected.astype('float'))
+
+ # assert our expected result
+ tm.assert_series_equal(result, expected)
+
+ def test_arith_integer_array(self, data, all_arithmetic_operators):
+ # we operate with a rhs of an integer array
+
+ op = all_arithmetic_operators
+
+ s = pd.Series(data)
+ rhs = pd.Series([1] * len(data), dtype=data.dtype)
+ rhs.iloc[-1] = np.nan
+
+ self._check_op(s, op, rhs)
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ # scalar
+ op = all_arithmetic_operators
+
+ s = pd.Series(data)
+ self._check_op(s, op, 1, exc=TypeError)
+
+ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+ # frame & scalar
+ op = all_arithmetic_operators
+
+ df = pd.DataFrame({'A': data})
+ self._check_op(df, op, 1, exc=TypeError)
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ # ndarray & other series
+ op = all_arithmetic_operators
+
+ s = pd.Series(data)
+ other = np.ones(len(s), dtype=s.dtype.type)
+ self._check_op(s, op, other, exc=TypeError)
+
+ def test_arith_coerce_scalar(self, data, all_arithmetic_operators):
+
+ op = all_arithmetic_operators
+ s = pd.Series(data)
+
+ other = 0.01
+ self._check_op(s, op, other)
+
+ @pytest.mark.parametrize("other", [1., 1.0, np.array(1.), np.array([1.])])
+ def test_arithmetic_conversion(self, all_arithmetic_operators, other):
+ # if we have a float operand we should have a float result
+ # if that is equal to an integer
+ op = self.get_op_from_name(all_arithmetic_operators)
+
+ s = pd.Series([1, 2, 3], dtype='Int64')
+ result = op(s, other)
+ assert result.dtype is np.dtype('float')
+
+ @pytest.mark.parametrize("other", [0, 0.5])
+ def test_arith_zero_dim_ndarray(self, other):
+ arr = integer_array([1, None, 2])
+ result = arr + np.array(other)
+ expected = arr + other
+ tm.assert_equal(result, expected)
+
+ def test_error(self, data, all_arithmetic_operators):
+ # invalid ops
+
+ op = all_arithmetic_operators
+ s = pd.Series(data)
+ ops = getattr(s, op)
+ opa = getattr(data, op)
+
+ # invalid scalars
+ with pytest.raises(TypeError):
+ ops('foo')
+ with pytest.raises(TypeError):
+ ops(pd.Timestamp('20180101'))
+
+ # invalid array-likes
+ with pytest.raises(TypeError):
+ ops(pd.Series('foo', index=s.index))
+
+ if op != '__rpow__':
+ # TODO(extension)
+ # rpow with a datetimelike coerces the integer array incorrectly
+ with pytest.raises(TypeError):
+ ops(pd.Series(pd.date_range('20180101', periods=len(s))))
+
+ # 2d
+ with pytest.raises(NotImplementedError):
+ opa(pd.DataFrame({'A': s}))
+ with pytest.raises(NotImplementedError):
+ opa(np.arange(len(s)).reshape(-1, len(s)))
+
+ def test_pow(self):
+ # https://github.com/pandas-dev/pandas/issues/22022
+ a = integer_array([1, np.nan, np.nan, 1])
+ b = integer_array([1, np.nan, 1, np.nan])
+ result = a ** b
+ expected = pd.core.arrays.integer_array([1, np.nan, np.nan, 1])
+ tm.assert_extension_array_equal(result, expected)
+
+ def test_rpow_one_to_na(self):
+ # https://github.com/pandas-dev/pandas/issues/22022
+ arr = integer_array([np.nan, np.nan])
+ result = np.array([1.0, 2.0]) ** arr
+ expected = np.array([1.0, np.nan])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestComparisonOps(BaseOpsUtil):
+
+ def _compare_other(self, data, op_name, other):
+ op = self.get_op_from_name(op_name)
+
+ # array
+ result = pd.Series(op(data, other))
+ expected = pd.Series(op(data._data, other))
+
+ # fill the nan locations
+ expected[data._mask] = True if op_name == '__ne__' else False
+
+ tm.assert_series_equal(result, expected)
+
+ # series
+ s = pd.Series(data)
+ result = op(s, other)
+
+ expected = pd.Series(data._data)
+ expected = op(expected, other)
+
+ # fill the nan locations
+ expected[data._mask] = True if op_name == '__ne__' else False
+
+ tm.assert_series_equal(result, expected)
+
+ def test_compare_scalar(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ self._compare_other(data, op_name, 0)
+
+ def test_compare_array(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ other = pd.Series([0] * len(data))
+ self._compare_other(data, op_name, other)
+
+
+class TestCasting(object):
+ pass
+
+ @pytest.mark.parametrize('dropna', [True, False])
+ def test_construct_index(self, all_data, dropna):
+ # ensure that we do not coerce to Float64Index, rather
+ # keep as Index
+
+ all_data = all_data[:10]
+ if dropna:
+ other = np.array(all_data[~all_data.isna()])
+ else:
+ other = all_data
+
+ result = pd.Index(integer_array(other, dtype=all_data.dtype))
+ expected = pd.Index(other, dtype=object)
+
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('dropna', [True, False])
+ def test_astype_index(self, all_data, dropna):
+ # as an int/uint index to Index
+
+ all_data = all_data[:10]
+ if dropna:
+ other = all_data[~all_data.isna()]
+ else:
+ other = all_data
+
+ dtype = all_data.dtype
+ idx = pd.Index(np.array(other))
+ assert isinstance(idx, ABCIndexClass)
+
+ result = idx.astype(dtype)
+ expected = idx.astype(object).astype(dtype)
+ tm.assert_index_equal(result, expected)
+
+ def test_astype(self, all_data):
+ all_data = all_data[:10]
+
+ ints = all_data[~all_data.isna()]
+ mixed = all_data
+ dtype = Int8Dtype()
+
+ # coerce to same type - ints
+ s = pd.Series(ints)
+ result = s.astype(all_data.dtype)
+ expected = pd.Series(ints)
+ tm.assert_series_equal(result, expected)
+
+ # coerce to same other - ints
+ s = pd.Series(ints)
+ result = s.astype(dtype)
+ expected = pd.Series(ints, dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ # coerce to same numpy_dtype - ints
+ s = pd.Series(ints)
+ result = s.astype(all_data.dtype.numpy_dtype)
+ expected = pd.Series(ints._data.astype(
+ all_data.dtype.numpy_dtype))
+ tm.assert_series_equal(result, expected)
+
+ # coerce to same type - mixed
+ s = pd.Series(mixed)
+ result = s.astype(all_data.dtype)
+ expected = pd.Series(mixed)
+ tm.assert_series_equal(result, expected)
+
+ # coerce to same other - mixed
+ s = pd.Series(mixed)
+ result = s.astype(dtype)
+ expected = pd.Series(mixed, dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ # coerce to same numpy_dtype - mixed
+ s = pd.Series(mixed)
+ with pytest.raises(ValueError):
+ s.astype(all_data.dtype.numpy_dtype)
+
+ # coerce to object
+ s = pd.Series(mixed)
+ result = s.astype('object')
+ expected = pd.Series(np.asarray(mixed))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [Int8Dtype(), 'Int8',
+ UInt32Dtype(), 'UInt32'])
+ def test_astype_specific_casting(self, dtype):
+ s = pd.Series([1, 2, 3], dtype='Int64')
+ result = s.astype(dtype)
+ expected = pd.Series([1, 2, 3], dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ s = pd.Series([1, 2, 3, None], dtype='Int64')
+ result = s.astype(dtype)
+ expected = pd.Series([1, 2, 3, None], dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ def test_construct_cast_invalid(self, dtype):
+
+ msg = "cannot safely"
+ arr = [1.2, 2.3, 3.7]
+ with pytest.raises(TypeError, match=msg):
+ integer_array(arr, dtype=dtype)
+
+ with pytest.raises(TypeError, match=msg):
+ pd.Series(arr).astype(dtype)
+
+ arr = [1.2, 2.3, 3.7, np.nan]
+ with pytest.raises(TypeError, match=msg):
+ integer_array(arr, dtype=dtype)
+
+ with pytest.raises(TypeError, match=msg):
+ pd.Series(arr).astype(dtype)
+
+
+def test_frame_repr(data_missing):
+
+ df = pd.DataFrame({'A': data_missing})
+ result = repr(df)
+ expected = ' A\n0 NaN\n1 1'
+ assert result == expected
+
+
+def test_conversions(data_missing):
+
+ # astype to object series
+ df = pd.DataFrame({'A': data_missing})
+ result = df['A'].astype('object')
+ expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
+ tm.assert_series_equal(result, expected)
+
+ # convert to object ndarray
+ # we assert that we are exactly equal
+ # including type conversions of scalars
+ result = df['A'].astype('object').values
+ expected = np.array([np.nan, 1], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ for r, e in zip(result, expected):
+ if pd.isnull(r):
+ assert pd.isnull(e)
+ elif is_integer(r):
+ # PY2 can be int or long
+ assert r == e
+ assert is_integer(e)
+ else:
+ assert r == e
+ assert type(r) == type(e)
+
+
+def test_integer_array_constructor():
+ values = np.array([1, 2, 3, 4], dtype='int64')
+ mask = np.array([False, False, False, True], dtype='bool')
+
+ result = IntegerArray(values, mask)
+ expected = integer_array([1, 2, 3, np.nan], dtype='int64')
+ tm.assert_extension_array_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ IntegerArray(values.tolist(), mask)
+
+ with pytest.raises(TypeError):
+ IntegerArray(values, mask.tolist())
+
+ with pytest.raises(TypeError):
+ IntegerArray(values.astype(float), mask)
+
+ with pytest.raises(TypeError):
+ IntegerArray(values)
+
+
+ ([1, None], [1, np.nan]),
+ ([None], [np.nan]),
+ ([None, np.nan], [np.nan, np.nan]),
+ ([np.nan, np.nan], [np.nan, np.nan]),
+])
+def test_integer_array_constructor_none_is_nan(a, b):
+ result = integer_array(a)
+ expected = integer_array(b)
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_integer_array_constructor_copy():
+ values = np.array([1, 2, 3, 4], dtype='int64')
+ mask = np.array([False, False, False, True], dtype='bool')
+
+ result = IntegerArray(values, mask)
+ assert result._data is values
+ assert result._mask is mask
+
+ result = IntegerArray(values, mask, copy=True)
+ assert result._data is not values
+ assert result._mask is not mask
+
+
+ 'values',
+ [
+ ['foo', 'bar'],
+ ['1', '2'],
+ 'foo',
+ 1,
+ 1.0,
+ pd.date_range('20130101', periods=2),
+ np.array(['foo']),
+ [[1, 2], [3, 4]],
+ [np.nan, {'a': 1}]])
+def test_to_integer_array_error(values):
+ # error in converting existing arrays to IntegerArrays
+ with pytest.raises(TypeError):
+ integer_array(values)
+
+
+def test_to_integer_array_inferred_dtype():
+ # if values has dtype -> respect it
+ result = integer_array(np.array([1, 2], dtype='int8'))
+ assert result.dtype == Int8Dtype()
+ result = integer_array(np.array([1, 2], dtype='int32'))
+ assert result.dtype == Int32Dtype()
+
+ # if values have no dtype -> always int64
+ result = integer_array([1, 2])
+ assert result.dtype == Int64Dtype()
+
+
+def test_to_integer_array_dtype_keyword():
+ result = integer_array([1, 2], dtype='int8')
+ assert result.dtype == Int8Dtype()
+
+ # if values has dtype -> override it
+ result = integer_array(np.array([1, 2], dtype='int8'), dtype='int32')
+ assert result.dtype == Int32Dtype()
+
+
+def test_to_integer_array_float():
+ result = integer_array([1., 2.])
+ expected = integer_array([1, 2])
+ tm.assert_extension_array_equal(result, expected)
+
+ with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
+ integer_array([1.5, 2.])
+
+ # for float dtypes, the itemsize is not preserved
+ result = integer_array(np.array([1., 2.], dtype='float32'))
+ assert result.dtype == Int64Dtype()
+
+
+ 'values, to_dtype, result_dtype',
+ [
+ (np.array([1], dtype='int64'), None, Int64Dtype),
+ (np.array([1, np.nan]), None, Int64Dtype),
+ (np.array([1, np.nan]), 'int8', Int8Dtype)])
+def test_to_integer_array(values, to_dtype, result_dtype):
+ # convert existing arrays to IntegerArrays
+ result = integer_array(values, dtype=to_dtype)
+ assert result.dtype == result_dtype()
+ expected = integer_array(values, dtype=result_dtype())
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_cross_type_arithmetic():
+
+ df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
+ 'B': pd.Series([1, np.nan, 3], dtype='UInt8'),
+ 'C': [1, 2, 3]})
+
+ result = df.A + df.C
+ expected = pd.Series([2, 4, np.nan], dtype='Int64')
+ tm.assert_series_equal(result, expected)
+
+ result = (df.A + df.C) * 3 == 12
+ expected = pd.Series([False, True, False])
+ tm.assert_series_equal(result, expected)
+
+ result = df.A + df.B
+ expected = pd.Series([2, np.nan, np.nan], dtype='Int64')
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('op', ['sum', 'min', 'max', 'prod'])
+def test_preserve_dtypes(op):
+ # TODO(#22346): preserve Int64 dtype
+ # for ops that enable (mean would actually work here
+ # but generally it is a float return value)
+ df = pd.DataFrame({
+ "A": ['a', 'b', 'b'],
+ "B": [1, None, 3],
+ "C": integer_array([1, None, 3], dtype='Int64'),
+ })
+
+ # op
+ result = getattr(df.C, op)()
+ assert isinstance(result, int)
+
+ # groupby
+ result = getattr(df.groupby("A"), op)()
+
+ expected = pd.DataFrame({
+ "B": np.array([1.0, 3.0]),
+ "C": integer_array([1, 3], dtype="Int64")
+ }, index=pd.Index(['a', 'b'], name='A'))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('op', ['mean'])
+def test_reduce_to_float(op):
+ # some reduce ops always return float, even if the result
+ # is a rounded number
+ df = pd.DataFrame({
+ "A": ['a', 'b', 'b'],
+ "B": [1, None, 3],
+ "C": integer_array([1, None, 3], dtype='Int64'),
+ })
+
+ # op
+ result = getattr(df.C, op)()
+ assert isinstance(result, float)
+
+ # groupby
+ result = getattr(df.groupby("A"), op)()
+
+ expected = pd.DataFrame({
+ "B": np.array([1.0, 3.0]),
+ "C": integer_array([1, 3], dtype="Int64")
+ }, index=pd.Index(['a', 'b'], name='A'))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_astype_nansafe():
+ # see gh-22343
+ arr = integer_array([np.nan, 1, 2], dtype="Int8")
+ msg = "cannot convert float NaN to integer"
+
+ with pytest.raises(ValueError, match=msg):
+ arr.astype('uint32')
+
+
+# TODO(jreback) - these need testing / are broken
+
+# shift
+
+# set_index (destroys type)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_numpy.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_numpy.py
new file mode 100644
index 00000000000..a77f1f8a7b3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_numpy.py
@@ -0,0 +1,206 @@
+"""
+Additional tests for PandasArray that aren't covered by
+the interface tests.
+"""
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import compat
+from pandas.arrays import PandasArray
+from pandas.core.arrays.numpy_ import PandasDtype
+import pandas.util.testing as tm
+
+
+ np.array(['a', 'b'], dtype=object),
+ np.array([0, 1], dtype=float),
+ np.array([0, 1], dtype=int),
+ np.array([0, 1 + 2j], dtype=complex),
+ np.array([True, False], dtype=bool),
+ np.array([0, 1], dtype='datetime64[ns]'),
+ np.array([0, 1], dtype='timedelta64[ns]'),
+])
+def any_numpy_array(request):
+ """
+ Parametrized fixture for NumPy arrays with different dtypes.
+
+ This excludes string and bytes.
+ """
+ return request.param
+
+
+# ----------------------------------------------------------------------------
+# PandasDtype
+
[email protected]('dtype, expected', [
+ ('bool', True),
+ ('int', True),
+ ('uint', True),
+ ('float', True),
+ ('complex', True),
+ ('str', False),
+ pytest.param('bytes', False,
+ marks=pytest.mark.skipif(compat.PY2, reason="PY2")),
+ ('datetime64[ns]', False),
+ ('object', False),
+ ('void', False),
+])
+def test_is_numeric(dtype, expected):
+ dtype = PandasDtype(dtype)
+ assert dtype._is_numeric is expected
+
+
[email protected]('dtype, expected', [
+ ('bool', True),
+ ('int', False),
+ ('uint', False),
+ ('float', False),
+ ('complex', False),
+ ('str', False),
+ pytest.param('bytes', False,
+ marks=pytest.mark.skipif(compat.PY2, reason="PY2")),
+ ('datetime64[ns]', False),
+ ('object', False),
+ ('void', False)
+])
+def test_is_boolean(dtype, expected):
+ dtype = PandasDtype(dtype)
+ assert dtype._is_boolean is expected
+
+
+def test_repr():
+ dtype = PandasDtype(np.dtype("int64"))
+ assert repr(dtype) == "PandasDtype('int64')"
+
+
+def test_constructor_from_string():
+ result = PandasDtype.construct_from_string("int64")
+ expected = PandasDtype(np.dtype("int64"))
+ assert result == expected
+
+
+# ----------------------------------------------------------------------------
+# Construction
+
+def test_constructor_no_coercion():
+ with pytest.raises(ValueError, match='NumPy array'):
+ PandasArray([1, 2, 3])
+
+
+def test_series_constructor_with_copy():
+ ndarray = np.array([1, 2, 3])
+ ser = pd.Series(PandasArray(ndarray), copy=True)
+
+ assert ser.values is not ndarray
+
+
+def test_series_constructor_with_astype():
+ ndarray = np.array([1, 2, 3])
+ result = pd.Series(PandasArray(ndarray), dtype="float64")
+ expected = pd.Series([1.0, 2.0, 3.0], dtype="float64")
+ tm.assert_series_equal(result, expected)
+
+
+def test_from_sequence_dtype():
+ arr = np.array([1, 2, 3], dtype='int64')
+ result = PandasArray._from_sequence(arr, dtype='uint64')
+ expected = PandasArray(np.array([1, 2, 3], dtype='uint64'))
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_constructor_copy():
+ arr = np.array([0, 1])
+ result = PandasArray(arr, copy=True)
+
+ assert np.shares_memory(result._ndarray, arr) is False
+
+
+def test_constructor_with_data(any_numpy_array):
+ nparr = any_numpy_array
+ arr = PandasArray(nparr)
+ assert arr.dtype.numpy_dtype == nparr.dtype
+
+
+# ----------------------------------------------------------------------------
+# Conversion
+
+def test_to_numpy():
+ arr = PandasArray(np.array([1, 2, 3]))
+ result = arr.to_numpy()
+ assert result is arr._ndarray
+
+ result = arr.to_numpy(copy=True)
+ assert result is not arr._ndarray
+
+ result = arr.to_numpy(dtype='f8')
+ expected = np.array([1, 2, 3], dtype='f8')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+# ----------------------------------------------------------------------------
+# Setitem
+
+def test_setitem_series():
+ ser = pd.Series([1, 2, 3])
+ ser.array[0] = 10
+ expected = pd.Series([10, 2, 3])
+ tm.assert_series_equal(ser, expected)
+
+
+def test_setitem(any_numpy_array):
+ nparr = any_numpy_array
+ arr = PandasArray(nparr, copy=True)
+
+ arr[0] = arr[1]
+ nparr[0] = nparr[1]
+
+ tm.assert_numpy_array_equal(arr.to_numpy(), nparr)
+
+
+# ----------------------------------------------------------------------------
+# Reductions
+
+def test_bad_reduce_raises():
+ arr = np.array([1, 2, 3], dtype='int64')
+ arr = PandasArray(arr)
+ msg = "cannot perform not_a_method with type int"
+ with pytest.raises(TypeError, match=msg):
+ arr._reduce(msg)
+
+
+def test_validate_reduction_keyword_args():
+ arr = PandasArray(np.array([1, 2, 3]))
+ msg = "the 'keepdims' parameter is not supported .*all"
+ with pytest.raises(ValueError, match=msg):
+ arr.all(keepdims=True)
+
+
+# ----------------------------------------------------------------------------
+# Ops
+
[email protected]_if_no("numpy", min_version="1.13.0")
+def test_ufunc():
+ arr = PandasArray(np.array([-1.0, 0.0, 1.0]))
+ result = np.abs(arr)
+ expected = PandasArray(np.abs(arr._ndarray))
+ tm.assert_extension_array_equal(result, expected)
+
+ r1, r2 = np.divmod(arr, np.add(arr, 2))
+ e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2))
+ e1 = PandasArray(e1)
+ e2 = PandasArray(e2)
+ tm.assert_extension_array_equal(r1, e1)
+ tm.assert_extension_array_equal(r2, e2)
+
+
[email protected]_if_no("numpy", min_version="1.13.0")
+def test_basic_binop():
+ # Just a basic smoke test. The EA interface tests exercise this
+ # more thoroughly.
+ x = PandasArray(np.array([1, 2, 3]))
+ result = x + x
+ expected = PandasArray(np.array([2, 4, 6]))
+ tm.assert_extension_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_period.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_period.py
new file mode 100644
index 00000000000..99255d819d2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_period.py
@@ -0,0 +1,317 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+from pandas._libs.tslibs.period import IncompatibleFrequency
+
+from pandas.core.dtypes.dtypes import PeriodDtype, registry
+
+import pandas as pd
+from pandas.core.arrays import PeriodArray, period_array
+import pandas.util.testing as tm
+
+# ----------------------------------------------------------------------------
+# Dtype
+
+
+def test_registered():
+ assert PeriodDtype in registry.dtypes
+ result = registry.find("Period[D]")
+ expected = PeriodDtype("D")
+ assert result == expected
+
+# ----------------------------------------------------------------------------
+# period_array
+
+
[email protected]("data, freq, expected", [
+ ([pd.Period("2017", "D")], None, [17167]),
+ ([pd.Period("2017", "D")], "D", [17167]),
+ ([2017], "D", [17167]),
+ (["2017"], "D", [17167]),
+ ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]),
+ ([pd.Period("2017", "D"), None], None, [17167, iNaT]),
+ (pd.Series(pd.date_range("2017", periods=3)), None,
+ [17167, 17168, 17169]),
+ (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]),
+])
+def test_period_array_ok(data, freq, expected):
+ result = period_array(data, freq=freq).asi8
+ expected = np.asarray(expected, dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_period_array_readonly_object():
+ # https://github.com/pandas-dev/pandas/issues/25403
+ pa = period_array([pd.Period('2019-01-01')])
+ arr = np.asarray(pa, dtype='object')
+ arr.setflags(write=False)
+
+ result = period_array(arr)
+ tm.assert_period_array_equal(result, pa)
+
+ result = pd.Series(arr)
+ tm.assert_series_equal(result, pd.Series(pa))
+
+ result = pd.DataFrame({"A": arr})
+ tm.assert_frame_equal(result, pd.DataFrame({"A": pa}))
+
+
+def test_from_datetime64_freq_changes():
+ # https://github.com/pandas-dev/pandas/issues/23438
+ arr = pd.date_range("2017", periods=3, freq="D")
+ result = PeriodArray._from_datetime64(arr, freq="M")
+ expected = period_array(['2017-01-01', '2017-01-01', '2017-01-01'],
+ freq="M")
+ tm.assert_period_array_equal(result, expected)
+
+
[email protected]("data, freq, msg", [
+ ([pd.Period('2017', 'D'),
+ pd.Period('2017', 'A')],
+ None,
+ "Input has different freq"),
+ ([pd.Period('2017', 'D')],
+ "A",
+ "Input has different freq"),
+])
+def test_period_array_raises(data, freq, msg):
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ period_array(data, freq)
+
+
+def test_period_array_non_period_series_raies():
+ ser = pd.Series([1, 2, 3])
+ with pytest.raises(TypeError, match='dtype'):
+ PeriodArray(ser, freq='D')
+
+
+def test_period_array_freq_mismatch():
+ arr = period_array(['2000', '2001'], freq='D')
+ with pytest.raises(IncompatibleFrequency, match='freq'):
+ PeriodArray(arr, freq='M')
+
+ with pytest.raises(IncompatibleFrequency, match='freq'):
+ PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd())
+
+
+def test_asi8():
+ result = period_array(['2000', '2001', None], freq='D').asi8
+ expected = np.array([10957, 11323, iNaT])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_take_raises():
+ arr = period_array(['2000', '2001'], freq='D')
+ with pytest.raises(IncompatibleFrequency, match='freq'):
+ arr.take([0, -1], allow_fill=True,
+ fill_value=pd.Period('2000', freq='W'))
+
+ with pytest.raises(ValueError, match='foo'):
+ arr.take([0, -1], allow_fill=True, fill_value='foo')
+
+
+ int, np.int32, np.int64, 'uint32', 'uint64',
+])
+def test_astype(dtype):
+ # We choose to ignore the sign and size of integers for
+ # Period/Datetime/Timedelta astype
+ arr = period_array(['2000', '2001', None], freq='D')
+ result = arr.astype(dtype)
+
+ if np.dtype(dtype).kind == 'u':
+ expected_dtype = np.dtype('uint64')
+ else:
+ expected_dtype = np.dtype('int64')
+ expected = arr.astype(expected_dtype)
+
+ assert result.dtype == expected_dtype
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_astype_copies():
+ arr = period_array(['2000', '2001', None], freq='D')
+ result = arr.astype(np.int64, copy=False)
+ # Add the `.base`, since we now use `.asi8` which returns a view.
+ # We could maybe override it in PeriodArray to return ._data directly.
+ assert result.base is arr._data
+
+ result = arr.astype(np.int64, copy=True)
+ assert result is not arr._data
+ tm.assert_numpy_array_equal(result, arr._data.view('i8'))
+
+
+def test_astype_categorical():
+ arr = period_array(['2000', '2001', '2001', None], freq='D')
+ result = arr.astype('category')
+ categories = pd.PeriodIndex(['2000', '2001'], freq='D')
+ expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
+ tm.assert_categorical_equal(result, expected)
+
+
+def test_astype_period():
+ arr = period_array(['2000', '2001', None], freq='D')
+ result = arr.astype(PeriodDtype("M"))
+ expected = period_array(['2000', '2001', None], freq='M')
+ tm.assert_period_array_equal(result, expected)
+
+
+ 'datetime64[ns]', 'timedelta64[ns]',
+])
+def test_astype_datetime(other):
+ arr = period_array(['2000', '2001', None], freq='D')
+ # slice off the [ns] so that the regex matches.
+ with pytest.raises(TypeError, match=other[:-4]):
+ arr.astype(other)
+
+
+def test_fillna_raises():
+ arr = period_array(['2000', '2001', '2002'], freq='D')
+ with pytest.raises(ValueError, match='Length'):
+ arr.fillna(arr[:2])
+
+
+def test_fillna_copies():
+ arr = period_array(['2000', '2001', '2002'], freq='D')
+ result = arr.fillna(pd.Period("2000", "D"))
+ assert result is not arr
+
+
+# ----------------------------------------------------------------------------
+# setitem
+
[email protected]('key, value, expected', [
+ ([0], pd.Period("2000", "D"), [10957, 1, 2]),
+ ([0], None, [iNaT, 1, 2]),
+ ([0], np.nan, [iNaT, 1, 2]),
+ ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3),
+ ([0, 1, 2], [pd.Period("2000", "D"),
+ pd.Period("2001", "D"),
+ pd.Period("2002", "D")],
+ [10957, 11323, 11688]),
+])
+def test_setitem(key, value, expected):
+ arr = PeriodArray(np.arange(3), freq="D")
+ expected = PeriodArray(expected, freq="D")
+ arr[key] = value
+ tm.assert_period_array_equal(arr, expected)
+
+
+def test_setitem_raises_incompatible_freq():
+ arr = PeriodArray(np.arange(3), freq="D")
+ with pytest.raises(IncompatibleFrequency, match="freq"):
+ arr[0] = pd.Period("2000", freq="A")
+
+ other = period_array(['2000', '2001'], freq='A')
+ with pytest.raises(IncompatibleFrequency, match="freq"):
+ arr[[0, 1]] = other
+
+
+def test_setitem_raises_length():
+ arr = PeriodArray(np.arange(3), freq="D")
+ with pytest.raises(ValueError, match="length"):
+ arr[[0, 1]] = [pd.Period("2000", freq="D")]
+
+
+def test_setitem_raises_type():
+ arr = PeriodArray(np.arange(3), freq="D")
+ with pytest.raises(TypeError, match="int"):
+ arr[0] = 1
+
+
+# ----------------------------------------------------------------------------
+# Ops
+
+def test_sub_period():
+ arr = period_array(['2000', '2001'], freq='D')
+ other = pd.Period("2000", freq="M")
+ with pytest.raises(IncompatibleFrequency, match="freq"):
+ arr - other
+
+
+# ----------------------------------------------------------------------------
+# Methods
+
+ pd.Period('2000', freq='H'),
+ period_array(['2000', '2001', '2000'], freq='H')
+])
+def test_where_different_freq_raises(other):
+ ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D'))
+ cond = np.array([True, False, True])
+ with pytest.raises(IncompatibleFrequency, match="freq"):
+ ser.where(cond, other)
+
+
+# ----------------------------------------------------------------------------
+# Printing
+
+def test_repr_small():
+ arr = period_array(['2000', '2001'], freq='D')
+ result = str(arr)
+ expected = (
+ "<PeriodArray>\n"
+ "['2000-01-01', '2001-01-01']\n"
+ "Length: 2, dtype: period[D]"
+ )
+ assert result == expected
+
+
+def test_repr_large():
+ arr = period_array(['2000', '2001'] * 500, freq='D')
+ result = str(arr)
+ expected = (
+ "<PeriodArray>\n"
+ "['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
+ "'2000-01-01',\n"
+ " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
+ "'2001-01-01',\n"
+ " ...\n"
+ " '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
+ "'2000-01-01',\n"
+ " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
+ "'2001-01-01']\n"
+ "Length: 1000, dtype: period[D]"
+ )
+ assert result == expected
+
+
+# ----------------------------------------------------------------------------
+# Reductions
+
+class TestReductions(object):
+
+ def test_min_max(self):
+ arr = period_array([
+ '2000-01-03',
+ '2000-01-03',
+ 'NaT',
+ '2000-01-02',
+ '2000-01-05',
+ '2000-01-04',
+ ], freq='D')
+
+ result = arr.min()
+ expected = pd.Period('2000-01-02', freq='D')
+ assert result == expected
+
+ result = arr.max()
+ expected = pd.Period('2000-01-05', freq='D')
+ assert result == expected
+
+ result = arr.min(skipna=False)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=False)
+ assert result is pd.NaT
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_min_max_empty(self, skipna):
+ arr = period_array([], freq='D')
+ result = arr.min(skipna=skipna)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=skipna)
+ assert result is pd.NaT
diff --git a/contrib/python/pandas/py2/pandas/tests/arrays/test_timedeltas.py b/contrib/python/pandas/py2/pandas/tests/arrays/test_timedeltas.py
new file mode 100644
index 00000000000..1fec533a14a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/arrays/test_timedeltas.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.arrays import TimedeltaArray
+import pandas.util.testing as tm
+
+
+class TestTimedeltaArrayConstructor(object):
+ def test_only_1dim_accepted(self):
+ # GH#25282
+ arr = np.array([0, 1, 2, 3], dtype='m8[h]').astype('m8[ns]')
+
+ with pytest.raises(ValueError, match="Only 1-dimensional"):
+ # 2-dim
+ TimedeltaArray(arr.reshape(2, 2))
+
+ with pytest.raises(ValueError, match="Only 1-dimensional"):
+ # 0-dim
+ TimedeltaArray(arr[[0]].squeeze())
+
+ def test_freq_validation(self):
+ # ensure that the public constructor cannot create an invalid instance
+ arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9
+
+ msg = ("Inferred frequency None from passed values does not "
+ "conform to passed frequency D")
+ with pytest.raises(ValueError, match=msg):
+ TimedeltaArray(arr.view('timedelta64[ns]'), freq="D")
+
+ def test_non_array_raises(self):
+ with pytest.raises(ValueError, match='list'):
+ TimedeltaArray([1, 2, 3])
+
+ def test_other_type_raises(self):
+ with pytest.raises(ValueError,
+ match="dtype bool cannot be converted"):
+ TimedeltaArray(np.array([1, 2, 3], dtype='bool'))
+
+ def test_incorrect_dtype_raises(self):
+ # TODO: why TypeError for 'category' but ValueError for i8?
+ with pytest.raises(ValueError,
+ match=r'category cannot be converted '
+ r'to timedelta64\[ns\]'):
+ TimedeltaArray(np.array([1, 2, 3], dtype='i8'), dtype='category')
+
+ with pytest.raises(ValueError,
+ match=r"dtype int64 cannot be converted "
+ r"to timedelta64\[ns\]"):
+ TimedeltaArray(np.array([1, 2, 3], dtype='i8'),
+ dtype=np.dtype("int64"))
+
+ def test_copy(self):
+ data = np.array([1, 2, 3], dtype='m8[ns]')
+ arr = TimedeltaArray(data, copy=False)
+ assert arr._data is data
+
+ arr = TimedeltaArray(data, copy=True)
+ assert arr._data is not data
+ assert arr._data.base is not data
+
+
+class TestTimedeltaArray(object):
+ def test_np_sum(self):
+ # GH#25282
+ vals = np.arange(5, dtype=np.int64).view('m8[h]').astype('m8[ns]')
+ arr = TimedeltaArray(vals)
+ result = np.sum(arr)
+ assert result == vals.sum()
+
+ result = np.sum(pd.TimedeltaIndex(arr))
+ assert result == vals.sum()
+
+ def test_from_sequence_dtype(self):
+ msg = "dtype .*object.* cannot be converted to timedelta64"
+ with pytest.raises(ValueError, match=msg):
+ TimedeltaArray._from_sequence([], dtype=object)
+
+ def test_abs(self):
+ vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]')
+ arr = TimedeltaArray(vals)
+
+ evals = np.array([3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]')
+ expected = TimedeltaArray(evals)
+
+ result = abs(arr)
+ tm.assert_timedelta_array_equal(result, expected)
+
+ def test_neg(self):
+ vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]')
+ arr = TimedeltaArray(vals)
+
+ evals = np.array([3600 * 10**9, 'NaT', -7200 * 10**9], dtype='m8[ns]')
+ expected = TimedeltaArray(evals)
+
+ result = -arr
+ tm.assert_timedelta_array_equal(result, expected)
+
+ def test_neg_freq(self):
+ tdi = pd.timedelta_range('2 Days', periods=4, freq='H')
+ arr = TimedeltaArray(tdi, freq=tdi.freq)
+
+ expected = TimedeltaArray(-tdi._data, freq=-tdi.freq)
+
+ result = -arr
+ tm.assert_timedelta_array_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [
+ int, np.int32, np.int64, 'uint32', 'uint64',
+ ])
+ def test_astype_int(self, dtype):
+ arr = TimedeltaArray._from_sequence([pd.Timedelta('1H'),
+ pd.Timedelta('2H')])
+ result = arr.astype(dtype)
+
+ if np.dtype(dtype).kind == 'u':
+ expected_dtype = np.dtype('uint64')
+ else:
+ expected_dtype = np.dtype('int64')
+ expected = arr.astype(expected_dtype)
+
+ assert result.dtype == expected_dtype
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_setitem_clears_freq(self):
+ a = TimedeltaArray(pd.timedelta_range('1H', periods=2, freq='H'))
+ a[0] = pd.Timedelta("1H")
+ assert a.freq is None
+
+
+class TestReductions(object):
+
+ def test_min_max(self):
+ arr = TimedeltaArray._from_sequence([
+ '3H', '3H', 'NaT', '2H', '5H', '4H',
+ ])
+
+ result = arr.min()
+ expected = pd.Timedelta('2H')
+ assert result == expected
+
+ result = arr.max()
+ expected = pd.Timedelta('5H')
+ assert result == expected
+
+ result = arr.min(skipna=False)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=False)
+ assert result is pd.NaT
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_min_max_empty(self, skipna):
+ arr = TimedeltaArray._from_sequence([])
+ result = arr.min(skipna=skipna)
+ assert result is pd.NaT
+
+ result = arr.max(skipna=skipna)
+ assert result is pd.NaT
diff --git a/contrib/python/pandas/py2/pandas/tests/computation/__init__.py b/contrib/python/pandas/py2/pandas/tests/computation/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/computation/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/computation/test_compat.py b/contrib/python/pandas/py2/pandas/tests/computation/test_compat.py
new file mode 100644
index 00000000000..7cc373d06cf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/computation/test_compat.py
@@ -0,0 +1,47 @@
+from distutils.version import LooseVersion
+
+import pytest
+
+import pandas as pd
+from pandas.core.computation.check import _MIN_NUMEXPR_VERSION
+from pandas.core.computation.engines import _engines
+import pandas.core.computation.expr as expr
+
+
+def test_compat():
+ # test we have compat with our version of nu
+
+ from pandas.core.computation.check import _NUMEXPR_INSTALLED
+ try:
+ import numexpr as ne
+ ver = ne.__version__
+ if LooseVersion(ver) < LooseVersion(_MIN_NUMEXPR_VERSION):
+ assert not _NUMEXPR_INSTALLED
+ else:
+ assert _NUMEXPR_INSTALLED
+ except ImportError:
+ pytest.skip("not testing numexpr version compat")
+
+
[email protected]('engine', _engines)
[email protected]('parser', expr._parsers)
+def test_invalid_numexpr_version(engine, parser):
+ def testit():
+ a, b = 1, 2 # noqa
+ res = pd.eval('a + b', engine=engine, parser=parser)
+ assert res == 3
+
+ if engine == 'numexpr':
+ try:
+ import numexpr as ne
+ except ImportError:
+ pytest.skip("no numexpr")
+ else:
+ if (LooseVersion(ne.__version__) <
+ LooseVersion(_MIN_NUMEXPR_VERSION)):
+ with pytest.raises(ImportError):
+ testit()
+ else:
+ testit()
+ else:
+ testit()
diff --git a/contrib/python/pandas/py2/pandas/tests/computation/test_eval.py b/contrib/python/pandas/py2/pandas/tests/computation/test_eval.py
new file mode 100644
index 00000000000..c1ba15f428e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/computation/test_eval.py
@@ -0,0 +1,1924 @@
+from distutils.version import LooseVersion
+from itertools import product
+import operator
+import warnings
+
+import numpy as np
+from numpy.random import rand, randint, randn
+import pytest
+
+from pandas.compat import PY3, reduce
+from pandas.errors import PerformanceWarning
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar
+
+import pandas as pd
+from pandas import DataFrame, Panel, Series, date_range
+from pandas.core.computation import pytables
+from pandas.core.computation.check import _NUMEXPR_VERSION
+from pandas.core.computation.engines import NumExprClobberingError, _engines
+import pandas.core.computation.expr as expr
+from pandas.core.computation.expr import PandasExprVisitor, PythonExprVisitor
+from pandas.core.computation.expressions import (
+ _NUMEXPR_INSTALLED, _USE_NUMEXPR)
+from pandas.core.computation.ops import (
+ _arith_ops_syms, _binary_math_ops, _binary_ops_dict, _bool_ops_syms,
+ _special_case_arith_ops_syms, _unary_math_ops)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_numpy_array_equal, assert_produces_warning,
+ assert_series_equal, makeCustomDataframe as mkdf, randbool)
+
+_series_frame_incompatible = _bool_ops_syms
+_scalar_skip = 'in', 'not in'
+
+
+ pytest.param(engine,
+ marks=pytest.mark.skipif(
+ engine == 'numexpr' and not _USE_NUMEXPR,
+ reason='numexpr enabled->{enabled}, '
+ 'installed->{installed}'.format(
+ enabled=_USE_NUMEXPR,
+ installed=_NUMEXPR_INSTALLED)))
+ for engine in _engines)) # noqa
+def engine(request):
+ return request.param
+
+
[email protected](params=expr._parsers)
+def parser(request):
+ return request.param
+
+
+def ne_lt_2_6_9():
+ if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion('2.6.9'):
+ pytest.skip("numexpr is >= 2.6.9")
+ return 'numexpr'
+
+
+def unary_fns_for_ne():
+ if _NUMEXPR_INSTALLED:
+ if _NUMEXPR_VERSION >= LooseVersion('2.6.9'):
+ return _unary_math_ops
+ else:
+ return tuple(x for x in _unary_math_ops
+ if x not in ("floor", "ceil"))
+ else:
+ pytest.skip("numexpr is not present")
+
+
+def engine_has_neg_frac(engine):
+ return _engines[engine].has_neg_frac
+
+
+def _eval_single_bin(lhs, cmp1, rhs, engine):
+ c = _binary_ops_dict[cmp1]
+ if engine_has_neg_frac(engine):
+ try:
+ return c(lhs, rhs)
+ except ValueError as e:
+ if str(e).startswith('negative number cannot be '
+ 'raised to a fractional power'):
+ return np.nan
+ raise
+ return c(lhs, rhs)
+
+
+def _series_and_2d_ndarray(lhs, rhs):
+ return ((isinstance(lhs, Series) and
+ isinstance(rhs, np.ndarray) and rhs.ndim > 1) or
+ (isinstance(rhs, Series) and
+ isinstance(lhs, np.ndarray) and lhs.ndim > 1))
+
+
+def _series_and_frame(lhs, rhs):
+ return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or
+ (isinstance(rhs, Series) and isinstance(lhs, DataFrame)))
+
+
+def _bool_and_frame(lhs, rhs):
+ return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame)
+
+
+def _is_py3_complex_incompat(result, expected):
+ return (PY3 and isinstance(expected, (complex, np.complexfloating)) and
+ np.isnan(result))
+
+
+_good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms)
+
+
+class TestEvalNumexprPandas(object):
+
+ @classmethod
+ def setup_class(cls):
+ import numexpr as ne
+ cls.ne = ne
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+
+ @classmethod
+ def teardown_class(cls):
+ del cls.engine, cls.parser
+ if hasattr(cls, 'ne'):
+ del cls.ne
+
+ def setup_data(self):
+ nan_df1 = DataFrame(rand(10, 5))
+ nan_df1[nan_df1 > 0.5] = np.nan
+ nan_df2 = DataFrame(rand(10, 5))
+ nan_df2[nan_df2 > 0.5] = np.nan
+
+ self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)),
+ Series([1, 2, np.nan, np.nan, 5]), nan_df1)
+ self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)),
+ Series([1, 2, np.nan, np.nan, 5]), nan_df2)
+ self.scalar_lhses = randn(),
+ self.scalar_rhses = randn(),
+
+ self.lhses = self.pandas_lhses + self.scalar_lhses
+ self.rhses = self.pandas_rhses + self.scalar_rhses
+
+ def setup_ops(self):
+ self.cmp_ops = expr._cmp_ops_syms
+ self.cmp2_ops = self.cmp_ops[::-1]
+ self.bin_ops = expr._bool_ops_syms
+ self.special_case_ops = _special_case_arith_ops_syms
+ self.arith_ops = _good_arith_ops
+ self.unary_ops = '-', '~', 'not '
+
+ def setup_method(self, method):
+ self.setup_ops()
+ self.setup_data()
+ self.current_engines = filter(lambda x: x != self.engine, _engines)
+
+ def teardown_method(self, method):
+ del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses
+ del self.pandas_rhses, self.pandas_lhses, self.current_engines
+
+ @pytest.mark.slow
+ def test_complex_cmp_ops(self):
+ cmp_ops = ('!=', '==', '<=', '>=', '<', '>')
+ cmp2_ops = ('>', '<')
+ for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, cmp_ops,
+ self.rhses, self.bin_ops,
+ cmp2_ops):
+ self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2)
+
+ def test_simple_cmp_ops(self):
+ bool_lhses = (DataFrame(randbool(size=(10, 5))),
+ Series(randbool((5,))), randbool())
+ bool_rhses = (DataFrame(randbool(size=(10, 5))),
+ Series(randbool((5,))), randbool())
+ for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops):
+ self.check_simple_cmp_op(lhs, cmp_op, rhs)
+
+ @pytest.mark.slow
+ def test_binary_arith_ops(self):
+ for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses):
+ self.check_binary_arith_op(lhs, op, rhs)
+
+ def test_modulus(self):
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_modulus(lhs, '%', rhs)
+
+ def test_floor_division(self):
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_floor_division(lhs, '//', rhs)
+
+ @td.skip_if_windows
+ def test_pow(self):
+ # odd failure on win32 platform, so skip
+ for lhs, rhs in product(self.lhses, self.rhses):
+ self.check_pow(lhs, '**', rhs)
+
+ @pytest.mark.slow
+ def test_single_invert_op(self):
+ for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses):
+ self.check_single_invert_op(lhs, op, rhs)
+
+ @pytest.mark.slow
+ def test_compound_invert_op(self):
+ for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses):
+ self.check_compound_invert_op(lhs, op, rhs)
+
+ @pytest.mark.slow
+ def test_chained_cmp_op(self):
+ mids = self.lhses
+ cmp_ops = '<', '>'
+ for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops,
+ mids, cmp_ops, self.rhses):
+ self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs)
+
+ def check_equal(self, result, expected):
+ if isinstance(result, DataFrame):
+ tm.assert_frame_equal(result, expected)
+ elif isinstance(result, Series):
+ tm.assert_series_equal(result, expected)
+ elif isinstance(result, np.ndarray):
+ tm.assert_numpy_array_equal(result, expected)
+ else:
+ assert result == expected
+
+ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2):
+ skip_these = _scalar_skip
+ ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1,
+ binop=binop,
+ cmp2=cmp2)
+ scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or
+ cmp2 in skip_these))
+ if scalar_with_in_notin:
+ with pytest.raises(TypeError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ with pytest.raises(TypeError):
+ pd.eval(ex, engine=self.engine, parser=self.parser,
+ local_dict={'lhs': lhs, 'rhs': rhs})
+ else:
+ lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine)
+ if (isinstance(lhs_new, Series) and
+ isinstance(rhs_new, DataFrame) and
+ binop in _series_frame_incompatible):
+ pass
+ # TODO: the code below should be added back when left and right
+ # hand side bool ops are fixed.
+ #
+ # try:
+ # pytest.raises(Exception, pd.eval, ex,
+ # local_dict={'lhs': lhs, 'rhs': rhs},
+ # engine=self.engine, parser=self.parser)
+ # except AssertionError:
+ # import ipdb
+ #
+ # ipdb.set_trace()
+ # raise
+ else:
+ expected = _eval_single_bin(
+ lhs_new, binop, rhs_new, self.engine)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ self.check_equal(result, expected)
+
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+
+ def check_operands(left, right, cmp_op):
+ return _eval_single_bin(left, cmp_op, right, self.engine)
+
+ lhs_new = check_operands(lhs, mid, cmp1)
+ rhs_new = check_operands(mid, rhs, cmp2)
+
+ if lhs_new is not None and rhs_new is not None:
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2)
+ ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2)
+ ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2)
+ expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine)
+
+ for ex in (ex1, ex2, ex3):
+ result = pd.eval(ex, engine=self.engine,
+ parser=self.parser)
+
+ tm.assert_almost_equal(result, expected)
+
+ def check_simple_cmp_op(self, lhs, cmp1, rhs):
+ ex = 'lhs {0} rhs'.format(cmp1)
+ if cmp1 in ('in', 'not in') and not is_list_like(rhs):
+ pytest.raises(TypeError, pd.eval, ex, engine=self.engine,
+ parser=self.parser, local_dict={'lhs': lhs,
+ 'rhs': rhs})
+ else:
+ expected = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ self.check_equal(result, expected)
+
+ def check_binary_arith_op(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = _eval_single_bin(lhs, arith1, rhs, self.engine)
+
+ tm.assert_almost_equal(result, expected)
+ ex = 'lhs {0} rhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ nlhs = _eval_single_bin(lhs, arith1, rhs,
+ self.engine)
+ self.check_alignment(result, nlhs, rhs, arith1)
+
+ def check_alignment(self, result, nlhs, ghs, op):
+ try:
+ nlhs, ghs = nlhs.align(ghs)
+ except (ValueError, TypeError, AttributeError):
+ # ValueError: series frame or frame series align
+ # TypeError, AttributeError: series or frame with scalar align
+ pass
+ else:
+
+ # direct numpy comparison
+ expected = self.ne.evaluate('nlhs {0} ghs'.format(op))
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ # modulus, pow, and floor division require special casing
+
+ def check_modulus(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = lhs % rhs
+
+ tm.assert_almost_equal(result, expected)
+ expected = self.ne.evaluate('expected {0} rhs'.format(arith1))
+ if isinstance(result, (DataFrame, Series)):
+ tm.assert_almost_equal(result.values, expected)
+ else:
+ tm.assert_almost_equal(result, expected.item())
+
+ def check_floor_division(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+
+ if self.engine == 'python':
+ res = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = lhs // rhs
+ self.check_equal(res, expected)
+ else:
+ pytest.raises(TypeError, pd.eval, ex,
+ local_dict={'lhs': lhs, 'rhs': rhs},
+ engine=self.engine, parser=self.parser)
+
+ def get_expected_pow_result(self, lhs, rhs):
+ try:
+ expected = _eval_single_bin(lhs, '**', rhs, self.engine)
+ except ValueError as e:
+ if str(e).startswith('negative number cannot be '
+ 'raised to a fractional power'):
+ if self.engine == 'python':
+ pytest.skip(str(e))
+ else:
+ expected = np.nan
+ else:
+ raise
+ return expected
+
+ def check_pow(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ expected = self.get_expected_pow_result(lhs, rhs)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+
+ if (is_scalar(lhs) and is_scalar(rhs) and
+ _is_py3_complex_incompat(result, expected)):
+ pytest.raises(AssertionError, tm.assert_numpy_array_equal,
+ result, expected)
+ else:
+ tm.assert_almost_equal(result, expected)
+
+ ex = '(lhs {0} rhs) {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ expected = self.get_expected_pow_result(
+ self.get_expected_pow_result(lhs, rhs), rhs)
+ tm.assert_almost_equal(result, expected)
+
+ def check_single_invert_op(self, lhs, cmp1, rhs):
+ # simple
+ for el in (lhs, rhs):
+ try:
+ elb = el.astype(bool)
+ except AttributeError:
+ elb = np.array([bool(el)])
+ expected = ~elb
+ result = pd.eval('~elb', engine=self.engine, parser=self.parser)
+ tm.assert_almost_equal(expected, result)
+
+ for engine in self.current_engines:
+ tm.assert_almost_equal(result, pd.eval('~elb', engine=engine,
+ parser=self.parser))
+
+ def check_compound_invert_op(self, lhs, cmp1, rhs):
+ skip_these = 'in', 'not in'
+ ex = '~(lhs {0} rhs)'.format(cmp1)
+
+ if is_scalar(rhs) and cmp1 in skip_these:
+ pytest.raises(TypeError, pd.eval, ex, engine=self.engine,
+ parser=self.parser, local_dict={'lhs': lhs,
+ 'rhs': rhs})
+ else:
+ # compound
+ if is_scalar(lhs) and is_scalar(rhs):
+ lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs))
+ expected = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+ if is_scalar(expected):
+ expected = not expected
+ else:
+ expected = ~expected
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+ tm.assert_almost_equal(expected, result)
+
+ # make sure the other engines work the same as this one
+ for engine in self.current_engines:
+ ev = pd.eval(ex, engine=self.engine, parser=self.parser)
+ tm.assert_almost_equal(ev, result)
+
+ def ex(self, op, var_name='lhs'):
+ return '{0}{1}'.format(op, var_name)
+
+ def test_frame_invert(self):
+ expr = self.ex('~')
+
+ # ~ ##
+ # frame
+ # float always raises
+ lhs = DataFrame(randn(5, 2))
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with pytest.raises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ # int raises on numexpr
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # bool always works
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # object raises
+ lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5})
+ if self.engine == 'numexpr':
+ with pytest.raises(ValueError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with pytest.raises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ def test_series_invert(self):
+ # ~ ####
+ expr = self.ex('~')
+
+ # series
+ # float raises
+ lhs = Series(randn(5))
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with pytest.raises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ # int raises on numexpr
+ lhs = Series(randint(5, size=5))
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # bool
+ lhs = Series(rand(5) > 0.5)
+ expect = ~lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # float
+ # int
+ # bool
+
+ # object
+ lhs = Series(['a', 1, 2.0])
+ if self.engine == 'numexpr':
+ with pytest.raises(ValueError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ with pytest.raises(TypeError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+
+ def test_frame_negate(self):
+ expr = self.ex('-')
+
+ # float
+ lhs = DataFrame(randn(5, 2))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # int
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ def test_series_negate(self):
+ expr = self.ex('-')
+
+ # float
+ lhs = Series(randn(5))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # int
+ lhs = Series(randint(5, size=5))
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = Series(rand(5) > 0.5)
+ if self.engine == 'numexpr':
+ with pytest.raises(NotImplementedError):
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ else:
+ expect = -lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ def test_frame_pos(self):
+ expr = self.ex('+')
+
+ # float
+ lhs = DataFrame(randn(5, 2))
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # int
+ lhs = DataFrame(randint(5, size=(5, 2)))
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = DataFrame(rand(5, 2) > 0.5)
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_frame_equal(expect, result)
+
+ def test_series_pos(self):
+ expr = self.ex('+')
+
+ # float
+ lhs = Series(randn(5))
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # int
+ lhs = Series(randint(5, size=5))
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ # bool doesn't work with numexpr but works elsewhere
+ lhs = Series(rand(5) > 0.5)
+ expect = lhs
+ result = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert_series_equal(expect, result)
+
+ def test_scalar_unary(self):
+ with pytest.raises(TypeError):
+ pd.eval('~1.0', engine=self.engine, parser=self.parser)
+
+ assert pd.eval('-1.0', parser=self.parser,
+ engine=self.engine) == -1.0
+ assert pd.eval('+1.0', parser=self.parser,
+ engine=self.engine) == +1.0
+ assert pd.eval('~1', parser=self.parser,
+ engine=self.engine) == ~1
+ assert pd.eval('-1', parser=self.parser,
+ engine=self.engine) == -1
+ assert pd.eval('+1', parser=self.parser,
+ engine=self.engine) == +1
+ assert pd.eval('~True', parser=self.parser,
+ engine=self.engine) == ~True
+ assert pd.eval('~False', parser=self.parser,
+ engine=self.engine) == ~False
+ assert pd.eval('-True', parser=self.parser,
+ engine=self.engine) == -True
+ assert pd.eval('-False', parser=self.parser,
+ engine=self.engine) == -False
+ assert pd.eval('+True', parser=self.parser,
+ engine=self.engine) == +True
+ assert pd.eval('+False', parser=self.parser,
+ engine=self.engine) == +False
+
+ def test_unary_in_array(self):
+ # GH 11235
+ assert_numpy_array_equal(
+ pd.eval('[-True, True, ~True, +True,'
+ '-False, False, ~False, +False,'
+ '-37, 37, ~37, +37]'),
+ np.array([-True, True, ~True, +True,
+ -False, False, ~False, +False,
+ -37, 37, ~37, +37], dtype=np.object_))
+
+ def test_disallow_scalar_bool_ops(self):
+ exprs = '1 or 2', '1 and 2'
+ exprs += 'a and b', 'a or b'
+ exprs += '1 or 2 and (3 + 2) > 3',
+ exprs += '2 * x > 2 or 1 and 2',
+ exprs += '2 * df > 3 and 1 or a',
+
+ x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa
+ for ex in exprs:
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+
+ def test_identical(self):
+ # see gh-10546
+ x = 1
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ assert result == 1
+ assert is_scalar(result)
+
+ x = 1.5
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ assert result == 1.5
+ assert is_scalar(result)
+
+ x = False
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ assert not result
+ assert is_bool(result)
+ assert is_scalar(result)
+
+ x = np.array([1])
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ tm.assert_numpy_array_equal(result, np.array([1]))
+ assert result.shape == (1, )
+
+ x = np.array([1.5])
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ tm.assert_numpy_array_equal(result, np.array([1.5]))
+ assert result.shape == (1, )
+
+ x = np.array([False]) # noqa
+ result = pd.eval('x', engine=self.engine, parser=self.parser)
+ tm.assert_numpy_array_equal(result, np.array([False]))
+ assert result.shape == (1, )
+
+ def test_line_continuation(self):
+ # GH 11149
+ exp = """1 + 2 * \
+ 5 - 1 + 2 """
+ result = pd.eval(exp, engine=self.engine, parser=self.parser)
+ assert result == 12
+
+ def test_float_truncation(self):
+ # GH 14241
+ exp = '1000000000.006'
+ result = pd.eval(exp, engine=self.engine, parser=self.parser)
+ expected = np.float64(exp)
+ assert result == expected
+
+ df = pd.DataFrame({'A': [1000000000.0009,
+ 1000000000.0011,
+ 1000000000.0015]})
+ cutoff = 1000000000.0006
+ result = df.query("A < %.4f" % cutoff)
+ assert result.empty
+
+ cutoff = 1000000000.0010
+ result = df.query("A > %.4f" % cutoff)
+ expected = df.loc[[1, 2], :]
+ tm.assert_frame_equal(expected, result)
+
+ exact = 1000000000.0011
+ result = df.query('A == %.4f' % exact)
+ expected = df.loc[[1], :]
+ tm.assert_frame_equal(expected, result)
+
+ def test_disallow_python_keywords(self):
+ # GH 18221
+ df = pd.DataFrame([[0, 0, 0]], columns=['foo', 'bar', 'class'])
+ msg = "Python keyword not valid identifier in numexpr query"
+ with pytest.raises(SyntaxError, match=msg):
+ df.query('class == 0')
+
+ df = pd.DataFrame()
+ df.index.name = 'lambda'
+ with pytest.raises(SyntaxError, match=msg):
+ df.query('lambda == 0')
+
+
+class TestEvalNumexprPython(TestEvalNumexprPandas):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestEvalNumexprPython, cls).setup_class()
+ import numexpr as ne
+ cls.ne = ne
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+
+ def setup_ops(self):
+ self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'),
+ expr._cmp_ops_syms))
+ self.cmp2_ops = self.cmp_ops[::-1]
+ self.bin_ops = [s for s in expr._bool_ops_syms
+ if s not in ('and', 'or')]
+ self.special_case_ops = _special_case_arith_ops_syms
+ self.arith_ops = _good_arith_ops
+ self.unary_ops = '+', '-', '~'
+
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2)
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex1, engine=self.engine, parser=self.parser)
+
+
+class TestEvalPythonPython(TestEvalNumexprPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestEvalPythonPython, cls).setup_class()
+ cls.engine = 'python'
+ cls.parser = 'python'
+
+ def check_modulus(self, lhs, arith1, rhs):
+ ex = 'lhs {0} rhs'.format(arith1)
+ result = pd.eval(ex, engine=self.engine, parser=self.parser)
+
+ expected = lhs % rhs
+ tm.assert_almost_equal(result, expected)
+
+ expected = _eval_single_bin(expected, arith1, rhs, self.engine)
+ tm.assert_almost_equal(result, expected)
+
+ def check_alignment(self, result, nlhs, ghs, op):
+ try:
+ nlhs, ghs = nlhs.align(ghs)
+ except (ValueError, TypeError, AttributeError):
+ # ValueError: series frame or frame series align
+ # TypeError, AttributeError: series or frame with scalar align
+ pass
+ else:
+ expected = eval('nlhs {0} ghs'.format(op))
+ tm.assert_almost_equal(result, expected)
+
+
+class TestEvalPythonPandas(TestEvalPythonPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestEvalPythonPandas, cls).setup_class()
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+
+ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
+ TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2,
+ rhs)
+
+
+f = lambda *args, **kwargs: np.random.randn()
+
+
+# -------------------------------------
+# gh-12388: Typecasting rules consistency with python
+
+
+class TestTypeCasting(object):
+ @pytest.mark.parametrize('op', ['+', '-', '*', '**', '/'])
+ # maybe someday... numexpr has too many upcasting rules now
+ # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float']))
+ @pytest.mark.parametrize('dt', [np.float32, np.float64])
+ def test_binop_typecasting(self, engine, parser, op, dt):
+ df = mkdf(5, 3, data_gen_f=f, dtype=dt)
+ s = 'df {} 3'.format(op)
+ res = pd.eval(s, engine=engine, parser=parser)
+ assert df.values.dtype == dt
+ assert res.values.dtype == dt
+ assert_frame_equal(res, eval(s))
+
+ s = '3 {} df'.format(op)
+ res = pd.eval(s, engine=engine, parser=parser)
+ assert df.values.dtype == dt
+ assert res.values.dtype == dt
+ assert_frame_equal(res, eval(s))
+
+
+# -------------------------------------
+# Basic and complex alignment
+
+def _is_datetime(x):
+ return issubclass(x.dtype.type, np.datetime64)
+
+
+def should_warn(*args):
+ not_mono = not any(map(operator.attrgetter('is_monotonic'), args))
+ only_one_dt = reduce(operator.xor, map(_is_datetime, args))
+ return not_mono and only_one_dt
+
+
+class TestAlignment(object):
+
+ index_types = 'i', 'u', 'dt'
+ lhs_index_types = index_types + ('s',) # 'p'
+
+ def test_align_nested_unary_op(self, engine, parser):
+ s = 'df * ~2'
+ df = mkdf(5, 3, data_gen_f=f)
+ res = pd.eval(s, engine=engine, parser=parser)
+ assert_frame_equal(res, df * ~2)
+
+ def test_basic_frame_alignment(self, engine, parser):
+ args = product(self.lhs_index_types, self.index_types,
+ self.index_types)
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter('always', RuntimeWarning)
+ for lr_idx_type, rr_idx_type, c_idx_type in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type,
+ c_idx_type=c_idx_type)
+ df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type,
+ c_idx_type=c_idx_type)
+ # only warns if not monotonic and not sortable
+ if should_warn(df.index, df2.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ res = pd.eval('df + df2', engine=engine, parser=parser)
+ else:
+ res = pd.eval('df + df2', engine=engine, parser=parser)
+ assert_frame_equal(res, df + df2)
+
+ def test_frame_comparison(self, engine, parser):
+ args = product(self.lhs_index_types, repeat=2)
+ for r_idx_type, c_idx_type in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ res = pd.eval('df < 2', engine=engine, parser=parser)
+ assert_frame_equal(res, df < 2)
+
+ df3 = DataFrame(randn(*df.shape), index=df.index,
+ columns=df.columns)
+ res = pd.eval('df < df3', engine=engine, parser=parser)
+ assert_frame_equal(res, df < df3)
+
+ @pytest.mark.slow
+ def test_medium_complex_frame_alignment(self, engine, parser):
+ args = product(self.lhs_index_types, self.index_types,
+ self.index_types, self.index_types)
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter('always', RuntimeWarning)
+
+ for r1, c1, r2, c2 in args:
+ df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
+ df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ if should_warn(df.index, df2.index, df3.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ res = pd.eval('df + df2 + df3', engine=engine,
+ parser=parser)
+ else:
+ res = pd.eval('df + df2 + df3',
+ engine=engine, parser=parser)
+ assert_frame_equal(res, df + df2 + df3)
+
+ def test_basic_frame_series_alignment(self, engine, parser):
+ def testit(r_idx_type, c_idx_type, index_name):
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ if should_warn(df.index, s.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ res = pd.eval('df + s', engine=engine, parser=parser)
+ else:
+ res = pd.eval('df + s', engine=engine, parser=parser)
+
+ if r_idx_type == 'dt' or c_idx_type == 'dt':
+ expected = df.add(s) if engine == 'numexpr' else df + s
+ else:
+ expected = df + s
+ assert_frame_equal(res, expected)
+
+ args = product(self.lhs_index_types, self.index_types,
+ ('index', 'columns'))
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter('always', RuntimeWarning)
+ for r_idx_type, c_idx_type, index_name in args:
+ testit(r_idx_type, c_idx_type, index_name)
+
+ def test_basic_series_frame_alignment(self, engine, parser):
+ def testit(r_idx_type, c_idx_type, index_name):
+ df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+ if should_warn(s.index, df.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ res = pd.eval('s + df', engine=engine, parser=parser)
+ else:
+ res = pd.eval('s + df', engine=engine, parser=parser)
+
+ if r_idx_type == 'dt' or c_idx_type == 'dt':
+ expected = df.add(s) if engine == 'numexpr' else s + df
+ else:
+ expected = s + df
+ assert_frame_equal(res, expected)
+
+ # only test dt with dt, otherwise weird joins result
+ args = product(['i', 'u', 's'], ['i', 'u', 's'], ('index', 'columns'))
+ with warnings.catch_warnings(record=True):
+ # avoid warning about comparing strings and ints
+ warnings.simplefilter("ignore", RuntimeWarning)
+
+ for r_idx_type, c_idx_type, index_name in args:
+ testit(r_idx_type, c_idx_type, index_name)
+
+ # dt with dt
+ args = product(['dt'], ['dt'], ('index', 'columns'))
+ with warnings.catch_warnings(record=True):
+ # avoid warning about comparing strings and ints
+ warnings.simplefilter("ignore", RuntimeWarning)
+
+ for r_idx_type, c_idx_type, index_name in args:
+ testit(r_idx_type, c_idx_type, index_name)
+
+ def test_series_frame_commutativity(self, engine, parser):
+ args = product(self.lhs_index_types, self.index_types, ('+', '*'),
+ ('index', 'columns'))
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter('always', RuntimeWarning)
+ for r_idx_type, c_idx_type, op, index_name in args:
+ df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type)
+ index = getattr(df, index_name)
+ s = Series(np.random.randn(5), index[:5])
+
+ lhs = 's {0} df'.format(op)
+ rhs = 'df {0} s'.format(op)
+ if should_warn(df.index, s.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ a = pd.eval(lhs, engine=engine, parser=parser)
+ with tm.assert_produces_warning(RuntimeWarning):
+ b = pd.eval(rhs, engine=engine, parser=parser)
+ else:
+ a = pd.eval(lhs, engine=engine, parser=parser)
+ b = pd.eval(rhs, engine=engine, parser=parser)
+
+ if r_idx_type != 'dt' and c_idx_type != 'dt':
+ if engine == 'numexpr':
+ assert_frame_equal(a, b)
+
+ @pytest.mark.slow
+ def test_complex_series_frame_alignment(self, engine, parser):
+ import random
+ args = product(self.lhs_index_types, self.index_types,
+ self.index_types, self.index_types)
+ n = 3
+ m1 = 5
+ m2 = 2 * m1
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter('always', RuntimeWarning)
+ for r1, r2, c1, c2 in args:
+ index_name = random.choice(['index', 'columns'])
+ obj_name = random.choice(['df', 'df2'])
+
+ df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1)
+ df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2)
+ index = getattr(locals().get(obj_name), index_name)
+ s = Series(np.random.randn(n), index[:n])
+
+ if r2 == 'dt' or c2 == 'dt':
+ if engine == 'numexpr':
+ expected2 = df2.add(s)
+ else:
+ expected2 = df2 + s
+ else:
+ expected2 = df2 + s
+
+ if r1 == 'dt' or c1 == 'dt':
+ if engine == 'numexpr':
+ expected = expected2.add(df)
+ else:
+ expected = expected2 + df
+ else:
+ expected = expected2 + df
+
+ if should_warn(df2.index, s.index, df.index):
+ with tm.assert_produces_warning(RuntimeWarning):
+ res = pd.eval('df2 + s + df', engine=engine,
+ parser=parser)
+ else:
+ res = pd.eval('df2 + s + df', engine=engine, parser=parser)
+ assert res.shape == expected.shape
+ assert_frame_equal(res, expected)
+
+ def test_performance_warning_for_poor_alignment(self, engine, parser):
+ df = DataFrame(randn(1000, 10))
+ s = Series(randn(10000))
+ if engine == 'numexpr':
+ seen = PerformanceWarning
+ else:
+ seen = False
+
+ with assert_produces_warning(seen):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ s = Series(randn(1000))
+ with assert_produces_warning(False):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ df = DataFrame(randn(10, 10000))
+ s = Series(randn(10000))
+ with assert_produces_warning(False):
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ df = DataFrame(randn(10, 10))
+ s = Series(randn(10000))
+
+ is_python_engine = engine == 'python'
+
+ if not is_python_engine:
+ wrn = PerformanceWarning
+ else:
+ wrn = False
+
+ with assert_produces_warning(wrn) as w:
+ pd.eval('df + s', engine=engine, parser=parser)
+
+ if not is_python_engine:
+ assert len(w) == 1
+ msg = str(w[0].message)
+ expected = ("Alignment difference on axis {0} is larger"
+ " than an order of magnitude on term {1!r}, "
+ "by more than {2:.4g}; performance may suffer"
+ "".format(1, 'df', np.log10(s.size - df.shape[1])))
+ assert msg == expected
+
+
+# ------------------------------------
+# Slightly more complex ops
+
+class TestOperationsNumExprPandas(object):
+
+ @classmethod
+ def setup_class(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+
+ @classmethod
+ def teardown_class(cls):
+ del cls.engine, cls.parser
+
+ def eval(self, *args, **kwargs):
+ kwargs['engine'] = self.engine
+ kwargs['parser'] = self.parser
+ kwargs['level'] = kwargs.pop('level', 0) + 1
+ return pd.eval(*args, **kwargs)
+
+ def test_simple_arith_ops(self):
+ ops = self.arith_ops
+
+ for op in filter(lambda x: x != '//', ops):
+ ex = '1 {0} 1'.format(op)
+ ex2 = 'x {0} 1'.format(op)
+ ex3 = '1 {0} (x + 1)'.format(op)
+
+ if op in ('in', 'not in'):
+ pytest.raises(TypeError, pd.eval, ex,
+ engine=self.engine, parser=self.parser)
+ else:
+ expec = _eval_single_bin(1, op, 1, self.engine)
+ x = self.eval(ex, engine=self.engine, parser=self.parser)
+ assert x == expec
+
+ expec = _eval_single_bin(x, op, 1, self.engine)
+ y = self.eval(ex2, local_dict={'x': x}, engine=self.engine,
+ parser=self.parser)
+ assert y == expec
+
+ expec = _eval_single_bin(1, op, x + 1, self.engine)
+ y = self.eval(ex3, local_dict={'x': x},
+ engine=self.engine, parser=self.parser)
+ assert y == expec
+
+ def test_simple_bool_ops(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, (True, False),
+ (True, False)):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ res = self.eval(ex)
+ exp = eval(ex)
+ assert res == exp
+
+ def test_bool_ops_with_constants(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'),
+ ('True', 'False')):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ res = self.eval(ex)
+ exp = eval(ex)
+ assert res == exp
+
+ @pytest.mark.filterwarnings("ignore::FutureWarning")
+ def test_panel_fails(self):
+ x = Panel(randn(3, 4, 5))
+ y = Series(randn(10))
+ with pytest.raises(NotImplementedError):
+ self.eval('x + y',
+ local_dict={'x': x, 'y': y})
+
+ def test_4d_ndarray_fails(self):
+ x = randn(3, 4, 5, 6)
+ y = Series(randn(10))
+ with pytest.raises(NotImplementedError):
+ self.eval('x + y',
+ local_dict={'x': x, 'y': y})
+
+ def test_constant(self):
+ x = self.eval('1')
+ assert x == 1
+
+ def test_single_variable(self):
+ df = DataFrame(randn(10, 2))
+ df2 = self.eval('df', local_dict={'df': df})
+ assert_frame_equal(df, df2)
+
+ def test_truediv(self):
+ s = np.array([1])
+ ex = 's / 1'
+ d = {'s': s} # noqa
+
+ if PY3:
+ res = self.eval(ex, truediv=False)
+ tm.assert_numpy_array_equal(res, np.array([1.0]))
+
+ res = self.eval(ex, truediv=True)
+ tm.assert_numpy_array_equal(res, np.array([1.0]))
+
+ res = self.eval('1 / 2', truediv=True)
+ expec = 0.5
+ assert res == expec
+
+ res = self.eval('1 / 2', truediv=False)
+ expec = 0.5
+ assert res == expec
+
+ res = self.eval('s / 2', truediv=False)
+ expec = 0.5
+ assert res == expec
+
+ res = self.eval('s / 2', truediv=True)
+ expec = 0.5
+ assert res == expec
+ else:
+ res = self.eval(ex, truediv=False)
+ tm.assert_numpy_array_equal(res, np.array([1]))
+
+ res = self.eval(ex, truediv=True)
+ tm.assert_numpy_array_equal(res, np.array([1.0]))
+
+ res = self.eval('1 / 2', truediv=True)
+ expec = 0.5
+ assert res == expec
+
+ res = self.eval('1 / 2', truediv=False)
+ expec = 0
+ assert res == expec
+
+ res = self.eval('s / 2', truediv=False)
+ expec = 0
+ assert res == expec
+
+ res = self.eval('s / 2', truediv=True)
+ expec = 0.5
+ assert res == expec
+
+ def test_failing_subscript_with_name_error(self):
+ df = DataFrame(np.random.randn(5, 3)) # noqa
+ with pytest.raises(NameError):
+ self.eval('df[x > 2] > 2')
+
+ def test_lhs_expression_subscript(self):
+ df = DataFrame(np.random.randn(5, 3))
+ result = self.eval('(df + 1)[df > 2]', local_dict={'df': df})
+ expected = (df + 1)[df > 2]
+ assert_frame_equal(result, expected)
+
+ def test_attr_expression(self):
+ df = DataFrame(np.random.randn(5, 3), columns=list('abc'))
+ expr1 = 'df.a < df.b'
+ expec1 = df.a < df.b
+ expr2 = 'df.a + df.b + df.c'
+ expec2 = df.a + df.b + df.c
+ expr3 = 'df.a + df.b + df.c[df.b < 0]'
+ expec3 = df.a + df.b + df.c[df.b < 0]
+ exprs = expr1, expr2, expr3
+ expecs = expec1, expec2, expec3
+ for e, expec in zip(exprs, expecs):
+ assert_series_equal(expec, self.eval(e, local_dict={'df': df}))
+
+ def test_assignment_fails(self):
+ df = DataFrame(np.random.randn(5, 3), columns=list('abc'))
+ df2 = DataFrame(np.random.randn(5, 3))
+ expr1 = 'df = df2'
+ pytest.raises(ValueError, self.eval, expr1,
+ local_dict={'df': df, 'df2': df2})
+
+ def test_assignment_column(self):
+ df = DataFrame(np.random.randn(5, 2), columns=list('ab'))
+ orig_df = df.copy()
+
+ # multiple assignees
+ pytest.raises(SyntaxError, df.eval, 'd c = a + b')
+
+ # invalid assignees
+ pytest.raises(SyntaxError, df.eval, 'd,c = a + b')
+ pytest.raises(SyntaxError, df.eval, 'Timestamp("20131001") = a + b')
+
+ # single assignment - existing variable
+ expected = orig_df.copy()
+ expected['a'] = expected['a'] + expected['b']
+ df = orig_df.copy()
+ df.eval('a = a + b', inplace=True)
+ assert_frame_equal(df, expected)
+
+ # single assignment - new variable
+ expected = orig_df.copy()
+ expected['c'] = expected['a'] + expected['b']
+ df = orig_df.copy()
+ df.eval('c = a + b', inplace=True)
+ assert_frame_equal(df, expected)
+
+ # with a local name overlap
+ def f():
+ df = orig_df.copy()
+ a = 1 # noqa
+ df.eval('a = 1 + b', inplace=True)
+ return df
+
+ df = f()
+ expected = orig_df.copy()
+ expected['a'] = 1 + expected['b']
+ assert_frame_equal(df, expected)
+
+ df = orig_df.copy()
+
+ def f():
+ a = 1 # noqa
+ old_a = df.a.copy()
+ df.eval('a = a + b', inplace=True)
+ result = old_a + df.b
+ assert_series_equal(result, df.a, check_names=False)
+ assert result.name is None
+
+ f()
+
+ # multiple assignment
+ df = orig_df.copy()
+ df.eval('c = a + b', inplace=True)
+ pytest.raises(SyntaxError, df.eval, 'c = a = b')
+
+ # explicit targets
+ df = orig_df.copy()
+ self.eval('c = df.a + df.b', local_dict={'df': df},
+ target=df, inplace=True)
+ expected = orig_df.copy()
+ expected['c'] = expected['a'] + expected['b']
+ assert_frame_equal(df, expected)
+
+ def test_column_in(self):
+ # GH 11235
+ df = DataFrame({'a': [11], 'b': [-32]})
+ result = df.eval('a in [11, -32]')
+ expected = Series([True])
+ assert_series_equal(result, expected)
+
+ def assignment_not_inplace(self):
+ # see gh-9297
+ df = DataFrame(np.random.randn(5, 2), columns=list('ab'))
+
+ actual = df.eval('c = a + b', inplace=False)
+ assert actual is not None
+
+ expected = df.copy()
+ expected['c'] = expected['a'] + expected['b']
+ tm.assert_frame_equal(df, expected)
+
+ def test_multi_line_expression(self):
+ # GH 11149
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ expected = df.copy()
+
+ expected['c'] = expected['a'] + expected['b']
+ expected['d'] = expected['c'] + expected['b']
+ ans = df.eval("""
+ c = a + b
+ d = c + b""", inplace=True)
+ assert_frame_equal(expected, df)
+ assert ans is None
+
+ expected['a'] = expected['a'] - 1
+ expected['e'] = expected['a'] + 2
+ ans = df.eval("""
+ a = a - 1
+ e = a + 2""", inplace=True)
+ assert_frame_equal(expected, df)
+ assert ans is None
+
+ # multi-line not valid if not all assignments
+ with pytest.raises(ValueError):
+ df.eval("""
+ a = b + 2
+ b - 2""", inplace=False)
+
+ def test_multi_line_expression_not_inplace(self):
+ # GH 11149
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ expected = df.copy()
+
+ expected['c'] = expected['a'] + expected['b']
+ expected['d'] = expected['c'] + expected['b']
+ df = df.eval("""
+ c = a + b
+ d = c + b""", inplace=False)
+ assert_frame_equal(expected, df)
+
+ expected['a'] = expected['a'] - 1
+ expected['e'] = expected['a'] + 2
+ df = df.eval("""
+ a = a - 1
+ e = a + 2""", inplace=False)
+ assert_frame_equal(expected, df)
+
+ def test_multi_line_expression_local_variable(self):
+ # GH 15342
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ expected = df.copy()
+
+ local_var = 7
+ expected['c'] = expected['a'] * local_var
+ expected['d'] = expected['c'] + local_var
+ ans = df.eval("""
+ c = a * @local_var
+ d = c + @local_var
+ """, inplace=True)
+ assert_frame_equal(expected, df)
+ assert ans is None
+
+ def test_assignment_in_query(self):
+ # GH 8664
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ df_orig = df.copy()
+ with pytest.raises(ValueError):
+ df.query('a = 1')
+ assert_frame_equal(df, df_orig)
+
+ def test_query_inplace(self):
+ # see gh-11149
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ expected = df.copy()
+ expected = expected[expected['a'] == 2]
+ df.query('a == 2', inplace=True)
+ assert_frame_equal(expected, df)
+
+ df = {}
+ expected = {"a": 3}
+
+ self.eval("a = 1 + 2", target=df, inplace=True)
+ tm.assert_dict_equal(df, expected)
+
+ @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2],
+ np.array([]), (1, 3)])
+ @pytest.mark.filterwarnings("ignore::FutureWarning")
+ def test_cannot_item_assign(self, invalid_target):
+ msg = "Cannot assign expression output to target"
+ expression = "a = 1 + 2"
+
+ with pytest.raises(ValueError, match=msg):
+ self.eval(expression, target=invalid_target, inplace=True)
+
+ if hasattr(invalid_target, "copy"):
+ with pytest.raises(ValueError, match=msg):
+ self.eval(expression, target=invalid_target, inplace=False)
+
+ @pytest.mark.parametrize("invalid_target", [1, "cat", (1, 3)])
+ def test_cannot_copy_item(self, invalid_target):
+ msg = "Cannot return a copy of the target"
+ expression = "a = 1 + 2"
+
+ with pytest.raises(ValueError, match=msg):
+ self.eval(expression, target=invalid_target, inplace=False)
+
+ @pytest.mark.parametrize("target", [1, "cat", [1, 2],
+ np.array([]), (1, 3), {1: 2}])
+ def test_inplace_no_assignment(self, target):
+ expression = "1 + 2"
+
+ assert self.eval(expression, target=target, inplace=False) == 3
+
+ msg = "Cannot operate inplace if there is no assignment"
+ with pytest.raises(ValueError, match=msg):
+ self.eval(expression, target=target, inplace=True)
+
+ def test_basic_period_index_boolean_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+
+ e = df < 2
+ r = self.eval('df < 2', local_dict={'df': df})
+ x = df < 2
+
+ assert_frame_equal(r, e)
+ assert_frame_equal(x, e)
+
+ def test_basic_period_index_subscript_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+ r = self.eval('df[df < 2 + 3]', local_dict={'df': df})
+ e = df[df < 2 + 3]
+ assert_frame_equal(r, e)
+
+ def test_nested_period_index_subscript_expression(self):
+ df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')
+ r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df})
+ e = df[df[df < 2] < 2] + df * 2
+ assert_frame_equal(r, e)
+
+ def test_date_boolean(self):
+ df = DataFrame(randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ res = self.eval('df.dates1 < 20130101', local_dict={'df': df},
+ engine=self.engine, parser=self.parser)
+ expec = df.dates1 < '20130101'
+ assert_series_equal(res, expec, check_names=False)
+
+ def test_simple_in_ops(self):
+ if self.parser != 'python':
+ res = pd.eval('1 in [1, 2]', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('2 in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('3 in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ assert not res
+
+ res = pd.eval('3 not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('[3] not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('[3] in ([3], 2)', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('(3,) in [(3,), 2]', engine=self.engine,
+ parser=self.parser)
+ assert res
+
+ res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine,
+ parser=self.parser)
+ assert not res
+
+ res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine,
+ parser=self.parser)
+ assert res
+ else:
+ with pytest.raises(NotImplementedError):
+ pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser)
+ with pytest.raises(NotImplementedError):
+ pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser)
+ with pytest.raises(NotImplementedError):
+ pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser)
+ with pytest.raises(NotImplementedError):
+ pd.eval('3 not in (1, 2)', engine=self.engine,
+ parser=self.parser)
+ with pytest.raises(NotImplementedError):
+ pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine,
+ parser=self.parser)
+ with pytest.raises(NotImplementedError):
+ pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine,
+ parser=self.parser)
+
+
+class TestOperationsNumExprPython(TestOperationsNumExprPandas):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestOperationsNumExprPython, cls).setup_class()
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = filter(lambda x: x not in ('in', 'not in'),
+ cls.arith_ops)
+
+ def test_check_many_exprs(self):
+ a = 1 # noqa
+ expr = ' * '.join('a' * 33)
+ expected = 1
+ res = pd.eval(expr, engine=self.engine, parser=self.parser)
+ assert res == expected
+
+ def test_fails_and(self):
+ df = DataFrame(np.random.randn(5, 3))
+ pytest.raises(NotImplementedError, pd.eval, 'df > 2 and df > 3',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_or(self):
+ df = DataFrame(np.random.randn(5, 3))
+ pytest.raises(NotImplementedError, pd.eval, 'df > 2 or df > 3',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_not(self):
+ df = DataFrame(np.random.randn(5, 3))
+ pytest.raises(NotImplementedError, pd.eval, 'not df > 2',
+ local_dict={'df': df}, parser=self.parser,
+ engine=self.engine)
+
+ def test_fails_ampersand(self):
+ df = DataFrame(np.random.randn(5, 3)) # noqa
+ ex = '(df + 2)[df > 1] > 0 & (df > 0)'
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex, parser=self.parser, engine=self.engine)
+
+ def test_fails_pipe(self):
+ df = DataFrame(np.random.randn(5, 3)) # noqa
+ ex = '(df + 2)[df > 1] > 0 | (df > 0)'
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex, parser=self.parser, engine=self.engine)
+
+ def test_bool_ops_with_constants(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'),
+ ('True', 'False')):
+ ex = '{0} {1} {2}'.format(lhs, op, rhs)
+ if op in ('and', 'or'):
+ with pytest.raises(NotImplementedError):
+ self.eval(ex)
+ else:
+ res = self.eval(ex)
+ exp = eval(ex)
+ assert res == exp
+
+ def test_simple_bool_ops(self):
+ for op, lhs, rhs in product(expr._bool_ops_syms, (True, False),
+ (True, False)):
+ ex = 'lhs {0} rhs'.format(op)
+ if op in ('and', 'or'):
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex, engine=self.engine, parser=self.parser)
+ else:
+ res = pd.eval(ex, engine=self.engine, parser=self.parser)
+ exp = eval(ex)
+ assert res == exp
+
+
+class TestOperationsPythonPython(TestOperationsNumExprPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestOperationsPythonPython, cls).setup_class()
+ cls.engine = cls.parser = 'python'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = filter(lambda x: x not in ('in', 'not in'),
+ cls.arith_ops)
+
+
+class TestOperationsPythonPandas(TestOperationsNumExprPandas):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestOperationsPythonPandas, cls).setup_class()
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+ cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+
+
+class TestMathPythonPython(object):
+
+ @classmethod
+ def setup_class(cls):
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+ cls.unary_fns = _unary_math_ops
+ cls.binary_fns = _binary_math_ops
+
+ @classmethod
+ def teardown_class(cls):
+ del cls.engine, cls.parser
+
+ def eval(self, *args, **kwargs):
+ kwargs['engine'] = self.engine
+ kwargs['parser'] = self.parser
+ kwargs['level'] = kwargs.pop('level', 0) + 1
+ return pd.eval(*args, **kwargs)
+
+ def test_unary_functions(self, unary_fns_for_ne):
+ df = DataFrame({'a': np.random.randn(10)})
+ a = df.a
+
+ for fn in unary_fns_for_ne:
+ expr = "{0}(a)".format(fn)
+ got = self.eval(expr)
+ with np.errstate(all='ignore'):
+ expect = getattr(np, fn)(a)
+ tm.assert_series_equal(got, expect, check_names=False)
+
+ def test_floor_and_ceil_functions_raise_error(self,
+ ne_lt_2_6_9,
+ unary_fns_for_ne):
+ for fn in ('floor', 'ceil'):
+ msg = "\"{0}\" is not a supported function".format(fn)
+ with pytest.raises(ValueError, match=msg):
+ expr = "{0}(100)".format(fn)
+ self.eval(expr)
+
+ def test_binary_functions(self):
+ df = DataFrame({'a': np.random.randn(10),
+ 'b': np.random.randn(10)})
+ a = df.a
+ b = df.b
+ for fn in self.binary_fns:
+ expr = "{0}(a, b)".format(fn)
+ got = self.eval(expr)
+ with np.errstate(all='ignore'):
+ expect = getattr(np, fn)(a, b)
+ tm.assert_almost_equal(got, expect, check_names=False)
+
+ def test_df_use_case(self):
+ df = DataFrame({'a': np.random.randn(10),
+ 'b': np.random.randn(10)})
+ df.eval("e = arctan2(sin(a), b)",
+ engine=self.engine,
+ parser=self.parser, inplace=True)
+ got = df.e
+ expect = np.arctan2(np.sin(df.a), df.b)
+ tm.assert_series_equal(got, expect, check_names=False)
+
+ def test_df_arithmetic_subexpression(self):
+ df = DataFrame({'a': np.random.randn(10),
+ 'b': np.random.randn(10)})
+ df.eval("e = sin(a + b)",
+ engine=self.engine,
+ parser=self.parser, inplace=True)
+ got = df.e
+ expect = np.sin(df.a + df.b)
+ tm.assert_series_equal(got, expect, check_names=False)
+
+ def check_result_type(self, dtype, expect_dtype):
+ df = DataFrame({'a': np.random.randn(10).astype(dtype)})
+ assert df.a.dtype == dtype
+ df.eval("b = sin(a)",
+ engine=self.engine,
+ parser=self.parser, inplace=True)
+ got = df.b
+ expect = np.sin(df.a)
+ assert expect.dtype == got.dtype
+ assert expect_dtype == got.dtype
+ tm.assert_series_equal(got, expect, check_names=False)
+
+ def test_result_types(self):
+ self.check_result_type(np.int32, np.float64)
+ self.check_result_type(np.int64, np.float64)
+ self.check_result_type(np.float32, np.float32)
+ self.check_result_type(np.float64, np.float64)
+
+ def test_result_types2(self):
+ # xref https://github.com/pandas-dev/pandas/issues/12293
+ pytest.skip("unreliable tests on complex128")
+
+ # Did not test complex64 because DataFrame is converting it to
+ # complex128. Due to https://github.com/pandas-dev/pandas/issues/10952
+ self.check_result_type(np.complex128, np.complex128)
+
+ def test_undefined_func(self):
+ df = DataFrame({'a': np.random.randn(10)})
+ msg = "\"mysin\" is not a supported function"
+
+ with pytest.raises(ValueError, match=msg):
+ df.eval("mysin(a)",
+ engine=self.engine,
+ parser=self.parser)
+
+ def test_keyword_arg(self):
+ df = DataFrame({'a': np.random.randn(10)})
+ msg = "Function \"sin\" does not support keyword arguments"
+
+ with pytest.raises(TypeError, match=msg):
+ df.eval("sin(x=a)",
+ engine=self.engine,
+ parser=self.parser)
+
+
+class TestMathPythonPandas(TestMathPythonPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestMathPythonPandas, cls).setup_class()
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+
+
+class TestMathNumExprPandas(TestMathPythonPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestMathNumExprPandas, cls).setup_class()
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+
+
+class TestMathNumExprPython(TestMathPythonPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestMathNumExprPython, cls).setup_class()
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+
+
+_var_s = randn(10)
+
+
+class TestScope(object):
+
+ def test_global_scope(self, engine, parser):
+ e = '_var_s * 2'
+ tm.assert_numpy_array_equal(_var_s * 2, pd.eval(e, engine=engine,
+ parser=parser))
+
+ def test_no_new_locals(self, engine, parser):
+ x = 1 # noqa
+ lcls = locals().copy()
+ pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser)
+ lcls2 = locals().copy()
+ lcls2.pop('lcls')
+ assert lcls == lcls2
+
+ def test_no_new_globals(self, engine, parser):
+ x = 1 # noqa
+ gbls = globals().copy()
+ pd.eval('x + 1', engine=engine, parser=parser)
+ gbls2 = globals().copy()
+ assert gbls == gbls2
+
+
+def test_invalid_engine():
+ msg = 'Invalid engine \'asdf\' passed'
+ with pytest.raises(KeyError, match=msg):
+ pd.eval('x + y', local_dict={'x': 1, 'y': 2}, engine='asdf')
+
+
+def test_invalid_parser():
+ msg = 'Invalid parser \'asdf\' passed'
+ with pytest.raises(KeyError, match=msg):
+ pd.eval('x + y', local_dict={'x': 1, 'y': 2}, parser='asdf')
+
+
+_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor,
+ 'pandas': PandasExprVisitor}
+
+
[email protected]('engine', _engines)
[email protected]('parser', _parsers)
+def test_disallowed_nodes(engine, parser):
+ VisitorClass = _parsers[parser]
+ uns_ops = VisitorClass.unsupported_nodes
+ inst = VisitorClass('x + 1', engine, parser)
+
+ for ops in uns_ops:
+ with pytest.raises(NotImplementedError):
+ getattr(inst, ops)()
+
+
+def test_syntax_error_exprs(engine, parser):
+ e = 's +'
+ with pytest.raises(SyntaxError):
+ pd.eval(e, engine=engine, parser=parser)
+
+
+def test_name_error_exprs(engine, parser):
+ e = 's + t'
+ with pytest.raises(NameError):
+ pd.eval(e, engine=engine, parser=parser)
+
+
+def test_invalid_local_variable_reference(engine, parser):
+ a, b = 1, 2 # noqa
+ exprs = 'a + @b', '@a + b', '@a + @b'
+
+ for _expr in exprs:
+ if parser != 'pandas':
+ with pytest.raises(SyntaxError, match="The '@' prefix is only"):
+ pd.eval(_expr, engine=engine, parser=parser)
+ else:
+ with pytest.raises(SyntaxError, match="The '@' prefix is not"):
+ pd.eval(_expr, engine=engine, parser=parser)
+
+
+def test_numexpr_builtin_raises(engine, parser):
+ sin, dotted_line = 1, 2
+ if engine == 'numexpr':
+ msg = 'Variables in expression .+'
+ with pytest.raises(NumExprClobberingError, match=msg):
+ pd.eval('sin + dotted_line', engine=engine, parser=parser)
+ else:
+ res = pd.eval('sin + dotted_line', engine=engine, parser=parser)
+ assert res == sin + dotted_line
+
+
+def test_bad_resolver_raises(engine, parser):
+ cannot_resolve = 42, 3.0
+ with pytest.raises(TypeError, match='Resolver of type .+'):
+ pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine,
+ parser=parser)
+
+
+def test_empty_string_raises(engine, parser):
+ # GH 13139
+ with pytest.raises(ValueError, match="expr cannot be an empty string"):
+ pd.eval('', engine=engine, parser=parser)
+
+
+def test_more_than_one_expression_raises(engine, parser):
+ with pytest.raises(SyntaxError, match=("only a single expression "
+ "is allowed")):
+ pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser)
+
+
[email protected]('cmp', ('and', 'or'))
[email protected]('lhs', (int, float))
[email protected]('rhs', (int, float))
+def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser):
+ gen = {int: lambda: np.random.randint(10), float: np.random.randn}
+
+ mid = gen[lhs]() # noqa
+ lhs = gen[lhs]() # noqa
+ rhs = gen[rhs]() # noqa
+
+ ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp)
+ ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp)
+ ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp)
+ for ex in (ex1, ex2, ex3):
+ with pytest.raises(NotImplementedError):
+ pd.eval(ex, engine=engine, parser=parser)
+
+
+def test_inf(engine, parser):
+ s = 'inf + 1'
+ expected = np.inf
+ result = pd.eval(s, engine=engine, parser=parser)
+ assert result == expected
+
+
+def test_negate_lt_eq_le(engine, parser):
+ df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count'])
+ expected = df[~(df.cat > 0)]
+
+ result = df.query('~(cat > 0)', engine=engine, parser=parser)
+ tm.assert_frame_equal(result, expected)
+
+ if parser == 'python':
+ with pytest.raises(NotImplementedError):
+ df.query('not (cat > 0)', engine=engine, parser=parser)
+ else:
+ result = df.query('not (cat > 0)', engine=engine, parser=parser)
+ tm.assert_frame_equal(result, expected)
+
+
+class TestValidate(object):
+
+ def test_validate_bool_args(self):
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ pd.eval("2+2", inplace=value)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/__init__.py b/contrib/python/pandas/py2/pandas/tests/dtypes/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/__init__.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_from_scalar.py
new file mode 100644
index 00000000000..d0f58c811e3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_from_scalar.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+from pandas.core.dtypes.cast import construct_1d_arraylike_from_scalar
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas import Categorical
+from pandas.util import testing as tm
+
+
+def test_cast_1d_array_like_from_scalar_categorical():
+ # see gh-19565
+ #
+ # Categorical result from scalar did not maintain
+ # categories and ordering of the passed dtype.
+ cats = ["a", "b", "c"]
+ cat_type = CategoricalDtype(categories=cats, ordered=False)
+ expected = Categorical(["a", "a"], categories=cats)
+
+ result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type)
+ tm.assert_categorical_equal(result, expected,
+ check_category_order=True,
+ check_dtype=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_ndarray.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_ndarray.py
new file mode 100644
index 00000000000..aa2cb25e62d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_ndarray.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import construct_1d_ndarray_preserving_na
+
+from pandas.util import testing as tm
+
+
[email protected]('values, dtype, expected', [
+ ([1, 2, 3], None, np.array([1, 2, 3])),
+ (np.array([1, 2, 3]), None, np.array([1, 2, 3])),
+ (['1', '2', None], None, np.array(['1', '2', None])),
+ (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
+ ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
+])
+def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
+ result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_object_arr.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_object_arr.py
new file mode 100644
index 00000000000..61fc17880ed
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_construct_object_arr.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+
+
[email protected]("datum1", [1, 2., "3", (4, 5), [6, 7], None])
[email protected]("datum2", [8, 9., "10", (11, 12), [13, 14], None])
+def test_cast_1d_array(datum1, datum2):
+ data = [datum1, datum2]
+ result = construct_1d_object_array_from_listlike(data)
+
+ # Direct comparison fails: https://github.com/numpy/numpy/issues/10218
+ assert result.dtype == "object"
+ assert list(result) == data
+
+
[email protected]("val", [1, 2., None])
+def test_cast_1d_array_invalid_scalar(val):
+ with pytest.raises(TypeError, match="has no len()"):
+ construct_1d_object_array_from_listlike(val)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_convert_objects.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_convert_objects.py
new file mode 100644
index 00000000000..58ba4161e96
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_convert_objects.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import maybe_convert_objects
+
+
[email protected]("data", [[1, 2], ["apply", "banana"]])
[email protected]("copy", [True, False])
+def test_maybe_convert_objects_copy(data, copy):
+ arr = np.array(data)
+ out = maybe_convert_objects(arr, copy=copy)
+
+ assert (arr is out) is (not copy)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_downcast.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_downcast.py
new file mode 100644
index 00000000000..41607c948b9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_downcast.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+
+from pandas import DatetimeIndex, Series, Timestamp
+from pandas.util import testing as tm
+
+
[email protected]("arr,dtype,expected", [
+ (np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), "infer",
+ np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995])),
+
+ (np.array([8., 8., 8., 8., 8.9999999999995]), "infer",
+ np.array([8, 8, 8, 8, 9], dtype=np.int64)),
+
+ (np.array([8., 8., 8., 8., 9.0000000000005]), "infer",
+ np.array([8, 8, 8, 8, 9], dtype=np.int64)),
+])
+def test_downcast(arr, expected, dtype):
+ result = maybe_downcast_to_dtype(arr, dtype)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_downcast_booleans():
+ # see gh-16875: coercing of booleans.
+ ser = Series([True, True, False])
+ result = maybe_downcast_to_dtype(ser, np.dtype(np.float64))
+
+ expected = ser
+ tm.assert_series_equal(result, expected)
+
+
+def test_downcast_conversion_no_nan(any_real_dtype):
+ dtype = any_real_dtype
+ expected = np.array([1, 2])
+ arr = np.array([1.0, 2.0], dtype=dtype)
+
+ result = maybe_downcast_to_dtype(arr, "infer")
+ tm.assert_almost_equal(result, expected, check_dtype=False)
+
+
+def test_downcast_conversion_nan(float_dtype):
+ dtype = float_dtype
+ data = [1.0, 2.0, np.nan]
+
+ expected = np.array(data, dtype=dtype)
+ arr = np.array(data, dtype=dtype)
+
+ result = maybe_downcast_to_dtype(arr, "infer")
+ tm.assert_almost_equal(result, expected)
+
+
+def test_downcast_conversion_empty(any_real_dtype):
+ dtype = any_real_dtype
+ arr = np.array([], dtype=dtype)
+ result = maybe_downcast_to_dtype(arr, "int64")
+ tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64))
+
+
[email protected]("klass", [np.datetime64, np.timedelta64])
+def test_datetime_likes_nan(klass):
+ dtype = klass.__name__ + "[ns]"
+ arr = np.array([1, 2, np.nan])
+
+ exp = np.array([1, 2, klass("NaT")], dtype)
+ res = maybe_downcast_to_dtype(arr, dtype)
+ tm.assert_numpy_array_equal(res, exp)
+
+
[email protected]("as_asi", [True, False])
+def test_datetime_with_timezone(as_asi):
+ # see gh-15426
+ ts = Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
+ exp = DatetimeIndex([ts, ts])
+
+ obj = exp.asi8 if as_asi else exp
+ res = maybe_downcast_to_dtype(obj, exp.dtype)
+
+ tm.assert_index_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_find_common_type.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_find_common_type.py
new file mode 100644
index 00000000000..d83c8d03e9e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_find_common_type.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import find_common_type
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, DatetimeTZDtype, PeriodDtype)
+
+
[email protected]("source_dtypes,expected_common_dtype", [
+ ((np.int64,), np.int64),
+ ((np.uint64,), np.uint64),
+ ((np.float32,), np.float32),
+ ((np.object,), np.object),
+
+ # Into ints.
+ ((np.int16, np.int64), np.int64),
+ ((np.int32, np.uint32), np.int64),
+ ((np.uint16, np.uint64), np.uint64),
+
+ # Into floats.
+ ((np.float16, np.float32), np.float32),
+ ((np.float16, np.int16), np.float32),
+ ((np.float32, np.int16), np.float32),
+ ((np.uint64, np.int64), np.float64),
+ ((np.int16, np.float64), np.float64),
+ ((np.float16, np.int64), np.float64),
+
+ # Into others.
+ ((np.complex128, np.int32), np.complex128),
+ ((np.object, np.float32), np.object),
+ ((np.object, np.int16), np.object),
+
+ # Bool with int.
+ ((np.dtype("bool"), np.int64), np.object),
+ ((np.dtype("bool"), np.int32), np.object),
+ ((np.dtype("bool"), np.int16), np.object),
+ ((np.dtype("bool"), np.int8), np.object),
+ ((np.dtype("bool"), np.uint64), np.object),
+ ((np.dtype("bool"), np.uint32), np.object),
+ ((np.dtype("bool"), np.uint16), np.object),
+ ((np.dtype("bool"), np.uint8), np.object),
+
+ # Bool with float.
+ ((np.dtype("bool"), np.float64), np.object),
+ ((np.dtype("bool"), np.float32), np.object),
+
+ ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")),
+ np.dtype("datetime64[ns]")),
+ ((np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")),
+ np.dtype("timedelta64[ns]")),
+
+ ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")),
+ np.dtype("datetime64[ns]")),
+ ((np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")),
+ np.dtype("timedelta64[ns]")),
+
+ ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object),
+ ((np.dtype("datetime64[ns]"), np.int64), np.object)
+])
+def test_numpy_dtypes(source_dtypes, expected_common_dtype):
+ assert find_common_type(source_dtypes) == expected_common_dtype
+
+
+def test_raises_empty_input():
+ with pytest.raises(ValueError, match="no types given"):
+ find_common_type([])
+
+
[email protected]("dtypes,exp_type", [
+ ([CategoricalDtype()], "category"),
+ ([np.object, CategoricalDtype()], np.object),
+ ([CategoricalDtype(), CategoricalDtype()], "category"),
+])
+def test_categorical_dtype(dtypes, exp_type):
+ assert find_common_type(dtypes) == exp_type
+
+
+def test_datetimetz_dtype_match():
+ dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern")
+ assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]"
+
+
[email protected]("dtype2", [
+ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"),
+ np.dtype("datetime64[ns]"), np.object, np.int64
+])
+def test_datetimetz_dtype_mismatch(dtype2):
+ dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern")
+ assert find_common_type([dtype, dtype2]) == np.object
+ assert find_common_type([dtype2, dtype]) == np.object
+
+
+def test_period_dtype_match():
+ dtype = PeriodDtype(freq="D")
+ assert find_common_type([dtype, dtype]) == "period[D]"
+
+
[email protected]("dtype2", [
+ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"),
+ PeriodDtype(freq="2D"), PeriodDtype(freq="H"),
+ np.dtype("datetime64[ns]"), np.object, np.int64
+])
+def test_period_dtype_mismatch(dtype2):
+ dtype = PeriodDtype(freq="D")
+ assert find_common_type([dtype, dtype2]) == np.object
+ assert find_common_type([dtype2, dtype]) == np.object
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_datetimelike.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_datetimelike.py
new file mode 100644
index 00000000000..b2d63a6bfbd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_datetimelike.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, NaT, Series, Timestamp
+
+
[email protected]("data,exp_size", [
+ # see gh-16362.
+ ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8),
+ ([[NaT, "a", 0], [NaT, "b", 1]], 6)
+])
+def test_maybe_infer_to_datetimelike_df_construct(data, exp_size):
+ result = DataFrame(np.array(data))
+ assert result.size == exp_size
+
+
+def test_maybe_infer_to_datetimelike_ser_construct():
+ # see gh-19671.
+ result = Series(["M1701", Timestamp("20130101")])
+ assert result.dtype.kind == "O"
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_dtype.py b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_dtype.py
new file mode 100644
index 00000000000..c7842ac591e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/cast/test_infer_dtype.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+from datetime import date, datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import (
+ cast_scalar_to_array, infer_dtype_from_array, infer_dtype_from_scalar)
+from pandas.core.dtypes.common import is_dtype_equal
+
+from pandas import (
+ Categorical, Period, Series, Timedelta, Timestamp, date_range)
+from pandas.util import testing as tm
+
+
[email protected](params=[True, False])
+def pandas_dtype(request):
+ return request.param
+
+
+def test_infer_dtype_from_int_scalar(any_int_dtype):
+ # Test that infer_dtype_from_scalar is
+ # returning correct dtype for int and float.
+ data = np.dtype(any_int_dtype).type(12)
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == type(data)
+
+
+def test_infer_dtype_from_float_scalar(float_dtype):
+ float_dtype = np.dtype(float_dtype).type
+ data = float_dtype(12)
+
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == float_dtype
+
+
[email protected]("data,exp_dtype", [
+ (12, np.int64), (np.float(12), np.float64)
+])
+def test_infer_dtype_from_python_scalar(data, exp_dtype):
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == exp_dtype
+
+
[email protected]("bool_val", [True, False])
+def test_infer_dtype_from_boolean(bool_val):
+ dtype, val = infer_dtype_from_scalar(bool_val)
+ assert dtype == np.bool_
+
+
+def test_infer_dtype_from_complex(complex_dtype):
+ data = np.dtype(complex_dtype).type(1)
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == np.complex_
+
+
[email protected]("data", [np.datetime64(1, "ns"), Timestamp(1),
+ datetime(2000, 1, 1, 0, 0)])
+def test_infer_dtype_from_datetime(data):
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == "M8[ns]"
+
+
[email protected]("data", [np.timedelta64(1, "ns"), Timedelta(1),
+ timedelta(1)])
+def test_infer_dtype_from_timedelta(data):
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == "m8[ns]"
+
+
[email protected]("freq", ["M", "D"])
+def test_infer_dtype_from_period(freq, pandas_dtype):
+ p = Period("2011-01-01", freq=freq)
+ dtype, val = infer_dtype_from_scalar(p, pandas_dtype=pandas_dtype)
+
+ if pandas_dtype:
+ exp_dtype = "period[{0}]".format(freq)
+ exp_val = p.ordinal
+ else:
+ exp_dtype = np.object_
+ exp_val = p
+
+ assert dtype == exp_dtype
+ assert val == exp_val
+
+
[email protected]("data", [date(2000, 1, 1), "foo",
+ Timestamp(1, tz="US/Eastern")])
+def test_infer_dtype_misc(data):
+ dtype, val = infer_dtype_from_scalar(data)
+ assert dtype == np.object_
+
+
[email protected]("tz", ["UTC", "US/Eastern", "Asia/Tokyo"])
+def test_infer_from_scalar_tz(tz, pandas_dtype):
+ dt = Timestamp(1, tz=tz)
+ dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=pandas_dtype)
+
+ if pandas_dtype:
+ exp_dtype = "datetime64[ns, {0}]".format(tz)
+ exp_val = dt.value
+ else:
+ exp_dtype = np.object_
+ exp_val = dt
+
+ assert dtype == exp_dtype
+ assert val == exp_val
+
+
+def test_infer_dtype_from_scalar_errors():
+ msg = "invalid ndarray passed to infer_dtype_from_scalar"
+
+ with pytest.raises(ValueError, match=msg):
+ infer_dtype_from_scalar(np.array([1]))
+
+
+ "arr, expected, pandas_dtype",
+ [("foo", np.object_, False),
+ (b"foo", np.object_, False),
+ (1, np.int_, False),
+ (1.5, np.float_, False),
+ ([1], np.int_, False),
+ (np.array([1], dtype=np.int64), np.int64, False),
+ ([np.nan, 1, ""], np.object_, False),
+ (np.array([[1.0, 2.0]]), np.float_, False),
+ (Categorical(list("aabc")), np.object_, False),
+ (Categorical([1, 2, 3]), np.int64, False),
+ (Categorical(list("aabc")), "category", True),
+ (Categorical([1, 2, 3]), "category", True),
+ (Timestamp("20160101"), np.object_, False),
+ (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False),
+ (date_range("20160101", periods=3),
+ np.dtype("=M8[ns]"), False),
+ (date_range("20160101", periods=3, tz="US/Eastern"),
+ "datetime64[ns, US/Eastern]", True),
+ (Series([1., 2, 3]), np.float64, False),
+ (Series(list("abc")), np.object_, False),
+ (Series(date_range("20160101", periods=3, tz="US/Eastern")),
+ "datetime64[ns, US/Eastern]", True)])
+def test_infer_dtype_from_array(arr, expected, pandas_dtype):
+ dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype)
+ assert is_dtype_equal(dtype, expected)
+
+
[email protected]("obj,dtype", [
+ (1, np.int64), (1.1, np.float64),
+ (Timestamp("2011-01-01"), "datetime64[ns]"),
+ (Timestamp("2011-01-01", tz="US/Eastern"), np.object),
+ (Period("2011-01-01", freq="D"), np.object)
+])
+def test_cast_scalar_to_array(obj, dtype):
+ shape = (3, 2)
+
+ exp = np.empty(shape, dtype=dtype)
+ exp.fill(obj)
+
+ arr = cast_scalar_to_array(shape, obj, dtype=dtype)
+ tm.assert_numpy_array_equal(arr, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_common.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_common.py
new file mode 100644
index 00000000000..62e96fd39a7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_common.py
@@ -0,0 +1,653 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas.core.dtypes.common as com
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, IntervalDtype,
+ PeriodDtype)
+
+import pandas as pd
+from pandas.conftest import (
+ ALL_EA_INT_DTYPES, ALL_INT_DTYPES, SIGNED_EA_INT_DTYPES, SIGNED_INT_DTYPES,
+ UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES)
+from pandas.core.sparse.api import SparseDtype
+import pandas.util.testing as tm
+
+
+# EA & Actual Dtypes
+def to_ea_dtypes(dtypes):
+ """ convert list of string dtypes to EA dtype """
+ return [getattr(pd, dt + 'Dtype') for dt in dtypes]
+
+
+def to_numpy_dtypes(dtypes):
+ """ convert list of string dtypes to numpy dtype """
+ return [getattr(np, dt) for dt in dtypes if isinstance(dt, str)]
+
+
+class TestPandasDtype(object):
+
+ # Passing invalid dtype, both as a string or object, must raise TypeError
+ # Per issue GH15520
+ @pytest.mark.parametrize('box', [pd.Timestamp, 'pd.Timestamp', list])
+ def test_invalid_dtype_error(self, box):
+ with pytest.raises(TypeError, match='not understood'):
+ com.pandas_dtype(box)
+
+ @pytest.mark.parametrize('dtype', [
+ object, 'float64', np.object_, np.dtype('object'), 'O',
+ np.float64, float, np.dtype('float64')])
+ def test_pandas_dtype_valid(self, dtype):
+ assert com.pandas_dtype(dtype) == dtype
+
+ @pytest.mark.parametrize('dtype', [
+ 'M8[ns]', 'm8[ns]', 'object', 'float64', 'int64'])
+ def test_numpy_dtype(self, dtype):
+ assert com.pandas_dtype(dtype) == np.dtype(dtype)
+
+ def test_numpy_string_dtype(self):
+ # do not parse freq-like string as period dtype
+ assert com.pandas_dtype('U') == np.dtype('U')
+ assert com.pandas_dtype('S') == np.dtype('S')
+
+ @pytest.mark.parametrize('dtype', [
+ 'datetime64[ns, US/Eastern]',
+ 'datetime64[ns, Asia/Tokyo]',
+ 'datetime64[ns, UTC]'])
+ def test_datetimetz_dtype(self, dtype):
+ assert (com.pandas_dtype(dtype) ==
+ DatetimeTZDtype.construct_from_string(dtype))
+ assert com.pandas_dtype(dtype) == dtype
+
+ def test_categorical_dtype(self):
+ assert com.pandas_dtype('category') == CategoricalDtype()
+
+ @pytest.mark.parametrize('dtype', [
+ 'period[D]', 'period[3M]', 'period[U]',
+ 'Period[D]', 'Period[3M]', 'Period[U]'])
+ def test_period_dtype(self, dtype):
+ assert com.pandas_dtype(dtype) is PeriodDtype(dtype)
+ assert com.pandas_dtype(dtype) == PeriodDtype(dtype)
+ assert com.pandas_dtype(dtype) == dtype
+
+
+dtypes = dict(datetime_tz=com.pandas_dtype('datetime64[ns, US/Eastern]'),
+ datetime=com.pandas_dtype('datetime64[ns]'),
+ timedelta=com.pandas_dtype('timedelta64[ns]'),
+ period=PeriodDtype('D'),
+ integer=np.dtype(np.int64),
+ float=np.dtype(np.float64),
+ object=np.dtype(np.object),
+ category=com.pandas_dtype('category'))
+
+
[email protected]('name1,dtype1',
+ list(dtypes.items()),
+ ids=lambda x: str(x))
[email protected]('name2,dtype2',
+ list(dtypes.items()),
+ ids=lambda x: str(x))
+def test_dtype_equal(name1, dtype1, name2, dtype2):
+
+ # match equal to self, but not equal to other
+ assert com.is_dtype_equal(dtype1, dtype1)
+ if name1 != name2:
+ assert not com.is_dtype_equal(dtype1, dtype2)
+
+
[email protected]("dtype1,dtype2", [
+ (np.int8, np.int64),
+ (np.int16, np.int64),
+ (np.int32, np.int64),
+ (np.float32, np.float64),
+ (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType
+ (com.pandas_dtype("datetime64[ns, US/Eastern]"),
+ com.pandas_dtype("datetime64[ns, CET]")), # Datetime
+ (None, None) # gh-15941: no exception should be raised.
+])
+def test_dtype_equal_strict(dtype1, dtype2):
+ assert not com.is_dtype_equal(dtype1, dtype2)
+
+
+def get_is_dtype_funcs():
+ """
+ Get all functions in pandas.core.dtypes.common that
+ begin with 'is_' and end with 'dtype'
+
+ """
+
+ fnames = [f for f in dir(com) if (f.startswith('is_') and
+ f.endswith('dtype'))]
+ return [getattr(com, fname) for fname in fnames]
+
+
+ get_is_dtype_funcs(),
+ ids=lambda x: x.__name__)
+def test_get_dtype_error_catch(func):
+ # see gh-15941
+ #
+ # No exception should be raised.
+
+ assert not func(None)
+
+
+def test_is_object():
+ assert com.is_object_dtype(object)
+ assert com.is_object_dtype(np.array([], dtype=object))
+
+ assert not com.is_object_dtype(int)
+ assert not com.is_object_dtype(np.array([], dtype=int))
+ assert not com.is_object_dtype([1, 2, 3])
+
+
[email protected]("check_scipy", [
+ False, pytest.param(True, marks=td.skip_if_no_scipy)
+])
+def test_is_sparse(check_scipy):
+ assert com.is_sparse(pd.SparseArray([1, 2, 3]))
+ assert com.is_sparse(pd.SparseSeries([1, 2, 3]))
+
+ assert not com.is_sparse(np.array([1, 2, 3]))
+
+ if check_scipy:
+ import scipy.sparse
+ assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
+
+
[email protected]_if_no_scipy
+def test_is_scipy_sparse():
+ from scipy.sparse import bsr_matrix
+ assert com.is_scipy_sparse(bsr_matrix([1, 2, 3]))
+
+ assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3]))
+ assert not com.is_scipy_sparse(pd.SparseSeries([1, 2, 3]))
+
+
+def test_is_categorical():
+ cat = pd.Categorical([1, 2, 3])
+ assert com.is_categorical(cat)
+ assert com.is_categorical(pd.Series(cat))
+ assert com.is_categorical(pd.CategoricalIndex([1, 2, 3]))
+
+ assert not com.is_categorical([1, 2, 3])
+
+
+def test_is_datetimetz():
+ with tm.assert_produces_warning(FutureWarning):
+ assert not com.is_datetimetz([1, 2, 3])
+ assert not com.is_datetimetz(pd.DatetimeIndex([1, 2, 3]))
+
+ assert com.is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+
+ dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ s = pd.Series([], dtype=dtype)
+ assert com.is_datetimetz(s)
+
+
+def test_is_period_deprecated():
+ with tm.assert_produces_warning(FutureWarning):
+ assert not com.is_period([1, 2, 3])
+ assert not com.is_period(pd.Index([1, 2, 3]))
+ assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D"))
+
+
+def test_is_datetime64_dtype():
+ assert not com.is_datetime64_dtype(object)
+ assert not com.is_datetime64_dtype([1, 2, 3])
+ assert not com.is_datetime64_dtype(np.array([], dtype=int))
+
+ assert com.is_datetime64_dtype(np.datetime64)
+ assert com.is_datetime64_dtype(np.array([], dtype=np.datetime64))
+
+
+def test_is_datetime64tz_dtype():
+ assert not com.is_datetime64tz_dtype(object)
+ assert not com.is_datetime64tz_dtype([1, 2, 3])
+ assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3]))
+ assert com.is_datetime64tz_dtype(pd.DatetimeIndex(['2000'],
+ tz="US/Eastern"))
+
+
+def test_is_timedelta64_dtype():
+ assert not com.is_timedelta64_dtype(object)
+ assert not com.is_timedelta64_dtype(None)
+ assert not com.is_timedelta64_dtype([1, 2, 3])
+ assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64))
+ assert not com.is_timedelta64_dtype('0 days')
+ assert not com.is_timedelta64_dtype("0 days 00:00:00")
+ assert not com.is_timedelta64_dtype(["0 days 00:00:00"])
+ assert not com.is_timedelta64_dtype("NO DATE")
+
+ assert com.is_timedelta64_dtype(np.timedelta64)
+ assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]"))
+ assert com.is_timedelta64_dtype(pd.to_timedelta(['0 days', '1 days']))
+
+
+def test_is_period_dtype():
+ assert not com.is_period_dtype(object)
+ assert not com.is_period_dtype([1, 2, 3])
+ assert not com.is_period_dtype(pd.Period("2017-01-01"))
+
+ assert com.is_period_dtype(PeriodDtype(freq="D"))
+ assert com.is_period_dtype(pd.PeriodIndex([], freq="A"))
+
+
+def test_is_interval_dtype():
+ assert not com.is_interval_dtype(object)
+ assert not com.is_interval_dtype([1, 2, 3])
+
+ assert com.is_interval_dtype(IntervalDtype())
+
+ interval = pd.Interval(1, 2, closed="right")
+ assert not com.is_interval_dtype(interval)
+ assert com.is_interval_dtype(pd.IntervalIndex([interval]))
+
+
+def test_is_categorical_dtype():
+ assert not com.is_categorical_dtype(object)
+ assert not com.is_categorical_dtype([1, 2, 3])
+
+ assert com.is_categorical_dtype(CategoricalDtype())
+ assert com.is_categorical_dtype(pd.Categorical([1, 2, 3]))
+ assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3]))
+
+
+def test_is_string_dtype():
+ assert not com.is_string_dtype(int)
+ assert not com.is_string_dtype(pd.Series([1, 2]))
+
+ assert com.is_string_dtype(str)
+ assert com.is_string_dtype(object)
+ assert com.is_string_dtype(np.array(['a', 'b']))
+
+
+def test_is_period_arraylike():
+ assert not com.is_period_arraylike([1, 2, 3])
+ assert not com.is_period_arraylike(pd.Index([1, 2, 3]))
+ assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D"))
+
+
+def test_is_datetime_arraylike():
+ assert not com.is_datetime_arraylike([1, 2, 3])
+ assert not com.is_datetime_arraylike(pd.Index([1, 2, 3]))
+ assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3]))
+
+
+def test_is_datetimelike():
+ assert not com.is_datetimelike([1, 2, 3])
+ assert not com.is_datetimelike(pd.Index([1, 2, 3]))
+
+ assert com.is_datetimelike(pd.DatetimeIndex([1, 2, 3]))
+ assert com.is_datetimelike(pd.PeriodIndex([], freq="A"))
+ assert com.is_datetimelike(np.array([], dtype=np.datetime64))
+ assert com.is_datetimelike(pd.Series([], dtype="timedelta64[ns]"))
+ assert com.is_datetimelike(pd.DatetimeIndex(["2000"], tz="US/Eastern"))
+
+ dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ s = pd.Series([], dtype=dtype)
+ assert com.is_datetimelike(s)
+
+
+ 'dtype', [
+ pd.Series([1, 2])] +
+ ALL_INT_DTYPES + to_numpy_dtypes(ALL_INT_DTYPES) +
+ ALL_EA_INT_DTYPES + to_ea_dtypes(ALL_EA_INT_DTYPES))
+def test_is_integer_dtype(dtype):
+ assert com.is_integer_dtype(dtype)
+
+
+ 'dtype', [str, float, np.datetime64, np.timedelta64,
+ pd.Index([1, 2.]), np.array(['a', 'b']),
+ np.array([], dtype=np.timedelta64)])
+def test_is_not_integer_dtype(dtype):
+ assert not com.is_integer_dtype(dtype)
+
+
+ 'dtype', [
+ pd.Series([1, 2])] +
+ SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) +
+ SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES))
+def test_is_signed_integer_dtype(dtype):
+ assert com.is_integer_dtype(dtype)
+
+
+ 'dtype',
+ [
+ str, float, np.datetime64, np.timedelta64,
+ pd.Index([1, 2.]), np.array(['a', 'b']),
+ np.array([], dtype=np.timedelta64)] +
+ UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) +
+ UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES))
+def test_is_not_signed_integer_dtype(dtype):
+ assert not com.is_signed_integer_dtype(dtype)
+
+
+ 'dtype',
+ [pd.Series([1, 2], dtype=np.uint32)] +
+ UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) +
+ UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES))
+def test_is_unsigned_integer_dtype(dtype):
+ assert com.is_unsigned_integer_dtype(dtype)
+
+
+ 'dtype',
+ [
+ str, float, np.datetime64, np.timedelta64,
+ pd.Index([1, 2.]), np.array(['a', 'b']),
+ np.array([], dtype=np.timedelta64)] +
+ SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) +
+ SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES))
+def test_is_not_unsigned_integer_dtype(dtype):
+ assert not com.is_unsigned_integer_dtype(dtype)
+
+
+ 'dtype',
+ [np.int64, np.array([1, 2], dtype=np.int64), 'Int64', pd.Int64Dtype])
+def test_is_int64_dtype(dtype):
+ assert com.is_int64_dtype(dtype)
+
+
+ 'dtype',
+ [
+ str, float, np.int32, np.uint64, pd.Index([1, 2.]),
+ np.array(['a', 'b']), np.array([1, 2], dtype=np.uint32),
+ 'int8', 'Int8', pd.Int8Dtype])
+def test_is_not_int64_dtype(dtype):
+ assert not com.is_int64_dtype(dtype)
+
+
+def test_is_datetime64_any_dtype():
+ assert not com.is_datetime64_any_dtype(int)
+ assert not com.is_datetime64_any_dtype(str)
+ assert not com.is_datetime64_any_dtype(np.array([1, 2]))
+ assert not com.is_datetime64_any_dtype(np.array(['a', 'b']))
+
+ assert com.is_datetime64_any_dtype(np.datetime64)
+ assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64))
+ assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+ assert com.is_datetime64_any_dtype(
+ pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]"))
+
+
+def test_is_datetime64_ns_dtype():
+ assert not com.is_datetime64_ns_dtype(int)
+ assert not com.is_datetime64_ns_dtype(str)
+ assert not com.is_datetime64_ns_dtype(np.datetime64)
+ assert not com.is_datetime64_ns_dtype(np.array([1, 2]))
+ assert not com.is_datetime64_ns_dtype(np.array(['a', 'b']))
+ assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64))
+
+ # This datetime array has the wrong unit (ps instead of ns)
+ assert not com.is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]"))
+
+ assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+ assert com.is_datetime64_ns_dtype(
+ pd.DatetimeIndex([1, 2, 3], dtype=np.dtype('datetime64[ns]')))
+
+
+def test_is_timedelta64_ns_dtype():
+ assert not com.is_timedelta64_ns_dtype(np.dtype('m8[ps]'))
+ assert not com.is_timedelta64_ns_dtype(
+ np.array([1, 2], dtype=np.timedelta64))
+
+ assert com.is_timedelta64_ns_dtype(np.dtype('m8[ns]'))
+ assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]'))
+
+
+def test_is_datetime_or_timedelta_dtype():
+ assert not com.is_datetime_or_timedelta_dtype(int)
+ assert not com.is_datetime_or_timedelta_dtype(str)
+ assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
+ assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
+
+ # TODO(jreback), this is sligthly suspect
+ assert not com.is_datetime_or_timedelta_dtype(
+ DatetimeTZDtype("ns", "US/Eastern"))
+
+ assert com.is_datetime_or_timedelta_dtype(np.datetime64)
+ assert com.is_datetime_or_timedelta_dtype(np.timedelta64)
+ assert com.is_datetime_or_timedelta_dtype(
+ np.array([], dtype=np.timedelta64))
+ assert com.is_datetime_or_timedelta_dtype(
+ np.array([], dtype=np.datetime64))
+
+
+def test_is_numeric_v_string_like():
+ assert not com.is_numeric_v_string_like(1, 1)
+ assert not com.is_numeric_v_string_like(1, "foo")
+ assert not com.is_numeric_v_string_like("foo", "foo")
+ assert not com.is_numeric_v_string_like(np.array([1]), np.array([2]))
+ assert not com.is_numeric_v_string_like(
+ np.array(["foo"]), np.array(["foo"]))
+
+ assert com.is_numeric_v_string_like(np.array([1]), "foo")
+ assert com.is_numeric_v_string_like("foo", np.array([1]))
+ assert com.is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"]))
+ assert com.is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2]))
+
+
+def test_is_datetimelike_v_numeric():
+ dt = np.datetime64(pd.datetime(2017, 1, 1))
+
+ assert not com.is_datetimelike_v_numeric(1, 1)
+ assert not com.is_datetimelike_v_numeric(dt, dt)
+ assert not com.is_datetimelike_v_numeric(np.array([1]), np.array([2]))
+ assert not com.is_datetimelike_v_numeric(np.array([dt]), np.array([dt]))
+
+ assert com.is_datetimelike_v_numeric(1, dt)
+ assert com.is_datetimelike_v_numeric(1, dt)
+ assert com.is_datetimelike_v_numeric(np.array([dt]), 1)
+ assert com.is_datetimelike_v_numeric(np.array([1]), dt)
+ assert com.is_datetimelike_v_numeric(np.array([dt]), np.array([1]))
+
+
+def test_is_datetimelike_v_object():
+ obj = object()
+ dt = np.datetime64(pd.datetime(2017, 1, 1))
+
+ assert not com.is_datetimelike_v_object(dt, dt)
+ assert not com.is_datetimelike_v_object(obj, obj)
+ assert not com.is_datetimelike_v_object(np.array([dt]), np.array([1]))
+ assert not com.is_datetimelike_v_object(np.array([dt]), np.array([dt]))
+ assert not com.is_datetimelike_v_object(np.array([obj]), np.array([obj]))
+
+ assert com.is_datetimelike_v_object(dt, obj)
+ assert com.is_datetimelike_v_object(obj, dt)
+ assert com.is_datetimelike_v_object(np.array([dt]), obj)
+ assert com.is_datetimelike_v_object(np.array([obj]), dt)
+ assert com.is_datetimelike_v_object(np.array([dt]), np.array([obj]))
+
+
+def test_needs_i8_conversion():
+ assert not com.needs_i8_conversion(str)
+ assert not com.needs_i8_conversion(np.int64)
+ assert not com.needs_i8_conversion(pd.Series([1, 2]))
+ assert not com.needs_i8_conversion(np.array(['a', 'b']))
+
+ assert com.needs_i8_conversion(np.datetime64)
+ assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]"))
+ assert com.needs_i8_conversion(pd.DatetimeIndex(
+ ["2000"], tz="US/Eastern"))
+
+
+def test_is_numeric_dtype():
+ assert not com.is_numeric_dtype(str)
+ assert not com.is_numeric_dtype(np.datetime64)
+ assert not com.is_numeric_dtype(np.timedelta64)
+ assert not com.is_numeric_dtype(np.array(['a', 'b']))
+ assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64))
+
+ assert com.is_numeric_dtype(int)
+ assert com.is_numeric_dtype(float)
+ assert com.is_numeric_dtype(np.uint64)
+ assert com.is_numeric_dtype(pd.Series([1, 2]))
+ assert com.is_numeric_dtype(pd.Index([1, 2.]))
+
+
+def test_is_string_like_dtype():
+ assert not com.is_string_like_dtype(object)
+ assert not com.is_string_like_dtype(pd.Series([1, 2]))
+
+ assert com.is_string_like_dtype(str)
+ assert com.is_string_like_dtype(np.array(['a', 'b']))
+
+
+def test_is_float_dtype():
+ assert not com.is_float_dtype(str)
+ assert not com.is_float_dtype(int)
+ assert not com.is_float_dtype(pd.Series([1, 2]))
+ assert not com.is_float_dtype(np.array(['a', 'b']))
+
+ assert com.is_float_dtype(float)
+ assert com.is_float_dtype(pd.Index([1, 2.]))
+
+
+def test_is_bool_dtype():
+ assert not com.is_bool_dtype(int)
+ assert not com.is_bool_dtype(str)
+ assert not com.is_bool_dtype(pd.Series([1, 2]))
+ assert not com.is_bool_dtype(np.array(['a', 'b']))
+ assert not com.is_bool_dtype(pd.Index(['a', 'b']))
+
+ assert com.is_bool_dtype(bool)
+ assert com.is_bool_dtype(np.bool)
+ assert com.is_bool_dtype(np.array([True, False]))
+ assert com.is_bool_dtype(pd.Index([True, False]))
+
+
[email protected]("check_scipy", [
+ False, pytest.param(True, marks=td.skip_if_no_scipy)
+])
+def test_is_extension_type(check_scipy):
+ assert not com.is_extension_type([1, 2, 3])
+ assert not com.is_extension_type(np.array([1, 2, 3]))
+ assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3]))
+
+ cat = pd.Categorical([1, 2, 3])
+ assert com.is_extension_type(cat)
+ assert com.is_extension_type(pd.Series(cat))
+ assert com.is_extension_type(pd.SparseArray([1, 2, 3]))
+ assert com.is_extension_type(pd.SparseSeries([1, 2, 3]))
+ assert com.is_extension_type(pd.DatetimeIndex(['2000'], tz="US/Eastern"))
+
+ dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ s = pd.Series([], dtype=dtype)
+ assert com.is_extension_type(s)
+
+ if check_scipy:
+ import scipy.sparse
+ assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3]))
+
+
+def test_is_complex_dtype():
+ assert not com.is_complex_dtype(int)
+ assert not com.is_complex_dtype(str)
+ assert not com.is_complex_dtype(pd.Series([1, 2]))
+ assert not com.is_complex_dtype(np.array(['a', 'b']))
+
+ assert com.is_complex_dtype(np.complex)
+ assert com.is_complex_dtype(np.array([1 + 1j, 5]))
+
+
+def test_is_offsetlike():
+ assert com.is_offsetlike(np.array([pd.DateOffset(month=3),
+ pd.offsets.Nano()]))
+ assert com.is_offsetlike(pd.offsets.MonthEnd())
+ assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)]))
+
+ assert not com.is_offsetlike(pd.Timedelta(1))
+ assert not com.is_offsetlike(np.array([1 + 1j, 5]))
+
+ # mixed case
+ assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)]))
+
+
[email protected]('input_param,result', [
+ (int, np.dtype(int)),
+ ('int32', np.dtype('int32')),
+ (float, np.dtype(float)),
+ ('float64', np.dtype('float64')),
+ (np.dtype('float64'), np.dtype('float64')),
+ (str, np.dtype(str)),
+ (pd.Series([1, 2], dtype=np.dtype('int16')), np.dtype('int16')),
+ (pd.Series(['a', 'b']), np.dtype(object)),
+ (pd.Index([1, 2]), np.dtype('int64')),
+ (pd.Index(['a', 'b']), np.dtype(object)),
+ ('category', 'category'),
+ (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+ (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])),
+ (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+ (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])),
+ (CategoricalDtype(), CategoricalDtype()),
+ (CategoricalDtype(['a', 'b']), CategoricalDtype()),
+ (pd.DatetimeIndex([1, 2]), np.dtype('=M8[ns]')),
+ (pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')),
+ ('<M8[ns]', np.dtype('<M8[ns]')),
+ ('datetime64[ns, Europe/London]', DatetimeTZDtype('ns', 'Europe/London')),
+ (pd.SparseSeries([1, 2], dtype='int32'), SparseDtype('int32')),
+ (pd.SparseSeries([1, 2], dtype='int32').dtype, SparseDtype('int32')),
+ (PeriodDtype(freq='D'), PeriodDtype(freq='D')),
+ ('period[D]', PeriodDtype(freq='D')),
+ (IntervalDtype(), IntervalDtype()),
+])
+def test__get_dtype(input_param, result):
+ assert com._get_dtype(input_param) == result
+
+
[email protected]('input_param', [None,
+ 1, 1.2,
+ 'random string',
+ pd.DataFrame([1, 2])])
+def test__get_dtype_fails(input_param):
+ # python objects
+ pytest.raises(TypeError, com._get_dtype, input_param)
+
+
[email protected]('input_param,result', [
+ (int, np.dtype(int).type),
+ ('int32', np.int32),
+ (float, np.dtype(float).type),
+ ('float64', np.float64),
+ (np.dtype('float64'), np.float64),
+ (str, np.dtype(str).type),
+ (pd.Series([1, 2], dtype=np.dtype('int16')), np.int16),
+ (pd.Series(['a', 'b']), np.object_),
+ (pd.Index([1, 2], dtype='int64'), np.int64),
+ (pd.Index(['a', 'b']), np.object_),
+ ('category', CategoricalDtypeType),
+ (pd.Categorical(['a', 'b']).dtype, CategoricalDtypeType),
+ (pd.Categorical(['a', 'b']), CategoricalDtypeType),
+ (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtypeType),
+ (pd.CategoricalIndex(['a', 'b']), CategoricalDtypeType),
+ (pd.DatetimeIndex([1, 2]), np.datetime64),
+ (pd.DatetimeIndex([1, 2]).dtype, np.datetime64),
+ ('<M8[ns]', np.datetime64),
+ (pd.DatetimeIndex(['2000'], tz='Europe/London'), pd.Timestamp),
+ (pd.DatetimeIndex(['2000'], tz='Europe/London').dtype,
+ pd.Timestamp),
+ ('datetime64[ns, Europe/London]', pd.Timestamp),
+ (pd.SparseSeries([1, 2], dtype='int32'), np.int32),
+ (pd.SparseSeries([1, 2], dtype='int32').dtype, np.int32),
+ (PeriodDtype(freq='D'), pd.Period),
+ ('period[D]', pd.Period),
+ (IntervalDtype(), pd.Interval),
+ (None, type(None)),
+ (1, type(None)),
+ (1.2, type(None)),
+ (pd.DataFrame([1, 2]), type(None)), # composite dtype
+])
+def test__is_dtype_type(input_param, result):
+ assert com._is_dtype_type(input_param, lambda tipo: tipo == result)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_concat.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_concat.py
new file mode 100644
index 00000000000..d58f8ee3b74
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_concat.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas.core.dtypes.concat as _concat
+
+from pandas import (
+ DatetimeIndex, Index, Period, PeriodIndex, Series, TimedeltaIndex)
+
+
[email protected]('to_concat, expected', [
+ # int/float/str
+ ([['a'], [1, 2]], ['i', 'object']),
+ ([[3, 4], [1, 2]], ['i']),
+ ([[3, 4], [1, 2.1]], ['i', 'f']),
+
+ # datetimelike
+ ([DatetimeIndex(['2011-01-01']), DatetimeIndex(['2011-01-02'])],
+ ['datetime']),
+ ([TimedeltaIndex(['1 days']), TimedeltaIndex(['2 days'])],
+ ['timedelta']),
+
+ # datetimelike object
+ ([DatetimeIndex(['2011-01-01']),
+ DatetimeIndex(['2011-01-02'], tz='US/Eastern')],
+ ['datetime', 'datetime64[ns, US/Eastern]']),
+ ([DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'),
+ DatetimeIndex(['2011-01-02'], tz='US/Eastern')],
+ ['datetime64[ns, Asia/Tokyo]', 'datetime64[ns, US/Eastern]']),
+ ([TimedeltaIndex(['1 days']), TimedeltaIndex(['2 hours'])],
+ ['timedelta']),
+ ([DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'),
+ TimedeltaIndex(['1 days'])],
+ ['datetime64[ns, Asia/Tokyo]', 'timedelta'])])
[email protected]('klass', [Index, Series])
+def test_get_dtype_kinds(klass, to_concat, expected):
+ to_concat_klass = [klass(c) for c in to_concat]
+ result = _concat.get_dtype_kinds(to_concat_klass)
+ assert result == set(expected)
+
+
[email protected]('to_concat, expected', [
+ ([PeriodIndex(['2011-01'], freq='M'),
+ PeriodIndex(['2011-01'], freq='M')], ['period[M]']),
+ ([Series([Period('2011-01', freq='M')]),
+ Series([Period('2011-02', freq='M')])], ['period[M]']),
+ ([PeriodIndex(['2011-01'], freq='M'),
+ PeriodIndex(['2011-01'], freq='D')], ['period[M]', 'period[D]']),
+ ([Series([Period('2011-01', freq='M')]),
+ Series([Period('2011-02', freq='D')])], ['period[M]', 'period[D]'])])
+def test_get_dtype_kinds_period(to_concat, expected):
+ result = _concat.get_dtype_kinds(to_concat)
+ assert result == set(expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_dtypes.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_dtypes.py
new file mode 100644
index 00000000000..71eaf504bdc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_dtypes.py
@@ -0,0 +1,890 @@
+# -*- coding: utf-8 -*-
+import re
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import (
+ is_bool_dtype, is_categorical, is_categorical_dtype,
+ is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype,
+ is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype,
+ is_period, is_period_dtype, is_string_dtype)
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry)
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, IntervalIndex, Series, date_range)
+from pandas.core.sparse.api import SparseDtype
+import pandas.util.testing as tm
+
+
[email protected](params=[True, False, None])
+def ordered(request):
+ return request.param
+
+
+class Base(object):
+
+ def setup_method(self, method):
+ self.dtype = self.create()
+
+ def test_hash(self):
+ hash(self.dtype)
+
+ def test_equality_invalid(self):
+ assert not self.dtype == 'foo'
+ assert not is_dtype_equal(self.dtype, np.int64)
+
+ def test_numpy_informed(self):
+ pytest.raises(TypeError, np.dtype, self.dtype)
+
+ assert not self.dtype == np.str_
+ assert not np.str_ == self.dtype
+
+ def test_pickle(self):
+ # make sure our cache is NOT pickled
+
+ # clear the cache
+ type(self.dtype).reset_cache()
+ assert not len(self.dtype._cache)
+
+ # force back to the cache
+ result = tm.round_trip_pickle(self.dtype)
+ assert not len(self.dtype._cache)
+ assert result == self.dtype
+
+
+class TestCategoricalDtype(Base):
+
+ def create(self):
+ return CategoricalDtype()
+
+ def test_pickle(self):
+ # make sure our cache is NOT pickled
+
+ # clear the cache
+ type(self.dtype).reset_cache()
+ assert not len(self.dtype._cache)
+
+ # force back to the cache
+ result = tm.round_trip_pickle(self.dtype)
+ assert result == self.dtype
+
+ def test_hash_vs_equality(self):
+ dtype = self.dtype
+ dtype2 = CategoricalDtype()
+ assert dtype == dtype2
+ assert dtype2 == dtype
+ assert hash(dtype) == hash(dtype2)
+
+ def test_equality(self):
+ assert is_dtype_equal(self.dtype, 'category')
+ assert is_dtype_equal(self.dtype, CategoricalDtype())
+ assert not is_dtype_equal(self.dtype, 'foo')
+
+ def test_construction_from_string(self):
+ result = CategoricalDtype.construct_from_string('category')
+ assert is_dtype_equal(self.dtype, result)
+ pytest.raises(
+ TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
+
+ def test_constructor_invalid(self):
+ msg = "Parameter 'categories' must be list-like"
+ with pytest.raises(TypeError, match=msg):
+ CategoricalDtype("category")
+
+ dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
+ dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
+ c = Categorical([0, 1], dtype=dtype1, fastpath=True)
+
+ @pytest.mark.parametrize('values, categories, ordered, dtype, expected',
+ [
+ [None, None, None, None,
+ CategoricalDtype()],
+ [None, ['a', 'b'], True, None, dtype1],
+ [c, None, None, dtype2, dtype2],
+ [c, ['x', 'y'], False, None, dtype2],
+ ])
+ def test_from_values_or_dtype(
+ self, values, categories, ordered, dtype, expected):
+ result = CategoricalDtype._from_values_or_dtype(values, categories,
+ ordered, dtype)
+ assert result == expected
+
+ @pytest.mark.parametrize('values, categories, ordered, dtype', [
+ [None, ['a', 'b'], True, dtype2],
+ [None, ['a', 'b'], None, dtype2],
+ [None, None, True, dtype2],
+ ])
+ def test_from_values_or_dtype_raises(self, values, categories,
+ ordered, dtype):
+ msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+ with pytest.raises(ValueError, match=msg):
+ CategoricalDtype._from_values_or_dtype(values, categories,
+ ordered, dtype)
+
+ def test_is_dtype(self):
+ assert CategoricalDtype.is_dtype(self.dtype)
+ assert CategoricalDtype.is_dtype('category')
+ assert CategoricalDtype.is_dtype(CategoricalDtype())
+ assert not CategoricalDtype.is_dtype('foo')
+ assert not CategoricalDtype.is_dtype(np.float64)
+
+ def test_basic(self):
+
+ assert is_categorical_dtype(self.dtype)
+
+ factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+
+ s = Series(factor, name='A')
+
+ # dtypes
+ assert is_categorical_dtype(s.dtype)
+ assert is_categorical_dtype(s)
+ assert not is_categorical_dtype(np.dtype('float64'))
+
+ assert is_categorical(s.dtype)
+ assert is_categorical(s)
+ assert not is_categorical(np.dtype('float64'))
+ assert not is_categorical(1.0)
+
+ def test_tuple_categories(self):
+ categories = [(1, 'a'), (2, 'b'), (3, 'c')]
+ result = CategoricalDtype(categories)
+ assert all(result.categories == categories)
+
+ @pytest.mark.parametrize("categories, expected", [
+ ([True, False], True),
+ ([True, False, None], True),
+ ([True, False, "a", "b'"], False),
+ ([0, 1], False),
+ ])
+ def test_is_boolean(self, categories, expected):
+ cat = Categorical(categories)
+ assert cat.dtype._is_boolean is expected
+ assert is_bool_dtype(cat) is expected
+ assert is_bool_dtype(cat.dtype) is expected
+
+
+class TestDatetimeTZDtype(Base):
+
+ def create(self):
+ return DatetimeTZDtype('ns', 'US/Eastern')
+
+ def test_alias_to_unit_raises(self):
+ # 23990
+ with tm.assert_produces_warning(FutureWarning):
+ DatetimeTZDtype('datetime64[ns, US/Central]')
+
+ def test_alias_to_unit_bad_alias_raises(self):
+ # 23990
+ with pytest.raises(TypeError, match=''):
+ DatetimeTZDtype('this is a bad string')
+
+ with pytest.raises(TypeError, match=''):
+ DatetimeTZDtype('datetime64[ns, US/NotATZ]')
+
+ def test_hash_vs_equality(self):
+ # make sure that we satisfy is semantics
+ dtype = self.dtype
+ dtype2 = DatetimeTZDtype('ns', 'US/Eastern')
+ dtype3 = DatetimeTZDtype(dtype2)
+ assert dtype == dtype2
+ assert dtype2 == dtype
+ assert dtype3 == dtype
+ assert hash(dtype) == hash(dtype2)
+ assert hash(dtype) == hash(dtype3)
+
+ dtype4 = DatetimeTZDtype("ns", "US/Central")
+ assert dtype2 != dtype4
+ assert hash(dtype2) != hash(dtype4)
+
+ def test_construction(self):
+ pytest.raises(ValueError,
+ lambda: DatetimeTZDtype('ms', 'US/Eastern'))
+
+ def test_subclass(self):
+ a = DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]')
+ b = DatetimeTZDtype.construct_from_string('datetime64[ns, CET]')
+
+ assert issubclass(type(a), type(a))
+ assert issubclass(type(a), type(b))
+
+ def test_compat(self):
+ assert is_datetime64tz_dtype(self.dtype)
+ assert is_datetime64tz_dtype('datetime64[ns, US/Eastern]')
+ assert is_datetime64_any_dtype(self.dtype)
+ assert is_datetime64_any_dtype('datetime64[ns, US/Eastern]')
+ assert is_datetime64_ns_dtype(self.dtype)
+ assert is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')
+ assert not is_datetime64_dtype(self.dtype)
+ assert not is_datetime64_dtype('datetime64[ns, US/Eastern]')
+
+ def test_construction_from_string(self):
+ result = DatetimeTZDtype.construct_from_string(
+ 'datetime64[ns, US/Eastern]')
+ assert is_dtype_equal(self.dtype, result)
+ pytest.raises(TypeError,
+ lambda: DatetimeTZDtype.construct_from_string('foo'))
+
+ def test_construct_from_string_raises(self):
+ with pytest.raises(TypeError, match="notatz"):
+ DatetimeTZDtype.construct_from_string('datetime64[ns, notatz]')
+
+ with pytest.raises(TypeError,
+ match="^Could not construct DatetimeTZDtype$"):
+ DatetimeTZDtype.construct_from_string(['datetime64[ns, notatz]'])
+
+ def test_is_dtype(self):
+ assert not DatetimeTZDtype.is_dtype(None)
+ assert DatetimeTZDtype.is_dtype(self.dtype)
+ assert DatetimeTZDtype.is_dtype('datetime64[ns, US/Eastern]')
+ assert not DatetimeTZDtype.is_dtype('foo')
+ assert DatetimeTZDtype.is_dtype(DatetimeTZDtype('ns', 'US/Pacific'))
+ assert not DatetimeTZDtype.is_dtype(np.float64)
+
+ def test_equality(self):
+ assert is_dtype_equal(self.dtype, 'datetime64[ns, US/Eastern]')
+ assert is_dtype_equal(self.dtype, DatetimeTZDtype('ns', 'US/Eastern'))
+ assert not is_dtype_equal(self.dtype, 'foo')
+ assert not is_dtype_equal(self.dtype, DatetimeTZDtype('ns', 'CET'))
+ assert not is_dtype_equal(DatetimeTZDtype('ns', 'US/Eastern'),
+ DatetimeTZDtype('ns', 'US/Pacific'))
+
+ # numpy compat
+ assert is_dtype_equal(np.dtype("M8[ns]"), "datetime64[ns]")
+
+ def test_basic(self):
+
+ assert is_datetime64tz_dtype(self.dtype)
+
+ dr = date_range('20130101', periods=3, tz='US/Eastern')
+ s = Series(dr, name='A')
+
+ # dtypes
+ assert is_datetime64tz_dtype(s.dtype)
+ assert is_datetime64tz_dtype(s)
+ assert not is_datetime64tz_dtype(np.dtype('float64'))
+ assert not is_datetime64tz_dtype(1.0)
+
+ with tm.assert_produces_warning(FutureWarning):
+ assert is_datetimetz(s)
+ assert is_datetimetz(s.dtype)
+ assert not is_datetimetz(np.dtype('float64'))
+ assert not is_datetimetz(1.0)
+
+ def test_dst(self):
+
+ dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern')
+ s1 = Series(dr1, name='A')
+ assert is_datetime64tz_dtype(s1)
+ with tm.assert_produces_warning(FutureWarning):
+ assert is_datetimetz(s1)
+
+ dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern')
+ s2 = Series(dr2, name='A')
+ assert is_datetime64tz_dtype(s2)
+ with tm.assert_produces_warning(FutureWarning):
+ assert is_datetimetz(s2)
+ assert s1.dtype == s2.dtype
+
+ @pytest.mark.parametrize('tz', ['UTC', 'US/Eastern'])
+ @pytest.mark.parametrize('constructor', ['M8', 'datetime64'])
+ def test_parser(self, tz, constructor):
+ # pr #11245
+ dtz_str = '{con}[ns, {tz}]'.format(con=constructor, tz=tz)
+ result = DatetimeTZDtype.construct_from_string(dtz_str)
+ expected = DatetimeTZDtype('ns', tz)
+ assert result == expected
+
+ def test_empty(self):
+ with pytest.raises(TypeError, match="A 'tz' is required."):
+ DatetimeTZDtype()
+
+
+class TestPeriodDtype(Base):
+
+ def create(self):
+ return PeriodDtype('D')
+
+ def test_hash_vs_equality(self):
+ # make sure that we satisfy is semantics
+ dtype = self.dtype
+ dtype2 = PeriodDtype('D')
+ dtype3 = PeriodDtype(dtype2)
+ assert dtype == dtype2
+ assert dtype2 == dtype
+ assert dtype3 == dtype
+ assert dtype is dtype2
+ assert dtype2 is dtype
+ assert dtype3 is dtype
+ assert hash(dtype) == hash(dtype2)
+ assert hash(dtype) == hash(dtype3)
+
+ def test_construction(self):
+ with pytest.raises(ValueError):
+ PeriodDtype('xx')
+
+ for s in ['period[D]', 'Period[D]', 'D']:
+ dt = PeriodDtype(s)
+ assert dt.freq == pd.tseries.offsets.Day()
+ assert is_period_dtype(dt)
+
+ for s in ['period[3D]', 'Period[3D]', '3D']:
+ dt = PeriodDtype(s)
+ assert dt.freq == pd.tseries.offsets.Day(3)
+ assert is_period_dtype(dt)
+
+ for s in ['period[26H]', 'Period[26H]', '26H',
+ 'period[1D2H]', 'Period[1D2H]', '1D2H']:
+ dt = PeriodDtype(s)
+ assert dt.freq == pd.tseries.offsets.Hour(26)
+ assert is_period_dtype(dt)
+
+ def test_subclass(self):
+ a = PeriodDtype('period[D]')
+ b = PeriodDtype('period[3D]')
+
+ assert issubclass(type(a), type(a))
+ assert issubclass(type(a), type(b))
+
+ def test_identity(self):
+ assert PeriodDtype('period[D]') == PeriodDtype('period[D]')
+ assert PeriodDtype('period[D]') is PeriodDtype('period[D]')
+
+ assert PeriodDtype('period[3D]') == PeriodDtype('period[3D]')
+ assert PeriodDtype('period[3D]') is PeriodDtype('period[3D]')
+
+ assert PeriodDtype('period[1S1U]') == PeriodDtype('period[1000001U]')
+ assert PeriodDtype('period[1S1U]') is PeriodDtype('period[1000001U]')
+
+ def test_compat(self):
+ assert not is_datetime64_ns_dtype(self.dtype)
+ assert not is_datetime64_ns_dtype('period[D]')
+ assert not is_datetime64_dtype(self.dtype)
+ assert not is_datetime64_dtype('period[D]')
+
+ def test_construction_from_string(self):
+ result = PeriodDtype('period[D]')
+ assert is_dtype_equal(self.dtype, result)
+ result = PeriodDtype.construct_from_string('period[D]')
+ assert is_dtype_equal(self.dtype, result)
+ with pytest.raises(TypeError):
+ PeriodDtype.construct_from_string('foo')
+ with pytest.raises(TypeError):
+ PeriodDtype.construct_from_string('period[foo]')
+ with pytest.raises(TypeError):
+ PeriodDtype.construct_from_string('foo[D]')
+
+ with pytest.raises(TypeError):
+ PeriodDtype.construct_from_string('datetime64[ns]')
+ with pytest.raises(TypeError):
+ PeriodDtype.construct_from_string('datetime64[ns, US/Eastern]')
+
+ def test_is_dtype(self):
+ assert PeriodDtype.is_dtype(self.dtype)
+ assert PeriodDtype.is_dtype('period[D]')
+ assert PeriodDtype.is_dtype('period[3D]')
+ assert PeriodDtype.is_dtype(PeriodDtype('3D'))
+ assert PeriodDtype.is_dtype('period[U]')
+ assert PeriodDtype.is_dtype('period[S]')
+ assert PeriodDtype.is_dtype(PeriodDtype('U'))
+ assert PeriodDtype.is_dtype(PeriodDtype('S'))
+
+ assert not PeriodDtype.is_dtype('D')
+ assert not PeriodDtype.is_dtype('3D')
+ assert not PeriodDtype.is_dtype('U')
+ assert not PeriodDtype.is_dtype('S')
+ assert not PeriodDtype.is_dtype('foo')
+ assert not PeriodDtype.is_dtype(np.object_)
+ assert not PeriodDtype.is_dtype(np.int64)
+ assert not PeriodDtype.is_dtype(np.float64)
+
+ def test_equality(self):
+ assert is_dtype_equal(self.dtype, 'period[D]')
+ assert is_dtype_equal(self.dtype, PeriodDtype('D'))
+ assert is_dtype_equal(self.dtype, PeriodDtype('D'))
+ assert is_dtype_equal(PeriodDtype('D'), PeriodDtype('D'))
+
+ assert not is_dtype_equal(self.dtype, 'D')
+ assert not is_dtype_equal(PeriodDtype('D'), PeriodDtype('2D'))
+
+ def test_basic(self):
+ assert is_period_dtype(self.dtype)
+
+ pidx = pd.period_range('2013-01-01 09:00', periods=5, freq='H')
+
+ assert is_period_dtype(pidx.dtype)
+ assert is_period_dtype(pidx)
+ with tm.assert_produces_warning(FutureWarning):
+ assert is_period(pidx)
+
+ s = Series(pidx, name='A')
+
+ assert is_period_dtype(s.dtype)
+ assert is_period_dtype(s)
+ with tm.assert_produces_warning(FutureWarning):
+ assert is_period(s)
+
+ assert not is_period_dtype(np.dtype('float64'))
+ assert not is_period_dtype(1.0)
+ with tm.assert_produces_warning(FutureWarning):
+ assert not is_period(np.dtype('float64'))
+ with tm.assert_produces_warning(FutureWarning):
+ assert not is_period(1.0)
+
+ def test_empty(self):
+ dt = PeriodDtype()
+ with pytest.raises(AttributeError):
+ str(dt)
+
+ def test_not_string(self):
+ # though PeriodDtype has object kind, it cannot be string
+ assert not is_string_dtype(PeriodDtype('D'))
+
+
+class TestIntervalDtype(Base):
+
+ def create(self):
+ return IntervalDtype('int64')
+
+ def test_hash_vs_equality(self):
+ # make sure that we satisfy is semantics
+ dtype = self.dtype
+ dtype2 = IntervalDtype('int64')
+ dtype3 = IntervalDtype(dtype2)
+ assert dtype == dtype2
+ assert dtype2 == dtype
+ assert dtype3 == dtype
+ assert dtype is dtype2
+ assert dtype2 is dtype3
+ assert dtype3 is dtype
+ assert hash(dtype) == hash(dtype2)
+ assert hash(dtype) == hash(dtype3)
+
+ dtype1 = IntervalDtype('interval')
+ dtype2 = IntervalDtype(dtype1)
+ dtype3 = IntervalDtype('interval')
+ assert dtype2 == dtype1
+ assert dtype2 == dtype2
+ assert dtype2 == dtype3
+ assert dtype2 is dtype1
+ assert dtype2 is dtype2
+ assert dtype2 is dtype3
+ assert hash(dtype2) == hash(dtype1)
+ assert hash(dtype2) == hash(dtype2)
+ assert hash(dtype2) == hash(dtype3)
+
+ @pytest.mark.parametrize('subtype', [
+ 'interval[int64]', 'Interval[int64]', 'int64', np.dtype('int64')])
+ def test_construction(self, subtype):
+ i = IntervalDtype(subtype)
+ assert i.subtype == np.dtype('int64')
+ assert is_interval_dtype(i)
+
+ @pytest.mark.parametrize('subtype', [None, 'interval', 'Interval'])
+ def test_construction_generic(self, subtype):
+ # generic
+ i = IntervalDtype(subtype)
+ assert i.subtype is None
+ assert is_interval_dtype(i)
+
+ @pytest.mark.parametrize('subtype', [
+ CategoricalDtype(list('abc'), False),
+ CategoricalDtype(list('wxyz'), True),
+ object, str, '<U10', 'interval[category]', 'interval[object]'])
+ def test_construction_not_supported(self, subtype):
+ # GH 19016
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalDtype')
+ with pytest.raises(TypeError, match=msg):
+ IntervalDtype(subtype)
+
+ @pytest.mark.parametrize('subtype', ['xx', 'IntervalA', 'Interval[foo]'])
+ def test_construction_errors(self, subtype):
+ msg = 'could not construct IntervalDtype'
+ with pytest.raises(TypeError, match=msg):
+ IntervalDtype(subtype)
+
+ def test_construction_from_string(self):
+ result = IntervalDtype('interval[int64]')
+ assert is_dtype_equal(self.dtype, result)
+ result = IntervalDtype.construct_from_string('interval[int64]')
+ assert is_dtype_equal(self.dtype, result)
+
+ @pytest.mark.parametrize('string', [
+ 0, 3.14, ('a', 'b'), None])
+ def test_construction_from_string_errors(self, string):
+ # these are invalid entirely
+ msg = 'a string needs to be passed, got type'
+
+ with pytest.raises(TypeError, match=msg):
+ IntervalDtype.construct_from_string(string)
+
+ @pytest.mark.parametrize('string', [
+ 'foo', 'foo[int64]', 'IntervalA'])
+ def test_construction_from_string_error_subtype(self, string):
+ # this is an invalid subtype
+ msg = ("Incorrectly formatted string passed to constructor. "
+ r"Valid formats include Interval or Interval\[dtype\] "
+ "where dtype is numeric, datetime, or timedelta")
+
+ with pytest.raises(TypeError, match=msg):
+ IntervalDtype.construct_from_string(string)
+
+ def test_subclass(self):
+ a = IntervalDtype('interval[int64]')
+ b = IntervalDtype('interval[int64]')
+
+ assert issubclass(type(a), type(a))
+ assert issubclass(type(a), type(b))
+
+ def test_is_dtype(self):
+ assert IntervalDtype.is_dtype(self.dtype)
+ assert IntervalDtype.is_dtype('interval')
+ assert IntervalDtype.is_dtype(IntervalDtype('float64'))
+ assert IntervalDtype.is_dtype(IntervalDtype('int64'))
+ assert IntervalDtype.is_dtype(IntervalDtype(np.int64))
+
+ assert not IntervalDtype.is_dtype('D')
+ assert not IntervalDtype.is_dtype('3D')
+ assert not IntervalDtype.is_dtype('U')
+ assert not IntervalDtype.is_dtype('S')
+ assert not IntervalDtype.is_dtype('foo')
+ assert not IntervalDtype.is_dtype('IntervalA')
+ assert not IntervalDtype.is_dtype(np.object_)
+ assert not IntervalDtype.is_dtype(np.int64)
+ assert not IntervalDtype.is_dtype(np.float64)
+
+ def test_equality(self):
+ assert is_dtype_equal(self.dtype, 'interval[int64]')
+ assert is_dtype_equal(self.dtype, IntervalDtype('int64'))
+ assert is_dtype_equal(IntervalDtype('int64'), IntervalDtype('int64'))
+
+ assert not is_dtype_equal(self.dtype, 'int64')
+ assert not is_dtype_equal(IntervalDtype('int64'),
+ IntervalDtype('float64'))
+
+ # invalid subtype comparisons do not raise when directly compared
+ dtype1 = IntervalDtype('float64')
+ dtype2 = IntervalDtype('datetime64[ns, US/Eastern]')
+ assert dtype1 != dtype2
+ assert dtype2 != dtype1
+
+ @pytest.mark.parametrize('subtype', [
+ None, 'interval', 'Interval', 'int64', 'uint64', 'float64',
+ 'complex128', 'datetime64', 'timedelta64', PeriodDtype('Q')])
+ def test_equality_generic(self, subtype):
+ # GH 18980
+ dtype = IntervalDtype(subtype)
+ assert is_dtype_equal(dtype, 'interval')
+ assert is_dtype_equal(dtype, IntervalDtype())
+
+ @pytest.mark.parametrize('subtype', [
+ 'int64', 'uint64', 'float64', 'complex128', 'datetime64',
+ 'timedelta64', PeriodDtype('Q')])
+ def test_name_repr(self, subtype):
+ # GH 18980
+ dtype = IntervalDtype(subtype)
+ expected = 'interval[{subtype}]'.format(subtype=subtype)
+ assert str(dtype) == expected
+ assert dtype.name == 'interval'
+
+ @pytest.mark.parametrize('subtype', [None, 'interval', 'Interval'])
+ def test_name_repr_generic(self, subtype):
+ # GH 18980
+ dtype = IntervalDtype(subtype)
+ assert str(dtype) == 'interval'
+ assert dtype.name == 'interval'
+
+ def test_basic(self):
+ assert is_interval_dtype(self.dtype)
+
+ ii = IntervalIndex.from_breaks(range(3))
+
+ assert is_interval_dtype(ii.dtype)
+ assert is_interval_dtype(ii)
+
+ s = Series(ii, name='A')
+
+ assert is_interval_dtype(s.dtype)
+ assert is_interval_dtype(s)
+
+ def test_basic_dtype(self):
+ assert is_interval_dtype('interval[int64]')
+ assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))
+ assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4)))
+ assert is_interval_dtype(IntervalIndex.from_breaks(
+ date_range('20130101', periods=3)))
+ assert not is_interval_dtype('U')
+ assert not is_interval_dtype('S')
+ assert not is_interval_dtype('foo')
+ assert not is_interval_dtype(np.object_)
+ assert not is_interval_dtype(np.int64)
+ assert not is_interval_dtype(np.float64)
+
+ def test_caching(self):
+ IntervalDtype.reset_cache()
+ dtype = IntervalDtype("int64")
+ assert len(IntervalDtype._cache) == 1
+
+ IntervalDtype("interval")
+ assert len(IntervalDtype._cache) == 2
+
+ IntervalDtype.reset_cache()
+ tm.round_trip_pickle(dtype)
+ assert len(IntervalDtype._cache) == 0
+
+
+class TestCategoricalDtypeParametrized(object):
+
+ @pytest.mark.parametrize('categories', [
+ list('abcd'),
+ np.arange(1000),
+ ['a', 'b', 10, 2, 1.3, True],
+ [True, False],
+ pd.date_range('2017', periods=4)])
+ def test_basic(self, categories, ordered):
+ c1 = CategoricalDtype(categories, ordered=ordered)
+ tm.assert_index_equal(c1.categories, pd.Index(categories))
+ assert c1.ordered is ordered
+
+ def test_order_matters(self):
+ categories = ['a', 'b']
+ c1 = CategoricalDtype(categories, ordered=True)
+ c2 = CategoricalDtype(categories, ordered=False)
+ c3 = CategoricalDtype(categories, ordered=None)
+ assert c1 is not c2
+ assert c1 is not c3
+
+ @pytest.mark.parametrize('ordered', [False, None])
+ def test_unordered_same(self, ordered):
+ c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
+ c2 = CategoricalDtype(['b', 'a'], ordered=ordered)
+ assert hash(c1) == hash(c2)
+
+ def test_categories(self):
+ result = CategoricalDtype(['a', 'b', 'c'])
+ tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
+ assert result.ordered is None
+
+ def test_equal_but_different(self, ordered):
+ c1 = CategoricalDtype([1, 2, 3])
+ c2 = CategoricalDtype([1., 2., 3.])
+ assert c1 is not c2
+ assert c1 != c2
+
+ @pytest.mark.parametrize('v1, v2', [
+ ([1, 2, 3], [1, 2, 3]),
+ ([1, 2, 3], [3, 2, 1]),
+ ])
+ def test_order_hashes_different(self, v1, v2):
+ c1 = CategoricalDtype(v1, ordered=False)
+ c2 = CategoricalDtype(v2, ordered=True)
+ c3 = CategoricalDtype(v1, ordered=None)
+ assert c1 is not c2
+ assert c1 is not c3
+
+ def test_nan_invalid(self):
+ with pytest.raises(ValueError):
+ CategoricalDtype([1, 2, np.nan])
+
+ def test_non_unique_invalid(self):
+ with pytest.raises(ValueError):
+ CategoricalDtype([1, 2, 1])
+
+ def test_same_categories_different_order(self):
+ c1 = CategoricalDtype(['a', 'b'], ordered=True)
+ c2 = CategoricalDtype(['b', 'a'], ordered=True)
+ assert c1 is not c2
+
+ @pytest.mark.parametrize('ordered1', [True, False, None])
+ @pytest.mark.parametrize('ordered2', [True, False, None])
+ def test_categorical_equality(self, ordered1, ordered2):
+ # same categories, same order
+ # any combination of None/False are equal
+ # True/True is the only combination with True that are equal
+ c1 = CategoricalDtype(list('abc'), ordered1)
+ c2 = CategoricalDtype(list('abc'), ordered2)
+ result = c1 == c2
+ expected = bool(ordered1) is bool(ordered2)
+ assert result is expected
+
+ # same categories, different order
+ # any combination of None/False are equal (order doesn't matter)
+ # any combination with True are not equal (different order of cats)
+ c1 = CategoricalDtype(list('abc'), ordered1)
+ c2 = CategoricalDtype(list('cab'), ordered2)
+ result = c1 == c2
+ expected = (bool(ordered1) is False) and (bool(ordered2) is False)
+ assert result is expected
+
+ # different categories
+ c2 = CategoricalDtype([1, 2, 3], ordered2)
+ assert c1 != c2
+
+ # none categories
+ c1 = CategoricalDtype(list('abc'), ordered1)
+ c2 = CategoricalDtype(None, ordered2)
+ c3 = CategoricalDtype(None, ordered1)
+ assert c1 == c2
+ assert c2 == c1
+ assert c2 == c3
+
+ @pytest.mark.parametrize('categories', [list('abc'), None])
+ @pytest.mark.parametrize('other', ['category', 'not a category'])
+ def test_categorical_equality_strings(self, categories, ordered, other):
+ c1 = CategoricalDtype(categories, ordered)
+ result = c1 == other
+ expected = other == 'category'
+ assert result is expected
+
+ def test_invalid_raises(self):
+ with pytest.raises(TypeError, match='ordered'):
+ CategoricalDtype(['a', 'b'], ordered='foo')
+
+ with pytest.raises(TypeError, match="'categories' must be list-like"):
+ CategoricalDtype('category')
+
+ def test_mixed(self):
+ a = CategoricalDtype(['a', 'b', 1, 2])
+ b = CategoricalDtype(['a', 'b', '1', '2'])
+ assert hash(a) != hash(b)
+
+ def test_from_categorical_dtype_identity(self):
+ c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+ # Identity test for no changes
+ c2 = CategoricalDtype._from_categorical_dtype(c1)
+ assert c2 is c1
+
+ def test_from_categorical_dtype_categories(self):
+ c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+ # override categories
+ result = CategoricalDtype._from_categorical_dtype(
+ c1, categories=[2, 3])
+ assert result == CategoricalDtype([2, 3], ordered=True)
+
+ def test_from_categorical_dtype_ordered(self):
+ c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+ # override ordered
+ result = CategoricalDtype._from_categorical_dtype(
+ c1, ordered=False)
+ assert result == CategoricalDtype([1, 2, 3], ordered=False)
+
+ def test_from_categorical_dtype_both(self):
+ c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+ # override ordered
+ result = CategoricalDtype._from_categorical_dtype(
+ c1, categories=[1, 2], ordered=False)
+ assert result == CategoricalDtype([1, 2], ordered=False)
+
+ def test_str_vs_repr(self, ordered):
+ c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
+ assert str(c1) == 'category'
+ # Py2 will have unicode prefixes
+ pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)"
+ assert re.match(pat.format(ordered=ordered), repr(c1))
+
+ def test_categorical_categories(self):
+ # GH17884
+ c1 = CategoricalDtype(Categorical(['a', 'b']))
+ tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
+ c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
+ tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
+
+ @pytest.mark.parametrize('new_categories', [
+ list('abc'), list('cba'), list('wxyz'), None])
+ @pytest.mark.parametrize('new_ordered', [True, False, None])
+ def test_update_dtype(self, ordered, new_categories, new_ordered):
+ dtype = CategoricalDtype(list('abc'), ordered)
+ new_dtype = CategoricalDtype(new_categories, new_ordered)
+
+ expected_categories = new_dtype.categories
+ if expected_categories is None:
+ expected_categories = dtype.categories
+
+ expected_ordered = new_dtype.ordered
+ if expected_ordered is None:
+ expected_ordered = dtype.ordered
+
+ result = dtype.update_dtype(new_dtype)
+ tm.assert_index_equal(result.categories, expected_categories)
+ assert result.ordered is expected_ordered
+
+ def test_update_dtype_string(self, ordered):
+ dtype = CategoricalDtype(list('abc'), ordered)
+ expected_categories = dtype.categories
+ expected_ordered = dtype.ordered
+ result = dtype.update_dtype('category')
+ tm.assert_index_equal(result.categories, expected_categories)
+ assert result.ordered is expected_ordered
+
+ @pytest.mark.parametrize('bad_dtype', [
+ 'foo', object, np.int64, PeriodDtype('Q')])
+ def test_update_dtype_errors(self, bad_dtype):
+ dtype = CategoricalDtype(list('abc'), False)
+ msg = 'a CategoricalDtype must be passed to perform an update, '
+ with pytest.raises(ValueError, match=msg):
+ dtype.update_dtype(bad_dtype)
+
+
+ CategoricalDtype,
+ IntervalDtype,
+ DatetimeTZDtype,
+ PeriodDtype,
+])
+def test_registry(dtype):
+ assert dtype in registry.dtypes
+
+
[email protected]('dtype, expected', [
+ ('int64', None),
+ ('interval', IntervalDtype()),
+ ('interval[int64]', IntervalDtype()),
+ ('interval[datetime64[ns]]', IntervalDtype('datetime64[ns]')),
+ ('period[D]', PeriodDtype('D')),
+ ('category', CategoricalDtype()),
+ ('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern')),
+])
+def test_registry_find(dtype, expected):
+ assert registry.find(dtype) == expected
+
+
[email protected]('dtype, expected', [
+ (str, False),
+ (int, False),
+ (bool, True),
+ (np.bool, True),
+ (np.array(['a', 'b']), False),
+ (pd.Series([1, 2]), False),
+ (np.array([True, False]), True),
+ (pd.Series([True, False]), True),
+ (pd.SparseSeries([True, False]), True),
+ (pd.SparseArray([True, False]), True),
+ (SparseDtype(bool), True)
+])
+def test_is_bool_dtype(dtype, expected):
+ result = is_bool_dtype(dtype)
+ assert result is expected
+
+
+ is_categorical_dtype,
+ is_datetime64tz_dtype,
+ is_period_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64_dtype,
+ is_interval_dtype,
+ is_datetime64_any_dtype,
+ is_string_dtype,
+ is_bool_dtype,
+])
+def test_is_dtype_no_warning(check):
+ data = pd.DataFrame({"A": [1, 2]})
+ with tm.assert_produces_warning(None):
+ check(data)
+
+ with tm.assert_produces_warning(None):
+ check(data["A"])
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_generic.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_generic.py
new file mode 100644
index 00000000000..1622088d05f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_generic.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+
+from pandas.core.dtypes import generic as gt
+
+import pandas as pd
+from pandas.util import testing as tm
+
+
+class TestABCClasses(object):
+ tuples = [[1, 2, 2], ['red', 'blue', 'red']]
+ multi_index = pd.MultiIndex.from_arrays(tuples, names=('number', 'color'))
+ datetime_index = pd.to_datetime(['2000/1/1', '2010/1/1'])
+ timedelta_index = pd.to_timedelta(np.arange(5), unit='s')
+ period_index = pd.period_range('2000/1/1', '2010/1/1/', freq='M')
+ categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
+ categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
+ df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
+ sparse_series = pd.Series([1, 2, 3]).to_sparse()
+ sparse_array = pd.SparseArray(np.random.randn(10))
+ sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})
+ datetime_array = pd.core.arrays.DatetimeArray(datetime_index)
+ timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index)
+
+ def test_abc_types(self):
+ assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
+ assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index)
+ assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index)
+ assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index)
+ assert isinstance(self.multi_index, gt.ABCMultiIndex)
+ assert isinstance(self.datetime_index, gt.ABCDatetimeIndex)
+ assert isinstance(self.timedelta_index, gt.ABCTimedeltaIndex)
+ assert isinstance(self.period_index, gt.ABCPeriodIndex)
+ assert isinstance(self.categorical_df.index, gt.ABCCategoricalIndex)
+ assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass)
+ assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass)
+ assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries)
+ assert isinstance(self.df, gt.ABCDataFrame)
+ with catch_warnings(record=True):
+ simplefilter('ignore', FutureWarning)
+ assert isinstance(self.df.to_panel(), gt.ABCPanel)
+ assert isinstance(self.sparse_series, gt.ABCSparseSeries)
+ assert isinstance(self.sparse_array, gt.ABCSparseArray)
+ assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame)
+ assert isinstance(self.categorical, gt.ABCCategorical)
+ assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)
+
+ assert isinstance(pd.DateOffset(), gt.ABCDateOffset)
+ assert isinstance(pd.Period('2012', freq='A-DEC').freq,
+ gt.ABCDateOffset)
+ assert not isinstance(pd.Period('2012', freq='A-DEC'),
+ gt.ABCDateOffset)
+ assert isinstance(pd.Interval(0, 1.5), gt.ABCInterval)
+ assert not isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCInterval)
+
+ assert isinstance(self.datetime_array, gt.ABCDatetimeArray)
+ assert not isinstance(self.datetime_index, gt.ABCDatetimeArray)
+
+ assert isinstance(self.timedelta_array, gt.ABCTimedeltaArray)
+ assert not isinstance(self.timedelta_index, gt.ABCTimedeltaArray)
+
+
+def test_setattr_warnings():
+ # GH7175 - GOTCHA: You can't use dot notation to add a column...
+ d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
+ 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
+ df = pd.DataFrame(d)
+
+ with catch_warnings(record=True) as w:
+ # successfully add new column
+ # this should not raise a warning
+ df['three'] = df.two + 1
+ assert len(w) == 0
+ assert df.three.sum() > df.two.sum()
+
+ with catch_warnings(record=True) as w:
+ # successfully modify column in place
+ # this should not raise a warning
+ df.one += 1
+ assert len(w) == 0
+ assert df.one.iloc[0] == 2
+
+ with catch_warnings(record=True) as w:
+ # successfully add an attribute to a series
+ # this should not raise a warning
+ df.two.not_an_index = [1, 2]
+ assert len(w) == 0
+
+ with tm.assert_produces_warning(UserWarning):
+ # warn when setting column to nonexistent name
+ df.four = df.two + 2
+ assert df.four.sum() > df.two.sum()
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_inference.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_inference.py
new file mode 100644
index 00000000000..49a66efaffc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_inference.py
@@ -0,0 +1,1333 @@
+# -*- coding: utf-8 -*-
+
+"""
+These the test the public routines exposed in types/common.py
+related to inference and not otherwise tested in types/test_common.py
+
+"""
+import collections
+from datetime import date, datetime, time, timedelta
+from decimal import Decimal
+from fractions import Fraction
+from numbers import Number
+import re
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs import iNaT, lib, missing as libmissing
+from pandas.compat import PY2, StringIO, lrange, u
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes import inference
+from pandas.core.dtypes.common import (
+ ensure_categorical, ensure_int32, is_bool, is_datetime64_any_dtype,
+ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype,
+ is_float, is_integer, is_number, is_scalar, is_scipy_sparse,
+ is_timedelta64_dtype, is_timedelta64_ns_dtype)
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DateOffset, DatetimeIndex, Index, Interval, Panel,
+ Period, Series, Timedelta, TimedeltaIndex, Timestamp, compat, isna)
+from pandas.util import testing as tm
+
+
[email protected](params=[True, False], ids=str)
+def coerce(request):
+ return request.param
+
+
+# collect all objects to be tested for list-like-ness; use tuples of objects,
+# whether they are list-like or not (special casing for sets), and their ID
+ll_params = [
+ ([1], True, 'list'), # noqa: E241
+ ([], True, 'list-empty'), # noqa: E241
+ ((1, ), True, 'tuple'), # noqa: E241
+ (tuple(), True, 'tuple-empty'), # noqa: E241
+ ({'a': 1}, True, 'dict'), # noqa: E241
+ (dict(), True, 'dict-empty'), # noqa: E241
+ ({'a', 1}, 'set', 'set'), # noqa: E241
+ (set(), 'set', 'set-empty'), # noqa: E241
+ (frozenset({'a', 1}), 'set', 'frozenset'), # noqa: E241
+ (frozenset(), 'set', 'frozenset-empty'), # noqa: E241
+ (iter([1, 2]), True, 'iterator'), # noqa: E241
+ (iter([]), True, 'iterator-empty'), # noqa: E241
+ ((x for x in [1, 2]), True, 'generator'), # noqa: E241
+ ((x for x in []), True, 'generator-empty'), # noqa: E241
+ (Series([1]), True, 'Series'), # noqa: E241
+ (Series([]), True, 'Series-empty'), # noqa: E241
+ (Series(['a']).str, True, 'StringMethods'), # noqa: E241
+ (Series([], dtype='O').str, True, 'StringMethods-empty'), # noqa: E241
+ (Index([1]), True, 'Index'), # noqa: E241
+ (Index([]), True, 'Index-empty'), # noqa: E241
+ (DataFrame([[1]]), True, 'DataFrame'), # noqa: E241
+ (DataFrame(), True, 'DataFrame-empty'), # noqa: E241
+ (np.ndarray((2,) * 1), True, 'ndarray-1d'), # noqa: E241
+ (np.array([]), True, 'ndarray-1d-empty'), # noqa: E241
+ (np.ndarray((2,) * 2), True, 'ndarray-2d'), # noqa: E241
+ (np.array([[]]), True, 'ndarray-2d-empty'), # noqa: E241
+ (np.ndarray((2,) * 3), True, 'ndarray-3d'), # noqa: E241
+ (np.array([[[]]]), True, 'ndarray-3d-empty'), # noqa: E241
+ (np.ndarray((2,) * 4), True, 'ndarray-4d'), # noqa: E241
+ (np.array([[[[]]]]), True, 'ndarray-4d-empty'), # noqa: E241
+ (np.array(2), False, 'ndarray-0d'), # noqa: E241
+ (1, False, 'int'), # noqa: E241
+ (b'123', False, 'bytes'), # noqa: E241
+ (b'', False, 'bytes-empty'), # noqa: E241
+ ('123', False, 'string'), # noqa: E241
+ ('', False, 'string-empty'), # noqa: E241
+ (str, False, 'string-type'), # noqa: E241
+ (object(), False, 'object'), # noqa: E241
+ (np.nan, False, 'NaN'), # noqa: E241
+ (None, False, 'None') # noqa: E241
+]
+objs, expected, ids = zip(*ll_params)
+
+
[email protected](params=zip(objs, expected), ids=ids)
+def maybe_list_like(request):
+ return request.param
+
+
+def test_is_list_like(maybe_list_like):
+ obj, expected = maybe_list_like
+ expected = True if expected == 'set' else expected
+ assert inference.is_list_like(obj) == expected
+
+
+def test_is_list_like_disallow_sets(maybe_list_like):
+ obj, expected = maybe_list_like
+ expected = False if expected == 'set' else expected
+ assert inference.is_list_like(obj, allow_sets=False) == expected
+
+
+def test_is_sequence():
+ is_seq = inference.is_sequence
+ assert (is_seq((1, 2)))
+ assert (is_seq([1, 2]))
+ assert (not is_seq("abcd"))
+ assert (not is_seq(u("abcd")))
+ assert (not is_seq(np.int64))
+
+ class A(object):
+
+ def __getitem__(self):
+ return 1
+
+ assert (not is_seq(A()))
+
+
+def test_is_array_like():
+ assert inference.is_array_like(Series([]))
+ assert inference.is_array_like(Series([1, 2]))
+ assert inference.is_array_like(np.array(["a", "b"]))
+ assert inference.is_array_like(Index(["2016-01-01"]))
+
+ class DtypeList(list):
+ dtype = "special"
+
+ assert inference.is_array_like(DtypeList())
+
+ assert not inference.is_array_like([1, 2, 3])
+ assert not inference.is_array_like(tuple())
+ assert not inference.is_array_like("foo")
+ assert not inference.is_array_like(123)
+
+
+ [], [1], (1, ), (1, 2), {'a': 1}, {1, 'a'}, Series([1]),
+ Series([]), Series(['a']).str, (x for x in range(5))
+])
+ list, Series, np.array, tuple
+])
+def test_is_nested_list_like_passes(inner, outer):
+ result = outer([inner for _ in range(5)])
+ assert inference.is_list_like(result)
+
+
+ 'abc', [], [1], (1,), ['a'], 'a', {'a'},
+ [1, 2, 3], Series([1]), DataFrame({"A": [1]}),
+ ([1, 2] for _ in range(5)),
+])
+def test_is_nested_list_like_fails(obj):
+ assert not inference.is_nested_list_like(obj)
+
+
+ "ll", [{}, {'A': 1}, Series([1]), collections.defaultdict()])
+def test_is_dict_like_passes(ll):
+ assert inference.is_dict_like(ll)
+
+
+ '1', 1, [1, 2], (1, 2), range(2), Index([1]),
+ dict, collections.defaultdict, Series
+])
+def test_is_dict_like_fails(ll):
+ assert not inference.is_dict_like(ll)
+
+
[email protected]("has_keys", [True, False])
[email protected]("has_getitem", [True, False])
[email protected]("has_contains", [True, False])
+def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains):
+ class DictLike(object):
+ def __init__(self, d):
+ self.d = d
+
+ if has_keys:
+ def keys(self):
+ return self.d.keys()
+
+ if has_getitem:
+ def __getitem__(self, key):
+ return self.d.__getitem__(key)
+
+ if has_contains:
+ def __contains__(self, key):
+ return self.d.__contains__(key)
+
+ d = DictLike({1: 2})
+ result = inference.is_dict_like(d)
+ expected = has_keys and has_getitem and has_contains
+
+ assert result is expected
+
+
+def test_is_file_like():
+ class MockFile(object):
+ pass
+
+ is_file = inference.is_file_like
+
+ data = StringIO("data")
+ assert is_file(data)
+
+ # No read / write attributes
+ # No iterator attributes
+ m = MockFile()
+ assert not is_file(m)
+
+ MockFile.write = lambda self: 0
+
+ # Write attribute but not an iterator
+ m = MockFile()
+ assert not is_file(m)
+
+ # gh-16530: Valid iterator just means we have the
+ # __iter__ attribute for our purposes.
+ MockFile.__iter__ = lambda self: self
+
+ # Valid write-only file
+ m = MockFile()
+ assert is_file(m)
+
+ del MockFile.write
+ MockFile.read = lambda self: 0
+
+ # Valid read-only file
+ m = MockFile()
+ assert is_file(m)
+
+ # Iterator but no read / write attributes
+ data = [1, 2, 3]
+ assert not is_file(data)
+
+
+ "ll", [collections.namedtuple('Test', list('abc'))(1, 2, 3)])
+def test_is_names_tuple_passes(ll):
+ assert inference.is_named_tuple(ll)
+
+
+ "ll", [(1, 2, 3), 'a', Series({'pi': 3.14})])
+def test_is_names_tuple_fails(ll):
+ assert not inference.is_named_tuple(ll)
+
+
+def test_is_hashable():
+
+ # all new-style classes are hashable by default
+ class HashableClass(object):
+ pass
+
+ class UnhashableClass1(object):
+ __hash__ = None
+
+ class UnhashableClass2(object):
+
+ def __hash__(self):
+ raise TypeError("Not hashable")
+
+ hashable = (1,
+ 3.14,
+ np.float64(3.14),
+ 'a',
+ tuple(),
+ (1, ),
+ HashableClass(), )
+ not_hashable = ([], UnhashableClass1(), )
+ abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), )
+
+ for i in hashable:
+ assert inference.is_hashable(i)
+ for i in not_hashable:
+ assert not inference.is_hashable(i)
+ for i in abc_hashable_not_really_hashable:
+ assert not inference.is_hashable(i)
+
+ # numpy.array is no longer collections.Hashable as of
+ # https://github.com/numpy/numpy/pull/5326, just test
+ # is_hashable()
+ assert not inference.is_hashable(np.array([]))
+
+ # old-style classes in Python 2 don't appear hashable to
+ # collections.Hashable but also seem to support hash() by default
+ if PY2:
+
+ class OldStyleClass():
+ pass
+
+ c = OldStyleClass()
+ assert not isinstance(c, compat.Hashable)
+ assert inference.is_hashable(c)
+ hash(c) # this will not raise
+
+
+ "ll", [re.compile('ad')])
+def test_is_re_passes(ll):
+ assert inference.is_re(ll)
+
+
+ "ll", ['x', 2, 3, object()])
+def test_is_re_fails(ll):
+ assert not inference.is_re(ll)
+
+
+ "ll", [r'a', u('x'),
+ r'asdf',
+ re.compile('adsf'),
+ u(r'\u2233\s*'),
+ re.compile(r'')])
+def test_is_recompilable_passes(ll):
+ assert inference.is_re_compilable(ll)
+
+
+ "ll", [1, [], object()])
+def test_is_recompilable_fails(ll):
+ assert not inference.is_re_compilable(ll)
+
+
+class TestInference(object):
+
+ def test_infer_dtype_bytes(self):
+ compare = 'string' if PY2 else 'bytes'
+
+ # string array of bytes
+ arr = np.array(list('abc'), dtype='S1')
+ assert lib.infer_dtype(arr, skipna=True) == compare
+
+ # object array of bytes
+ arr = arr.astype(object)
+ assert lib.infer_dtype(arr, skipna=True) == compare
+
+ # object array of bytes with missing values
+ assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
+
+ def test_isinf_scalar(self):
+ # GH 11352
+ assert libmissing.isposinf_scalar(float('inf'))
+ assert libmissing.isposinf_scalar(np.inf)
+ assert not libmissing.isposinf_scalar(-np.inf)
+ assert not libmissing.isposinf_scalar(1)
+ assert not libmissing.isposinf_scalar('a')
+
+ assert libmissing.isneginf_scalar(float('-inf'))
+ assert libmissing.isneginf_scalar(-np.inf)
+ assert not libmissing.isneginf_scalar(np.inf)
+ assert not libmissing.isneginf_scalar(1)
+ assert not libmissing.isneginf_scalar('a')
+
+ def test_maybe_convert_numeric_infinities(self):
+ # see gh-13274
+ infinities = ['inf', 'inF', 'iNf', 'Inf',
+ 'iNF', 'InF', 'INf', 'INF']
+ na_values = {'', 'NULL', 'nan'}
+
+ pos = np.array(['inf'], dtype=np.float64)
+ neg = np.array(['-inf'], dtype=np.float64)
+
+ msg = "Unable to parse string"
+
+ for infinity in infinities:
+ for maybe_int in (True, False):
+ out = lib.maybe_convert_numeric(
+ np.array([infinity], dtype=object),
+ na_values, maybe_int)
+ tm.assert_numpy_array_equal(out, pos)
+
+ out = lib.maybe_convert_numeric(
+ np.array(['-' + infinity], dtype=object),
+ na_values, maybe_int)
+ tm.assert_numpy_array_equal(out, neg)
+
+ out = lib.maybe_convert_numeric(
+ np.array([u(infinity)], dtype=object),
+ na_values, maybe_int)
+ tm.assert_numpy_array_equal(out, pos)
+
+ out = lib.maybe_convert_numeric(
+ np.array(['+' + infinity], dtype=object),
+ na_values, maybe_int)
+ tm.assert_numpy_array_equal(out, pos)
+
+ # too many characters
+ with pytest.raises(ValueError, match=msg):
+ lib.maybe_convert_numeric(
+ np.array(['foo_' + infinity], dtype=object),
+ na_values, maybe_int)
+
+ def test_maybe_convert_numeric_post_floatify_nan(self, coerce):
+ # see gh-13314
+ data = np.array(['1.200', '-999.000', '4.500'], dtype=object)
+ expected = np.array([1.2, np.nan, 4.5], dtype=np.float64)
+ nan_values = {-999, -999.0}
+
+ out = lib.maybe_convert_numeric(data, nan_values, coerce)
+ tm.assert_numpy_array_equal(out, expected)
+
+ def test_convert_infs(self):
+ arr = np.array(['inf', 'inf', 'inf'], dtype='O')
+ result = lib.maybe_convert_numeric(arr, set(), False)
+ assert result.dtype == np.float64
+
+ arr = np.array(['-inf', '-inf', '-inf'], dtype='O')
+ result = lib.maybe_convert_numeric(arr, set(), False)
+ assert result.dtype == np.float64
+
+ def test_scientific_no_exponent(self):
+ # See PR 12215
+ arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
+ result = lib.maybe_convert_numeric(arr, set(), False, True)
+ assert np.all(np.isnan(result))
+
+ def test_convert_non_hashable(self):
+ # GH13324
+ # make sure that we are handing non-hashables
+ arr = np.array([[10.0, 2], 1.0, 'apple'])
+ result = lib.maybe_convert_numeric(arr, set(), False, True)
+ tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))
+
+ def test_convert_numeric_uint64(self):
+ arr = np.array([2**63], dtype=object)
+ exp = np.array([2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+ arr = np.array([str(2**63)], dtype=object)
+ exp = np.array([2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+ arr = np.array([np.uint64(2**63)], dtype=object)
+ exp = np.array([2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+ @pytest.mark.parametrize("arr", [
+ np.array([2**63, np.nan], dtype=object),
+ np.array([str(2**63), np.nan], dtype=object),
+ np.array([np.nan, 2**63], dtype=object),
+ np.array([np.nan, str(2**63)], dtype=object)])
+ def test_convert_numeric_uint64_nan(self, coerce, arr):
+ expected = arr.astype(float) if coerce else arr.copy()
+ result = lib.maybe_convert_numeric(arr, set(),
+ coerce_numeric=coerce)
+ tm.assert_almost_equal(result, expected)
+
+ def test_convert_numeric_uint64_nan_values(self, coerce):
+ arr = np.array([2**63, 2**63 + 1], dtype=object)
+ na_values = {2**63}
+
+ expected = (np.array([np.nan, 2**63 + 1], dtype=float)
+ if coerce else arr.copy())
+ result = lib.maybe_convert_numeric(arr, na_values,
+ coerce_numeric=coerce)
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize("case", [
+ np.array([2**63, -1], dtype=object),
+ np.array([str(2**63), -1], dtype=object),
+ np.array([str(2**63), str(-1)], dtype=object),
+ np.array([-1, 2**63], dtype=object),
+ np.array([-1, str(2**63)], dtype=object),
+ np.array([str(-1), str(2**63)], dtype=object)])
+ def test_convert_numeric_int64_uint64(self, case, coerce):
+ expected = case.astype(float) if coerce else case.copy()
+ result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize("value", [-2**63 - 1, 2**64])
+ def test_convert_int_overflow(self, value):
+ # see gh-18584
+ arr = np.array([value], dtype=object)
+ result = lib.maybe_convert_objects(arr)
+ tm.assert_numpy_array_equal(arr, result)
+
+ def test_maybe_convert_objects_uint64(self):
+ # see gh-4471
+ arr = np.array([2**63], dtype=object)
+ exp = np.array([2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
+
+ # NumPy bug: can't compare uint64 to int64, as that
+ # results in both casting to float64, so we should
+ # make sure that this function is robust against it
+ arr = np.array([np.uint64(2**63)], dtype=object)
+ exp = np.array([2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
+
+ arr = np.array([2, -1], dtype=object)
+ exp = np.array([2, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
+
+ arr = np.array([2**63, -1], dtype=object)
+ exp = np.array([2**63, -1], dtype=object)
+ tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
+
+ def test_mixed_dtypes_remain_object_array(self):
+ # GH14956
+ array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1],
+ dtype=object)
+ result = lib.maybe_convert_objects(array, convert_datetime=1)
+ tm.assert_numpy_array_equal(result, array)
+
+
+class TestTypeInference(object):
+
+ # Dummy class used for testing with Python objects
+ class Dummy():
+ pass
+
+ def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
+ # see pandas/conftest.py
+ inferred_dtype, values = any_skipna_inferred_dtype
+
+ # make sure the inferred dtype of the fixture is as requested
+ assert inferred_dtype == lib.infer_dtype(values, skipna=True)
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_length_zero(self, skipna):
+ result = lib.infer_dtype(np.array([], dtype='i4'), skipna=skipna)
+ assert result == 'integer'
+
+ result = lib.infer_dtype([], skipna=skipna)
+ assert result == 'empty'
+
+ # GH 18004
+ arr = np.array([np.array([], dtype=object),
+ np.array([], dtype=object)])
+ result = lib.infer_dtype(arr, skipna=skipna)
+ assert result == 'empty'
+
+ def test_integers(self):
+ arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'integer'
+
+ arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'mixed-integer'
+
+ arr = np.array([1, 2, 3, 4, 5], dtype='i4')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'integer'
+
+ def test_deprecation(self):
+ # GH 24050
+ arr = np.array([1, 2, 3], dtype=object)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = lib.infer_dtype(arr) # default: skipna=None -> warn
+ assert result == 'integer'
+
+ def test_bools(self):
+ arr = np.array([True, False, True, True, True], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'boolean'
+
+ arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'boolean'
+
+ arr = np.array([True, False, True, 'foo'], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'mixed'
+
+ arr = np.array([True, False, True], dtype=bool)
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'boolean'
+
+ arr = np.array([True, np.nan, False], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'boolean'
+
+ result = lib.infer_dtype(arr, skipna=False)
+ assert result == 'mixed'
+
+ def test_floats(self):
+ arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'floating'
+
+ arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'],
+ dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'mixed-integer'
+
+ arr = np.array([1, 2, 3, 4, 5], dtype='f4')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'floating'
+
+ arr = np.array([1, 2, 3, 4, 5], dtype='f8')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'floating'
+
+ def test_decimals(self):
+ # GH15690
+ arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'decimal'
+
+ arr = np.array([1.0, 2.0, Decimal(3)])
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'mixed'
+
+ arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'decimal'
+
+ arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'decimal'
+
+ def test_string(self):
+ pass
+
+ def test_unicode(self):
+ arr = [u'a', np.nan, u'c']
+ result = lib.infer_dtype(arr, skipna=False)
+ assert result == 'mixed'
+
+ arr = [u'a', np.nan, u'c']
+ result = lib.infer_dtype(arr, skipna=True)
+ expected = 'unicode' if PY2 else 'string'
+ assert result == expected
+
+ @pytest.mark.parametrize('dtype, missing, skipna, expected', [
+ (float, np.nan, False, 'floating'),
+ (float, np.nan, True, 'floating'),
+ (object, np.nan, False, 'floating'),
+ (object, np.nan, True, 'empty'),
+ (object, None, False, 'mixed'),
+ (object, None, True, 'empty')
+ ])
+ @pytest.mark.parametrize('box', [pd.Series, np.array])
+ def test_object_empty(self, box, missing, dtype, skipna, expected):
+ # GH 23421
+ arr = box([missing, missing], dtype=dtype)
+
+ result = lib.infer_dtype(arr, skipna=skipna)
+ assert result == expected
+
+ def test_datetime(self):
+
+ dates = [datetime(2012, 1, x) for x in range(1, 20)]
+ index = Index(dates)
+ assert index.inferred_type == 'datetime64'
+
+ def test_infer_dtype_datetime(self):
+
+ arr = np.array([Timestamp('2011-01-01'),
+ Timestamp('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ arr = np.array([np.datetime64('2011-01-01'),
+ np.datetime64('2011-01-01')], dtype=object)
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
+
+ arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ # starts with nan
+ for n in [pd.NaT, np.nan]:
+ arr = np.array([n, pd.Timestamp('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ arr = np.array([n, np.datetime64('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
+
+ arr = np.array([n, datetime(2011, 1, 1)])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ arr = np.array([n, pd.Timestamp('2011-01-02'), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ arr = np.array([n, np.datetime64('2011-01-02'), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
+
+ arr = np.array([n, datetime(2011, 1, 1), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ # different type of nat
+ arr = np.array([np.timedelta64('nat'),
+ np.datetime64('2011-01-02')], dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ arr = np.array([np.datetime64('2011-01-02'),
+ np.timedelta64('nat')], dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ # mixed datetime
+ arr = np.array([datetime(2011, 1, 1),
+ pd.Timestamp('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'datetime'
+
+ # should be datetime?
+ arr = np.array([np.datetime64('2011-01-01'),
+ pd.Timestamp('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'mixed'
+
+ arr = np.array([pd.Timestamp('2011-01-02'),
+ np.datetime64('2011-01-01')])
+ assert lib.infer_dtype(arr, skipna=True) == 'mixed'
+
+ arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
+ assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'
+
+ arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
+ assert lib.infer_dtype(arr, skipna=True) == 'mixed'
+
+ arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
+ assert lib.infer_dtype(arr, skipna=True) == 'mixed'
+
+ def test_infer_dtype_timedelta(self):
+
+ arr = np.array([pd.Timedelta('1 days'),
+ pd.Timedelta('2 days')])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([np.timedelta64(1, 'D'),
+ np.timedelta64(2, 'D')], dtype=object)
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([timedelta(1), timedelta(2)])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ # starts with nan
+ for n in [pd.NaT, np.nan]:
+ arr = np.array([n, Timedelta('1 days')])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([n, np.timedelta64(1, 'D')])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([n, timedelta(1)])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([n, pd.Timedelta('1 days'), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([n, np.timedelta64(1, 'D'), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ arr = np.array([n, timedelta(1), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
+
+ # different type of nat
+ arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
+ dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')],
+ dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ def test_infer_dtype_period(self):
+ # GH 13664
+ arr = np.array([pd.Period('2011-01', freq='D'),
+ pd.Period('2011-02', freq='D')])
+ assert lib.infer_dtype(arr, skipna=True) == 'period'
+
+ arr = np.array([pd.Period('2011-01', freq='D'),
+ pd.Period('2011-02', freq='M')])
+ assert lib.infer_dtype(arr, skipna=True) == 'period'
+
+ # starts with nan
+ for n in [pd.NaT, np.nan]:
+ arr = np.array([n, pd.Period('2011-01', freq='D')])
+ assert lib.infer_dtype(arr, skipna=True) == 'period'
+
+ arr = np.array([n, pd.Period('2011-01', freq='D'), n])
+ assert lib.infer_dtype(arr, skipna=True) == 'period'
+
+ # different type of nat
+ arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
+ dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')],
+ dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ @pytest.mark.parametrize(
+ "data",
+ [
+ [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)],
+ [Timestamp("20170612"), Timestamp("20170311")],
+ [Timestamp("20170612", tz='US/Eastern'),
+ Timestamp("20170311", tz='US/Eastern')],
+ [date(2017, 6, 12),
+ Timestamp("20170311", tz='US/Eastern')],
+ [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")],
+ [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)]
+ ]
+ )
+ def test_infer_datetimelike_array_datetime(self, data):
+ assert lib.infer_datetimelike_array(data) == "datetime"
+
+ @pytest.mark.parametrize(
+ "data",
+ [
+ [timedelta(2017, 6, 12), timedelta(2017, 3, 11)],
+ [timedelta(2017, 6, 12), date(2017, 3, 11)],
+ [np.timedelta64(2017, "D"), np.timedelta64(6, "s")],
+ [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)]
+ ]
+ )
+ def test_infer_datetimelike_array_timedelta(self, data):
+ assert lib.infer_datetimelike_array(data) == "timedelta"
+
+ def test_infer_datetimelike_array_date(self):
+ arr = [date(2017, 6, 12), date(2017, 3, 11)]
+ assert lib.infer_datetimelike_array(arr) == "date"
+
+ @pytest.mark.parametrize(
+ "data",
+ [
+ ["2017-06-12", "2017-03-11"],
+ [20170612, 20170311],
+ [20170612.5, 20170311.8],
+ [Dummy(), Dummy()],
+ [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')],
+ [Timestamp("20170612"), 20170311],
+ [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')]
+ ]
+ )
+ def test_infer_datetimelike_array_mixed(self, data):
+ assert lib.infer_datetimelike_array(data) == "mixed"
+
+ @pytest.mark.parametrize(
+ "first, expected",
+ [
+ [[None], "mixed"],
+ [[np.nan], "mixed"],
+ [[pd.NaT], "nat"],
+ [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"],
+ [[np.datetime64("2017-06-12"), pd.NaT], "datetime"],
+ [[date(2017, 6, 12), pd.NaT], "date"],
+ [[timedelta(2017, 6, 12), pd.NaT], "timedelta"],
+ [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"]
+ ]
+ )
+ @pytest.mark.parametrize("second", [None, np.nan])
+ def test_infer_datetimelike_array_nan_nat_like(self, first, second,
+ expected):
+ first.append(second)
+ assert lib.infer_datetimelike_array(first) == expected
+
+ def test_infer_dtype_all_nan_nat_like(self):
+ arr = np.array([np.nan, np.nan])
+ assert lib.infer_dtype(arr, skipna=True) == 'floating'
+
+ # nan and None mix are result in mixed
+ arr = np.array([np.nan, np.nan, None])
+ assert lib.infer_dtype(arr, skipna=True) == 'empty'
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ arr = np.array([None, np.nan, np.nan])
+ assert lib.infer_dtype(arr, skipna=True) == 'empty'
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ # pd.NaT
+ arr = np.array([pd.NaT])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+
+ arr = np.array([pd.NaT, np.nan])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+
+ arr = np.array([np.nan, pd.NaT])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+
+ arr = np.array([np.nan, pd.NaT, np.nan])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+
+ arr = np.array([None, pd.NaT, None])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+
+ # np.datetime64(nat)
+ arr = np.array([np.datetime64('nat')])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+
+ for n in [np.nan, pd.NaT, None]:
+ arr = np.array([n, np.datetime64('nat'), n])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+
+ arr = np.array([pd.NaT, n, np.datetime64('nat'), n])
+ assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+
+ arr = np.array([np.timedelta64('nat')], dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+
+ for n in [np.nan, pd.NaT, None]:
+ arr = np.array([n, np.timedelta64('nat'), n])
+ assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+
+ arr = np.array([pd.NaT, n, np.timedelta64('nat'), n])
+ assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+
+ # datetime / timedelta mixed
+ arr = np.array([pd.NaT, np.datetime64('nat'),
+ np.timedelta64('nat'), np.nan])
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ arr = np.array([np.timedelta64('nat'), np.datetime64('nat')],
+ dtype=object)
+ assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+
+ def test_is_datetimelike_array_all_nan_nat_like(self):
+ arr = np.array([np.nan, pd.NaT, np.datetime64('nat')])
+ assert lib.is_datetime_array(arr)
+ assert lib.is_datetime64_array(arr)
+ assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+ arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')])
+ assert not lib.is_datetime_array(arr)
+ assert not lib.is_datetime64_array(arr)
+ assert lib.is_timedelta_or_timedelta64_array(arr)
+
+ arr = np.array([np.nan, pd.NaT, np.datetime64('nat'),
+ np.timedelta64('nat')])
+ assert not lib.is_datetime_array(arr)
+ assert not lib.is_datetime64_array(arr)
+ assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+ arr = np.array([np.nan, pd.NaT])
+ assert lib.is_datetime_array(arr)
+ assert lib.is_datetime64_array(arr)
+ assert lib.is_timedelta_or_timedelta64_array(arr)
+
+ arr = np.array([np.nan, np.nan], dtype=object)
+ assert not lib.is_datetime_array(arr)
+ assert not lib.is_datetime64_array(arr)
+ assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+ assert lib.is_datetime_with_singletz_array(
+ np.array([pd.Timestamp('20130101', tz='US/Eastern'),
+ pd.Timestamp('20130102', tz='US/Eastern')],
+ dtype=object))
+ assert not lib.is_datetime_with_singletz_array(
+ np.array([pd.Timestamp('20130101', tz='US/Eastern'),
+ pd.Timestamp('20130102', tz='CET')],
+ dtype=object))
+
+ @pytest.mark.parametrize(
+ "func",
+ [
+ 'is_datetime_array',
+ 'is_datetime64_array',
+ 'is_bool_array',
+ 'is_timedelta_or_timedelta64_array',
+ 'is_date_array',
+ 'is_time_array',
+ 'is_interval_array',
+ 'is_period_array'])
+ def test_other_dtypes_for_array(self, func):
+ func = getattr(lib, func)
+ arr = np.array(['foo', 'bar'])
+ assert not func(arr)
+
+ arr = np.array([1, 2])
+ assert not func(arr)
+
+ def test_date(self):
+
+ dates = [date(2012, 1, day) for day in range(1, 20)]
+ index = Index(dates)
+ assert index.inferred_type == 'date'
+
+ dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
+ result = lib.infer_dtype(dates, skipna=False)
+ assert result == 'mixed'
+
+ result = lib.infer_dtype(dates, skipna=True)
+ assert result == 'date'
+
+ def test_is_numeric_array(self):
+
+ assert lib.is_float_array(np.array([1, 2.0]))
+ assert lib.is_float_array(np.array([1, 2.0, np.nan]))
+ assert not lib.is_float_array(np.array([1, 2]))
+
+ assert lib.is_integer_array(np.array([1, 2]))
+ assert not lib.is_integer_array(np.array([1, 2.0]))
+
+ def test_is_string_array(self):
+
+ assert lib.is_string_array(np.array(['foo', 'bar']))
+ assert not lib.is_string_array(
+ np.array(['foo', 'bar', np.nan], dtype=object), skipna=False)
+ assert lib.is_string_array(
+ np.array(['foo', 'bar', np.nan], dtype=object), skipna=True)
+ assert not lib.is_string_array(np.array([1, 2]))
+
+ def test_to_object_array_tuples(self):
+ r = (5, 6)
+ values = [r]
+ result = lib.to_object_array_tuples(values)
+
+ try:
+ # make sure record array works
+ from collections import namedtuple
+ record = namedtuple('record', 'x y')
+ r = record(5, 6)
+ values = [r]
+ result = lib.to_object_array_tuples(values) # noqa
+ except ImportError:
+ pass
+
+ def test_object(self):
+
+ # GH 7431
+ # cannot infer more than this as only a single element
+ arr = np.array([None], dtype='O')
+ result = lib.infer_dtype(arr, skipna=False)
+ assert result == 'mixed'
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'empty'
+
+ def test_to_object_array_width(self):
+ # see gh-13320
+ rows = [[1, 2, 3], [4, 5, 6]]
+
+ expected = np.array(rows, dtype=object)
+ out = lib.to_object_array(rows)
+ tm.assert_numpy_array_equal(out, expected)
+
+ expected = np.array(rows, dtype=object)
+ out = lib.to_object_array(rows, min_width=1)
+ tm.assert_numpy_array_equal(out, expected)
+
+ expected = np.array([[1, 2, 3, None, None],
+ [4, 5, 6, None, None]], dtype=object)
+ out = lib.to_object_array(rows, min_width=5)
+ tm.assert_numpy_array_equal(out, expected)
+
+ def test_is_period(self):
+ assert lib.is_period(pd.Period('2011-01', freq='M'))
+ assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))
+ assert not lib.is_period(pd.Timestamp('2011-01'))
+ assert not lib.is_period(1)
+ assert not lib.is_period(np.nan)
+
+ def test_categorical(self):
+
+ # GH 8974
+ from pandas import Categorical, Series
+ arr = Categorical(list('abc'))
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'categorical'
+
+ result = lib.infer_dtype(Series(arr), skipna=True)
+ assert result == 'categorical'
+
+ arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
+ result = lib.infer_dtype(arr, skipna=True)
+ assert result == 'categorical'
+
+ result = lib.infer_dtype(Series(arr), skipna=True)
+ assert result == 'categorical'
+
+
+class TestNumberScalar(object):
+
+ def test_is_number(self):
+
+ assert is_number(True)
+ assert is_number(1)
+ assert is_number(1.1)
+ assert is_number(1 + 3j)
+ assert is_number(np.bool(False))
+ assert is_number(np.int64(1))
+ assert is_number(np.float64(1.1))
+ assert is_number(np.complex128(1 + 3j))
+ assert is_number(np.nan)
+
+ assert not is_number(None)
+ assert not is_number('x')
+ assert not is_number(datetime(2011, 1, 1))
+ assert not is_number(np.datetime64('2011-01-01'))
+ assert not is_number(Timestamp('2011-01-01'))
+ assert not is_number(Timestamp('2011-01-01', tz='US/Eastern'))
+ assert not is_number(timedelta(1000))
+ assert not is_number(Timedelta('1 days'))
+
+ # questionable
+ assert not is_number(np.bool_(False))
+ assert is_number(np.timedelta64(1, 'D'))
+
+ def test_is_bool(self):
+ assert is_bool(True)
+ assert is_bool(np.bool(False))
+ assert is_bool(np.bool_(False))
+
+ assert not is_bool(1)
+ assert not is_bool(1.1)
+ assert not is_bool(1 + 3j)
+ assert not is_bool(np.int64(1))
+ assert not is_bool(np.float64(1.1))
+ assert not is_bool(np.complex128(1 + 3j))
+ assert not is_bool(np.nan)
+ assert not is_bool(None)
+ assert not is_bool('x')
+ assert not is_bool(datetime(2011, 1, 1))
+ assert not is_bool(np.datetime64('2011-01-01'))
+ assert not is_bool(Timestamp('2011-01-01'))
+ assert not is_bool(Timestamp('2011-01-01', tz='US/Eastern'))
+ assert not is_bool(timedelta(1000))
+ assert not is_bool(np.timedelta64(1, 'D'))
+ assert not is_bool(Timedelta('1 days'))
+
+ def test_is_integer(self):
+ assert is_integer(1)
+ assert is_integer(np.int64(1))
+
+ assert not is_integer(True)
+ assert not is_integer(1.1)
+ assert not is_integer(1 + 3j)
+ assert not is_integer(np.bool(False))
+ assert not is_integer(np.bool_(False))
+ assert not is_integer(np.float64(1.1))
+ assert not is_integer(np.complex128(1 + 3j))
+ assert not is_integer(np.nan)
+ assert not is_integer(None)
+ assert not is_integer('x')
+ assert not is_integer(datetime(2011, 1, 1))
+ assert not is_integer(np.datetime64('2011-01-01'))
+ assert not is_integer(Timestamp('2011-01-01'))
+ assert not is_integer(Timestamp('2011-01-01', tz='US/Eastern'))
+ assert not is_integer(timedelta(1000))
+ assert not is_integer(Timedelta('1 days'))
+
+ # questionable
+ assert is_integer(np.timedelta64(1, 'D'))
+
+ def test_is_float(self):
+ assert is_float(1.1)
+ assert is_float(np.float64(1.1))
+ assert is_float(np.nan)
+
+ assert not is_float(True)
+ assert not is_float(1)
+ assert not is_float(1 + 3j)
+ assert not is_float(np.bool(False))
+ assert not is_float(np.bool_(False))
+ assert not is_float(np.int64(1))
+ assert not is_float(np.complex128(1 + 3j))
+ assert not is_float(None)
+ assert not is_float('x')
+ assert not is_float(datetime(2011, 1, 1))
+ assert not is_float(np.datetime64('2011-01-01'))
+ assert not is_float(Timestamp('2011-01-01'))
+ assert not is_float(Timestamp('2011-01-01', tz='US/Eastern'))
+ assert not is_float(timedelta(1000))
+ assert not is_float(np.timedelta64(1, 'D'))
+ assert not is_float(Timedelta('1 days'))
+
+ def test_is_datetime_dtypes(self):
+
+ ts = pd.date_range('20130101', periods=3)
+ tsa = pd.date_range('20130101', periods=3, tz='US/Eastern')
+
+ assert is_datetime64_dtype('datetime64')
+ assert is_datetime64_dtype('datetime64[ns]')
+ assert is_datetime64_dtype(ts)
+ assert not is_datetime64_dtype(tsa)
+
+ assert not is_datetime64_ns_dtype('datetime64')
+ assert is_datetime64_ns_dtype('datetime64[ns]')
+ assert is_datetime64_ns_dtype(ts)
+ assert is_datetime64_ns_dtype(tsa)
+
+ assert is_datetime64_any_dtype('datetime64')
+ assert is_datetime64_any_dtype('datetime64[ns]')
+ assert is_datetime64_any_dtype(ts)
+ assert is_datetime64_any_dtype(tsa)
+
+ assert not is_datetime64tz_dtype('datetime64')
+ assert not is_datetime64tz_dtype('datetime64[ns]')
+ assert not is_datetime64tz_dtype(ts)
+ assert is_datetime64tz_dtype(tsa)
+
+ for tz in ['US/Eastern', 'UTC']:
+ dtype = 'datetime64[ns, {}]'.format(tz)
+ assert not is_datetime64_dtype(dtype)
+ assert is_datetime64tz_dtype(dtype)
+ assert is_datetime64_ns_dtype(dtype)
+ assert is_datetime64_any_dtype(dtype)
+
+ def test_is_timedelta(self):
+ assert is_timedelta64_dtype('timedelta64')
+ assert is_timedelta64_dtype('timedelta64[ns]')
+ assert not is_timedelta64_ns_dtype('timedelta64')
+ assert is_timedelta64_ns_dtype('timedelta64[ns]')
+
+ tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64[ns]')
+ assert is_timedelta64_dtype(tdi)
+ assert is_timedelta64_ns_dtype(tdi)
+ assert is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))
+
+ # Conversion to Int64Index:
+ assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64'))
+ assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))
+
+
+class TestIsScalar(object):
+
+ def test_is_scalar_builtin_scalars(self):
+ assert is_scalar(None)
+ assert is_scalar(True)
+ assert is_scalar(False)
+ assert is_scalar(Number())
+ assert is_scalar(Fraction())
+ assert is_scalar(0.)
+ assert is_scalar(np.nan)
+ assert is_scalar('foobar')
+ assert is_scalar(b'foobar')
+ assert is_scalar(u('efoobar'))
+ assert is_scalar(datetime(2014, 1, 1))
+ assert is_scalar(date(2014, 1, 1))
+ assert is_scalar(time(12, 0))
+ assert is_scalar(timedelta(hours=1))
+ assert is_scalar(pd.NaT)
+
+ def test_is_scalar_builtin_nonscalars(self):
+ assert not is_scalar({})
+ assert not is_scalar([])
+ assert not is_scalar([1])
+ assert not is_scalar(())
+ assert not is_scalar((1, ))
+ assert not is_scalar(slice(None))
+ assert not is_scalar(Ellipsis)
+
+ def test_is_scalar_numpy_array_scalars(self):
+ assert is_scalar(np.int64(1))
+ assert is_scalar(np.float64(1.))
+ assert is_scalar(np.int32(1))
+ assert is_scalar(np.object_('foobar'))
+ assert is_scalar(np.str_('foobar'))
+ assert is_scalar(np.unicode_(u('foobar')))
+ assert is_scalar(np.bytes_(b'foobar'))
+ assert is_scalar(np.datetime64('2014-01-01'))
+ assert is_scalar(np.timedelta64(1, 'h'))
+
+ def test_is_scalar_numpy_zerodim_arrays(self):
+ for zerodim in [np.array(1), np.array('foobar'),
+ np.array(np.datetime64('2014-01-01')),
+ np.array(np.timedelta64(1, 'h')),
+ np.array(np.datetime64('NaT'))]:
+ assert not is_scalar(zerodim)
+ assert is_scalar(lib.item_from_zerodim(zerodim))
+
+ @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning")
+ def test_is_scalar_numpy_arrays(self):
+ assert not is_scalar(np.array([]))
+ assert not is_scalar(np.array([[]]))
+ assert not is_scalar(np.matrix('1; 2'))
+
+ def test_is_scalar_pandas_scalars(self):
+ assert is_scalar(Timestamp('2014-01-01'))
+ assert is_scalar(Timedelta(hours=1))
+ assert is_scalar(Period('2014-01-01'))
+ assert is_scalar(Interval(left=0, right=1))
+ assert is_scalar(DateOffset(days=1))
+
+ def test_is_scalar_pandas_containers(self):
+ assert not is_scalar(Series())
+ assert not is_scalar(Series([1]))
+ assert not is_scalar(DataFrame())
+ assert not is_scalar(DataFrame([[1]]))
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ assert not is_scalar(Panel())
+ assert not is_scalar(Panel([[[1]]]))
+ assert not is_scalar(Index([]))
+ assert not is_scalar(Index([1]))
+
+
+def test_datetimeindex_from_empty_datetime64_array():
+ for unit in ['ms', 'us', 'ns']:
+ idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit))
+ assert (len(idx) == 0)
+
+
+def test_nan_to_nat_conversions():
+
+ df = DataFrame(dict({
+ 'A': np.asarray(
+ lrange(10), dtype='float64'),
+ 'B': Timestamp('20010101')
+ }))
+ df.iloc[3:6, :] = np.nan
+ result = df.loc[4, 'B'].value
+ assert (result == iNaT)
+
+ s = df['B'].copy()
+ s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan)
+ assert (isna(s[8]))
+
+ assert (s[8].value == np.datetime64('NaT').astype(np.int64))
+
+
[email protected]_if_no_scipy
[email protected]("ignore::PendingDeprecationWarning")
+def test_is_scipy_sparse(spmatrix): # noqa: F811
+ assert is_scipy_sparse(spmatrix([[0, 1]]))
+ assert not is_scipy_sparse(np.array([1]))
+
+
+def test_ensure_int32():
+ values = np.arange(10, dtype=np.int32)
+ result = ensure_int32(values)
+ assert (result.dtype == np.int32)
+
+ values = np.arange(10, dtype=np.int64)
+ result = ensure_int32(values)
+ assert (result.dtype == np.int32)
+
+
+def test_ensure_categorical():
+ values = np.arange(10, dtype=np.int32)
+ result = ensure_categorical(values)
+ assert (result.dtype == 'category')
+
+ values = Categorical(values)
+ result = ensure_categorical(values)
+ tm.assert_categorical_equal(result, values)
diff --git a/contrib/python/pandas/py2/pandas/tests/dtypes/test_missing.py b/contrib/python/pandas/py2/pandas/tests/dtypes/test_missing.py
new file mode 100644
index 00000000000..d913d2ad299
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/dtypes/test_missing.py
@@ -0,0 +1,498 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+from decimal import Decimal
+from warnings import catch_warnings, filterwarnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas._libs import missing as libmissing
+from pandas._libs.tslibs import iNaT, is_null_datetimelike
+from pandas.compat import u
+
+from pandas.core.dtypes.common import is_scalar
+from pandas.core.dtypes.dtypes import (
+ DatetimeTZDtype, IntervalDtype, PeriodDtype)
+from pandas.core.dtypes.missing import (
+ array_equivalent, isna, isnull, na_value_for_dtype, notna, notnull)
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range)
+from pandas.core import config as cf
+from pandas.util import testing as tm
+
+
[email protected]('notna_f', [notna, notnull])
+def test_notna_notnull(notna_f):
+ assert notna_f(1.)
+ assert not notna_f(None)
+ assert not notna_f(np.NaN)
+
+ with cf.option_context("mode.use_inf_as_na", False):
+ assert notna_f(np.inf)
+ assert notna_f(-np.inf)
+
+ arr = np.array([1.5, np.inf, 3.5, -np.inf])
+ result = notna_f(arr)
+ assert result.all()
+
+ with cf.option_context("mode.use_inf_as_na", True):
+ assert not notna_f(np.inf)
+ assert not notna_f(-np.inf)
+
+ arr = np.array([1.5, np.inf, 3.5, -np.inf])
+ result = notna_f(arr)
+ assert result.sum() == 2
+
+ with cf.option_context("mode.use_inf_as_na", False):
+ for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
+ tm.makeObjectSeries(), tm.makeTimeSeries(),
+ tm.makePeriodSeries()]:
+ assert (isinstance(notna_f(s), Series))
+
+
+class TestIsNA(object):
+
+ def test_0d_array(self):
+ assert isna(np.array(np.nan))
+ assert not isna(np.array(0.0))
+ assert not isna(np.array(0))
+ # test object dtype
+ assert isna(np.array(np.nan, dtype=object))
+ assert not isna(np.array(0.0, dtype=object))
+ assert not isna(np.array(0, dtype=object))
+
+ def test_empty_object(self):
+
+ for shape in [(4, 0), (4,)]:
+ arr = np.empty(shape=shape, dtype=object)
+ result = isna(arr)
+ expected = np.ones(shape=shape, dtype=bool)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('isna_f', [isna, isnull])
+ def test_isna_isnull(self, isna_f):
+ assert not isna_f(1.)
+ assert isna_f(None)
+ assert isna_f(np.NaN)
+ assert float('nan')
+ assert not isna_f(np.inf)
+ assert not isna_f(-np.inf)
+
+ # series
+ for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
+ tm.makeObjectSeries(), tm.makeTimeSeries(),
+ tm.makePeriodSeries()]:
+ assert isinstance(isna_f(s), Series)
+
+ # frame
+ for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(),
+ tm.makeMixedDataFrame()]:
+ result = isna_f(df)
+ expected = df.apply(isna_f)
+ tm.assert_frame_equal(result, expected)
+
+ # panel
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ for p in [tm.makePanel(), tm.makePeriodPanel(),
+ tm.add_nans(tm.makePanel())]:
+ result = isna_f(p)
+ expected = p.apply(isna_f)
+ tm.assert_panel_equal(result, expected)
+
+ def test_isna_lists(self):
+ result = isna([[False]])
+ exp = np.array([[False]])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = isna([[1], [2]])
+ exp = np.array([[False], [False]])
+ tm.assert_numpy_array_equal(result, exp)
+
+ # list of strings / unicode
+ result = isna(['foo', 'bar'])
+ exp = np.array([False, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = isna([u('foo'), u('bar')])
+ exp = np.array([False, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ # GH20675
+ result = isna([np.NaN, 'world'])
+ exp = np.array([True, False])
+ tm.assert_numpy_array_equal(result, exp)
+
+ def test_isna_nat(self):
+ result = isna([NaT])
+ exp = np.array([True])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = isna(np.array([NaT], dtype=object))
+ exp = np.array([True])
+ tm.assert_numpy_array_equal(result, exp)
+
+ def test_isna_numpy_nat(self):
+ arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'),
+ np.datetime64('NaT', 's')])
+ result = isna(arr)
+ expected = np.array([True] * 4)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_isna_datetime(self):
+ assert not isna(datetime.now())
+ assert notna(datetime.now())
+
+ idx = date_range('1/1/1990', periods=20)
+ exp = np.ones(len(idx), dtype=bool)
+ tm.assert_numpy_array_equal(notna(idx), exp)
+
+ idx = np.asarray(idx)
+ idx[0] = iNaT
+ idx = DatetimeIndex(idx)
+ mask = isna(idx)
+ assert mask[0]
+ exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool)
+ tm.assert_numpy_array_equal(mask, exp)
+
+ # GH 9129
+ pidx = idx.to_period(freq='M')
+ mask = isna(pidx)
+ assert mask[0]
+ exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool)
+ tm.assert_numpy_array_equal(mask, exp)
+
+ mask = isna(pidx[1:])
+ exp = np.zeros(len(mask), dtype=bool)
+ tm.assert_numpy_array_equal(mask, exp)
+
+ @pytest.mark.parametrize(
+ "value, expected",
+ [(np.complex128(np.nan), True),
+ (np.float64(1), False),
+ (np.array([1, 1 + 0j, np.nan, 3]),
+ np.array([False, False, True, False])),
+ (np.array([1, 1 + 0j, np.nan, 3], dtype=object),
+ np.array([False, False, True, False])),
+ (np.array([1, 1 + 0j, np.nan, 3]).astype(object),
+ np.array([False, False, True, False]))])
+ def test_complex(self, value, expected):
+ result = isna(value)
+ if is_scalar(result):
+ assert result is expected
+ else:
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_datetime_other_units(self):
+ idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02'])
+ exp = np.array([False, True, False])
+ tm.assert_numpy_array_equal(isna(idx), exp)
+ tm.assert_numpy_array_equal(notna(idx), ~exp)
+ tm.assert_numpy_array_equal(isna(idx.values), exp)
+ tm.assert_numpy_array_equal(notna(idx.values), ~exp)
+
+ for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]',
+ 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]',
+ 'datetime64[ns]']:
+ values = idx.values.astype(dtype)
+
+ exp = np.array([False, True, False])
+ tm.assert_numpy_array_equal(isna(values), exp)
+ tm.assert_numpy_array_equal(notna(values), ~exp)
+
+ exp = pd.Series([False, True, False])
+ s = pd.Series(values)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+ s = pd.Series(values, dtype=object)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+
+ def test_timedelta_other_units(self):
+ idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days'])
+ exp = np.array([False, True, False])
+ tm.assert_numpy_array_equal(isna(idx), exp)
+ tm.assert_numpy_array_equal(notna(idx), ~exp)
+ tm.assert_numpy_array_equal(isna(idx.values), exp)
+ tm.assert_numpy_array_equal(notna(idx.values), ~exp)
+
+ for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]',
+ 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]',
+ 'timedelta64[ns]']:
+ values = idx.values.astype(dtype)
+
+ exp = np.array([False, True, False])
+ tm.assert_numpy_array_equal(isna(values), exp)
+ tm.assert_numpy_array_equal(notna(values), ~exp)
+
+ exp = pd.Series([False, True, False])
+ s = pd.Series(values)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+ s = pd.Series(values, dtype=object)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+
+ def test_period(self):
+ idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M')
+ exp = np.array([False, True, False])
+ tm.assert_numpy_array_equal(isna(idx), exp)
+ tm.assert_numpy_array_equal(notna(idx), ~exp)
+
+ exp = pd.Series([False, True, False])
+ s = pd.Series(idx)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+ s = pd.Series(idx, dtype=object)
+ tm.assert_series_equal(isna(s), exp)
+ tm.assert_series_equal(notna(s), ~exp)
+
+
+def test_array_equivalent():
+ assert array_equivalent(np.array([np.nan, np.nan]),
+ np.array([np.nan, np.nan]))
+ assert array_equivalent(np.array([np.nan, 1, np.nan]),
+ np.array([np.nan, 1, np.nan]))
+ assert array_equivalent(np.array([np.nan, None], dtype='object'),
+ np.array([np.nan, None], dtype='object'))
+ assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'),
+ np.array([np.nan, 1 + 1j], dtype='complex'))
+ assert not array_equivalent(
+ np.array([np.nan, 1 + 1j], dtype='complex'), np.array(
+ [np.nan, 1 + 2j], dtype='complex'))
+ assert not array_equivalent(
+ np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]))
+ assert not array_equivalent(
+ np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e']))
+ assert array_equivalent(Float64Index([0, np.nan]),
+ Float64Index([0, np.nan]))
+ assert not array_equivalent(
+ Float64Index([0, np.nan]), Float64Index([1, np.nan]))
+ assert array_equivalent(DatetimeIndex([0, np.nan]),
+ DatetimeIndex([0, np.nan]))
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]))
+ assert array_equivalent(TimedeltaIndex([0, np.nan]),
+ TimedeltaIndex([0, np.nan]))
+ assert not array_equivalent(
+ TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]))
+ with catch_warnings():
+ filterwarnings("ignore", "Converting timezone", FutureWarning)
+ assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'),
+ DatetimeIndex([0, np.nan], tz='US/Eastern'))
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex(
+ [1, np.nan], tz='US/Eastern'))
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan]), DatetimeIndex(
+ [0, np.nan], tz='US/Eastern'))
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex(
+ [0, np.nan], tz='US/Eastern'))
+
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan]))
+
+
+def test_array_equivalent_compat():
+ # see gh-13388
+ m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)])
+ n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)])
+ assert (array_equivalent(m, n, strict_nan=True))
+ assert (array_equivalent(m, n, strict_nan=False))
+
+ m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)])
+ n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)])
+ assert (not array_equivalent(m, n, strict_nan=True))
+ assert (not array_equivalent(m, n, strict_nan=False))
+
+ m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)])
+ n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)])
+ assert (not array_equivalent(m, n, strict_nan=True))
+ assert (not array_equivalent(m, n, strict_nan=False))
+
+
+def test_array_equivalent_str():
+ for dtype in ['O', 'S', 'U']:
+ assert array_equivalent(np.array(['A', 'B'], dtype=dtype),
+ np.array(['A', 'B'], dtype=dtype))
+ assert not array_equivalent(np.array(['A', 'B'], dtype=dtype),
+ np.array(['A', 'X'], dtype=dtype))
+
+
[email protected]('dtype, na_value', [
+ # Datetime-like
+ (np.dtype("M8[ns]"), NaT),
+ (np.dtype("m8[ns]"), NaT),
+ (DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]'), NaT),
+ (PeriodDtype("M"), NaT),
+ # Integer
+ ('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0),
+ ('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0),
+ # Bool
+ ('bool', False),
+ # Float
+ ('f2', np.nan), ('f4', np.nan), ('f8', np.nan),
+ # Object
+ ('O', np.nan),
+ # Interval
+ (IntervalDtype(), np.nan),
+])
+def test_na_value_for_dtype(dtype, na_value):
+ result = na_value_for_dtype(dtype)
+ assert result is na_value
+
+
+class TestNAObj(object):
+
+ _1d_methods = ['isnaobj', 'isnaobj_old']
+ _2d_methods = ['isnaobj2d', 'isnaobj2d_old']
+
+ def _check_behavior(self, arr, expected):
+ for method in TestNAObj._1d_methods:
+ result = getattr(libmissing, method)(arr)
+ tm.assert_numpy_array_equal(result, expected)
+
+ arr = np.atleast_2d(arr)
+ expected = np.atleast_2d(expected)
+
+ for method in TestNAObj._2d_methods:
+ result = getattr(libmissing, method)(arr)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_basic(self):
+ arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan])
+ expected = np.array([False, True, False, False, True, True])
+
+ self._check_behavior(arr, expected)
+
+ def test_non_obj_dtype(self):
+ arr = np.array([1, 3, np.nan, 5], dtype=float)
+ expected = np.array([False, False, True, False])
+
+ self._check_behavior(arr, expected)
+
+ def test_empty_arr(self):
+ arr = np.array([])
+ expected = np.array([], dtype=bool)
+
+ self._check_behavior(arr, expected)
+
+ def test_empty_str_inp(self):
+ arr = np.array([""]) # empty but not na
+ expected = np.array([False])
+
+ self._check_behavior(arr, expected)
+
+ def test_empty_like(self):
+ # see gh-13717: no segfaults!
+ arr = np.empty_like([None])
+ expected = np.array([True])
+
+ self._check_behavior(arr, expected)
+
+
+m8_units = ['as', 'ps', 'ns', 'us', 'ms', 's',
+ 'm', 'h', 'D', 'W', 'M', 'Y']
+
+na_vals = [
+ None,
+ NaT,
+ float('NaN'),
+ complex('NaN'),
+ np.nan,
+ np.float64('NaN'),
+ np.float32('NaN'),
+ np.complex64(np.nan),
+ np.complex128(np.nan),
+ np.datetime64('NaT'),
+ np.timedelta64('NaT'),
+] + [
+ np.datetime64('NaT', unit) for unit in m8_units
+] + [
+ np.timedelta64('NaT', unit) for unit in m8_units
+]
+
+inf_vals = [
+ float('inf'),
+ float('-inf'),
+ complex('inf'),
+ complex('-inf'),
+ np.inf,
+ np.NINF,
+]
+
+int_na_vals = [
+ # Values that match iNaT, which we treat as null in specific cases
+ np.int64(NaT.value),
+ int(NaT.value),
+]
+
+sometimes_na_vals = [
+ Decimal('NaN'),
+]
+
+never_na_vals = [
+ # float/complex values that when viewed as int64 match iNaT
+ -0.0,
+ np.float64('-0.0'),
+ -0j,
+ np.complex64(-0j),
+]
+
+
+class TestLibMissing(object):
+ def test_checknull(self):
+ for value in na_vals:
+ assert libmissing.checknull(value)
+
+ for value in inf_vals:
+ assert not libmissing.checknull(value)
+
+ for value in int_na_vals:
+ assert not libmissing.checknull(value)
+
+ for value in sometimes_na_vals:
+ assert not libmissing.checknull(value)
+
+ for value in never_na_vals:
+ assert not libmissing.checknull(value)
+
+ def checknull_old(self):
+ for value in na_vals:
+ assert libmissing.checknull_old(value)
+
+ for value in inf_vals:
+ assert libmissing.checknull_old(value)
+
+ for value in int_na_vals:
+ assert not libmissing.checknull_old(value)
+
+ for value in sometimes_na_vals:
+ assert not libmissing.checknull_old(value)
+
+ for value in never_na_vals:
+ assert not libmissing.checknull_old(value)
+
+ def test_is_null_datetimelike(self):
+ for value in na_vals:
+ assert is_null_datetimelike(value)
+ assert is_null_datetimelike(value, False)
+
+ for value in inf_vals:
+ assert not is_null_datetimelike(value)
+ assert not is_null_datetimelike(value, False)
+
+ for value in int_na_vals:
+ assert is_null_datetimelike(value)
+ assert not is_null_datetimelike(value, False)
+
+ for value in sometimes_na_vals:
+ assert not is_null_datetimelike(value)
+ assert not is_null_datetimelike(value, False)
+
+ for value in never_na_vals:
+ assert not is_null_datetimelike(value)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/arrow/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/arrow/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/arrow/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/arrow/bool.py b/contrib/python/pandas/py2/pandas/tests/extension/arrow/bool.py
new file mode 100644
index 00000000000..025c4cacd8f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/arrow/bool.py
@@ -0,0 +1,144 @@
+"""Rudimentary Apache Arrow-backed ExtensionArray.
+
+At the moment, just a boolean array / type is implemented.
+Eventually, we'll want to parametrize the type and support
+multiple dtypes. Not all methods are implemented yet, and the
+current implementation is not efficient.
+"""
+import copy
+import itertools
+
+import numpy as np
+import pyarrow as pa
+
+import pandas as pd
+from pandas.api.extensions import (
+ ExtensionArray, ExtensionDtype, register_extension_dtype, take)
+
+
+@register_extension_dtype
+class ArrowBoolDtype(ExtensionDtype):
+
+ type = np.bool_
+ kind = 'b'
+ name = 'arrow_bool'
+ na_value = pa.NULL
+
+ @classmethod
+ def construct_from_string(cls, string):
+ if string == cls.name:
+ return cls()
+ else:
+ raise TypeError("Cannot construct a '{}' from "
+ "'{}'".format(cls, string))
+
+ @classmethod
+ def construct_array_type(cls):
+ return ArrowBoolArray
+
+ def _is_boolean(self):
+ return True
+
+
+class ArrowBoolArray(ExtensionArray):
+ def __init__(self, values):
+ if not isinstance(values, pa.ChunkedArray):
+ raise ValueError
+
+ assert values.type == pa.bool_()
+ self._data = values
+ self._dtype = ArrowBoolDtype()
+
+ def __repr__(self):
+ return "ArrowBoolArray({})".format(repr(self._data))
+
+ @classmethod
+ def from_scalars(cls, values):
+ arr = pa.chunked_array([pa.array(np.asarray(values))])
+ return cls(arr)
+
+ @classmethod
+ def from_array(cls, arr):
+ assert isinstance(arr, pa.Array)
+ return cls(pa.chunked_array([arr]))
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return cls.from_scalars(scalars)
+
+ def __getitem__(self, item):
+ if pd.api.types.is_scalar(item):
+ return self._data.to_pandas()[item]
+ else:
+ vals = self._data.to_pandas()[item]
+ return type(self).from_scalars(vals)
+
+ def __len__(self):
+ return len(self._data)
+
+ def astype(self, dtype, copy=True):
+ # needed to fix this astype for the Series constructor.
+ if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+ if copy:
+ return self.copy()
+ return self
+ return super(ArrowBoolArray, self).astype(dtype, copy)
+
+ @property
+ def dtype(self):
+ return self._dtype
+
+ @property
+ def nbytes(self):
+ return sum(x.size for chunk in self._data.chunks
+ for x in chunk.buffers()
+ if x is not None)
+
+ def isna(self):
+ nas = pd.isna(self._data.to_pandas())
+ return type(self).from_scalars(nas)
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ data = self._data.to_pandas()
+
+ if allow_fill and fill_value is None:
+ fill_value = self.dtype.na_value
+
+ result = take(data, indices, fill_value=fill_value,
+ allow_fill=allow_fill)
+ return self._from_sequence(result, dtype=self.dtype)
+
+ def copy(self, deep=False):
+ if deep:
+ return type(self)(copy.deepcopy(self._data))
+ else:
+ return type(self)(copy.copy(self._data))
+
+ def _concat_same_type(cls, to_concat):
+ chunks = list(itertools.chain.from_iterable(x._data.chunks
+ for x in to_concat))
+ arr = pa.chunked_array(chunks)
+ return cls(arr)
+
+ def __invert__(self):
+ return type(self).from_scalars(
+ ~self._data.to_pandas()
+ )
+
+ def _reduce(self, method, skipna=True, **kwargs):
+ if skipna:
+ arr = self[~self.isna()]
+ else:
+ arr = self
+
+ try:
+ op = getattr(arr, method)
+ except AttributeError:
+ raise TypeError
+ return op(**kwargs)
+
+ def any(self, axis=0, out=None):
+ return self._data.to_pandas().any()
+
+ def all(self, axis=0, out=None):
+ return self._data.to_pandas().all()
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/arrow/test_bool.py b/contrib/python/pandas/py2/pandas/tests/extension/arrow/test_bool.py
new file mode 100644
index 00000000000..15ceb6adff5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/arrow/test_bool.py
@@ -0,0 +1,68 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.tests.extension import base
+import pandas.util.testing as tm
+
+pytest.importorskip('pyarrow', minversion="0.10.0")
+
+from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip
+
+
+def dtype():
+ return ArrowBoolDtype()
+
+
+def data():
+ return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100,
+ dtype=bool))
+
+
+def data_missing():
+ return ArrowBoolArray.from_scalars([None, True])
+
+
+class BaseArrowTests(object):
+ pass
+
+
+class TestDtype(BaseArrowTests, base.BaseDtypeTests):
+ def test_array_type_with_arg(self, data, dtype):
+ pytest.skip("GH-22666")
+
+
+class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
+ def test_repr(self, data):
+ raise pytest.skip("TODO")
+
+
+class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
+ def test_from_dtype(self, data):
+ pytest.skip("GH-22666")
+
+ # seems like some bug in isna on empty BoolArray returning floats.
+ @pytest.mark.xfail(reason='bad is-na for empty data')
+ def test_from_sequence_from_cls(self, data):
+ super(TestConstructors, self).test_from_sequence_from_cls(data)
+
+
+class TestReduce(base.BaseNoReduceTests):
+ def test_reduce_series_boolean(self):
+ pass
+
+
+class TestReduceBoolean(base.BaseBooleanReduceTests):
+ pass
+
+
+def test_is_bool_dtype(data):
+ assert pd.api.types.is_bool_dtype(data)
+ assert pd.core.common.is_bool_indexer(data)
+ s = pd.Series(range(len(data)))
+ result = s[data]
+ expected = s[np.asarray(data)]
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/base/__init__.py
new file mode 100644
index 00000000000..1f7ee2ae17e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/__init__.py
@@ -0,0 +1,56 @@
+"""Base test suite for extension arrays.
+
+These tests are intended for third-party libraries to subclass to validate
+that their extension arrays and dtypes satisfy the interface. Moving or
+renaming the tests should not be done lightly.
+
+Libraries are expected to implement a few pytest fixtures to provide data
+for the tests. The fixtures may be located in either
+
+* The same module as your test class.
+* A ``conftest.py`` in the same directory as your test class.
+
+The full list of fixtures may be found in the ``conftest.py`` next to this
+file.
+
+.. code-block:: python
+
+ import pytest
+ from pandas.tests.extension.base import BaseDtypeTests
+
+
+ @pytest.fixture
+ def dtype():
+ return MyDtype()
+
+
+ class TestMyDtype(BaseDtypeTests):
+ pass
+
+
+Your class ``TestDtype`` will inherit all the tests defined on
+``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
+wherever the test requires it. You're free to implement additional tests.
+
+All the tests in these modules use ``self.assert_frame_equal`` or
+``self.assert_series_equal`` for dataframe or series comparisons. By default,
+they use the usual ``pandas.testing.assert_frame_equal`` and
+``pandas.testing.assert_series_equal``. You can override the checks used
+by defining the staticmethods ``assert_frame_equal`` and
+``assert_series_equal`` on your base test class.
+
+"""
+from .casting import BaseCastingTests # noqa
+from .constructors import BaseConstructorsTests # noqa
+from .dtype import BaseDtypeTests # noqa
+from .getitem import BaseGetitemTests # noqa
+from .groupby import BaseGroupbyTests # noqa
+from .interface import BaseInterfaceTests # noqa
+from .methods import BaseMethodsTests # noqa
+from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa
+from .printing import BasePrintingTests # noqa
+from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa
+from .missing import BaseMissingTests # noqa
+from .reshaping import BaseReshapingTests # noqa
+from .setitem import BaseSetitemTests # noqa
+from .io import BaseParsingTests # noqa
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/base.py b/contrib/python/pandas/py2/pandas/tests/extension/base/base.py
new file mode 100644
index 00000000000..2a4a1b9c466
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/base.py
@@ -0,0 +1,10 @@
+import pandas.util.testing as tm
+
+
+class BaseExtensionTests(object):
+ assert_equal = staticmethod(tm.assert_equal)
+ assert_series_equal = staticmethod(tm.assert_series_equal)
+ assert_frame_equal = staticmethod(tm.assert_frame_equal)
+ assert_extension_array_equal = staticmethod(
+ tm.assert_extension_array_equal
+ )
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/casting.py b/contrib/python/pandas/py2/pandas/tests/extension/base/casting.py
new file mode 100644
index 00000000000..7146443bf8d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/casting.py
@@ -0,0 +1,23 @@
+import pandas as pd
+from pandas.core.internals import ObjectBlock
+
+from .base import BaseExtensionTests
+
+
+class BaseCastingTests(BaseExtensionTests):
+ """Casting to and from ExtensionDtypes"""
+
+ def test_astype_object_series(self, all_data):
+ ser = pd.Series({"A": all_data})
+ result = ser.astype(object)
+ assert isinstance(result._data.blocks[0], ObjectBlock)
+
+ def test_tolist(self, data):
+ result = pd.Series(data).tolist()
+ expected = list(data)
+ assert result == expected
+
+ def test_astype_str(self, data):
+ result = pd.Series(data[:5]).astype(str)
+ expected = pd.Series(data[:5].astype(str))
+ self.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/constructors.py b/contrib/python/pandas/py2/pandas/tests/extension/base/constructors.py
new file mode 100644
index 00000000000..231a1f648f8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/constructors.py
@@ -0,0 +1,77 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.internals import ExtensionBlock
+
+from .base import BaseExtensionTests
+
+
+class BaseConstructorsTests(BaseExtensionTests):
+
+ def test_from_sequence_from_cls(self, data):
+ result = type(data)._from_sequence(data, dtype=data.dtype)
+ self.assert_extension_array_equal(result, data)
+
+ data = data[:0]
+ result = type(data)._from_sequence(data, dtype=data.dtype)
+ self.assert_extension_array_equal(result, data)
+
+ def test_array_from_scalars(self, data):
+ scalars = [data[0], data[1], data[2]]
+ result = data._from_sequence(scalars)
+ assert isinstance(result, type(data))
+
+ def test_series_constructor(self, data):
+ result = pd.Series(data)
+ assert result.dtype == data.dtype
+ assert len(result) == len(data)
+ assert isinstance(result._data.blocks[0], ExtensionBlock)
+ assert result._data.blocks[0].values is data
+
+ # Series[EA] is unboxed / boxed correctly
+ result2 = pd.Series(result)
+ assert result2.dtype == data.dtype
+ assert isinstance(result2._data.blocks[0], ExtensionBlock)
+
+ @pytest.mark.parametrize("from_series", [True, False])
+ def test_dataframe_constructor_from_dict(self, data, from_series):
+ if from_series:
+ data = pd.Series(data)
+ result = pd.DataFrame({"A": data})
+ assert result.dtypes['A'] == data.dtype
+ assert result.shape == (len(data), 1)
+ assert isinstance(result._data.blocks[0], ExtensionBlock)
+
+ def test_dataframe_from_series(self, data):
+ result = pd.DataFrame(pd.Series(data))
+ assert result.dtypes[0] == data.dtype
+ assert result.shape == (len(data), 1)
+ assert isinstance(result._data.blocks[0], ExtensionBlock)
+
+ def test_series_given_mismatched_index_raises(self, data):
+ msg = 'Length of passed values is 3, index implies 5'
+ with pytest.raises(ValueError, match=msg):
+ pd.Series(data[:3], index=[0, 1, 2, 3, 4])
+
+ def test_from_dtype(self, data):
+ # construct from our dtype & string dtype
+ dtype = data.dtype
+
+ expected = pd.Series(data)
+ result = pd.Series(list(data), dtype=dtype)
+ self.assert_series_equal(result, expected)
+
+ result = pd.Series(list(data), dtype=str(dtype))
+ self.assert_series_equal(result, expected)
+
+ def test_pandas_array(self, data):
+ # pd.array(extension_array) should be idempotent...
+ result = pd.array(data)
+ self.assert_extension_array_equal(result, data)
+
+ def test_pandas_array_dtype(self, data):
+ # ... but specifying dtype will override idempotency
+ result = pd.array(data, dtype=np.dtype(object))
+ expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
+ self.assert_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/dtype.py b/contrib/python/pandas/py2/pandas/tests/extension/base/dtype.py
new file mode 100644
index 00000000000..e9d1f183812
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/dtype.py
@@ -0,0 +1,91 @@
+import warnings
+
+import numpy as np
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseDtypeTests(BaseExtensionTests):
+ """Base class for ExtensionDtype classes"""
+
+ def test_name(self, dtype):
+ assert isinstance(dtype.name, str)
+
+ def test_kind(self, dtype):
+ valid = set('biufcmMOSUV')
+ if dtype.kind is not None:
+ assert dtype.kind in valid
+
+ def test_construct_from_string_own_name(self, dtype):
+ result = dtype.construct_from_string(dtype.name)
+ assert type(result) is type(dtype)
+
+ # check OK as classmethod
+ result = type(dtype).construct_from_string(dtype.name)
+ assert type(result) is type(dtype)
+
+ def test_is_dtype_from_name(self, dtype):
+ result = type(dtype).is_dtype(dtype.name)
+ assert result is True
+
+ def test_is_dtype_unboxes_dtype(self, data, dtype):
+ assert dtype.is_dtype(data) is True
+
+ def test_is_dtype_from_self(self, dtype):
+ result = type(dtype).is_dtype(dtype)
+ assert result is True
+
+ def test_is_not_string_type(self, dtype):
+ return not pd.api.types.is_string_dtype(dtype)
+
+ def test_is_not_object_type(self, dtype):
+ return not pd.api.types.is_object_dtype(dtype)
+
+ def test_eq_with_str(self, dtype):
+ assert dtype == dtype.name
+ assert dtype != dtype.name + '-suffix'
+
+ def test_eq_with_numpy_object(self, dtype):
+ assert dtype != np.dtype('object')
+
+ def test_eq_with_self(self, dtype):
+ assert dtype == dtype
+ assert dtype != object()
+
+ def test_array_type(self, data, dtype):
+ assert dtype.construct_array_type() is type(data)
+
+ def test_check_dtype(self, data):
+ dtype = data.dtype
+
+ # check equivalency for using .dtypes
+ df = pd.DataFrame({'A': pd.Series(data, dtype=dtype),
+ 'B': data,
+ 'C': 'foo', 'D': 1})
+
+ # np.dtype('int64') == 'Int64' == 'int64'
+ # so can't distinguish
+ if dtype.name == 'Int64':
+ expected = pd.Series([True, True, False, True],
+ index=list('ABCD'))
+ else:
+ expected = pd.Series([True, True, False, False],
+ index=list('ABCD'))
+
+ # XXX: This should probably be *fixed* not ignored.
+ # See libops.scalar_compare
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", DeprecationWarning)
+ result = df.dtypes == str(dtype)
+
+ self.assert_series_equal(result, expected)
+
+ expected = pd.Series([True, True, False, False],
+ index=list('ABCD'))
+ result = df.dtypes.apply(str) == str(dtype)
+ self.assert_series_equal(result, expected)
+
+ def test_hashable(self, dtype):
+ hash(dtype) # no error
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/getitem.py b/contrib/python/pandas/py2/pandas/tests/extension/base/getitem.py
new file mode 100644
index 00000000000..dfc82c6041e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/getitem.py
@@ -0,0 +1,248 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseGetitemTests(BaseExtensionTests):
+ """Tests for ExtensionArray.__getitem__."""
+
+ def test_iloc_series(self, data):
+ ser = pd.Series(data)
+ result = ser.iloc[:4]
+ expected = pd.Series(data[:4])
+ self.assert_series_equal(result, expected)
+
+ result = ser.iloc[[0, 1, 2, 3]]
+ self.assert_series_equal(result, expected)
+
+ def test_iloc_frame(self, data):
+ df = pd.DataFrame({"A": data, 'B':
+ np.arange(len(data), dtype='int64')})
+ expected = pd.DataFrame({"A": data[:4]})
+
+ # slice -> frame
+ result = df.iloc[:4, [0]]
+ self.assert_frame_equal(result, expected)
+
+ # sequence -> frame
+ result = df.iloc[[0, 1, 2, 3], [0]]
+ self.assert_frame_equal(result, expected)
+
+ expected = pd.Series(data[:4], name='A')
+
+ # slice -> series
+ result = df.iloc[:4, 0]
+ self.assert_series_equal(result, expected)
+
+ # sequence -> series
+ result = df.iloc[:4, 0]
+ self.assert_series_equal(result, expected)
+
+ def test_loc_series(self, data):
+ ser = pd.Series(data)
+ result = ser.loc[:3]
+ expected = pd.Series(data[:4])
+ self.assert_series_equal(result, expected)
+
+ result = ser.loc[[0, 1, 2, 3]]
+ self.assert_series_equal(result, expected)
+
+ def test_loc_frame(self, data):
+ df = pd.DataFrame({"A": data,
+ 'B': np.arange(len(data), dtype='int64')})
+ expected = pd.DataFrame({"A": data[:4]})
+
+ # slice -> frame
+ result = df.loc[:3, ['A']]
+ self.assert_frame_equal(result, expected)
+
+ # sequence -> frame
+ result = df.loc[[0, 1, 2, 3], ['A']]
+ self.assert_frame_equal(result, expected)
+
+ expected = pd.Series(data[:4], name='A')
+
+ # slice -> series
+ result = df.loc[:3, 'A']
+ self.assert_series_equal(result, expected)
+
+ # sequence -> series
+ result = df.loc[:3, 'A']
+ self.assert_series_equal(result, expected)
+
+ def test_getitem_scalar(self, data):
+ result = data[0]
+ assert isinstance(result, data.dtype.type)
+
+ result = pd.Series(data)[0]
+ assert isinstance(result, data.dtype.type)
+
+ def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
+ result = data_missing[0]
+ assert na_cmp(result, na_value)
+
+ def test_getitem_mask(self, data):
+ # Empty mask, raw array
+ mask = np.zeros(len(data), dtype=bool)
+ result = data[mask]
+ assert len(result) == 0
+ assert isinstance(result, type(data))
+
+ # Empty mask, in series
+ mask = np.zeros(len(data), dtype=bool)
+ result = pd.Series(data)[mask]
+ assert len(result) == 0
+ assert result.dtype == data.dtype
+
+ # non-empty mask, raw array
+ mask[0] = True
+ result = data[mask]
+ assert len(result) == 1
+ assert isinstance(result, type(data))
+
+ # non-empty mask, in series
+ result = pd.Series(data)[mask]
+ assert len(result) == 1
+ assert result.dtype == data.dtype
+
+ def test_getitem_slice(self, data):
+ # getitem[slice] should return an array
+ result = data[slice(0)] # empty
+ assert isinstance(result, type(data))
+
+ result = data[slice(1)] # scalar
+ assert isinstance(result, type(data))
+
+ def test_get(self, data):
+ # GH 20882
+ s = pd.Series(data, index=[2 * i for i in range(len(data))])
+ assert s.get(4) == s.iloc[2]
+
+ result = s.get([4, 6])
+ expected = s.iloc[[2, 3]]
+ self.assert_series_equal(result, expected)
+
+ result = s.get(slice(2))
+ expected = s.iloc[[0, 1]]
+ self.assert_series_equal(result, expected)
+
+ assert s.get(-1) is None
+ assert s.get(s.index.max() + 1) is None
+
+ s = pd.Series(data[:6], index=list('abcdef'))
+ assert s.get('c') == s.iloc[2]
+
+ result = s.get(slice('b', 'd'))
+ expected = s.iloc[[1, 2, 3]]
+ self.assert_series_equal(result, expected)
+
+ result = s.get('Z')
+ assert result is None
+
+ assert s.get(4) == s.iloc[4]
+ assert s.get(-1) == s.iloc[-1]
+ assert s.get(len(s)) is None
+
+ # GH 21257
+ s = pd.Series(data)
+ s2 = s[::2]
+ assert s2.get(1) is None
+
+ def test_take_sequence(self, data):
+ result = pd.Series(data)[[0, 1, 3]]
+ assert result.iloc[0] == data[0]
+ assert result.iloc[1] == data[1]
+ assert result.iloc[2] == data[3]
+
+ def test_take(self, data, na_value, na_cmp):
+ result = data.take([0, -1])
+ assert result.dtype == data.dtype
+ assert result[0] == data[0]
+ assert result[1] == data[-1]
+
+ result = data.take([0, -1], allow_fill=True, fill_value=na_value)
+ assert result[0] == data[0]
+ assert na_cmp(result[1], na_value)
+
+ with pytest.raises(IndexError, match="out of bounds"):
+ data.take([len(data) + 1])
+
+ def test_take_empty(self, data, na_value, na_cmp):
+ empty = data[:0]
+
+ result = empty.take([-1], allow_fill=True)
+ assert na_cmp(result[0], na_value)
+
+ with pytest.raises(IndexError):
+ empty.take([-1])
+
+ with pytest.raises(IndexError, match="cannot do a non-empty take"):
+ empty.take([0, 1])
+
+ def test_take_negative(self, data):
+ # https://github.com/pandas-dev/pandas/issues/20640
+ n = len(data)
+ result = data.take([0, -n, n - 1, -1])
+ expected = data.take([0, 0, n - 1, n - 1])
+ self.assert_extension_array_equal(result, expected)
+
+ def test_take_non_na_fill_value(self, data_missing):
+ fill_value = data_missing[1] # valid
+ na = data_missing[0]
+
+ array = data_missing._from_sequence([na, fill_value, na])
+ result = array.take([-1, 1], fill_value=fill_value, allow_fill=True)
+ expected = array.take([1, 1])
+ self.assert_extension_array_equal(result, expected)
+
+ def test_take_pandas_style_negative_raises(self, data, na_value):
+ with pytest.raises(ValueError):
+ data.take([0, -2], fill_value=na_value, allow_fill=True)
+
+ @pytest.mark.parametrize('allow_fill', [True, False])
+ def test_take_out_of_bounds_raises(self, data, allow_fill):
+ arr = data[:3]
+ with pytest.raises(IndexError):
+ arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
+
+ def test_take_series(self, data):
+ s = pd.Series(data)
+ result = s.take([0, -1])
+ expected = pd.Series(
+ data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
+ index=[0, len(data) - 1])
+ self.assert_series_equal(result, expected)
+
+ def test_reindex(self, data, na_value):
+ s = pd.Series(data)
+ result = s.reindex([0, 1, 3])
+ expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
+ self.assert_series_equal(result, expected)
+
+ n = len(data)
+ result = s.reindex([-1, 0, n])
+ expected = pd.Series(
+ data._from_sequence([na_value, data[0], na_value],
+ dtype=s.dtype),
+ index=[-1, 0, n])
+ self.assert_series_equal(result, expected)
+
+ result = s.reindex([n, n + 1])
+ expected = pd.Series(data._from_sequence([na_value, na_value],
+ dtype=s.dtype),
+ index=[n, n + 1])
+ self.assert_series_equal(result, expected)
+
+ def test_reindex_non_na_fill_value(self, data_missing):
+ valid = data_missing[1]
+ na = data_missing[0]
+
+ array = data_missing._from_sequence([na, valid])
+ ser = pd.Series(array)
+ result = ser.reindex([0, 1, 2], fill_value=valid)
+ expected = pd.Series(data_missing._from_sequence([na, valid, valid]))
+
+ self.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/groupby.py b/contrib/python/pandas/py2/pandas/tests/extension/base/groupby.py
new file mode 100644
index 00000000000..dd406ca0cd5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/groupby.py
@@ -0,0 +1,83 @@
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .base import BaseExtensionTests
+
+
+class BaseGroupbyTests(BaseExtensionTests):
+ """Groupby-specific tests."""
+
+ def test_grouping_grouper(self, data_for_grouping):
+ df = pd.DataFrame({
+ "A": ["B", "B", None, None, "A", "A", "B", "C"],
+ "B": data_for_grouping
+ })
+ gr1 = df.groupby("A").grouper.groupings[0]
+ gr2 = df.groupby("B").grouper.groupings[0]
+
+ tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
+ tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
+
+ @pytest.mark.parametrize('as_index', [True, False])
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ result = df.groupby("B", as_index=as_index).A.mean()
+ _, index = pd.factorize(data_for_grouping, sort=True)
+
+ index = pd.Index(index, name="B")
+ expected = pd.Series([3, 1, 4], index=index, name="A")
+ if as_index:
+ self.assert_series_equal(result, expected)
+ else:
+ expected = expected.reset_index()
+ self.assert_frame_equal(result, expected)
+
+ def test_groupby_extension_no_sort(self, data_for_grouping):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ result = df.groupby("B", sort=False).A.mean()
+ _, index = pd.factorize(data_for_grouping, sort=False)
+
+ index = pd.Index(index, name="B")
+ expected = pd.Series([1, 3, 4], index=index, name="A")
+ self.assert_series_equal(result, expected)
+
+ def test_groupby_extension_transform(self, data_for_grouping):
+ valid = data_for_grouping[~data_for_grouping.isna()]
+ df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4],
+ "B": valid})
+
+ result = df.groupby("B").A.transform(len)
+ expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
+
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('op', [
+ lambda x: 1,
+ lambda x: [1] * len(x),
+ lambda x: pd.Series([1] * len(x)),
+ lambda x: x,
+ ], ids=['scalar', 'list', 'series', 'object'])
+ def test_groupby_extension_apply(self, data_for_grouping, op):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ df.groupby("B").apply(op)
+ df.groupby("B").A.apply(op)
+ df.groupby("A").apply(op)
+ df.groupby("A").B.apply(op)
+
+ def test_in_numeric_groupby(self, data_for_grouping):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping,
+ "C": [1, 1, 1, 1, 1, 1, 1, 1]})
+ result = df.groupby("A").sum().columns
+
+ if data_for_grouping.dtype._is_numeric:
+ expected = pd.Index(['B', 'C'])
+ else:
+ expected = pd.Index(['C'])
+
+ tm.assert_index_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/interface.py b/contrib/python/pandas/py2/pandas/tests/extension/base/interface.py
new file mode 100644
index 00000000000..6388902e456
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/interface.py
@@ -0,0 +1,68 @@
+import numpy as np
+
+from pandas.core.dtypes.common import is_extension_array_dtype
+from pandas.core.dtypes.dtypes import ExtensionDtype
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .base import BaseExtensionTests
+
+
+class BaseInterfaceTests(BaseExtensionTests):
+ """Tests that the basic interface is satisfied."""
+ # ------------------------------------------------------------------------
+ # Interface
+ # ------------------------------------------------------------------------
+
+ def test_len(self, data):
+ assert len(data) == 100
+
+ def test_ndim(self, data):
+ assert data.ndim == 1
+
+ def test_can_hold_na_valid(self, data):
+ # GH-20761
+ assert data._can_hold_na is True
+
+ def test_memory_usage(self, data):
+ s = pd.Series(data)
+ result = s.memory_usage(index=False)
+ assert result == s.nbytes
+
+ def test_array_interface(self, data):
+ result = np.array(data)
+ assert result[0] == data[0]
+
+ result = np.array(data, dtype=object)
+ expected = np.array(list(data), dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_is_extension_array_dtype(self, data):
+ assert is_extension_array_dtype(data)
+ assert is_extension_array_dtype(data.dtype)
+ assert is_extension_array_dtype(pd.Series(data))
+ assert isinstance(data.dtype, ExtensionDtype)
+
+ def test_no_values_attribute(self, data):
+ # GH-20735: EA's with .values attribute give problems with internal
+ # code, disallowing this for now until solved
+ assert not hasattr(data, 'values')
+ assert not hasattr(data, '_values')
+
+ def test_is_numeric_honored(self, data):
+ result = pd.Series(data)
+ assert result._data.blocks[0].is_numeric is data.dtype._is_numeric
+
+ def test_isna_extension_array(self, data_missing):
+ # If your `isna` returns an ExtensionArray, you must also implement
+ # _reduce. At the *very* least, you must implement any and all
+ na = data_missing.isna()
+ if is_extension_array_dtype(na):
+ assert na._reduce('any')
+ assert na.any()
+
+ assert not na._reduce('all')
+ assert not na.all()
+
+ assert na.dtype._is_boolean
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/io.py b/contrib/python/pandas/py2/pandas/tests/extension/base/io.py
new file mode 100644
index 00000000000..7ea62e4e9d6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/io.py
@@ -0,0 +1,23 @@
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseParsingTests(BaseExtensionTests):
+
+ @pytest.mark.parametrize('engine', ['c', 'python'])
+ def test_EA_types(self, engine, data):
+ df = pd.DataFrame({
+ 'with_dtype': pd.Series(data, dtype=str(data.dtype))
+ })
+ csv_output = df.to_csv(index=False, na_rep=np.nan)
+ result = pd.read_csv(StringIO(csv_output), dtype={
+ 'with_dtype': str(data.dtype)
+ }, engine=engine)
+ expected = df
+ self.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/methods.py b/contrib/python/pandas/py2/pandas/tests/extension/base/methods.py
new file mode 100644
index 00000000000..f64df7a84b7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/methods.py
@@ -0,0 +1,341 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .base import BaseExtensionTests
+
+
+class BaseMethodsTests(BaseExtensionTests):
+ """Various Series and DataFrame methods."""
+
+ @pytest.mark.parametrize('dropna', [True, False])
+ def test_value_counts(self, all_data, dropna):
+ all_data = all_data[:10]
+ if dropna:
+ other = np.array(all_data[~all_data.isna()])
+ else:
+ other = all_data
+
+ result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
+ expected = pd.Series(other).value_counts(
+ dropna=dropna).sort_index()
+
+ self.assert_series_equal(result, expected)
+
+ def test_count(self, data_missing):
+ df = pd.DataFrame({"A": data_missing})
+ result = df.count(axis='columns')
+ expected = pd.Series([0, 1])
+ self.assert_series_equal(result, expected)
+
+ def test_apply_simple_series(self, data):
+ result = pd.Series(data).apply(id)
+ assert isinstance(result, pd.Series)
+
+ def test_argsort(self, data_for_sorting):
+ result = pd.Series(data_for_sorting).argsort()
+ expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
+ self.assert_series_equal(result, expected)
+
+ def test_argsort_missing(self, data_missing_for_sorting):
+ result = pd.Series(data_missing_for_sorting).argsort()
+ expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('ascending', [True, False])
+ def test_sort_values(self, data_for_sorting, ascending):
+ ser = pd.Series(data_for_sorting)
+ result = ser.sort_values(ascending=ascending)
+ expected = ser.iloc[[2, 0, 1]]
+ if not ascending:
+ expected = expected[::-1]
+
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('ascending', [True, False])
+ def test_sort_values_missing(self, data_missing_for_sorting, ascending):
+ ser = pd.Series(data_missing_for_sorting)
+ result = ser.sort_values(ascending=ascending)
+ if ascending:
+ expected = ser.iloc[[2, 0, 1]]
+ else:
+ expected = ser.iloc[[0, 2, 1]]
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('ascending', [True, False])
+ def test_sort_values_frame(self, data_for_sorting, ascending):
+ df = pd.DataFrame({"A": [1, 2, 1],
+ "B": data_for_sorting})
+ result = df.sort_values(['A', 'B'])
+ expected = pd.DataFrame({"A": [1, 1, 2],
+ 'B': data_for_sorting.take([2, 0, 1])},
+ index=[2, 0, 1])
+ self.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [pd.Series, lambda x: x])
+ @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique])
+ def test_unique(self, data, box, method):
+ duplicated = box(data._from_sequence([data[0], data[0]]))
+
+ result = method(duplicated)
+
+ assert len(result) == 1
+ assert isinstance(result, type(data))
+ assert result[0] == duplicated[0]
+
+ @pytest.mark.parametrize('na_sentinel', [-1, -2])
+ def test_factorize(self, data_for_grouping, na_sentinel):
+ labels, uniques = pd.factorize(data_for_grouping,
+ na_sentinel=na_sentinel)
+ expected_labels = np.array([0, 0, na_sentinel,
+ na_sentinel, 1, 1, 0, 2],
+ dtype=np.intp)
+ expected_uniques = data_for_grouping.take([0, 4, 7])
+
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ self.assert_extension_array_equal(uniques, expected_uniques)
+
+ @pytest.mark.parametrize('na_sentinel', [-1, -2])
+ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
+ l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
+ l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
+
+ tm.assert_numpy_array_equal(l1, l2)
+ self.assert_extension_array_equal(u1, u2)
+
+ def test_factorize_empty(self, data):
+ labels, uniques = pd.factorize(data[:0])
+ expected_labels = np.array([], dtype=np.intp)
+ expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
+
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ self.assert_extension_array_equal(uniques, expected_uniques)
+
+ def test_fillna_copy_frame(self, data_missing):
+ arr = data_missing.take([1, 1])
+ df = pd.DataFrame({"A": arr})
+
+ filled_val = df.iloc[0, 0]
+ result = df.fillna(filled_val)
+
+ assert df.A.values is not result.A.values
+
+ def test_fillna_copy_series(self, data_missing):
+ arr = data_missing.take([1, 1])
+ ser = pd.Series(arr)
+
+ filled_val = ser[0]
+ result = ser.fillna(filled_val)
+
+ assert ser._values is not result._values
+ assert ser._values is arr
+
+ def test_fillna_length_mismatch(self, data_missing):
+ msg = "Length of 'value' does not match."
+ with pytest.raises(ValueError, match=msg):
+ data_missing.fillna(data_missing.take([1]))
+
+ def test_combine_le(self, data_repeated):
+ # GH 20825
+ # Test that combine works when doing a <= (le) comparison
+ orig_data1, orig_data2 = data_repeated(2)
+ s1 = pd.Series(orig_data1)
+ s2 = pd.Series(orig_data2)
+ result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+ expected = pd.Series([a <= b for (a, b) in
+ zip(list(orig_data1), list(orig_data2))])
+ self.assert_series_equal(result, expected)
+
+ val = s1.iloc[0]
+ result = s1.combine(val, lambda x1, x2: x1 <= x2)
+ expected = pd.Series([a <= val for a in list(orig_data1)])
+ self.assert_series_equal(result, expected)
+
+ def test_combine_add(self, data_repeated):
+ # GH 20825
+ orig_data1, orig_data2 = data_repeated(2)
+ s1 = pd.Series(orig_data1)
+ s2 = pd.Series(orig_data2)
+ result = s1.combine(s2, lambda x1, x2: x1 + x2)
+ with np.errstate(over='ignore'):
+ expected = pd.Series(
+ orig_data1._from_sequence([a + b for (a, b) in
+ zip(list(orig_data1),
+ list(orig_data2))]))
+ self.assert_series_equal(result, expected)
+
+ val = s1.iloc[0]
+ result = s1.combine(val, lambda x1, x2: x1 + x2)
+ expected = pd.Series(
+ orig_data1._from_sequence([a + val for a in list(orig_data1)]))
+ self.assert_series_equal(result, expected)
+
+ def test_combine_first(self, data):
+ # https://github.com/pandas-dev/pandas/issues/24147
+ a = pd.Series(data[:3])
+ b = pd.Series(data[2:5], index=[2, 3, 4])
+ result = a.combine_first(b)
+ expected = pd.Series(data[:5])
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('frame', [True, False])
+ @pytest.mark.parametrize('periods, indices', [
+ (-2, [2, 3, 4, -1, -1]),
+ (0, [0, 1, 2, 3, 4]),
+ (2, [-1, -1, 0, 1, 2]),
+ ])
+ def test_container_shift(self, data, frame, periods, indices):
+ # https://github.com/pandas-dev/pandas/issues/22386
+ subset = data[:5]
+ data = pd.Series(subset, name='A')
+ expected = pd.Series(subset.take(indices, allow_fill=True), name='A')
+
+ if frame:
+ result = data.to_frame(name='A').assign(B=1).shift(periods)
+ expected = pd.concat([
+ expected,
+ pd.Series([1] * 5, name='B').shift(periods)
+ ], axis=1)
+ compare = self.assert_frame_equal
+ else:
+ result = data.shift(periods)
+ compare = self.assert_series_equal
+
+ compare(result, expected)
+
+ @pytest.mark.parametrize('periods, indices', [
+ [-4, [-1, -1]],
+ [-1, [1, -1]],
+ [0, [0, 1]],
+ [1, [-1, 0]],
+ [4, [-1, -1]]
+ ])
+ def test_shift_non_empty_array(self, data, periods, indices):
+ # https://github.com/pandas-dev/pandas/issues/23911
+ subset = data[:2]
+ result = subset.shift(periods)
+ expected = subset.take(indices, allow_fill=True)
+ self.assert_extension_array_equal(result, expected)
+
+ @pytest.mark.parametrize('periods', [
+ -4, -1, 0, 1, 4
+ ])
+ def test_shift_empty_array(self, data, periods):
+ # https://github.com/pandas-dev/pandas/issues/23911
+ empty = data[:0]
+ result = empty.shift(periods)
+ expected = empty
+ self.assert_extension_array_equal(result, expected)
+
+ def test_shift_fill_value(self, data):
+ arr = data[:4]
+ fill_value = data[0]
+ result = arr.shift(1, fill_value=fill_value)
+ expected = data.take([0, 0, 1, 2])
+ self.assert_extension_array_equal(result, expected)
+
+ result = arr.shift(-2, fill_value=fill_value)
+ expected = data.take([2, 3, 0, 0])
+ self.assert_extension_array_equal(result, expected)
+
+ @pytest.mark.parametrize("as_frame", [True, False])
+ def test_hash_pandas_object_works(self, data, as_frame):
+ # https://github.com/pandas-dev/pandas/issues/23066
+ data = pd.Series(data)
+ if as_frame:
+ data = data.to_frame()
+ a = pd.util.hash_pandas_object(data)
+ b = pd.util.hash_pandas_object(data)
+ self.assert_equal(a, b)
+
+ @pytest.mark.parametrize("as_series", [True, False])
+ def test_searchsorted(self, data_for_sorting, as_series):
+ b, c, a = data_for_sorting
+ arr = type(data_for_sorting)._from_sequence([a, b, c])
+
+ if as_series:
+ arr = pd.Series(arr)
+ assert arr.searchsorted(a) == 0
+ assert arr.searchsorted(a, side="right") == 1
+
+ assert arr.searchsorted(b) == 1
+ assert arr.searchsorted(b, side="right") == 2
+
+ assert arr.searchsorted(c) == 2
+ assert arr.searchsorted(c, side="right") == 3
+
+ result = arr.searchsorted(arr.take([0, 2]))
+ expected = np.array([0, 2], dtype=np.intp)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # sorter
+ sorter = np.array([1, 2, 0])
+ assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
+
+ @pytest.mark.parametrize("as_frame", [True, False])
+ def test_where_series(self, data, na_value, as_frame):
+ assert data[0] != data[1]
+ cls = type(data)
+ a, b = data[:2]
+
+ ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
+ cond = np.array([True, True, False, False])
+
+ if as_frame:
+ ser = ser.to_frame(name='a')
+ cond = cond.reshape(-1, 1)
+
+ result = ser.where(cond)
+ expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
+ dtype=data.dtype))
+
+ if as_frame:
+ expected = expected.to_frame(name='a')
+ self.assert_equal(result, expected)
+
+ # array other
+ cond = np.array([True, False, True, True])
+ other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
+ if as_frame:
+ other = pd.DataFrame({"a": other})
+ cond = pd.DataFrame({"a": cond})
+ result = ser.where(cond, other)
+ expected = pd.Series(cls._from_sequence([a, b, b, b],
+ dtype=data.dtype))
+ if as_frame:
+ expected = expected.to_frame(name='a')
+ self.assert_equal(result, expected)
+
+ @pytest.mark.parametrize("use_numpy", [True, False])
+ @pytest.mark.parametrize("as_series", [True, False])
+ @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
+ def test_repeat(self, data, repeats, as_series, use_numpy):
+ arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
+ if as_series:
+ arr = pd.Series(arr)
+
+ result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
+
+ repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
+ expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
+ expected = type(data)._from_sequence(expected, dtype=data.dtype)
+ if as_series:
+ expected = pd.Series(expected, index=arr.index.repeat(repeats))
+
+ self.assert_equal(result, expected)
+
+ @pytest.mark.parametrize("use_numpy", [True, False])
+ @pytest.mark.parametrize('repeats, kwargs, error, msg', [
+ (2, dict(axis=1), ValueError, "'axis"),
+ (-1, dict(), ValueError, "negative"),
+ ([1, 2], dict(), ValueError, "shape"),
+ (2, dict(foo='bar'), TypeError, "'foo'")])
+ def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
+ with pytest.raises(error, match=msg):
+ if use_numpy:
+ np.repeat(data, repeats, **kwargs)
+ else:
+ data.repeat(repeats, **kwargs)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/missing.py b/contrib/python/pandas/py2/pandas/tests/extension/base/missing.py
new file mode 100644
index 00000000000..2fe547e50a3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/missing.py
@@ -0,0 +1,132 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .base import BaseExtensionTests
+
+
+class BaseMissingTests(BaseExtensionTests):
+ def test_isna(self, data_missing):
+ expected = np.array([True, False])
+
+ result = pd.isna(data_missing)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = pd.Series(data_missing).isna()
+ expected = pd.Series(expected)
+ self.assert_series_equal(result, expected)
+
+ # GH 21189
+ result = pd.Series(data_missing).drop([0, 1]).isna()
+ expected = pd.Series([], dtype=bool)
+ self.assert_series_equal(result, expected)
+
+ def test_dropna_array(self, data_missing):
+ result = data_missing.dropna()
+ expected = data_missing[[1]]
+ self.assert_extension_array_equal(result, expected)
+
+ def test_dropna_series(self, data_missing):
+ ser = pd.Series(data_missing)
+ result = ser.dropna()
+ expected = ser.iloc[[1]]
+ self.assert_series_equal(result, expected)
+
+ def test_dropna_frame(self, data_missing):
+ df = pd.DataFrame({"A": data_missing})
+
+ # defaults
+ result = df.dropna()
+ expected = df.iloc[[1]]
+ self.assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.dropna(axis='columns')
+ expected = pd.DataFrame(index=[0, 1])
+ self.assert_frame_equal(result, expected)
+
+ # multiple
+ df = pd.DataFrame({"A": data_missing,
+ "B": [1, np.nan]})
+ result = df.dropna()
+ expected = df.iloc[:0]
+ self.assert_frame_equal(result, expected)
+
+ def test_fillna_scalar(self, data_missing):
+ valid = data_missing[1]
+ result = data_missing.fillna(valid)
+ expected = data_missing.fillna(valid)
+ self.assert_extension_array_equal(result, expected)
+
+ def test_fillna_limit_pad(self, data_missing):
+ arr = data_missing.take([1, 0, 0, 0, 1])
+ result = pd.Series(arr).fillna(method='ffill', limit=2)
+ expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
+ self.assert_series_equal(result, expected)
+
+ def test_fillna_limit_backfill(self, data_missing):
+ arr = data_missing.take([1, 0, 0, 0, 1])
+ result = pd.Series(arr).fillna(method='backfill', limit=2)
+ expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
+ self.assert_series_equal(result, expected)
+
+ def test_fillna_series(self, data_missing):
+ fill_value = data_missing[1]
+ ser = pd.Series(data_missing)
+
+ result = ser.fillna(fill_value)
+ expected = pd.Series(data_missing._from_sequence(
+ [fill_value, fill_value], dtype=data_missing.dtype))
+ self.assert_series_equal(result, expected)
+
+ # Fill with a series
+ result = ser.fillna(expected)
+ self.assert_series_equal(result, expected)
+
+ # Fill with a series not affecting the missing values
+ result = ser.fillna(ser)
+ self.assert_series_equal(result, ser)
+
+ @pytest.mark.parametrize('method', ['ffill', 'bfill'])
+ def test_fillna_series_method(self, data_missing, method):
+ fill_value = data_missing[1]
+
+ if method == 'ffill':
+ data_missing = data_missing[::-1]
+
+ result = pd.Series(data_missing).fillna(method=method)
+ expected = pd.Series(data_missing._from_sequence(
+ [fill_value, fill_value], dtype=data_missing.dtype))
+
+ self.assert_series_equal(result, expected)
+
+ def test_fillna_frame(self, data_missing):
+ fill_value = data_missing[1]
+
+ result = pd.DataFrame({
+ "A": data_missing,
+ "B": [1, 2]
+ }).fillna(fill_value)
+
+ expected = pd.DataFrame({
+ "A": data_missing._from_sequence([fill_value, fill_value],
+ dtype=data_missing.dtype),
+ "B": [1, 2],
+ })
+
+ self.assert_frame_equal(result, expected)
+
+ def test_fillna_fill_other(self, data):
+ result = pd.DataFrame({
+ "A": data,
+ "B": [np.nan] * len(data)
+ }).fillna({"B": 0.0})
+
+ expected = pd.DataFrame({
+ "A": data,
+ "B": [0.0] * len(result),
+ })
+
+ self.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/ops.py b/contrib/python/pandas/py2/pandas/tests/extension/base/ops.py
new file mode 100644
index 00000000000..cd5e55d9871
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/ops.py
@@ -0,0 +1,166 @@
+import operator
+
+import pytest
+
+import pandas as pd
+from pandas.core import ops
+
+from .base import BaseExtensionTests
+
+
+class BaseOpsUtil(BaseExtensionTests):
+
+ def get_op_from_name(self, op_name):
+ short_opname = op_name.strip('_')
+ try:
+ op = getattr(operator, short_opname)
+ except AttributeError:
+ # Assume it is the reverse operator
+ rop = getattr(operator, short_opname[1:])
+ op = lambda x, y: rop(y, x)
+
+ return op
+
+ def check_opname(self, s, op_name, other, exc=Exception):
+ op = self.get_op_from_name(op_name)
+
+ self._check_op(s, op, other, op_name, exc)
+
+ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
+ if exc is None:
+ result = op(s, other)
+ expected = s.combine(other, op)
+ self.assert_series_equal(result, expected)
+ else:
+ with pytest.raises(exc):
+ op(s, other)
+
+ def _check_divmod_op(self, s, op, other, exc=Exception):
+ # divmod has multiple return values, so check separatly
+ if exc is None:
+ result_div, result_mod = op(s, other)
+ if op is divmod:
+ expected_div, expected_mod = s // other, s % other
+ else:
+ expected_div, expected_mod = other // s, other % s
+ self.assert_series_equal(result_div, expected_div)
+ self.assert_series_equal(result_mod, expected_mod)
+ else:
+ with pytest.raises(exc):
+ divmod(s, other)
+
+
+class BaseArithmeticOpsTests(BaseOpsUtil):
+ """Various Series and DataFrame arithmetic ops methods.
+
+ Subclasses supporting various ops should set the class variables
+ to indicate that they support ops of that kind
+
+ * series_scalar_exc = TypeError
+ * frame_scalar_exc = TypeError
+ * series_array_exc = TypeError
+ * divmod_exc = TypeError
+ """
+ series_scalar_exc = TypeError
+ frame_scalar_exc = TypeError
+ series_array_exc = TypeError
+ divmod_exc = TypeError
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ # series & scalar
+ op_name = all_arithmetic_operators
+ s = pd.Series(data)
+ self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)
+
+ @pytest.mark.xfail(run=False, reason="_reduce needs implementation")
+ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+ # frame & scalar
+ op_name = all_arithmetic_operators
+ df = pd.DataFrame({'A': data})
+ self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ # ndarray & other series
+ op_name = all_arithmetic_operators
+ s = pd.Series(data)
+ self.check_opname(s, op_name, pd.Series([s.iloc[0]] * len(s)),
+ exc=self.series_array_exc)
+
+ def test_divmod(self, data):
+ s = pd.Series(data)
+ self._check_divmod_op(s, divmod, 1, exc=self.divmod_exc)
+ self._check_divmod_op(1, ops.rdivmod, s, exc=self.divmod_exc)
+
+ def test_divmod_series_array(self, data):
+ s = pd.Series(data)
+ self._check_divmod_op(s, divmod, data)
+
+ def test_add_series_with_extension_array(self, data):
+ s = pd.Series(data)
+ result = s + data
+ expected = pd.Series(data + data)
+ self.assert_series_equal(result, expected)
+
+ def test_error(self, data, all_arithmetic_operators):
+ # invalid ops
+ op_name = all_arithmetic_operators
+ with pytest.raises(AttributeError):
+ getattr(data, op_name)
+
+ def test_direct_arith_with_series_returns_not_implemented(self, data):
+ # EAs should return NotImplemented for ops with Series.
+ # Pandas takes care of unboxing the series and calling the EA's op.
+ other = pd.Series(data)
+ if hasattr(data, '__add__'):
+ result = data.__add__(other)
+ assert result is NotImplemented
+ else:
+ raise pytest.skip(
+ "{} does not implement add".format(data.__class__.__name__)
+ )
+
+
+class BaseComparisonOpsTests(BaseOpsUtil):
+ """Various Series and DataFrame comparison ops methods."""
+
+ def _compare_other(self, s, data, op_name, other):
+ op = self.get_op_from_name(op_name)
+ if op_name == '__eq__':
+ assert getattr(data, op_name)(other) is NotImplemented
+ assert not op(s, other).all()
+ elif op_name == '__ne__':
+ assert getattr(data, op_name)(other) is NotImplemented
+ assert op(s, other).all()
+
+ else:
+
+ # array
+ assert getattr(data, op_name)(other) is NotImplemented
+
+ # series
+ s = pd.Series(data)
+ with pytest.raises(TypeError):
+ op(s, other)
+
+ def test_compare_scalar(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ s = pd.Series(data)
+ self._compare_other(s, data, op_name, 0)
+
+ def test_compare_array(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ s = pd.Series(data)
+ other = pd.Series([data[0]] * len(data))
+ self._compare_other(s, data, op_name, other)
+
+ def test_direct_arith_with_series_returns_not_implemented(self, data):
+ # EAs should return NotImplemented for ops with Series.
+ # Pandas takes care of unboxing the series and calling the EA's op.
+ other = pd.Series(data)
+ if hasattr(data, '__eq__'):
+ result = data.__eq__(other)
+ assert result is NotImplemented
+ else:
+ raise pytest.skip(
+ "{} does not implement __eq__".format(data.__class__.__name__)
+ )
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/printing.py b/contrib/python/pandas/py2/pandas/tests/extension/base/printing.py
new file mode 100644
index 00000000000..b2ba1d95cf3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/printing.py
@@ -0,0 +1,44 @@
+import io
+
+import pytest
+
+import pandas as pd
+from pandas import compat
+
+from .base import BaseExtensionTests
+
+
+class BasePrintingTests(BaseExtensionTests):
+ """Tests checking the formatting of your EA when printed."""
+
+ @pytest.mark.parametrize("size", ["big", "small"])
+ def test_array_repr(self, data, size):
+ if size == "small":
+ data = data[:5]
+ else:
+ data = type(data)._concat_same_type([data] * 5)
+
+ result = repr(data)
+ assert data.__class__.__name__ in result
+ assert 'Length: {}'.format(len(data)) in result
+ assert str(data.dtype) in result
+ if size == 'big':
+ assert '...' in result
+
+ def test_array_repr_unicode(self, data):
+ result = compat.text_type(data)
+ assert isinstance(result, compat.text_type)
+
+ def test_series_repr(self, data):
+ ser = pd.Series(data)
+ assert data.dtype.name in repr(ser)
+
+ def test_dataframe_repr(self, data):
+ df = pd.DataFrame({"A": data})
+ repr(df)
+
+ def test_dtype_name_in_info(self, data):
+ buf = io.StringIO()
+ pd.DataFrame({"A": data}).info(buf=buf)
+ result = buf.getvalue()
+ assert data.dtype.name in result
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/reduce.py b/contrib/python/pandas/py2/pandas/tests/extension/base/reduce.py
new file mode 100644
index 00000000000..c4b70f20132
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/reduce.py
@@ -0,0 +1,61 @@
+import warnings
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .base import BaseExtensionTests
+
+
+class BaseReduceTests(BaseExtensionTests):
+ """
+ Reduction specific tests. Generally these only
+ make sense for numeric/boolean operations.
+ """
+ def check_reduce(self, s, op_name, skipna):
+ result = getattr(s, op_name)(skipna=skipna)
+ expected = getattr(s.astype('float64'), op_name)(skipna=skipna)
+ tm.assert_almost_equal(result, expected)
+
+
+class BaseNoReduceTests(BaseReduceTests):
+ """ we don't define any reductions """
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+ op_name = all_numeric_reductions
+ s = pd.Series(data)
+
+ with pytest.raises(TypeError):
+ getattr(s, op_name)(skipna=skipna)
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
+ op_name = all_boolean_reductions
+ s = pd.Series(data)
+
+ with pytest.raises(TypeError):
+ getattr(s, op_name)(skipna=skipna)
+
+
+class BaseNumericReduceTests(BaseReduceTests):
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_reduce_series(self, data, all_numeric_reductions, skipna):
+ op_name = all_numeric_reductions
+ s = pd.Series(data)
+
+ # min/max with empty produce numpy warnings
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", RuntimeWarning)
+ self.check_reduce(s, op_name, skipna)
+
+
+class BaseBooleanReduceTests(BaseReduceTests):
+
+ @pytest.mark.parametrize('skipna', [True, False])
+ def test_reduce_series(self, data, all_boolean_reductions, skipna):
+ op_name = all_boolean_reductions
+ s = pd.Series(data)
+ self.check_reduce(s, op_name, skipna)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/reshaping.py b/contrib/python/pandas/py2/pandas/tests/extension/base/reshaping.py
new file mode 100644
index 00000000000..ee22ffb3ccf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/reshaping.py
@@ -0,0 +1,271 @@
+import itertools
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.internals import ExtensionBlock
+
+from .base import BaseExtensionTests
+
+
+class BaseReshapingTests(BaseExtensionTests):
+ """Tests for reshaping and concatenation."""
+ @pytest.mark.parametrize('in_frame', [True, False])
+ def test_concat(self, data, in_frame):
+ wrapped = pd.Series(data)
+ if in_frame:
+ wrapped = pd.DataFrame(wrapped)
+ result = pd.concat([wrapped, wrapped], ignore_index=True)
+
+ assert len(result) == len(data) * 2
+
+ if in_frame:
+ dtype = result.dtypes[0]
+ else:
+ dtype = result.dtype
+
+ assert dtype == data.dtype
+ assert isinstance(result._data.blocks[0], ExtensionBlock)
+
+ @pytest.mark.parametrize('in_frame', [True, False])
+ def test_concat_all_na_block(self, data_missing, in_frame):
+ valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
+ na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
+ if in_frame:
+ valid_block = pd.DataFrame({"a": valid_block})
+ na_block = pd.DataFrame({"a": na_block})
+ result = pd.concat([valid_block, na_block])
+ if in_frame:
+ expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
+ self.assert_frame_equal(result, expected)
+ else:
+ expected = pd.Series(data_missing.take([1, 1, 0, 0]))
+ self.assert_series_equal(result, expected)
+
+ def test_concat_mixed_dtypes(self, data):
+ # https://github.com/pandas-dev/pandas/issues/20762
+ df1 = pd.DataFrame({'A': data[:3]})
+ df2 = pd.DataFrame({"A": [1, 2, 3]})
+ df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
+ dfs = [df1, df2, df3]
+
+ # dataframes
+ result = pd.concat(dfs)
+ expected = pd.concat([x.astype(object) for x in dfs])
+ self.assert_frame_equal(result, expected)
+
+ # series
+ result = pd.concat([x['A'] for x in dfs])
+ expected = pd.concat([x['A'].astype(object) for x in dfs])
+ self.assert_series_equal(result, expected)
+
+ # simple test for just EA and one other
+ result = pd.concat([df1, df2])
+ expected = pd.concat([df1.astype('object'), df2.astype('object')])
+ self.assert_frame_equal(result, expected)
+
+ result = pd.concat([df1['A'], df2['A']])
+ expected = pd.concat([df1['A'].astype('object'),
+ df2['A'].astype('object')])
+ self.assert_series_equal(result, expected)
+
+ def test_concat_columns(self, data, na_value):
+ df1 = pd.DataFrame({'A': data[:3]})
+ df2 = pd.DataFrame({'B': [1, 2, 3]})
+
+ expected = pd.DataFrame({'A': data[:3], 'B': [1, 2, 3]})
+ result = pd.concat([df1, df2], axis=1)
+ self.assert_frame_equal(result, expected)
+ result = pd.concat([df1['A'], df2['B']], axis=1)
+ self.assert_frame_equal(result, expected)
+
+ # non-aligned
+ df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3])
+ expected = pd.DataFrame({
+ 'A': data._from_sequence(list(data[:3]) + [na_value],
+ dtype=data.dtype),
+ 'B': [np.nan, 1, 2, 3]})
+
+ result = pd.concat([df1, df2], axis=1)
+ self.assert_frame_equal(result, expected)
+ result = pd.concat([df1['A'], df2['B']], axis=1)
+ self.assert_frame_equal(result, expected)
+
+ def test_align(self, data, na_value):
+ a = data[:3]
+ b = data[2:5]
+ r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
+
+ # Assumes that the ctor can take a list of scalars of the type
+ e1 = pd.Series(data._from_sequence(list(a) + [na_value],
+ dtype=data.dtype))
+ e2 = pd.Series(data._from_sequence([na_value] + list(b),
+ dtype=data.dtype))
+ self.assert_series_equal(r1, e1)
+ self.assert_series_equal(r2, e2)
+
+ def test_align_frame(self, data, na_value):
+ a = data[:3]
+ b = data[2:5]
+ r1, r2 = pd.DataFrame({'A': a}).align(
+ pd.DataFrame({'A': b}, index=[1, 2, 3])
+ )
+
+ # Assumes that the ctor can take a list of scalars of the type
+ e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value],
+ dtype=data.dtype)})
+ e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b),
+ dtype=data.dtype)})
+ self.assert_frame_equal(r1, e1)
+ self.assert_frame_equal(r2, e2)
+
+ def test_align_series_frame(self, data, na_value):
+ # https://github.com/pandas-dev/pandas/issues/20576
+ ser = pd.Series(data, name='a')
+ df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
+ r1, r2 = ser.align(df)
+
+ e1 = pd.Series(data._from_sequence(list(data) + [na_value],
+ dtype=data.dtype),
+ name=ser.name)
+
+ self.assert_series_equal(r1, e1)
+ self.assert_frame_equal(r2, df)
+
+ def test_set_frame_expand_regular_with_extension(self, data):
+ df = pd.DataFrame({"A": [1] * len(data)})
+ df['B'] = data
+ expected = pd.DataFrame({"A": [1] * len(data), "B": data})
+ self.assert_frame_equal(df, expected)
+
+ def test_set_frame_expand_extension_with_regular(self, data):
+ df = pd.DataFrame({'A': data})
+ df['B'] = [1] * len(data)
+ expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
+ self.assert_frame_equal(df, expected)
+
+ def test_set_frame_overwrite_object(self, data):
+ # https://github.com/pandas-dev/pandas/issues/20555
+ df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
+ df['A'] = data
+ assert df.dtypes['A'] == data.dtype
+
+ def test_merge(self, data, na_value):
+ # GH-20743
+ df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3],
+ 'key': [0, 1, 2]})
+ df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]})
+
+ res = pd.merge(df1, df2)
+ exp = pd.DataFrame(
+ {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
+ 'ext': data._from_sequence([data[0], data[0], data[1]],
+ dtype=data.dtype)})
+ self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
+
+ res = pd.merge(df1, df2, how='outer')
+ exp = pd.DataFrame(
+ {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4],
+ 'key': [0, 0, 1, 2, 3],
+ 'ext': data._from_sequence(
+ [data[0], data[0], data[1], data[2], na_value],
+ dtype=data.dtype)})
+ self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
+
+ def test_merge_on_extension_array(self, data):
+ # GH 23020
+ a, b = data[:2]
+ key = type(data)._from_sequence([a, b], dtype=data.dtype)
+
+ df = pd.DataFrame({"key": key, "val": [1, 2]})
+ result = pd.merge(df, df, on='key')
+ expected = pd.DataFrame({"key": key,
+ "val_x": [1, 2],
+ "val_y": [1, 2]})
+ self.assert_frame_equal(result, expected)
+
+ # order
+ result = pd.merge(df.iloc[[1, 0]], df, on='key')
+ expected = expected.iloc[[1, 0]].reset_index(drop=True)
+ self.assert_frame_equal(result, expected)
+
+ def test_merge_on_extension_array_duplicates(self, data):
+ # GH 23020
+ a, b = data[:2]
+ key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
+ df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+ df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+
+ result = pd.merge(df1, df2, on='key')
+ expected = pd.DataFrame({
+ "key": key.take([0, 0, 0, 0, 1]),
+ "val_x": [1, 1, 3, 3, 2],
+ "val_y": [1, 3, 1, 3, 2],
+ })
+ self.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("columns", [
+ ["A", "B"],
+ pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],
+ names=['outer', 'inner']),
+ ])
+ def test_stack(self, data, columns):
+ df = pd.DataFrame({"A": data[:5], "B": data[:5]})
+ df.columns = columns
+ result = df.stack()
+ expected = df.astype(object).stack()
+ # we need a second astype(object), in case the constructor inferred
+ # object -> specialized, as is done for period.
+ expected = expected.astype(object)
+
+ if isinstance(expected, pd.Series):
+ assert result.dtype == df.iloc[:, 0].dtype
+ else:
+ assert all(result.dtypes == df.iloc[:, 0].dtype)
+
+ result = result.astype(object)
+ self.assert_equal(result, expected)
+
+ @pytest.mark.parametrize("index", [
+ # Two levels, uniform.
+ pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]),
+ names=['a', 'b']),
+
+ # non-uniform
+ pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]),
+
+ # three levels, non-uniform
+ pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]),
+ pd.MultiIndex.from_tuples([
+ ('A', 'a', 1),
+ ('A', 'b', 0),
+ ('A', 'a', 0),
+ ('B', 'a', 0),
+ ('B', 'c', 1),
+ ]),
+ ])
+ @pytest.mark.parametrize("obj", ["series", "frame"])
+ def test_unstack(self, data, index, obj):
+ data = data[:len(index)]
+ if obj == "series":
+ ser = pd.Series(data, index=index)
+ else:
+ ser = pd.DataFrame({"A": data, "B": data}, index=index)
+
+ n = index.nlevels
+ levels = list(range(n))
+ # [0, 1, 2]
+ # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
+ combinations = itertools.chain.from_iterable(
+ itertools.permutations(levels, i) for i in range(1, n)
+ )
+
+ for level in combinations:
+ result = ser.unstack(level=level)
+ assert all(isinstance(result[col].array, type(data))
+ for col in result.columns)
+ expected = ser.astype(object).unstack(level=level)
+ result = result.astype(object)
+
+ self.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/base/setitem.py b/contrib/python/pandas/py2/pandas/tests/extension/base/setitem.py
new file mode 100644
index 00000000000..42fda982f73
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/base/setitem.py
@@ -0,0 +1,189 @@
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseSetitemTests(BaseExtensionTests):
+ def test_setitem_scalar_series(self, data, box_in_series):
+ if box_in_series:
+ data = pd.Series(data)
+ data[0] = data[1]
+ assert data[0] == data[1]
+
+ def test_setitem_sequence(self, data, box_in_series):
+ if box_in_series:
+ data = pd.Series(data)
+ original = data.copy()
+
+ data[[0, 1]] = [data[1], data[0]]
+ assert data[0] == original[1]
+ assert data[1] == original[0]
+
+ @pytest.mark.parametrize('as_array', [True, False])
+ def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
+ ser = pd.Series(data)
+ original = ser.copy()
+ value = [data[0]]
+ if as_array:
+ value = data._from_sequence(value)
+
+ xpr = 'cannot set using a {} indexer with a different length'
+ with pytest.raises(ValueError, match=xpr.format('list-like')):
+ ser[[0, 1]] = value
+ # Ensure no modifications made before the exception
+ self.assert_series_equal(ser, original)
+
+ with pytest.raises(ValueError, match=xpr.format('slice')):
+ ser[slice(3)] = value
+ self.assert_series_equal(ser, original)
+
+ def test_setitem_empty_indxer(self, data, box_in_series):
+ if box_in_series:
+ data = pd.Series(data)
+ original = data.copy()
+ data[np.array([], dtype=int)] = []
+ self.assert_equal(data, original)
+
+ def test_setitem_sequence_broadcasts(self, data, box_in_series):
+ if box_in_series:
+ data = pd.Series(data)
+ data[[0, 1]] = data[2]
+ assert data[0] == data[2]
+ assert data[1] == data[2]
+
+ @pytest.mark.parametrize('setter', ['loc', 'iloc'])
+ def test_setitem_scalar(self, data, setter):
+ arr = pd.Series(data)
+ setter = getattr(arr, setter)
+ operator.setitem(setter, 0, data[1])
+ assert arr[0] == data[1]
+
+ def test_setitem_loc_scalar_mixed(self, data):
+ df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
+ df.loc[0, 'B'] = data[1]
+ assert df.loc[0, 'B'] == data[1]
+
+ def test_setitem_loc_scalar_single(self, data):
+ df = pd.DataFrame({"B": data})
+ df.loc[10, 'B'] = data[1]
+ assert df.loc[10, 'B'] == data[1]
+
+ def test_setitem_loc_scalar_multiple_homogoneous(self, data):
+ df = pd.DataFrame({"A": data, "B": data})
+ df.loc[10, 'B'] = data[1]
+ assert df.loc[10, 'B'] == data[1]
+
+ def test_setitem_iloc_scalar_mixed(self, data):
+ df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
+ df.iloc[0, 1] = data[1]
+ assert df.loc[0, 'B'] == data[1]
+
+ def test_setitem_iloc_scalar_single(self, data):
+ df = pd.DataFrame({"B": data})
+ df.iloc[10, 0] = data[1]
+ assert df.loc[10, 'B'] == data[1]
+
+ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
+ df = pd.DataFrame({"A": data, "B": data})
+ df.iloc[10, 1] = data[1]
+ assert df.loc[10, 'B'] == data[1]
+
+ @pytest.mark.parametrize('as_callable', [True, False])
+ @pytest.mark.parametrize('setter', ['loc', None])
+ def test_setitem_mask_aligned(self, data, as_callable, setter):
+ ser = pd.Series(data)
+ mask = np.zeros(len(data), dtype=bool)
+ mask[:2] = True
+
+ if as_callable:
+ mask2 = lambda x: mask
+ else:
+ mask2 = mask
+
+ if setter:
+ # loc
+ target = getattr(ser, setter)
+ else:
+ # Series.__setitem__
+ target = ser
+
+ operator.setitem(target, mask2, data[5:7])
+
+ ser[mask2] = data[5:7]
+ assert ser[0] == data[5]
+ assert ser[1] == data[6]
+
+ @pytest.mark.parametrize('setter', ['loc', None])
+ def test_setitem_mask_broadcast(self, data, setter):
+ ser = pd.Series(data)
+ mask = np.zeros(len(data), dtype=bool)
+ mask[:2] = True
+
+ if setter: # loc
+ target = getattr(ser, setter)
+ else: # __setitem__
+ target = ser
+
+ operator.setitem(target, mask, data[10])
+ assert ser[0] == data[10]
+ assert ser[1] == data[10]
+
+ def test_setitem_expand_columns(self, data):
+ df = pd.DataFrame({"A": data})
+ result = df.copy()
+ result['B'] = 1
+ expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
+ self.assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.loc[:, 'B'] = 1
+ self.assert_frame_equal(result, expected)
+
+ # overwrite with new type
+ result['B'] = data
+ expected = pd.DataFrame({"A": data, "B": data})
+ self.assert_frame_equal(result, expected)
+
+ def test_setitem_expand_with_extension(self, data):
+ df = pd.DataFrame({"A": [1] * len(data)})
+ result = df.copy()
+ result['B'] = data
+ expected = pd.DataFrame({"A": [1] * len(data), "B": data})
+ self.assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.loc[:, 'B'] = data
+ self.assert_frame_equal(result, expected)
+
+ def test_setitem_frame_invalid_length(self, data):
+ df = pd.DataFrame({"A": [1] * len(data)})
+ xpr = "Length of values does not match length of index"
+ with pytest.raises(ValueError, match=xpr):
+ df['B'] = data[:5]
+
+ @pytest.mark.xfail(reason="GH#20441: setitem on extension types.")
+ def test_setitem_tuple_index(self, data):
+ s = pd.Series(data[:2], index=[(0, 0), (0, 1)])
+ expected = pd.Series(data.take([1, 1]), index=s.index)
+ s[(0, 1)] = data[1]
+ self.assert_series_equal(s, expected)
+
+ def test_setitem_slice_mismatch_length_raises(self, data):
+ arr = data[:5]
+ with pytest.raises(ValueError):
+ arr[:1] = arr[:2]
+
+ def test_setitem_slice_array(self, data):
+ arr = data[:5].copy()
+ arr[:5] = data[-5:]
+ self.assert_extension_array_equal(arr, data[-5:])
+
+ def test_setitem_scalar_key_sequence_raise(self, data):
+ arr = data[:5].copy()
+ with pytest.raises(ValueError):
+ arr[0] = arr[[0, 1]]
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/conftest.py b/contrib/python/pandas/py2/pandas/tests/extension/conftest.py
new file mode 100644
index 00000000000..5349dd919f2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/conftest.py
@@ -0,0 +1,110 @@
+import operator
+
+import pytest
+
+
+def dtype():
+ """A fixture providing the ExtensionDtype to validate."""
+ raise NotImplementedError
+
+
+def data():
+ """Length-100 array for this type.
+
+ * data[0] and data[1] should both be non missing
+ * data[0] and data[1] should not gbe equal
+ """
+ raise NotImplementedError
+
+
+def data_missing():
+ """Length-2 array with [NA, Valid]"""
+ raise NotImplementedError
+
+
[email protected](params=['data', 'data_missing'])
+def all_data(request, data, data_missing):
+ """Parametrized fixture giving 'data' and 'data_missing'"""
+ if request.param == 'data':
+ return data
+ elif request.param == 'data_missing':
+ return data_missing
+
+
+def data_repeated(data):
+ """
+ Generate many datasets.
+
+ Parameters
+ ----------
+ data : fixture implementing `data`
+
+ Returns
+ -------
+ Callable[[int], Generator]:
+ A callable that takes a `count` argument and
+ returns a generator yielding `count` datasets.
+ """
+ def gen(count):
+ for _ in range(count):
+ yield data
+ return gen
+
+
+def data_for_sorting():
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, C, A] with
+ A < B < C
+ """
+ raise NotImplementedError
+
+
+def data_missing_for_sorting():
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, NA, A] with
+ A < B and NA missing.
+ """
+ raise NotImplementedError
+
+
+def na_cmp():
+ """Binary operator for comparing NA values.
+
+ Should return a function of two arguments that returns
+ True if both arguments are (scalar) NA for your type.
+
+ By default, uses ``operator.is_``
+ """
+ return operator.is_
+
+
+def na_value():
+ """The scalar missing value for this type. Default 'None'"""
+ return None
+
+
+def data_for_grouping():
+ """Data for factorization, grouping, and unique tests.
+
+ Expected to be like [B, B, NA, NA, A, A, B, C]
+
+ Where A < B < C and NA is missing
+ """
+ raise NotImplementedError
+
+
[email protected](params=[True, False])
+def box_in_series(request):
+ """Whether to box the data in a Series"""
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/decimal/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/decimal/__init__.py
new file mode 100644
index 00000000000..c37aad0af84
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/decimal/__init__.py
@@ -0,0 +1,4 @@
+from .array import DecimalArray, DecimalDtype, to_decimal, make_data
+
+
+__all__ = ['DecimalArray', 'DecimalDtype', 'to_decimal', 'make_data']
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/decimal/array.py b/contrib/python/pandas/py2/pandas/tests/extension/decimal/array.py
new file mode 100644
index 00000000000..1823eeb4d7f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/decimal/array.py
@@ -0,0 +1,166 @@
+import decimal
+import numbers
+import random
+import sys
+
+import numpy as np
+
+from pandas.core.dtypes.base import ExtensionDtype
+
+import pandas as pd
+from pandas.api.extensions import register_extension_dtype
+from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin
+
+
+@register_extension_dtype
+class DecimalDtype(ExtensionDtype):
+ type = decimal.Decimal
+ name = 'decimal'
+ na_value = decimal.Decimal('NaN')
+ _metadata = ('context',)
+
+ def __init__(self, context=None):
+ self.context = context or decimal.getcontext()
+
+ def __repr__(self):
+ return 'DecimalDtype(context={})'.format(self.context)
+
+ @classmethod
+ def construct_array_type(cls):
+ """Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ return DecimalArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ if string == cls.name:
+ return cls()
+ else:
+ raise TypeError("Cannot construct a '{}' from "
+ "'{}'".format(cls, string))
+
+ @property
+ def _is_numeric(self):
+ return True
+
+
+class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin):
+ __array_priority__ = 1000
+
+ def __init__(self, values, dtype=None, copy=False, context=None):
+ for val in values:
+ if not isinstance(val, decimal.Decimal):
+ raise TypeError("All values must be of type " +
+ str(decimal.Decimal))
+ values = np.asarray(values, dtype=object)
+
+ self._data = values
+ # Some aliases for common attribute names to ensure pandas supports
+ # these
+ self._items = self.data = self._data
+ # those aliases are currently not working due to assumptions
+ # in internal code (GH-20735)
+ # self._values = self.values = self.data
+ self._dtype = DecimalDtype(context)
+
+ @property
+ def dtype(self):
+ return self._dtype
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return cls(scalars)
+
+ @classmethod
+ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+ return cls._from_sequence([decimal.Decimal(x) for x in strings],
+ dtype, copy)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return cls(values)
+
+ def __getitem__(self, item):
+ if isinstance(item, numbers.Integral):
+ return self._data[item]
+ else:
+ return type(self)(self._data[item])
+
+ def take(self, indexer, allow_fill=False, fill_value=None):
+ from pandas.api.extensions import take
+
+ data = self._data
+ if allow_fill and fill_value is None:
+ fill_value = self.dtype.na_value
+
+ result = take(data, indexer, fill_value=fill_value,
+ allow_fill=allow_fill)
+ return self._from_sequence(result)
+
+ def copy(self, deep=False):
+ if deep:
+ return type(self)(self._data.copy())
+ return type(self)(self)
+
+ def astype(self, dtype, copy=True):
+ if isinstance(dtype, type(self.dtype)):
+ return type(self)(self._data, context=dtype.context)
+ return np.asarray(self, dtype=dtype)
+
+ def __setitem__(self, key, value):
+ if pd.api.types.is_list_like(value):
+ if pd.api.types.is_scalar(key):
+ raise ValueError("setting an array element with a sequence.")
+ value = [decimal.Decimal(v) for v in value]
+ else:
+ value = decimal.Decimal(value)
+ self._data[key] = value
+
+ def __len__(self):
+ return len(self._data)
+
+ @property
+ def nbytes(self):
+ n = len(self)
+ if n:
+ return n * sys.getsizeof(self[0])
+ return 0
+
+ def isna(self):
+ return np.array([x.is_nan() for x in self._data], dtype=bool)
+
+ @property
+ def _na_value(self):
+ return decimal.Decimal('NaN')
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ return cls(np.concatenate([x._data for x in to_concat]))
+
+ def _reduce(self, name, skipna=True, **kwargs):
+
+ if skipna:
+ raise NotImplementedError("decimal does not support skipna=True")
+
+ try:
+ op = getattr(self.data, name)
+ except AttributeError:
+ raise NotImplementedError("decimal does not support "
+ "the {} operation".format(name))
+ return op(axis=0)
+
+
+def to_decimal(values, context=None):
+ return DecimalArray([decimal.Decimal(x) for x in values], context=context)
+
+
+def make_data():
+ return [decimal.Decimal(random.random()) for _ in range(100)]
+
+
+DecimalArray._add_arithmetic_ops()
+DecimalArray._add_comparison_ops()
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/decimal/test_decimal.py b/contrib/python/pandas/py2/pandas/tests/extension/decimal/test_decimal.py
new file mode 100644
index 00000000000..6281c5360cd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/decimal/test_decimal.py
@@ -0,0 +1,401 @@
+import decimal
+import math
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import compat
+from pandas.tests.extension import base
+import pandas.util.testing as tm
+
+from .array import DecimalArray, DecimalDtype, make_data, to_decimal
+
+
+def dtype():
+ return DecimalDtype()
+
+
+def data():
+ return DecimalArray(make_data())
+
+
+def data_missing():
+ return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)])
+
+
+def data_for_sorting():
+ return DecimalArray([decimal.Decimal('1'),
+ decimal.Decimal('2'),
+ decimal.Decimal('0')])
+
+
+def data_missing_for_sorting():
+ return DecimalArray([decimal.Decimal('1'),
+ decimal.Decimal('NaN'),
+ decimal.Decimal('0')])
+
+
+def na_cmp():
+ return lambda x, y: x.is_nan() and y.is_nan()
+
+
+def na_value():
+ return decimal.Decimal("NaN")
+
+
+def data_for_grouping():
+ b = decimal.Decimal('1.0')
+ a = decimal.Decimal('0.0')
+ c = decimal.Decimal('2.0')
+ na = decimal.Decimal('NaN')
+ return DecimalArray([b, b, na, na, a, a, b, c])
+
+
+class BaseDecimal(object):
+
+ def assert_series_equal(self, left, right, *args, **kwargs):
+ def convert(x):
+ # need to convert array([Decimal(NaN)], dtype='object') to np.NaN
+ # because Series[object].isnan doesn't recognize decimal(NaN) as
+ # NA.
+ try:
+ return math.isnan(x)
+ except TypeError:
+ return False
+
+ if left.dtype == 'object':
+ left_na = left.apply(convert)
+ else:
+ left_na = left.isna()
+ if right.dtype == 'object':
+ right_na = right.apply(convert)
+ else:
+ right_na = right.isna()
+
+ tm.assert_series_equal(left_na, right_na)
+ return tm.assert_series_equal(left[~left_na],
+ right[~right_na],
+ *args, **kwargs)
+
+ def assert_frame_equal(self, left, right, *args, **kwargs):
+ # TODO(EA): select_dtypes
+ tm.assert_index_equal(
+ left.columns, right.columns,
+ exact=kwargs.get('check_column_type', 'equiv'),
+ check_names=kwargs.get('check_names', True),
+ check_exact=kwargs.get('check_exact', False),
+ check_categorical=kwargs.get('check_categorical', True),
+ obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame')))
+
+ decimals = (left.dtypes == 'decimal').index
+
+ for col in decimals:
+ self.assert_series_equal(left[col], right[col],
+ *args, **kwargs)
+
+ left = left.drop(columns=decimals)
+ right = right.drop(columns=decimals)
+ tm.assert_frame_equal(left, right, *args, **kwargs)
+
+
+class TestDtype(BaseDecimal, base.BaseDtypeTests):
+ @pytest.mark.skipif(compat.PY2, reason="Context not hashable.")
+ def test_hashable(self, dtype):
+ pass
+
+
+class TestInterface(BaseDecimal, base.BaseInterfaceTests):
+
+ pytestmark = pytest.mark.skipif(compat.PY2,
+ reason="Unhashble dtype in Py2.")
+
+
+class TestConstructors(BaseDecimal, base.BaseConstructorsTests):
+
+ @pytest.mark.skip(reason="not implemented constructor from dtype")
+ def test_from_dtype(self, data):
+ # construct from our dtype & string dtype
+ pass
+
+
+class TestReshaping(BaseDecimal, base.BaseReshapingTests):
+ pytestmark = pytest.mark.skipif(compat.PY2,
+ reason="Unhashble dtype in Py2.")
+
+
+class TestGetitem(BaseDecimal, base.BaseGetitemTests):
+
+ def test_take_na_value_other_decimal(self):
+ arr = DecimalArray([decimal.Decimal('1.0'),
+ decimal.Decimal('2.0')])
+ result = arr.take([0, -1], allow_fill=True,
+ fill_value=decimal.Decimal('-1.0'))
+ expected = DecimalArray([decimal.Decimal('1.0'),
+ decimal.Decimal('-1.0')])
+ self.assert_extension_array_equal(result, expected)
+
+
+class TestMissing(BaseDecimal, base.BaseMissingTests):
+ pass
+
+
+class Reduce(object):
+
+ def check_reduce(self, s, op_name, skipna):
+
+ if skipna or op_name in ['median', 'skew', 'kurt']:
+ with pytest.raises(NotImplementedError):
+ getattr(s, op_name)(skipna=skipna)
+
+ else:
+ result = getattr(s, op_name)(skipna=skipna)
+ expected = getattr(np.asarray(s), op_name)()
+ tm.assert_almost_equal(result, expected)
+
+
+class TestNumericReduce(Reduce, base.BaseNumericReduceTests):
+ pass
+
+
+class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests):
+ pass
+
+
+class TestMethods(BaseDecimal, base.BaseMethodsTests):
+ @pytest.mark.parametrize('dropna', [True, False])
+ @pytest.mark.xfail(reason="value_counts not implemented yet.")
+ def test_value_counts(self, all_data, dropna):
+ all_data = all_data[:10]
+ if dropna:
+ other = np.array(all_data[~all_data.isna()])
+ else:
+ other = all_data
+
+ result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
+ expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
+
+ tm.assert_series_equal(result, expected)
+
+
+class TestCasting(BaseDecimal, base.BaseCastingTests):
+ pytestmark = pytest.mark.skipif(compat.PY2,
+ reason="Unhashble dtype in Py2.")
+
+
+class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
+ pytestmark = pytest.mark.skipif(compat.PY2,
+ reason="Unhashble dtype in Py2.")
+
+
+class TestSetitem(BaseDecimal, base.BaseSetitemTests):
+ pass
+
+
+class TestPrinting(BaseDecimal, base.BasePrintingTests):
+ pytestmark = pytest.mark.skipif(compat.PY2,
+ reason="Unhashble dtype in Py2.")
+
+
+# TODO(extension)
+ "raising AssertionError as this is not implemented, "
+ "though easy enough to do"))
+def test_series_constructor_coerce_data_to_extension_dtype_raises():
+ xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the "
+ "extension array directly.")
+ with pytest.raises(ValueError, match=xpr):
+ pd.Series([0, 1, 2], dtype=DecimalDtype())
+
+
+def test_series_constructor_with_dtype():
+ arr = DecimalArray([decimal.Decimal('10.0')])
+ result = pd.Series(arr, dtype=DecimalDtype())
+ expected = pd.Series(arr)
+ tm.assert_series_equal(result, expected)
+
+ result = pd.Series(arr, dtype='int64')
+ expected = pd.Series([10])
+ tm.assert_series_equal(result, expected)
+
+
+def test_dataframe_constructor_with_dtype():
+ arr = DecimalArray([decimal.Decimal('10.0')])
+
+ result = pd.DataFrame({"A": arr}, dtype=DecimalDtype())
+ expected = pd.DataFrame({"A": arr})
+ tm.assert_frame_equal(result, expected)
+
+ arr = DecimalArray([decimal.Decimal('10.0')])
+ result = pd.DataFrame({"A": arr}, dtype='int64')
+ expected = pd.DataFrame({"A": [10]})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("frame", [True, False])
+def test_astype_dispatches(frame):
+ # This is a dtype-specific test that ensures Series[decimal].astype
+ # gets all the way through to ExtensionArray.astype
+ # Designing a reliable smoke test that works for arbitrary data types
+ # is difficult.
+ data = pd.Series(DecimalArray([decimal.Decimal(2)]), name='a')
+ ctx = decimal.Context()
+ ctx.prec = 5
+
+ if frame:
+ data = data.to_frame()
+
+ result = data.astype(DecimalDtype(ctx))
+
+ if frame:
+ result = result['a']
+
+ assert result.dtype.context.prec == ctx.prec
+
+
+class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests):
+
+ def check_opname(self, s, op_name, other, exc=None):
+ super(TestArithmeticOps, self).check_opname(s, op_name,
+ other, exc=None)
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ op_name = all_arithmetic_operators
+ s = pd.Series(data)
+
+ context = decimal.getcontext()
+ divbyzerotrap = context.traps[decimal.DivisionByZero]
+ invalidoptrap = context.traps[decimal.InvalidOperation]
+ context.traps[decimal.DivisionByZero] = 0
+ context.traps[decimal.InvalidOperation] = 0
+
+ # Decimal supports ops with int, but not float
+ other = pd.Series([int(d * 100) for d in data])
+ self.check_opname(s, op_name, other)
+
+ if "mod" not in op_name:
+ self.check_opname(s, op_name, s * 2)
+
+ self.check_opname(s, op_name, 0)
+ self.check_opname(s, op_name, 5)
+ context.traps[decimal.DivisionByZero] = divbyzerotrap
+ context.traps[decimal.InvalidOperation] = invalidoptrap
+
+ def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
+ # We implement divmod
+ super(TestArithmeticOps, self)._check_divmod_op(
+ s, op, other, exc=None
+ )
+
+ def test_error(self):
+ pass
+
+
+class TestComparisonOps(BaseDecimal, base.BaseComparisonOpsTests):
+
+ def check_opname(self, s, op_name, other, exc=None):
+ super(TestComparisonOps, self).check_opname(s, op_name,
+ other, exc=None)
+
+ def _compare_other(self, s, data, op_name, other):
+ self.check_opname(s, op_name, other)
+
+ def test_compare_scalar(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ s = pd.Series(data)
+ self._compare_other(s, data, op_name, 0.5)
+
+ def test_compare_array(self, data, all_compare_operators):
+ op_name = all_compare_operators
+ s = pd.Series(data)
+
+ alter = np.random.choice([-1, 0, 1], len(data))
+ # Randomly double, halve or keep same value
+ other = pd.Series(data) * [decimal.Decimal(pow(2.0, i))
+ for i in alter]
+ self._compare_other(s, data, op_name, other)
+
+
+class DecimalArrayWithoutFromSequence(DecimalArray):
+ """Helper class for testing error handling in _from_sequence."""
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ raise KeyError("For the test")
+
+
+class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence):
+ @classmethod
+ def _create_arithmetic_method(cls, op):
+ return cls._create_method(op, coerce_to_dtype=False)
+
+
+DecimalArrayWithoutCoercion._add_arithmetic_ops()
+
+
+def test_combine_from_sequence_raises():
+ # https://github.com/pandas-dev/pandas/issues/22850
+ ser = pd.Series(DecimalArrayWithoutFromSequence([
+ decimal.Decimal("1.0"),
+ decimal.Decimal("2.0")
+ ]))
+ result = ser.combine(ser, operator.add)
+
+ # note: object dtype
+ expected = pd.Series([decimal.Decimal("2.0"),
+ decimal.Decimal("4.0")], dtype="object")
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("class_", [DecimalArrayWithoutFromSequence,
+ DecimalArrayWithoutCoercion])
+def test_scalar_ops_from_sequence_raises(class_):
+ # op(EA, EA) should return an EA, or an ndarray if it's not possible
+ # to return an EA with the return values.
+ arr = class_([
+ decimal.Decimal("1.0"),
+ decimal.Decimal("2.0")
+ ])
+ result = arr + arr
+ expected = np.array([decimal.Decimal("2.0"), decimal.Decimal("4.0")],
+ dtype="object")
+ tm.assert_numpy_array_equal(result, expected)
+
+
[email protected]("reverse, expected_div, expected_mod", [
+ (False, [0, 1, 1, 2], [1, 0, 1, 0]),
+ (True, [2, 1, 0, 0], [0, 0, 2, 2]),
+])
+def test_divmod_array(reverse, expected_div, expected_mod):
+ # https://github.com/pandas-dev/pandas/issues/22930
+ arr = to_decimal([1, 2, 3, 4])
+ if reverse:
+ div, mod = divmod(2, arr)
+ else:
+ div, mod = divmod(arr, 2)
+ expected_div = to_decimal(expected_div)
+ expected_mod = to_decimal(expected_mod)
+
+ tm.assert_extension_array_equal(div, expected_div)
+ tm.assert_extension_array_equal(mod, expected_mod)
+
+
+def test_formatting_values_deprecated():
+ class DecimalArray2(DecimalArray):
+ def _formatting_values(self):
+ return np.array(self)
+
+ ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')]))
+ # different levels for 2 vs. 3
+ check_stacklevel = compat.PY3
+
+ with tm.assert_produces_warning(DeprecationWarning,
+ check_stacklevel=check_stacklevel):
+ repr(ser)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/json/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/json/__init__.py
new file mode 100644
index 00000000000..f2679d087c8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/json/__init__.py
@@ -0,0 +1,3 @@
+from .array import JSONArray, JSONDtype, make_data
+
+__all__ = ['JSONArray', 'JSONDtype', 'make_data']
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/json/array.py b/contrib/python/pandas/py2/pandas/tests/extension/json/array.py
new file mode 100644
index 00000000000..10fd21f89c5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/json/array.py
@@ -0,0 +1,199 @@
+"""Test extension array for storing nested data in a pandas container.
+
+The JSONArray stores lists of dictionaries. The storage mechanism is a list,
+not an ndarray.
+
+Note:
+
+We currently store lists of UserDicts (Py3 only). Pandas has a few places
+internally that specifically check for dicts, and does non-scalar things
+in that case. We *want* the dictionaries to be treated as scalars, so we
+hack around pandas by using UserDicts.
+"""
+import collections
+import itertools
+import numbers
+import random
+import string
+import sys
+
+import numpy as np
+
+from pandas.core.dtypes.base import ExtensionDtype
+
+from pandas import compat
+from pandas.core.arrays import ExtensionArray
+
+
+class JSONDtype(ExtensionDtype):
+ type = compat.Mapping
+ name = 'json'
+
+ try:
+ na_value = collections.UserDict()
+ except AttributeError:
+ # source compatibility with Py2.
+ na_value = {}
+
+ @classmethod
+ def construct_array_type(cls):
+ """Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ return JSONArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ if string == cls.name:
+ return cls()
+ else:
+ raise TypeError("Cannot construct a '{}' from "
+ "'{}'".format(cls, string))
+
+
+class JSONArray(ExtensionArray):
+ dtype = JSONDtype()
+ __array_priority__ = 1000
+
+ def __init__(self, values, dtype=None, copy=False):
+ for val in values:
+ if not isinstance(val, self.dtype.type):
+ raise TypeError("All values must be of type " +
+ str(self.dtype.type))
+ self.data = values
+
+ # Some aliases for common attribute names to ensure pandas supports
+ # these
+ self._items = self._data = self.data
+ # those aliases are currently not working due to assumptions
+ # in internal code (GH-20735)
+ # self._values = self.values = self.data
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return cls(scalars)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return cls([collections.UserDict(x) for x in values if x != ()])
+
+ def __getitem__(self, item):
+ if isinstance(item, numbers.Integral):
+ return self.data[item]
+ elif isinstance(item, np.ndarray) and item.dtype == 'bool':
+ return self._from_sequence([x for x, m in zip(self, item) if m])
+ elif isinstance(item, compat.Iterable):
+ # fancy indexing
+ return type(self)([self.data[i] for i in item])
+ else:
+ # slice
+ return type(self)(self.data[item])
+
+ def __setitem__(self, key, value):
+ if isinstance(key, numbers.Integral):
+ self.data[key] = value
+ else:
+ if not isinstance(value, (type(self),
+ compat.Sequence)):
+ # broadcast value
+ value = itertools.cycle([value])
+
+ if isinstance(key, np.ndarray) and key.dtype == 'bool':
+ # masking
+ for i, (k, v) in enumerate(zip(key, value)):
+ if k:
+ assert isinstance(v, self.dtype.type)
+ self.data[i] = v
+ else:
+ for k, v in zip(key, value):
+ assert isinstance(v, self.dtype.type)
+ self.data[k] = v
+
+ def __len__(self):
+ return len(self.data)
+
+ @property
+ def nbytes(self):
+ return sys.getsizeof(self.data)
+
+ def isna(self):
+ return np.array([x == self.dtype.na_value for x in self.data],
+ dtype=bool)
+
+ def take(self, indexer, allow_fill=False, fill_value=None):
+ # re-implement here, since NumPy has trouble setting
+ # sized objects like UserDicts into scalar slots of
+ # an ndarary.
+ indexer = np.asarray(indexer)
+ msg = ("Index is out of bounds or cannot do a "
+ "non-empty take from an empty array.")
+
+ if allow_fill:
+ if fill_value is None:
+ fill_value = self.dtype.na_value
+ # bounds check
+ if (indexer < -1).any():
+ raise ValueError
+ try:
+ output = [self.data[loc] if loc != -1 else fill_value
+ for loc in indexer]
+ except IndexError:
+ raise IndexError(msg)
+ else:
+ try:
+ output = [self.data[loc] for loc in indexer]
+ except IndexError:
+ raise IndexError(msg)
+
+ return self._from_sequence(output)
+
+ def copy(self, deep=False):
+ return type(self)(self.data[:])
+
+ def astype(self, dtype, copy=True):
+ # NumPy has issues when all the dicts are the same length.
+ # np.array([UserDict(...), UserDict(...)]) fails,
+ # but np.array([{...}, {...}]) works, so cast.
+
+ # needed to add this check for the Series constructor
+ if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+ if copy:
+ return self.copy()
+ return self
+ return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
+
+ def unique(self):
+ # Parent method doesn't work since np.array will try to infer
+ # a 2-dim object.
+ return type(self)([
+ dict(x) for x in list({tuple(d.items()) for d in self.data})
+ ])
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
+ return cls(data)
+
+ def _values_for_factorize(self):
+ frozen = self._values_for_argsort()
+ if len(frozen) == 0:
+ # _factorize_array expects 1-d array, this is a len-0 2-d array.
+ frozen = frozen.ravel()
+ return frozen, ()
+
+ def _values_for_argsort(self):
+ # Disable NumPy's shape inference by including an empty tuple...
+ # If all the elemnts of self are the same size P, NumPy will
+ # cast them to an (N, P) array, instead of an (N,) array of tuples.
+ frozen = [()] + [tuple(x.items()) for x in self]
+ return np.array(frozen, dtype=object)[1:]
+
+
+def make_data():
+ # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
+ return [collections.UserDict([
+ (random.choice(string.ascii_letters), random.randint(0, 100))
+ for _ in range(random.randint(0, 10))]) for _ in range(100)]
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/json/test_json.py b/contrib/python/pandas/py2/pandas/tests/extension/json/test_json.py
new file mode 100644
index 00000000000..9ee131950f1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/json/test_json.py
@@ -0,0 +1,304 @@
+import collections
+import operator
+
+import pytest
+
+from pandas.compat import PY2, PY36
+
+import pandas as pd
+from pandas.tests.extension import base
+import pandas.util.testing as tm
+
+from .array import JSONArray, JSONDtype, make_data
+
+pytestmark = pytest.mark.skipif(PY2, reason="Py2 doesn't have a UserDict")
+
+
+def dtype():
+ return JSONDtype()
+
+
+def data():
+ """Length-100 PeriodArray for semantics test."""
+ data = make_data()
+
+ # Why the while loop? NumPy is unable to construct an ndarray from
+ # equal-length ndarrays. Many of our operations involve coercing the
+ # EA to an ndarray of objects. To avoid random test failures, we ensure
+ # that our data is coercable to an ndarray. Several tests deal with only
+ # the first two elements, so that's what we'll check.
+
+ while len(data[0]) == len(data[1]):
+ data = make_data()
+
+ return JSONArray(data)
+
+
+def data_missing():
+ """Length 2 array with [NA, Valid]"""
+ return JSONArray([{}, {'a': 10}])
+
+
+def data_for_sorting():
+ return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}])
+
+
+def data_missing_for_sorting():
+ return JSONArray([{'b': 1}, {}, {'a': 4}])
+
+
+def na_value(dtype):
+ return dtype.na_value
+
+
+def na_cmp():
+ return operator.eq
+
+
+def data_for_grouping():
+ return JSONArray([
+ {'b': 1}, {'b': 1},
+ {}, {},
+ {'a': 0, 'c': 2}, {'a': 0, 'c': 2},
+ {'b': 1},
+ {'c': 2},
+ ])
+
+
+class BaseJSON(object):
+ # NumPy doesn't handle an array of equal-length UserDicts.
+ # The default assert_series_equal eventually does a
+ # Series.values, which raises. We work around it by
+ # converting the UserDicts to dicts.
+ def assert_series_equal(self, left, right, **kwargs):
+ if left.dtype.name == 'json':
+ assert left.dtype == right.dtype
+ left = pd.Series(JSONArray(left.values.astype(object)),
+ index=left.index, name=left.name)
+ right = pd.Series(JSONArray(right.values.astype(object)),
+ index=right.index, name=right.name)
+ tm.assert_series_equal(left, right, **kwargs)
+
+ def assert_frame_equal(self, left, right, *args, **kwargs):
+ tm.assert_index_equal(
+ left.columns, right.columns,
+ exact=kwargs.get('check_column_type', 'equiv'),
+ check_names=kwargs.get('check_names', True),
+ check_exact=kwargs.get('check_exact', False),
+ check_categorical=kwargs.get('check_categorical', True),
+ obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame')))
+
+ jsons = (left.dtypes == 'json').index
+
+ for col in jsons:
+ self.assert_series_equal(left[col], right[col],
+ *args, **kwargs)
+
+ left = left.drop(columns=jsons)
+ right = right.drop(columns=jsons)
+ tm.assert_frame_equal(left, right, *args, **kwargs)
+
+
+class TestDtype(BaseJSON, base.BaseDtypeTests):
+ pass
+
+
+class TestInterface(BaseJSON, base.BaseInterfaceTests):
+ def test_custom_asserts(self):
+ # This would always trigger the KeyError from trying to put
+ # an array of equal-length UserDicts inside an ndarray.
+ data = JSONArray([collections.UserDict({'a': 1}),
+ collections.UserDict({'b': 2}),
+ collections.UserDict({'c': 3})])
+ a = pd.Series(data)
+ self.assert_series_equal(a, a)
+ self.assert_frame_equal(a.to_frame(), a.to_frame())
+
+ b = pd.Series(data.take([0, 0, 1]))
+ with pytest.raises(AssertionError):
+ self.assert_series_equal(a, b)
+
+ with pytest.raises(AssertionError):
+ self.assert_frame_equal(a.to_frame(), b.to_frame())
+
+
+class TestConstructors(BaseJSON, base.BaseConstructorsTests):
+
+ @pytest.mark.skip(reason="not implemented constructor from dtype")
+ def test_from_dtype(self, data):
+ # construct from our dtype & string dtype
+ pass
+
+
+class TestReshaping(BaseJSON, base.BaseReshapingTests):
+
+ @pytest.mark.skip(reason="Different definitions of NA")
+ def test_stack(self):
+ """
+ The test does .astype(object).stack(). If we happen to have
+ any missing values in `data`, then we'll end up with different
+ rows since we consider `{}` NA, but `.astype(object)` doesn't.
+ """
+
+ @pytest.mark.xfail(reason="dict for NA")
+ def test_unstack(self, data, index):
+ # The base test has NaN for the expected NA value.
+ # this matches otherwise
+ return super().test_unstack(data, index)
+
+
+class TestGetitem(BaseJSON, base.BaseGetitemTests):
+ pass
+
+
+class TestMissing(BaseJSON, base.BaseMissingTests):
+ @pytest.mark.skip(reason="Setting a dict as a scalar")
+ def test_fillna_series(self):
+ """We treat dictionaries as a mapping in fillna, not a scalar."""
+
+ @pytest.mark.skip(reason="Setting a dict as a scalar")
+ def test_fillna_frame(self):
+ """We treat dictionaries as a mapping in fillna, not a scalar."""
+
+
+unhashable = pytest.mark.skip(reason="Unhashable")
+unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
+ reason="Dictionary order unstable")
+
+
+class TestReduce(base.BaseNoReduceTests):
+ pass
+
+
+class TestMethods(BaseJSON, base.BaseMethodsTests):
+ @unhashable
+ def test_value_counts(self, all_data, dropna):
+ pass
+
+ @unhashable
+ def test_sort_values_frame(self):
+ # TODO (EA.factorize): see if _values_for_factorize allows this.
+ pass
+
+ @unstable
+ def test_argsort(self, data_for_sorting):
+ super(TestMethods, self).test_argsort(data_for_sorting)
+
+ @unstable
+ def test_argsort_missing(self, data_missing_for_sorting):
+ super(TestMethods, self).test_argsort_missing(
+ data_missing_for_sorting)
+
+ @unstable
+ @pytest.mark.parametrize('ascending', [True, False])
+ def test_sort_values(self, data_for_sorting, ascending):
+ super(TestMethods, self).test_sort_values(
+ data_for_sorting, ascending)
+
+ @unstable
+ @pytest.mark.parametrize('ascending', [True, False])
+ def test_sort_values_missing(self, data_missing_for_sorting, ascending):
+ super(TestMethods, self).test_sort_values_missing(
+ data_missing_for_sorting, ascending)
+
+ @pytest.mark.skip(reason="combine for JSONArray not supported")
+ def test_combine_le(self, data_repeated):
+ pass
+
+ @pytest.mark.skip(reason="combine for JSONArray not supported")
+ def test_combine_add(self, data_repeated):
+ pass
+
+ @pytest.mark.skip(reason="combine for JSONArray not supported")
+ def test_combine_first(self, data):
+ pass
+
+ @unhashable
+ def test_hash_pandas_object_works(self, data, kind):
+ super().test_hash_pandas_object_works(data, kind)
+
+ @pytest.mark.skip(reason="broadcasting error")
+ def test_where_series(self, data, na_value):
+ # Fails with
+ # *** ValueError: operands could not be broadcast together
+ # with shapes (4,) (4,) (0,)
+ super().test_where_series(data, na_value)
+
+ @pytest.mark.skip(reason="Can't compare dicts.")
+ def test_searchsorted(self, data_for_sorting):
+ super(TestMethods, self).test_searchsorted(data_for_sorting)
+
+
+class TestCasting(BaseJSON, base.BaseCastingTests):
+ @pytest.mark.skip(reason="failing on np.array(self, dtype=str)")
+ def test_astype_str(self):
+ """This currently fails in NumPy on np.array(self, dtype=str) with
+
+ *** ValueError: setting an array element with a sequence
+ """
+
+
+# We intentionally don't run base.BaseSetitemTests because pandas'
+# internals has trouble setting sequences of values into scalar positions.
+
+
+class TestGroupby(BaseJSON, base.BaseGroupbyTests):
+
+ @unhashable
+ def test_groupby_extension_transform(self):
+ """
+ This currently fails in Series.name.setter, since the
+ name must be hashable, but the value is a dictionary.
+ I think this is what we want, i.e. `.name` should be the original
+ values, and not the values for factorization.
+ """
+
+ @unhashable
+ def test_groupby_extension_apply(self):
+ """
+ This fails in Index._do_unique_check with
+
+ > hash(val)
+ E TypeError: unhashable type: 'UserDict' with
+
+ I suspect that once we support Index[ExtensionArray],
+ we'll be able to dispatch unique.
+ """
+
+ @unstable
+ @pytest.mark.parametrize('as_index', [True, False])
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
+ super(TestGroupby, self).test_groupby_extension_agg(
+ as_index, data_for_grouping
+ )
+
+
+class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests):
+ def test_error(self, data, all_arithmetic_operators):
+ pass
+
+ def test_add_series_with_extension_array(self, data):
+ ser = pd.Series(data)
+ with pytest.raises(TypeError, match="unsupported"):
+ ser + data
+
+ def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
+ return super(TestArithmeticOps, self)._check_divmod_op(
+ s, op, other, exc=TypeError
+ )
+
+
+class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests):
+ pass
+
+
+class TestPrinting(BaseJSON, base.BasePrintingTests):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/numpy_/__init__.py b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/numpy_/conftest.py b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/conftest.py
new file mode 100644
index 00000000000..daa93571c29
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/conftest.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pytest
+
+from pandas.core.arrays.numpy_ import PandasArray
+
+
+def allow_in_pandas(monkeypatch):
+ """
+ A monkeypatch to tell pandas to let us in.
+
+ By default, passing a PandasArray to an index / series / frame
+ constructor will unbox that PandasArray to an ndarray, and treat
+ it as a non-EA column. We don't want people using EAs without
+ reason.
+
+ The mechanism for this is a check against ABCPandasArray
+ in each constructor.
+
+ But, for testing, we need to allow them in pandas. So we patch
+ the _typ of PandasArray, so that we evade the ABCPandasArray
+ check.
+ """
+ with monkeypatch.context() as m:
+ m.setattr(PandasArray, '_typ', 'extension')
+ yield
+
+
+def na_value():
+ return np.nan
+
+
+def na_cmp():
+ def cmp(a, b):
+ return np.isnan(a) and np.isnan(b)
+ return cmp
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy.py b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy.py
new file mode 100644
index 00000000000..4c93d5ee0b9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy.py
@@ -0,0 +1,182 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import compat
+from pandas.core.arrays.numpy_ import PandasArray, PandasDtype
+import pandas.util.testing as tm
+
+from .. import base
+
+
+def dtype():
+ return PandasDtype(np.dtype('float'))
+
+
+def data(allow_in_pandas, dtype):
+ return PandasArray(np.arange(1, 101, dtype=dtype._dtype))
+
+
+def data_missing(allow_in_pandas):
+ return PandasArray(np.array([np.nan, 1.0]))
+
+
+def data_for_sorting(allow_in_pandas):
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, C, A] with
+ A < B < C
+ """
+ return PandasArray(
+ np.array([1, 2, 0])
+ )
+
+
+def data_missing_for_sorting(allow_in_pandas):
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, NA, A] with
+ A < B and NA missing.
+ """
+ return PandasArray(
+ np.array([1, np.nan, 0])
+ )
+
+
+def data_for_grouping(allow_in_pandas):
+ """Data for factorization, grouping, and unique tests.
+
+ Expected to be like [B, B, NA, NA, A, A, B, C]
+
+ Where A < B < C and NA is missing
+ """
+ a, b, c = np.arange(3)
+ return PandasArray(np.array(
+ [b, b, np.nan, np.nan, a, a, b, c]
+ ))
+
+
+class BaseNumPyTests(object):
+ pass
+
+
+class TestCasting(BaseNumPyTests, base.BaseCastingTests):
+ pass
+
+
+class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
+ @pytest.mark.skip(reason="We don't register our dtype")
+ # We don't want to register. This test should probably be split in two.
+ def test_from_dtype(self, data):
+ pass
+
+
+class TestDtype(BaseNumPyTests, base.BaseDtypeTests):
+
+ @pytest.mark.skip(reason="Incorrect expected.")
+ # we unsurprisingly clash with a NumPy name.
+ def test_check_dtype(self, data):
+ pass
+
+
+class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
+ pass
+
+
+class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
+ pass
+
+
+class TestInterface(BaseNumPyTests, base.BaseInterfaceTests):
+ pass
+
+
+class TestMethods(BaseNumPyTests, base.BaseMethodsTests):
+
+ @pytest.mark.skip(reason="TODO: remove?")
+ def test_value_counts(self, all_data, dropna):
+ pass
+
+ @pytest.mark.skip(reason="Incorrect expected")
+ # We have a bool dtype, so the result is an ExtensionArray
+ # but expected is not
+ def test_combine_le(self, data_repeated):
+ super(TestMethods, self).test_combine_le(data_repeated)
+
+
+class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
+ divmod_exc = None
+ series_scalar_exc = None
+ frame_scalar_exc = None
+ series_array_exc = None
+
+ def test_divmod_series_array(self, data):
+ s = pd.Series(data)
+ self._check_divmod_op(s, divmod, data, exc=None)
+
+ @pytest.mark.skip("We implement ops")
+ def test_error(self, data, all_arithmetic_operators):
+ pass
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ if (compat.PY2 and
+ all_arithmetic_operators in {'__div__', '__rdiv__'}):
+ raise pytest.skip(
+ "Matching NumPy int / int -> float behavior."
+ )
+ super(TestArithmetics, self).test_arith_series_with_scalar(
+ data, all_arithmetic_operators
+ )
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ if (compat.PY2 and
+ all_arithmetic_operators in {'__div__', '__rdiv__'}):
+ raise pytest.skip(
+ "Matching NumPy int / int -> float behavior."
+ )
+ super(TestArithmetics, self).test_arith_series_with_array(
+ data, all_arithmetic_operators
+ )
+
+
+class TestPrinting(BaseNumPyTests, base.BasePrintingTests):
+ pass
+
+
+class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests):
+
+ def check_reduce(self, s, op_name, skipna):
+ result = getattr(s, op_name)(skipna=skipna)
+ # avoid coercing int -> float. Just cast to the actual numpy type.
+ expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna)
+ tm.assert_almost_equal(result, expected)
+
+
+class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests):
+ pass
+
+
+class TestMising(BaseNumPyTests, base.BaseMissingTests):
+ pass
+
+
+class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
+
+ @pytest.mark.skip("Incorrect parent test")
+ # not actually a mixed concat, since we concat int and int.
+ def test_concat_mixed_dtypes(self, data):
+ super(TestReshaping, self).test_concat_mixed_dtypes(data)
+
+
+class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
+ pass
+
+
+class TestParsing(BaseNumPyTests, base.BaseParsingTests):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy_nested.py b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy_nested.py
new file mode 100644
index 00000000000..cf9b34dd087
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/numpy_/test_numpy_nested.py
@@ -0,0 +1,286 @@
+"""
+Tests for PandasArray with nested data. Users typically won't create
+these objects via `pd.array`, but they can show up through `.array`
+on a Series with nested data.
+
+We partition these tests into their own file, as many of the base
+tests fail, as they aren't appropriate for nested data. It is easier
+to have a seperate file with its own data generating fixtures, than
+trying to skip based upon the value of a fixture.
+"""
+import pytest
+
+import pandas as pd
+from pandas.core.arrays.numpy_ import PandasArray, PandasDtype
+
+from .. import base
+
+# For NumPy <1.16, np.array([np.nan, (1,)]) raises
+# ValueError: setting an array element with a sequence.
+np = pytest.importorskip('numpy', minversion='1.16.0')
+
+
+def dtype():
+ return PandasDtype(np.dtype('object'))
+
+
+def data(allow_in_pandas, dtype):
+ return pd.Series([(i,) for i in range(100)]).array
+
+
+def data_missing(allow_in_pandas):
+ return PandasArray(np.array([np.nan, (1,)]))
+
+
+def data_for_sorting(allow_in_pandas):
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, C, A] with
+ A < B < C
+ """
+ # Use an empty tuple for first element, then remove,
+ # to disable np.array's shape inference.
+ return PandasArray(
+ np.array([(), (2,), (3,), (1,)])[1:]
+ )
+
+
+def data_missing_for_sorting(allow_in_pandas):
+ """Length-3 array with a known sort order.
+
+ This should be three items [B, NA, A] with
+ A < B and NA missing.
+ """
+ return PandasArray(
+ np.array([(1,), np.nan, (0,)])
+ )
+
+
+def data_for_grouping(allow_in_pandas):
+ """Data for factorization, grouping, and unique tests.
+
+ Expected to be like [B, B, NA, NA, A, A, B, C]
+
+ Where A < B < C and NA is missing
+ """
+ a, b, c = (1,), (2,), (3,)
+ return PandasArray(np.array(
+ [b, b, np.nan, np.nan, a, a, b, c]
+ ))
+
+
+skip_nested = pytest.mark.skip(reason="Skipping for nested PandasArray")
+
+
+class BaseNumPyTests(object):
+ pass
+
+
+class TestCasting(BaseNumPyTests, base.BaseCastingTests):
+
+ @skip_nested
+ def test_astype_str(self, data):
+ pass
+
+
+class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
+ @pytest.mark.skip(reason="We don't register our dtype")
+ # We don't want to register. This test should probably be split in two.
+ def test_from_dtype(self, data):
+ pass
+
+ @skip_nested
+ def test_array_from_scalars(self, data):
+ pass
+
+
+class TestDtype(BaseNumPyTests, base.BaseDtypeTests):
+
+ @pytest.mark.skip(reason="Incorrect expected.")
+ # we unsurprisingly clash with a NumPy name.
+ def test_check_dtype(self, data):
+ pass
+
+
+class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
+
+ @skip_nested
+ def test_getitem_scalar(self, data):
+ pass
+
+ @skip_nested
+ def test_take_series(self, data):
+ pass
+
+
+class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
+ @skip_nested
+ def test_groupby_extension_apply(self, data_for_grouping, op):
+ pass
+
+
+class TestInterface(BaseNumPyTests, base.BaseInterfaceTests):
+ @skip_nested
+ def test_array_interface(self, data):
+ # NumPy array shape inference
+ pass
+
+
+class TestMethods(BaseNumPyTests, base.BaseMethodsTests):
+
+ @pytest.mark.skip(reason="TODO: remove?")
+ def test_value_counts(self, all_data, dropna):
+ pass
+
+ @pytest.mark.skip(reason="Incorrect expected")
+ # We have a bool dtype, so the result is an ExtensionArray
+ # but expected is not
+ def test_combine_le(self, data_repeated):
+ super(TestMethods, self).test_combine_le(data_repeated)
+
+ @skip_nested
+ def test_combine_add(self, data_repeated):
+ # Not numeric
+ pass
+
+ @skip_nested
+ def test_shift_fill_value(self, data):
+ # np.array shape inference. Shift implementation fails.
+ super().test_shift_fill_value(data)
+
+ @skip_nested
+ def test_unique(self, data, box, method):
+ # Fails creating expected
+ pass
+
+ @skip_nested
+ def test_fillna_copy_frame(self, data_missing):
+ # The "scalar" for this array isn't a scalar.
+ pass
+
+ @skip_nested
+ def test_fillna_copy_series(self, data_missing):
+ # The "scalar" for this array isn't a scalar.
+ pass
+
+ @skip_nested
+ def test_hash_pandas_object_works(self, data, as_frame):
+ # ndarray of tuples not hashable
+ pass
+
+ @skip_nested
+ def test_searchsorted(self, data_for_sorting, as_series):
+ # Test setup fails.
+ pass
+
+ @skip_nested
+ def test_where_series(self, data, na_value, as_frame):
+ # Test setup fails.
+ pass
+
+ @skip_nested
+ def test_repeat(self, data, repeats, as_series, use_numpy):
+ # Fails creating expected
+ pass
+
+
+class TestPrinting(BaseNumPyTests, base.BasePrintingTests):
+ pass
+
+
+class TestMissing(BaseNumPyTests, base.BaseMissingTests):
+
+ @skip_nested
+ def test_fillna_scalar(self, data_missing):
+ # Non-scalar "scalar" values.
+ pass
+
+ @skip_nested
+ def test_fillna_series_method(self, data_missing, method):
+ # Non-scalar "scalar" values.
+ pass
+
+ @skip_nested
+ def test_fillna_series(self, data_missing):
+ # Non-scalar "scalar" values.
+ pass
+
+ @skip_nested
+ def test_fillna_frame(self, data_missing):
+ # Non-scalar "scalar" values.
+ pass
+
+
+class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
+
+ @pytest.mark.skip("Incorrect parent test")
+ # not actually a mixed concat, since we concat int and int.
+ def test_concat_mixed_dtypes(self, data):
+ super(TestReshaping, self).test_concat_mixed_dtypes(data)
+
+ @skip_nested
+ def test_merge(self, data, na_value):
+ # Fails creating expected
+ pass
+
+ @skip_nested
+ def test_merge_on_extension_array(self, data):
+ # Fails creating expected
+ pass
+
+ @skip_nested
+ def test_merge_on_extension_array_duplicates(self, data):
+ # Fails creating expected
+ pass
+
+
+class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
+
+ @skip_nested
+ def test_setitem_scalar_series(self, data, box_in_series):
+ pass
+
+ @skip_nested
+ def test_setitem_sequence(self, data, box_in_series):
+ pass
+
+ @skip_nested
+ def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
+ pass
+
+ @skip_nested
+ def test_setitem_sequence_broadcasts(self, data, box_in_series):
+ pass
+
+ @skip_nested
+ def test_setitem_loc_scalar_mixed(self, data):
+ pass
+
+ @skip_nested
+ def test_setitem_loc_scalar_multiple_homogoneous(self, data):
+ pass
+
+ @skip_nested
+ def test_setitem_iloc_scalar_mixed(self, data):
+ pass
+
+ @skip_nested
+ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
+ pass
+
+ @skip_nested
+ def test_setitem_mask_broadcast(self, data, setter):
+ pass
+
+ @skip_nested
+ def test_setitem_scalar_key_sequence_raise(self, data):
+ pass
+
+
+# Skip Arithmetics, NumericReduce, BooleanReduce, Parsing
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_categorical.py b/contrib/python/pandas/py2/pandas/tests/extension/test_categorical.py
new file mode 100644
index 00000000000..ac52d8f15b8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_categorical.py
@@ -0,0 +1,243 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Categorical
+from pandas.api.types import CategoricalDtype
+from pandas.tests.extension import base
+
+
+def make_data():
+ while True:
+ values = np.random.choice(list(string.ascii_letters), size=100)
+ # ensure we meet the requirements
+ # 1. first two not null
+ # 2. first and second are different
+ if values[0] != values[1]:
+ break
+ return values
+
+
+def dtype():
+ return CategoricalDtype()
+
+
+def data():
+ """Length-100 array for this type.
+
+ * data[0] and data[1] should both be non missing
+ * data[0] and data[1] should not gbe equal
+ """
+ return Categorical(make_data())
+
+
+def data_missing():
+ """Length 2 array with [NA, Valid]"""
+ return Categorical([np.nan, 'A'])
+
+
+def data_for_sorting():
+ return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'],
+ ordered=True)
+
+
+def data_missing_for_sorting():
+ return Categorical(['A', None, 'B'], categories=['B', 'A'],
+ ordered=True)
+
+
+def na_value():
+ return np.nan
+
+
+def data_for_grouping():
+ return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c'])
+
+
+class TestDtype(base.BaseDtypeTests):
+ pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+ @pytest.mark.skip(reason="Memory usage doesn't match")
+ def test_memory_usage(self, data):
+ # Is this deliberate?
+ super(TestInterface, self).test_memory_usage(data)
+
+
+class TestConstructors(base.BaseConstructorsTests):
+ pass
+
+
+class TestReshaping(base.BaseReshapingTests):
+ pass
+
+
+class TestGetitem(base.BaseGetitemTests):
+ skip_take = pytest.mark.skip(reason="GH-20664.")
+
+ @pytest.mark.skip(reason="Backwards compatibility")
+ def test_getitem_scalar(self, data):
+ # CategoricalDtype.type isn't "correct" since it should
+ # be a parent of the elements (object). But don't want
+ # to break things by changing.
+ super(TestGetitem, self).test_getitem_scalar(data)
+
+ @skip_take
+ def test_take(self, data, na_value, na_cmp):
+ # TODO remove this once Categorical.take is fixed
+ super(TestGetitem, self).test_take(data, na_value, na_cmp)
+
+ @skip_take
+ def test_take_negative(self, data):
+ super().test_take_negative(data)
+
+ @skip_take
+ def test_take_pandas_style_negative_raises(self, data, na_value):
+ super().test_take_pandas_style_negative_raises(data, na_value)
+
+ @skip_take
+ def test_take_non_na_fill_value(self, data_missing):
+ super().test_take_non_na_fill_value(data_missing)
+
+ @skip_take
+ def test_take_out_of_bounds_raises(self, data, allow_fill):
+ return super().test_take_out_of_bounds_raises(data, allow_fill)
+
+ @pytest.mark.skip(reason="GH-20747. Unobserved categories.")
+ def test_take_series(self, data):
+ super().test_take_series(data)
+
+ @skip_take
+ def test_reindex_non_na_fill_value(self, data_missing):
+ super().test_reindex_non_na_fill_value(data_missing)
+
+ @pytest.mark.skip(reason="Categorical.take buggy")
+ def test_take_empty(self, data, na_value, na_cmp):
+ super().test_take_empty(data, na_value, na_cmp)
+
+ @pytest.mark.skip(reason="test not written correctly for categorical")
+ def test_reindex(self, data, na_value):
+ super().test_reindex(data, na_value)
+
+
+class TestSetitem(base.BaseSetitemTests):
+ pass
+
+
+class TestMissing(base.BaseMissingTests):
+
+ @pytest.mark.skip(reason="Not implemented")
+ def test_fillna_limit_pad(self, data_missing):
+ super().test_fillna_limit_pad(data_missing)
+
+ @pytest.mark.skip(reason="Not implemented")
+ def test_fillna_limit_backfill(self, data_missing):
+ super().test_fillna_limit_backfill(data_missing)
+
+
+class TestReduce(base.BaseNoReduceTests):
+ pass
+
+
+class TestMethods(base.BaseMethodsTests):
+ @pytest.mark.skip(reason="Unobserved categories included")
+ def test_value_counts(self, all_data, dropna):
+ return super().test_value_counts(all_data, dropna)
+
+ def test_combine_add(self, data_repeated):
+ # GH 20825
+ # When adding categoricals in combine, result is a string
+ orig_data1, orig_data2 = data_repeated(2)
+ s1 = pd.Series(orig_data1)
+ s2 = pd.Series(orig_data2)
+ result = s1.combine(s2, lambda x1, x2: x1 + x2)
+ expected = pd.Series(([a + b for (a, b) in
+ zip(list(orig_data1), list(orig_data2))]))
+ self.assert_series_equal(result, expected)
+
+ val = s1.iloc[0]
+ result = s1.combine(val, lambda x1, x2: x1 + x2)
+ expected = pd.Series([a + val for a in list(orig_data1)])
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.skip(reason="Not Applicable")
+ def test_fillna_length_mismatch(self, data_missing):
+ super().test_fillna_length_mismatch(data_missing)
+
+ def test_searchsorted(self, data_for_sorting):
+ if not data_for_sorting.ordered:
+ raise pytest.skip(reason="searchsorted requires ordered data.")
+
+
+class TestCasting(base.BaseCastingTests):
+ pass
+
+
+class TestArithmeticOps(base.BaseArithmeticOpsTests):
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+
+ op_name = all_arithmetic_operators
+ if op_name != '__rmod__':
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data, op_name)
+ else:
+ pytest.skip('rmod never called when string is first argument')
+
+ def test_add_series_with_extension_array(self, data):
+ ser = pd.Series(data)
+ with pytest.raises(TypeError, match="cannot perform"):
+ ser + data
+
+ def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
+ return super(TestArithmeticOps, self)._check_divmod_op(
+ s, op, other, exc=TypeError
+ )
+
+
+class TestComparisonOps(base.BaseComparisonOpsTests):
+
+ def _compare_other(self, s, data, op_name, other):
+ op = self.get_op_from_name(op_name)
+ if op_name == '__eq__':
+ result = op(s, other)
+ expected = s.combine(other, lambda x, y: x == y)
+ assert (result == expected).all()
+
+ elif op_name == '__ne__':
+ result = op(s, other)
+ expected = s.combine(other, lambda x, y: x != y)
+ assert (result == expected).all()
+
+ else:
+ with pytest.raises(TypeError):
+ op(data, other)
+
+
+class TestParsing(base.BaseParsingTests):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_common.py b/contrib/python/pandas/py2/pandas/tests/extension/test_common.py
new file mode 100644
index 00000000000..db3f3b80bca
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_common.py
@@ -0,0 +1,86 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes import dtypes
+from pandas.core.dtypes.common import is_extension_array_dtype
+
+import pandas as pd
+from pandas.core.arrays import ExtensionArray
+import pandas.util.testing as tm
+
+
+class DummyDtype(dtypes.ExtensionDtype):
+ pass
+
+
+class DummyArray(ExtensionArray):
+
+ def __init__(self, data):
+ self.data = data
+
+ def __array__(self, dtype):
+ return self.data
+
+ @property
+ def dtype(self):
+ return DummyDtype()
+
+ def astype(self, dtype, copy=True):
+ # we don't support anything but a single dtype
+ if isinstance(dtype, DummyDtype):
+ if copy:
+ return type(self)(self.data)
+ return self
+
+ return np.array(self, dtype=dtype, copy=copy)
+
+
+class TestExtensionArrayDtype(object):
+
+ @pytest.mark.parametrize('values', [
+ pd.Categorical([]),
+ pd.Categorical([]).dtype,
+ pd.Series(pd.Categorical([])),
+ DummyDtype(),
+ DummyArray(np.array([1, 2])),
+ ])
+ def test_is_extension_array_dtype(self, values):
+ assert is_extension_array_dtype(values)
+
+ @pytest.mark.parametrize('values', [
+ np.array([]),
+ pd.Series(np.array([])),
+ ])
+ def test_is_not_extension_array_dtype(self, values):
+ assert not is_extension_array_dtype(values)
+
+
+def test_astype():
+
+ arr = DummyArray(np.array([1, 2, 3]))
+ expected = np.array([1, 2, 3], dtype=object)
+
+ result = arr.astype(object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = arr.astype('object')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_astype_no_copy():
+ arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
+ result = arr.astype(arr.dtype, copy=False)
+
+ assert arr is result
+
+ result = arr.astype(arr.dtype)
+ assert arr is not result
+
+
+ dtypes.CategoricalDtype(),
+ dtypes.IntervalDtype(),
+])
+def test_is_extension_array_dtype(dtype):
+ assert isinstance(dtype, dtypes.ExtensionDtype)
+ assert is_extension_array_dtype(dtype)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_datetime.py b/contrib/python/pandas/py2/pandas/tests/extension/test_datetime.py
new file mode 100644
index 00000000000..00ad35bf6a9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_datetime.py
@@ -0,0 +1,237 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+from pandas.core.arrays import DatetimeArray
+from pandas.tests.extension import base
+
+
[email protected](params=["US/Central"])
+def dtype(request):
+ return DatetimeTZDtype(unit="ns", tz=request.param)
+
+
+def data(dtype):
+ data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz),
+ dtype=dtype)
+ return data
+
+
+def data_missing(dtype):
+ return DatetimeArray(
+ np.array(['NaT', '2000-01-01'], dtype='datetime64[ns]'),
+ dtype=dtype
+ )
+
+
+def data_for_sorting(dtype):
+ a = pd.Timestamp('2000-01-01')
+ b = pd.Timestamp('2000-01-02')
+ c = pd.Timestamp('2000-01-03')
+ return DatetimeArray(np.array([b, c, a], dtype='datetime64[ns]'),
+ dtype=dtype)
+
+
+def data_missing_for_sorting(dtype):
+ a = pd.Timestamp('2000-01-01')
+ b = pd.Timestamp('2000-01-02')
+ return DatetimeArray(np.array([b, 'NaT', a], dtype='datetime64[ns]'),
+ dtype=dtype)
+
+
+def data_for_grouping(dtype):
+ """
+ Expected to be like [B, B, NA, NA, A, A, B, C]
+
+ Where A < B < C and NA is missing
+ """
+ a = pd.Timestamp('2000-01-01')
+ b = pd.Timestamp('2000-01-02')
+ c = pd.Timestamp('2000-01-03')
+ na = 'NaT'
+ return DatetimeArray(np.array([b, b, na, na, a, a, b, c],
+ dtype='datetime64[ns]'),
+ dtype=dtype)
+
+
+def na_cmp():
+ def cmp(a, b):
+ return a is pd.NaT and a is b
+ return cmp
+
+
+def na_value():
+ return pd.NaT
+
+
+# ----------------------------------------------------------------------------
+class BaseDatetimeTests(object):
+ pass
+
+
+# ----------------------------------------------------------------------------
+# Tests
+class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests):
+ pass
+
+
+class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests):
+ pass
+
+
+class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests):
+ pass
+
+
+class TestMethods(BaseDatetimeTests, base.BaseMethodsTests):
+ @pytest.mark.skip(reason="Incorrect expected")
+ def test_value_counts(self, all_data, dropna):
+ pass
+
+ def test_combine_add(self, data_repeated):
+ # Timestamp.__add__(Timestamp) not defined
+ pass
+
+
+class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
+
+ def test_array_interface(self, data):
+ if data.tz:
+ # np.asarray(DTA) is currently always tz-naive.
+ pytest.skip("GH-23569")
+ else:
+ super(TestInterface, self).test_array_interface(data)
+
+
+class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests):
+ implements = {'__sub__', '__rsub__'}
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ if all_arithmetic_operators in self.implements:
+ s = pd.Series(data)
+ self.check_opname(s, all_arithmetic_operators, s.iloc[0],
+ exc=None)
+ else:
+ # ... but not the rest.
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data, all_arithmetic_operators
+ )
+
+ def test_add_series_with_extension_array(self, data):
+ # Datetime + Datetime not implemented
+ s = pd.Series(data)
+ msg = 'cannot add DatetimeArray and DatetimeArray'
+ with pytest.raises(TypeError, match=msg):
+ s + data
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ if all_arithmetic_operators in self.implements:
+ s = pd.Series(data)
+ self.check_opname(s, all_arithmetic_operators, s.iloc[0],
+ exc=None)
+ else:
+ # ... but not the rest.
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data, all_arithmetic_operators
+ )
+
+ def test_error(self, data, all_arithmetic_operators):
+ pass
+
+ @pytest.mark.xfail(reason="different implementation", strict=False)
+ def test_direct_arith_with_series_returns_not_implemented(self, data):
+ # Right now, we have trouble with this. Returning NotImplemented
+ # fails other tests like
+ # tests/arithmetic/test_datetime64::TestTimestampSeriesArithmetic::
+ # test_dt64_seris_add_intlike
+ return super(
+ TestArithmeticOps,
+ self
+ ).test_direct_arith_with_series_returns_not_implemented(data)
+
+
+class TestCasting(BaseDatetimeTests, base.BaseCastingTests):
+ pass
+
+
+class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests):
+
+ def _compare_other(self, s, data, op_name, other):
+ # the base test is not appropriate for us. We raise on comparison
+ # with (some) integers, depending on the value.
+ pass
+
+ @pytest.mark.xfail(reason="different implementation", strict=False)
+ def test_direct_arith_with_series_returns_not_implemented(self, data):
+ return super(
+ TestComparisonOps,
+ self
+ ).test_direct_arith_with_series_returns_not_implemented(data)
+
+
+class TestMissing(BaseDatetimeTests, base.BaseMissingTests):
+ pass
+
+
+class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests):
+
+ @pytest.mark.skip(reason="We have DatetimeTZBlock")
+ def test_concat(self, data, in_frame):
+ pass
+
+ def test_concat_mixed_dtypes(self, data):
+ # concat(Series[datetimetz], Series[category]) uses a
+ # plain np.array(values) on the DatetimeArray, which
+ # drops the tz.
+ super(TestReshaping, self).test_concat_mixed_dtypes(data)
+
+ @pytest.mark.parametrize("obj", ["series", "frame"])
+ def test_unstack(self, obj):
+ # GH-13287: can't use base test, since building the expected fails.
+ data = DatetimeArray._from_sequence(['2000', '2001', '2002', '2003'],
+ tz='US/Central')
+ index = pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]),
+ names=['a', 'b'])
+
+ if obj == "series":
+ ser = pd.Series(data, index=index)
+ expected = pd.DataFrame({
+ "A": data.take([0, 1]),
+ "B": data.take([2, 3])
+ }, index=pd.Index(['a', 'b'], name='b'))
+ expected.columns.name = 'a'
+
+ else:
+ ser = pd.DataFrame({"A": data, "B": data}, index=index)
+ expected = pd.DataFrame(
+ {("A", "A"): data.take([0, 1]),
+ ("A", "B"): data.take([2, 3]),
+ ("B", "A"): data.take([0, 1]),
+ ("B", "B"): data.take([2, 3])},
+ index=pd.Index(['a', 'b'], name='b')
+ )
+ expected.columns.names = [None, 'a']
+
+ result = ser.unstack(0)
+ self.assert_equal(result, expected)
+
+
+class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests):
+ pass
+
+
+class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests):
+ pass
+
+
+class TestPrinting(BaseDatetimeTests, base.BasePrintingTests):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_external_block.py b/contrib/python/pandas/py2/pandas/tests/extension/test_external_block.py
new file mode 100644
index 00000000000..1b3f285e640
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_external_block.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=W0102
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.internals import BlockManager, SingleBlockManager
+from pandas.core.internals.blocks import Block, NonConsolidatableMixIn
+
+
+class CustomBlock(NonConsolidatableMixIn, Block):
+
+ _holder = np.ndarray
+
+ def formatting_values(self):
+ return np.array(["Val: {}".format(i) for i in self.values])
+
+ def concat_same_type(self, to_concat, placement=None):
+ """
+ Always concatenate disregarding self.ndim as the values are
+ always 1D in this custom Block
+ """
+ values = np.concatenate([blk.values for blk in to_concat])
+ return self.make_block_same_class(
+ values, placement=placement or slice(0, len(values), 1))
+
+
+def df():
+ df1 = pd.DataFrame({'a': [1, 2, 3]})
+ blocks = df1._data.blocks
+ values = np.arange(3, dtype='int64')
+ custom_block = CustomBlock(values, placement=slice(1, 2))
+ blocks = blocks + (custom_block,)
+ block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df1.index])
+ return pd.DataFrame(block_manager)
+
+
+def test_custom_repr():
+ values = np.arange(3, dtype='int64')
+
+ # series
+ block = CustomBlock(values, placement=slice(0, 3))
+
+ s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3)))
+ assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64'
+
+ # dataframe
+ block = CustomBlock(values, placement=slice(0, 1))
+ blk_mgr = BlockManager([block], [['col'], range(3)])
+ df = pd.DataFrame(blk_mgr)
+ assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'
+
+
+def test_concat_series():
+ # GH17728
+ values = np.arange(3, dtype='int64')
+ block = CustomBlock(values, placement=slice(0, 3))
+ s = pd.Series(block, pd.RangeIndex(3), fastpath=True)
+
+ res = pd.concat([s, s])
+ assert isinstance(res._data.blocks[0], CustomBlock)
+
+
+def test_concat_dataframe(df):
+ # GH17728
+ res = pd.concat([df, df])
+ assert isinstance(res._data.blocks[1], CustomBlock)
+
+
+def test_concat_axis1(df):
+ # GH17954
+ df2 = pd.DataFrame({'c': [.1, .2, .3]})
+ res = pd.concat([df, df2], axis=1)
+ assert isinstance(res._data.blocks[1], CustomBlock)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_integer.py b/contrib/python/pandas/py2/pandas/tests/extension/test_integer.py
new file mode 100644
index 00000000000..aadf9f2f12b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_integer.py
@@ -0,0 +1,224 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_extension_array_dtype
+
+import pandas as pd
+from pandas.core.arrays import integer_array
+from pandas.core.arrays.integer import (
+ Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype,
+ UInt32Dtype, UInt64Dtype)
+from pandas.tests.extension import base
+
+
+def make_data():
+ return (list(range(1, 9)) + [np.nan] + list(range(10, 98))
+ + [np.nan] + [99, 100])
+
+
[email protected](params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype,
+ UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype])
+def dtype(request):
+ return request.param()
+
+
+def data(dtype):
+ return integer_array(make_data(), dtype=dtype)
+
+
+def data_missing(dtype):
+ return integer_array([np.nan, 1], dtype=dtype)
+
+
+def data_for_sorting(dtype):
+ return integer_array([1, 2, 0], dtype=dtype)
+
+
+def data_missing_for_sorting(dtype):
+ return integer_array([1, np.nan, 0], dtype=dtype)
+
+
+def na_cmp():
+ # we are np.nan
+ return lambda x, y: np.isnan(x) and np.isnan(y)
+
+
+def na_value():
+ return np.nan
+
+
+def data_for_grouping(dtype):
+ b = 1
+ a = 0
+ c = 2
+ na = np.nan
+ return integer_array([b, b, na, na, a, a, b, c], dtype=dtype)
+
+
+class TestDtype(base.BaseDtypeTests):
+
+ @pytest.mark.skip(reason="using multiple dtypes")
+ def test_is_dtype_unboxes_dtype(self):
+ # we have multiple dtypes, so skip
+ pass
+
+
+class TestArithmeticOps(base.BaseArithmeticOpsTests):
+
+ def check_opname(self, s, op_name, other, exc=None):
+ # overwriting to indicate ops don't raise an error
+ super(TestArithmeticOps, self).check_opname(s, op_name,
+ other, exc=None)
+
+ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
+ if exc is None:
+ if s.dtype.is_unsigned_integer and (op_name == '__rsub__'):
+ # TODO see https://github.com/pandas-dev/pandas/issues/22023
+ pytest.skip("unsigned subtraction gives negative values")
+
+ if (hasattr(other, 'dtype')
+ and not is_extension_array_dtype(other.dtype)
+ and pd.api.types.is_integer_dtype(other.dtype)):
+ # other is np.int64 and would therefore always result in
+ # upcasting, so keeping other as same numpy_dtype
+ other = other.astype(s.dtype.numpy_dtype)
+
+ result = op(s, other)
+ expected = s.combine(other, op)
+
+ if op_name == '__rdiv__':
+ # combine is not giving the correct result for this case
+ pytest.skip("skipping reverse div in python 2")
+ elif op_name in ('__rtruediv__', '__truediv__', '__div__'):
+ expected = expected.astype(float)
+ if op_name == '__rtruediv__':
+ # TODO reverse operators result in object dtype
+ result = result.astype(float)
+ elif op_name.startswith('__r'):
+ # TODO reverse operators result in object dtype
+ # see https://github.com/pandas-dev/pandas/issues/22024
+ expected = expected.astype(s.dtype)
+ result = result.astype(s.dtype)
+ else:
+ # combine method result in 'biggest' (int64) dtype
+ expected = expected.astype(s.dtype)
+ pass
+ if (op_name == '__rpow__') and isinstance(other, pd.Series):
+ # TODO pow on Int arrays gives different result with NA
+ # see https://github.com/pandas-dev/pandas/issues/22022
+ result = result.fillna(1)
+
+ self.assert_series_equal(result, expected)
+ else:
+ with pytest.raises(exc):
+ op(s, other)
+
+ def _check_divmod_op(self, s, op, other, exc=None):
+ super(TestArithmeticOps, self)._check_divmod_op(s, op, other, None)
+
+ @pytest.mark.skip(reason="intNA does not error on ops")
+ def test_error(self, data, all_arithmetic_operators):
+ # other specific errors tested in the integer array specific tests
+ pass
+
+
+class TestComparisonOps(base.BaseComparisonOpsTests):
+
+ def check_opname(self, s, op_name, other, exc=None):
+ super(TestComparisonOps, self).check_opname(s, op_name,
+ other, exc=None)
+
+ def _compare_other(self, s, data, op_name, other):
+ self.check_opname(s, op_name, other)
+
+
+class TestInterface(base.BaseInterfaceTests):
+ pass
+
+
+class TestConstructors(base.BaseConstructorsTests):
+ pass
+
+
+class TestReshaping(base.BaseReshapingTests):
+ pass
+
+ # for test_concat_mixed_dtypes test
+ # concat of an Integer and Int coerces to object dtype
+ # TODO(jreback) once integrated this would
+
+
+class TestGetitem(base.BaseGetitemTests):
+ pass
+
+
+class TestSetitem(base.BaseSetitemTests):
+ pass
+
+
+class TestMissing(base.BaseMissingTests):
+ pass
+
+
+class TestMethods(base.BaseMethodsTests):
+
+ @pytest.mark.parametrize('dropna', [True, False])
+ def test_value_counts(self, all_data, dropna):
+ all_data = all_data[:10]
+ if dropna:
+ other = np.array(all_data[~all_data.isna()])
+ else:
+ other = all_data
+
+ result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
+ expected = pd.Series(other).value_counts(
+ dropna=dropna).sort_index()
+ expected.index = expected.index.astype(all_data.dtype)
+
+ self.assert_series_equal(result, expected)
+
+
+class TestCasting(base.BaseCastingTests):
+ pass
+
+
+class TestGroupby(base.BaseGroupbyTests):
+ pass
+
+
+class TestNumericReduce(base.BaseNumericReduceTests):
+ pass
+
+
+class TestBooleanReduce(base.BaseBooleanReduceTests):
+ pass
+
+
+class TestPrinting(base.BasePrintingTests):
+ pass
+
+
+class TestParsing(base.BaseParsingTests):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_interval.py b/contrib/python/pandas/py2/pandas/tests/extension/test_interval.py
new file mode 100644
index 00000000000..6eedbfb4aba
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_interval.py
@@ -0,0 +1,162 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import IntervalDtype
+
+from pandas import Interval
+from pandas.core.arrays import IntervalArray
+from pandas.tests.extension import base
+
+
+def make_data():
+ N = 100
+ left = np.random.uniform(size=N).cumsum()
+ right = left + np.random.uniform(size=N)
+ return [Interval(l, r) for l, r in zip(left, right)]
+
+
+def dtype():
+ return IntervalDtype()
+
+
+def data():
+ """Length-100 PeriodArray for semantics test."""
+ return IntervalArray(make_data())
+
+
+def data_missing():
+ """Length 2 array with [NA, Valid]"""
+ return IntervalArray.from_tuples([None, (0, 1)])
+
+
+def data_for_sorting():
+ return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)])
+
+
+def data_missing_for_sorting():
+ return IntervalArray.from_tuples([(1, 2), None, (0, 1)])
+
+
+def na_value():
+ return np.nan
+
+
+def data_for_grouping():
+ a = (0, 1)
+ b = (1, 2)
+ c = (2, 3)
+ return IntervalArray.from_tuples([b, b, None, None, a, a, b, c])
+
+
+class BaseInterval(object):
+ pass
+
+
+class TestDtype(BaseInterval, base.BaseDtypeTests):
+ pass
+
+
+class TestCasting(BaseInterval, base.BaseCastingTests):
+ pass
+
+
+class TestConstructors(BaseInterval, base.BaseConstructorsTests):
+ pass
+
+
+class TestGetitem(BaseInterval, base.BaseGetitemTests):
+ pass
+
+
+class TestGrouping(BaseInterval, base.BaseGroupbyTests):
+ pass
+
+
+class TestInterface(BaseInterval, base.BaseInterfaceTests):
+ pass
+
+
+class TestReduce(base.BaseNoReduceTests):
+ pass
+
+
+class TestMethods(BaseInterval, base.BaseMethodsTests):
+
+ @pytest.mark.skip(reason='addition is not defined for intervals')
+ def test_combine_add(self, data_repeated):
+ pass
+
+ @pytest.mark.skip(reason="Not Applicable")
+ def test_fillna_length_mismatch(self, data_missing):
+ pass
+
+
+class TestMissing(BaseInterval, base.BaseMissingTests):
+ # Index.fillna only accepts scalar `value`, so we have to skip all
+ # non-scalar fill tests.
+ unsupported_fill = pytest.mark.skip("Unsupported fillna option.")
+
+ @unsupported_fill
+ def test_fillna_limit_pad(self):
+ pass
+
+ @unsupported_fill
+ def test_fillna_series_method(self):
+ pass
+
+ @unsupported_fill
+ def test_fillna_limit_backfill(self):
+ pass
+
+ @unsupported_fill
+ def test_fillna_series(self):
+ pass
+
+ def test_non_scalar_raises(self, data_missing):
+ msg = "Got a 'list' instead."
+ with pytest.raises(TypeError, match=msg):
+ data_missing.fillna([1, 1])
+
+
+class TestReshaping(BaseInterval, base.BaseReshapingTests):
+ pass
+
+
+class TestSetitem(BaseInterval, base.BaseSetitemTests):
+ pass
+
+
+class TestPrinting(BaseInterval, base.BasePrintingTests):
+ @pytest.mark.skip(reason="custom repr")
+ def test_array_repr(self, data, size):
+ pass
+
+
+class TestParsing(BaseInterval, base.BaseParsingTests):
+ @pytest.mark.parametrize('engine', ['c', 'python'])
+ def test_EA_types(self, engine, data):
+ expected_msg = r'.*must implement _from_sequence_of_strings.*'
+ with pytest.raises(NotImplementedError, match=expected_msg):
+ super(TestParsing, self).test_EA_types(engine, data)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_period.py b/contrib/python/pandas/py2/pandas/tests/extension/test_period.py
new file mode 100644
index 00000000000..813efcb5678
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_period.py
@@ -0,0 +1,166 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+import pandas as pd
+from pandas.core.arrays import PeriodArray
+from pandas.tests.extension import base
+
+
+def dtype():
+ return PeriodDtype(freq='D')
+
+
+def data(dtype):
+ return PeriodArray(np.arange(1970, 2070), freq=dtype.freq)
+
+
+def data_for_sorting(dtype):
+ return PeriodArray([2018, 2019, 2017], freq=dtype.freq)
+
+
+def data_missing(dtype):
+ return PeriodArray([iNaT, 2017], freq=dtype.freq)
+
+
+def data_missing_for_sorting(dtype):
+ return PeriodArray([2018, iNaT, 2017], freq=dtype.freq)
+
+
+def data_for_grouping(dtype):
+ B = 2018
+ NA = iNaT
+ A = 2017
+ C = 2019
+ return PeriodArray([B, B, NA, NA, A, A, B, C], freq=dtype.freq)
+
+
+def na_value():
+ return pd.NaT
+
+
+class BasePeriodTests(object):
+ pass
+
+
+class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests):
+ pass
+
+
+class TestConstructors(BasePeriodTests, base.BaseConstructorsTests):
+ pass
+
+
+class TestGetitem(BasePeriodTests, base.BaseGetitemTests):
+ pass
+
+
+class TestMethods(BasePeriodTests, base.BaseMethodsTests):
+
+ def test_combine_add(self, data_repeated):
+ # Period + Period is not defined.
+ pass
+
+
+class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
+
+ pass
+
+
+class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests):
+ implements = {'__sub__', '__rsub__'}
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ # we implement substitution...
+ if all_arithmetic_operators in self.implements:
+ s = pd.Series(data)
+ self.check_opname(s, all_arithmetic_operators, s.iloc[0],
+ exc=None)
+ else:
+ # ... but not the rest.
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data, all_arithmetic_operators
+ )
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ if all_arithmetic_operators in self.implements:
+ s = pd.Series(data)
+ self.check_opname(s, all_arithmetic_operators, s.iloc[0],
+ exc=None)
+ else:
+ # ... but not the rest.
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data, all_arithmetic_operators
+ )
+
+ def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
+ super(TestArithmeticOps, self)._check_divmod_op(
+ s, op, other, exc=TypeError
+ )
+
+ def test_add_series_with_extension_array(self, data):
+ # we don't implement + for Period
+ s = pd.Series(data)
+ msg = (r"unsupported operand type\(s\) for \+: "
+ r"\'PeriodArray\' and \'PeriodArray\'")
+ with pytest.raises(TypeError, match=msg):
+ s + data
+
+ def test_error(self):
+ pass
+
+ def test_direct_arith_with_series_returns_not_implemented(self, data):
+ # Override to use __sub__ instead of __add__
+ other = pd.Series(data)
+ result = data.__sub__(other)
+ assert result is NotImplemented
+
+
+class TestCasting(BasePeriodTests, base.BaseCastingTests):
+ pass
+
+
+class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests):
+
+ def _compare_other(self, s, data, op_name, other):
+ # the base test is not appropriate for us. We raise on comparison
+ # with (some) integers, depending on the value.
+ pass
+
+
+class TestMissing(BasePeriodTests, base.BaseMissingTests):
+ pass
+
+
+class TestReshaping(BasePeriodTests, base.BaseReshapingTests):
+ pass
+
+
+class TestSetitem(BasePeriodTests, base.BaseSetitemTests):
+ pass
+
+
+class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):
+ pass
+
+
+class TestPrinting(BasePeriodTests, base.BasePrintingTests):
+ pass
+
+
+class TestParsing(BasePeriodTests, base.BaseParsingTests):
+ @pytest.mark.parametrize('engine', ['c', 'python'])
+ def test_EA_types(self, engine, data):
+ expected_msg = r'.*must implement _from_sequence_of_strings.*'
+ with pytest.raises(NotImplementedError, match=expected_msg):
+ super(TestParsing, self).test_EA_types(engine, data)
diff --git a/contrib/python/pandas/py2/pandas/tests/extension/test_sparse.py b/contrib/python/pandas/py2/pandas/tests/extension/test_sparse.py
new file mode 100644
index 00000000000..21dbf952496
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/extension/test_sparse.py
@@ -0,0 +1,370 @@
+import numpy as np
+import pytest
+
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import SparseArray, SparseDtype
+from pandas.tests.extension import base
+import pandas.util.testing as tm
+
+
+def make_data(fill_value):
+ if np.isnan(fill_value):
+ data = np.random.uniform(size=100)
+ else:
+ data = np.random.randint(1, 100, size=100)
+ if data[0] == data[1]:
+ data[0] += 1
+
+ data[2::3] = fill_value
+ return data
+
+
+def dtype():
+ return SparseDtype()
+
+
[email protected](params=[0, np.nan])
+def data(request):
+ """Length-100 PeriodArray for semantics test."""
+ res = SparseArray(make_data(request.param),
+ fill_value=request.param)
+ return res
+
+
[email protected](params=[0, np.nan])
+def data_missing(request):
+ """Length 2 array with [NA, Valid]"""
+ return SparseArray([np.nan, 1], fill_value=request.param)
+
+
[email protected](params=[0, np.nan])
+def data_repeated(request):
+ """Return different versions of data for count times"""
+ def gen(count):
+ for _ in range(count):
+ yield SparseArray(make_data(request.param),
+ fill_value=request.param)
+ yield gen
+
+
[email protected](params=[0, np.nan])
+def data_for_sorting(request):
+ return SparseArray([2, 3, 1], fill_value=request.param)
+
+
[email protected](params=[0, np.nan])
+def data_missing_for_sorting(request):
+ return SparseArray([2, np.nan, 1], fill_value=request.param)
+
+
+def na_value():
+ return np.nan
+
+
+def na_cmp():
+ return lambda left, right: pd.isna(left) and pd.isna(right)
+
+
[email protected](params=[0, np.nan])
+def data_for_grouping(request):
+ return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3],
+ fill_value=request.param)
+
+
+class BaseSparseTests(object):
+
+ def _check_unsupported(self, data):
+ if data.dtype == SparseDtype(int, 0):
+ pytest.skip("Can't store nan in int array.")
+
+
+class TestDtype(BaseSparseTests, base.BaseDtypeTests):
+
+ def test_array_type_with_arg(self, data, dtype):
+ assert dtype.construct_array_type() is SparseArray
+
+
+class TestInterface(BaseSparseTests, base.BaseInterfaceTests):
+ def test_no_values_attribute(self, data):
+ pytest.skip("We have values")
+
+
+class TestConstructors(BaseSparseTests, base.BaseConstructorsTests):
+ pass
+
+
+class TestReshaping(BaseSparseTests, base.BaseReshapingTests):
+
+ def test_concat_mixed_dtypes(self, data):
+ # https://github.com/pandas-dev/pandas/issues/20762
+ # This should be the same, aside from concat([sparse, float])
+ df1 = pd.DataFrame({'A': data[:3]})
+ df2 = pd.DataFrame({"A": [1, 2, 3]})
+ df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
+ dfs = [df1, df2, df3]
+
+ # dataframes
+ result = pd.concat(dfs)
+ expected = pd.concat([x.apply(lambda s: np.asarray(s).astype(object))
+ for x in dfs])
+ self.assert_frame_equal(result, expected)
+
+ def test_concat_columns(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestReshaping, self).test_concat_columns(data, na_value)
+
+ def test_align(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestReshaping, self).test_align(data, na_value)
+
+ def test_align_frame(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestReshaping, self).test_align_frame(data, na_value)
+
+ def test_align_series_frame(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestReshaping, self).test_align_series_frame(data, na_value)
+
+ def test_merge(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestReshaping, self).test_merge(data, na_value)
+
+
+class TestGetitem(BaseSparseTests, base.BaseGetitemTests):
+
+ def test_get(self, data):
+ s = pd.Series(data, index=[2 * i for i in range(len(data))])
+ if np.isnan(s.values.fill_value):
+ assert np.isnan(s.get(4)) and np.isnan(s.iloc[2])
+ else:
+ assert s.get(4) == s.iloc[2]
+ assert s.get(2) == s.iloc[1]
+
+ def test_reindex(self, data, na_value):
+ self._check_unsupported(data)
+ super(TestGetitem, self).test_reindex(data, na_value)
+
+
+# Skipping TestSetitem, since we don't implement it.
+
+class TestMissing(BaseSparseTests, base.BaseMissingTests):
+
+ def test_isna(self, data_missing):
+ expected_dtype = SparseDtype(bool,
+ pd.isna(data_missing.dtype.fill_value))
+ expected = SparseArray([True, False], dtype=expected_dtype)
+
+ result = pd.isna(data_missing)
+ self.assert_equal(result, expected)
+
+ result = pd.Series(data_missing).isna()
+ expected = pd.Series(expected)
+ self.assert_series_equal(result, expected)
+
+ # GH 21189
+ result = pd.Series(data_missing).drop([0, 1]).isna()
+ expected = pd.Series([], dtype=expected_dtype)
+ self.assert_series_equal(result, expected)
+
+ def test_fillna_limit_pad(self, data_missing):
+ with tm.assert_produces_warning(PerformanceWarning):
+ super(TestMissing, self).test_fillna_limit_pad(data_missing)
+
+ def test_fillna_limit_backfill(self, data_missing):
+ with tm.assert_produces_warning(PerformanceWarning):
+ super(TestMissing, self).test_fillna_limit_backfill(data_missing)
+
+ def test_fillna_series_method(self, data_missing):
+ with tm.assert_produces_warning(PerformanceWarning):
+ super(TestMissing, self).test_fillna_limit_backfill(data_missing)
+
+ @pytest.mark.skip(reason="Unsupported")
+ def test_fillna_series(self):
+ # this one looks doable.
+ pass
+
+ def test_fillna_frame(self, data_missing):
+ # Have to override to specify that fill_value will change.
+ fill_value = data_missing[1]
+
+ result = pd.DataFrame({
+ "A": data_missing,
+ "B": [1, 2]
+ }).fillna(fill_value)
+
+ if pd.isna(data_missing.fill_value):
+ dtype = SparseDtype(data_missing.dtype, fill_value)
+ else:
+ dtype = data_missing.dtype
+
+ expected = pd.DataFrame({
+ "A": data_missing._from_sequence([fill_value, fill_value],
+ dtype=dtype),
+ "B": [1, 2],
+ })
+
+ self.assert_frame_equal(result, expected)
+
+
+class TestMethods(BaseSparseTests, base.BaseMethodsTests):
+
+ def test_combine_le(self, data_repeated):
+ # We return a Series[SparseArray].__le__ returns a
+ # Series[Sparse[bool]]
+ # rather than Series[bool]
+ orig_data1, orig_data2 = data_repeated(2)
+ s1 = pd.Series(orig_data1)
+ s2 = pd.Series(orig_data2)
+ result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+ expected = pd.Series(pd.SparseArray([
+ a <= b for (a, b) in
+ zip(list(orig_data1), list(orig_data2))
+ ], fill_value=False))
+ self.assert_series_equal(result, expected)
+
+ val = s1.iloc[0]
+ result = s1.combine(val, lambda x1, x2: x1 <= x2)
+ expected = pd.Series(pd.SparseArray([
+ a <= val for a in list(orig_data1)
+ ], fill_value=False))
+ self.assert_series_equal(result, expected)
+
+ def test_fillna_copy_frame(self, data_missing):
+ arr = data_missing.take([1, 1])
+ df = pd.DataFrame({"A": arr})
+
+ filled_val = df.iloc[0, 0]
+ result = df.fillna(filled_val)
+
+ assert df.values.base is not result.values.base
+ assert df.A._values.to_dense() is arr.to_dense()
+
+ def test_fillna_copy_series(self, data_missing):
+ arr = data_missing.take([1, 1])
+ ser = pd.Series(arr)
+
+ filled_val = ser[0]
+ result = ser.fillna(filled_val)
+
+ assert ser._values is not result._values
+ assert ser._values.to_dense() is arr.to_dense()
+
+ @pytest.mark.skip(reason="Not Applicable")
+ def test_fillna_length_mismatch(self, data_missing):
+ pass
+
+ def test_where_series(self, data, na_value):
+ assert data[0] != data[1]
+ cls = type(data)
+ a, b = data[:2]
+
+ ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
+
+ cond = np.array([True, True, False, False])
+ result = ser.where(cond)
+
+ new_dtype = SparseDtype('float', 0.0)
+ expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
+ dtype=new_dtype))
+ self.assert_series_equal(result, expected)
+
+ other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
+ cond = np.array([True, False, True, True])
+ result = ser.where(cond, other)
+ expected = pd.Series(cls._from_sequence([a, b, b, b],
+ dtype=data.dtype))
+ self.assert_series_equal(result, expected)
+
+ def test_combine_first(self, data):
+ if data.dtype.subtype == 'int':
+ # Right now this is upcasted to float, just like combine_first
+ # for Series[int]
+ pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.")
+ super(TestMethods, self).test_combine_first(data)
+
+ @pytest.mark.parametrize("as_series", [True, False])
+ def test_searchsorted(self, data_for_sorting, as_series):
+ with tm.assert_produces_warning(PerformanceWarning):
+ super(TestMethods, self).test_searchsorted(data_for_sorting,
+ as_series=as_series)
+
+
+class TestCasting(BaseSparseTests, base.BaseCastingTests):
+ pass
+
+
+class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
+ series_scalar_exc = None
+ frame_scalar_exc = None
+ divmod_exc = None
+ series_array_exc = None
+
+ def _skip_if_different_combine(self, data):
+ if data.fill_value == 0:
+ # arith ops call on dtype.fill_value so that the sparsity
+ # is maintained. Combine can't be called on a dtype in
+ # general, so we can't make the expected. This is tested elsewhere
+ raise pytest.skip("Incorrected expected from Series.combine")
+
+ def test_error(self, data, all_arithmetic_operators):
+ pass
+
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+ self._skip_if_different_combine(data)
+ super(TestArithmeticOps, self).test_arith_series_with_scalar(
+ data,
+ all_arithmetic_operators
+ )
+
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
+ self._skip_if_different_combine(data)
+ super(TestArithmeticOps, self).test_arith_series_with_array(
+ data,
+ all_arithmetic_operators
+ )
+
+
+class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests):
+
+ def _compare_other(self, s, data, op_name, other):
+ op = self.get_op_from_name(op_name)
+
+ # array
+ result = pd.Series(op(data, other))
+ # hard to test the fill value, since we don't know what expected
+ # is in general.
+ # Rely on tests in `tests/sparse` to validate that.
+ assert isinstance(result.dtype, SparseDtype)
+ assert result.dtype.subtype == np.dtype('bool')
+
+ with np.errstate(all='ignore'):
+ expected = pd.Series(
+ pd.SparseArray(op(np.asarray(data), np.asarray(other)),
+ fill_value=result.values.fill_value)
+ )
+
+ tm.assert_series_equal(result, expected)
+
+ # series
+ s = pd.Series(data)
+ result = op(s, other)
+ tm.assert_series_equal(result, expected)
+
+
+class TestPrinting(BaseSparseTests, base.BasePrintingTests):
+ @pytest.mark.xfail(reason='Different repr', strict=True)
+ def test_array_repr(self, data, size):
+ super(TestPrinting, self).test_array_repr(data, size)
+
+
+class TestParsing(BaseSparseTests, base.BaseParsingTests):
+ @pytest.mark.parametrize('engine', ['c', 'python'])
+ def test_EA_types(self, engine, data):
+ expected_msg = r'.*must implement _from_sequence_of_strings.*'
+ with pytest.raises(NotImplementedError, match=expected_msg):
+ super(TestParsing, self).test_EA_types(engine, data)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/__init__.py b/contrib/python/pandas/py2/pandas/tests/frame/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/common.py b/contrib/python/pandas/py2/pandas/tests/frame/common.py
new file mode 100644
index 00000000000..2ea087c0510
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/common.py
@@ -0,0 +1,141 @@
+import numpy as np
+
+from pandas.util._decorators import cache_readonly
+
+import pandas as pd
+from pandas import compat
+import pandas.util.testing as tm
+
+_seriesd = tm.getSeriesData()
+_tsd = tm.getTimeSeriesData()
+
+_frame = pd.DataFrame(_seriesd)
+_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
+_intframe = pd.DataFrame({k: v.astype(int)
+ for k, v in compat.iteritems(_seriesd)})
+
+_tsframe = pd.DataFrame(_tsd)
+
+_mixed_frame = _frame.copy()
+_mixed_frame['foo'] = 'bar'
+
+
+class TestData(object):
+
+ @cache_readonly
+ def frame(self):
+ return _frame.copy()
+
+ @cache_readonly
+ def frame2(self):
+ return _frame2.copy()
+
+ @cache_readonly
+ def intframe(self):
+ # force these all to int64 to avoid platform testing issues
+ return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)},
+ dtype=np.int64)
+
+ @cache_readonly
+ def tsframe(self):
+ return _tsframe.copy()
+
+ @cache_readonly
+ def mixed_frame(self):
+ return _mixed_frame.copy()
+
+ @cache_readonly
+ def mixed_float(self):
+ return pd.DataFrame({'A': _frame['A'].copy().astype('float32'),
+ 'B': _frame['B'].copy().astype('float32'),
+ 'C': _frame['C'].copy().astype('float16'),
+ 'D': _frame['D'].copy().astype('float64')})
+
+ @cache_readonly
+ def mixed_float2(self):
+ return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'),
+ 'B': _frame2['B'].copy().astype('float32'),
+ 'C': _frame2['C'].copy().astype('float16'),
+ 'D': _frame2['D'].copy().astype('float64')})
+
+ @cache_readonly
+ def mixed_int(self):
+ return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'),
+ 'B': np.ones(len(_intframe['B']), dtype='uint64'),
+ 'C': _intframe['C'].copy().astype('uint8'),
+ 'D': _intframe['D'].copy().astype('int64')})
+
+ @cache_readonly
+ def all_mixed(self):
+ return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ 'float32': np.array([1.] * 10, dtype='float32'),
+ 'int32': np.array([1] * 10, dtype='int32')},
+ index=np.arange(10))
+
+ @cache_readonly
+ def tzframe(self):
+ result = pd.DataFrame({'A': pd.date_range('20130101', periods=3),
+ 'B': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'C': pd.date_range('20130101', periods=3,
+ tz='CET')})
+ result.iloc[1, 1] = pd.NaT
+ result.iloc[1, 2] = pd.NaT
+ return result
+
+ @cache_readonly
+ def empty(self):
+ return pd.DataFrame({})
+
+ @cache_readonly
+ def ts1(self):
+ return tm.makeTimeSeries(nper=30)
+
+ @cache_readonly
+ def ts2(self):
+ return tm.makeTimeSeries(nper=30)[5:]
+
+ @cache_readonly
+ def simple(self):
+ arr = np.array([[1., 2., 3.],
+ [4., 5., 6.],
+ [7., 8., 9.]])
+
+ return pd.DataFrame(arr, columns=['one', 'two', 'three'],
+ index=['a', 'b', 'c'])
+
+# self.ts3 = tm.makeTimeSeries()[-5:]
+# self.ts4 = tm.makeTimeSeries()[1:-1]
+
+
+def _check_mixed_float(df, dtype=None):
+ # float16 are most likely to be upcasted to float32
+ dtypes = dict(A='float32', B='float32', C='float16', D='float64')
+ if isinstance(dtype, compat.string_types):
+ dtypes = {k: dtype for k, v in dtypes.items()}
+ elif isinstance(dtype, dict):
+ dtypes.update(dtype)
+ if dtypes.get('A'):
+ assert(df.dtypes['A'] == dtypes['A'])
+ if dtypes.get('B'):
+ assert(df.dtypes['B'] == dtypes['B'])
+ if dtypes.get('C'):
+ assert(df.dtypes['C'] == dtypes['C'])
+ if dtypes.get('D'):
+ assert(df.dtypes['D'] == dtypes['D'])
+
+
+def _check_mixed_int(df, dtype=None):
+ dtypes = dict(A='int32', B='uint64', C='uint8', D='int64')
+ if isinstance(dtype, compat.string_types):
+ dtypes = {k: dtype for k, v in dtypes.items()}
+ elif isinstance(dtype, dict):
+ dtypes.update(dtype)
+ if dtypes.get('A'):
+ assert(df.dtypes['A'] == dtypes['A'])
+ if dtypes.get('B'):
+ assert(df.dtypes['B'] == dtypes['B'])
+ if dtypes.get('C'):
+ assert(df.dtypes['C'] == dtypes['C'])
+ if dtypes.get('D'):
+ assert(df.dtypes['D'] == dtypes['D'])
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/conftest.py b/contrib/python/pandas/py2/pandas/tests/frame/conftest.py
new file mode 100644
index 00000000000..377e737a531
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/conftest.py
@@ -0,0 +1,221 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, NaT, compat, date_range
+import pandas.util.testing as tm
+
+
+def float_frame():
+ """
+ Fixture for DataFrame of floats with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D'].
+ """
+ return DataFrame(tm.getSeriesData())
+
+
+def float_frame_with_na():
+ """
+ Fixture for DataFrame of floats with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D']; some entries are missing
+ """
+ df = DataFrame(tm.getSeriesData())
+ # set some NAs
+ df.loc[5:10] = np.nan
+ df.loc[15:20, -2:] = np.nan
+ return df
+
+
+def float_frame2():
+ """
+ Fixture for DataFrame of floats with index of unique strings
+
+ Columns are ['D', 'C', 'B', 'A']
+ """
+ return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A'])
+
+
+def bool_frame_with_na():
+ """
+ Fixture for DataFrame of booleans with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D']; some entries are missing
+ """
+ df = DataFrame(tm.getSeriesData()) > 0
+ df = df.astype(object)
+ # set some NAs
+ df.loc[5:10] = np.nan
+ df.loc[15:20, -2:] = np.nan
+ return df
+
+
+def int_frame():
+ """
+ Fixture for DataFrame of ints with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D']
+ """
+ df = DataFrame({k: v.astype(int)
+ for k, v in compat.iteritems(tm.getSeriesData())})
+ # force these all to int64 to avoid platform testing issues
+ return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
+
+
+def datetime_frame():
+ """
+ Fixture for DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']
+ """
+ return DataFrame(tm.getTimeSeriesData())
+
+
+def float_string_frame():
+ """
+ Fixture for DataFrame of floats and strings with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D', 'foo'].
+ """
+ df = DataFrame(tm.getSeriesData())
+ df['foo'] = 'bar'
+ return df
+
+
+def mixed_float_frame():
+ """
+ Fixture for DataFrame of different float types with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D'].
+ """
+ df = DataFrame(tm.getSeriesData())
+ df.A = df.A.astype('float32')
+ df.B = df.B.astype('float32')
+ df.C = df.C.astype('float16')
+ df.D = df.D.astype('float64')
+ return df
+
+
+def mixed_float_frame2():
+ """
+ Fixture for DataFrame of different float types with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D'].
+ """
+ df = DataFrame(tm.getSeriesData())
+ df.D = df.D.astype('float32')
+ df.C = df.C.astype('float32')
+ df.B = df.B.astype('float16')
+ df.D = df.D.astype('float64')
+ return df
+
+
+def mixed_int_frame():
+ """
+ Fixture for DataFrame of different int types with index of unique strings
+
+ Columns are ['A', 'B', 'C', 'D'].
+ """
+ df = DataFrame({k: v.astype(int)
+ for k, v in compat.iteritems(tm.getSeriesData())})
+ df.A = df.A.astype('int32')
+ df.B = np.ones(len(df.B), dtype='uint64')
+ df.C = df.C.astype('uint8')
+ df.D = df.C.astype('int64')
+ return df
+
+
+def mixed_type_frame():
+ """
+ Fixture for DataFrame of float/int/string columns with RangeIndex
+
+ Columns are ['a', 'b', 'c', 'float32', 'int32'].
+ """
+ return DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ 'float32': np.array([1.] * 10, dtype='float32'),
+ 'int32': np.array([1] * 10, dtype='int32')},
+ index=np.arange(10))
+
+
+def timezone_frame():
+ """
+ Fixture for DataFrame of date_range Series with different time zones
+
+ Columns are ['A', 'B', 'C']; some entries are missing
+ """
+ df = DataFrame({'A': date_range('20130101', periods=3),
+ 'B': date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'C': date_range('20130101', periods=3,
+ tz='CET')})
+ df.iloc[1, 1] = NaT
+ df.iloc[1, 2] = NaT
+ return df
+
+
+def empty_frame():
+ """
+ Fixture for empty DataFrame
+ """
+ return DataFrame({})
+
+
+def datetime_series():
+ """
+ Fixture for Series of floats with DatetimeIndex
+ """
+ return tm.makeTimeSeries(nper=30)
+
+
+def datetime_series_short():
+ """
+ Fixture for Series of floats with DatetimeIndex
+ """
+ return tm.makeTimeSeries(nper=30)[5:]
+
+
+def simple_frame():
+ """
+ Fixture for simple 3x3 DataFrame
+
+ Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
+ """
+ arr = np.array([[1., 2., 3.],
+ [4., 5., 6.],
+ [7., 8., 9.]])
+
+ return DataFrame(arr, columns=['one', 'two', 'three'],
+ index=['a', 'b', 'c'])
+
+
+def frame_of_index_cols():
+ """
+ Fixture for DataFrame of columns that can be used for indexing
+
+ Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
+ 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
+ """
+ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
+ 'B': ['one', 'two', 'three', 'one', 'two'],
+ 'C': ['a', 'b', 'c', 'd', 'e'],
+ 'D': np.random.randn(5),
+ 'E': np.random.randn(5),
+ ('tuple', 'as', 'label'): np.random.randn(5)})
+ return df
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_alter_axes.py b/contrib/python/pandas/py2/pandas/tests/frame/test_alter_axes.py
new file mode 100644
index 00000000000..cc3687f856b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_alter_axes.py
@@ -0,0 +1,1444 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime, timedelta
+import inspect
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY2, lrange
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_interval_dtype, is_object_dtype)
+
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, IntervalIndex, MultiIndex,
+ RangeIndex, Series, Timestamp, cut, date_range, to_datetime)
+import pandas.util.testing as tm
+
+
+class TestDataFrameAlterAxes():
+
+ def test_set_index_directly(self, float_string_frame):
+ df = float_string_frame
+ idx = Index(np.arange(len(df))[::-1])
+
+ df.index = idx
+ tm.assert_index_equal(df.index, idx)
+ with pytest.raises(ValueError, match='Length mismatch'):
+ df.index = idx[::2]
+
+ def test_set_index(self, float_string_frame):
+ df = float_string_frame
+ idx = Index(np.arange(len(df))[::-1])
+
+ df = df.set_index(idx)
+ tm.assert_index_equal(df.index, idx)
+ with pytest.raises(ValueError, match='Length mismatch'):
+ df.set_index(idx[::2])
+
+ def test_set_index_cast(self):
+ # issue casting an index then set_index
+ df = DataFrame({'A': [1.1, 2.2, 3.3], 'B': [5.0, 6.1, 7.2]},
+ index=[2010, 2011, 2012])
+ df2 = df.set_index(df.index.astype(np.int32))
+ tm.assert_frame_equal(df, df2)
+
+ # A has duplicate values, C does not
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
+ @pytest.mark.parametrize('inplace', [True, False])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_drop_inplace(self, frame_of_index_cols,
+ drop, inplace, keys):
+ df = frame_of_index_cols
+
+ if isinstance(keys, list):
+ idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
+ else:
+ idx = Index(df[keys], name=keys)
+ expected = df.drop(keys, axis=1) if drop else df
+ expected.index = idx
+
+ if inplace:
+ result = df.copy()
+ result.set_index(keys, drop=drop, inplace=True)
+ else:
+ result = df.set_index(keys, drop=drop)
+
+ tm.assert_frame_equal(result, expected)
+
+ # A has duplicate values, C does not
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_append(self, frame_of_index_cols, drop, keys):
+ df = frame_of_index_cols
+
+ keys = keys if isinstance(keys, list) else [keys]
+ idx = MultiIndex.from_arrays([df.index] + [df[x] for x in keys],
+ names=[None] + keys)
+ expected = df.drop(keys, axis=1) if drop else df.copy()
+ expected.index = idx
+
+ result = df.set_index(keys, drop=drop, append=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ # A has duplicate values, C does not
+ @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
+ ('tuple', 'as', 'label')])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_append_to_multiindex(self, frame_of_index_cols,
+ drop, keys):
+ # append to existing multiindex
+ df = frame_of_index_cols.set_index(['D'], drop=drop, append=True)
+
+ keys = keys if isinstance(keys, list) else [keys]
+ expected = frame_of_index_cols.set_index(['D'] + keys,
+ drop=drop, append=True)
+
+ result = df.set_index(keys, drop=drop, append=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_index_after_mutation(self):
+ # GH1590
+ df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']})
+ expected = DataFrame({'val': [1, 2]},
+ Index(['b', 'c'], name='key'))
+
+ df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
+ result = df2.set_index('key')
+ tm.assert_frame_equal(result, expected)
+
+ # MultiIndex constructor does not work directly on Series -> lambda
+ # Add list-of-list constructor because list is ambiguous -> lambda
+ # also test index name if append=True (name is duplicate here for B)
+ @pytest.mark.parametrize('box', [Series, Index, np.array,
+ list, lambda x: [list(x)],
+ lambda x: MultiIndex.from_arrays([x])])
+ @pytest.mark.parametrize('append, index_name', [(True, None),
+ (True, 'B'), (True, 'test'), (False, None)])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_pass_single_array(self, frame_of_index_cols,
+ drop, append, index_name, box):
+ df = frame_of_index_cols
+ df.index.name = index_name
+
+ key = box(df['B'])
+ if box == list:
+ # list of strings gets interpreted as list of keys
+ msg = "['one', 'two', 'three', 'one', 'two']"
+ with pytest.raises(KeyError, match=msg):
+ df.set_index(key, drop=drop, append=append)
+ else:
+ # np.array/list-of-list "forget" the name of B
+ name_mi = getattr(key, 'names', None)
+ name = [getattr(key, 'name', None)] if name_mi is None else name_mi
+
+ result = df.set_index(key, drop=drop, append=append)
+
+ # only valid column keys are dropped
+ # since B is always passed as array above, nothing is dropped
+ expected = df.set_index(['B'], drop=False, append=append)
+ expected.index.names = [index_name] + name if append else name
+
+ tm.assert_frame_equal(result, expected)
+
+ # MultiIndex constructor does not work directly on Series -> lambda
+ # also test index name if append=True (name is duplicate here for A & B)
+ @pytest.mark.parametrize('box', [Series, Index, np.array, list,
+ lambda x: MultiIndex.from_arrays([x])])
+ @pytest.mark.parametrize('append, index_name',
+ [(True, None), (True, 'A'), (True, 'B'),
+ (True, 'test'), (False, None)])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_pass_arrays(self, frame_of_index_cols,
+ drop, append, index_name, box):
+ df = frame_of_index_cols
+ df.index.name = index_name
+
+ keys = ['A', box(df['B'])]
+ # np.array/list "forget" the name of B
+ names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
+
+ result = df.set_index(keys, drop=drop, append=append)
+
+ # only valid column keys are dropped
+ # since B is always passed as array above, only A is dropped, if at all
+ expected = df.set_index(['A', 'B'], drop=False, append=append)
+ expected = expected.drop('A', axis=1) if drop else expected
+ expected.index.names = [index_name] + names if append else names
+
+ tm.assert_frame_equal(result, expected)
+
+ # MultiIndex constructor does not work directly on Series -> lambda
+ # We also emulate a "constructor" for the label -> lambda
+ # also test index name if append=True (name is duplicate here for A)
+ @pytest.mark.parametrize('box2', [Series, Index, np.array, list,
+ lambda x: MultiIndex.from_arrays([x]),
+ lambda x: x.name])
+ @pytest.mark.parametrize('box1', [Series, Index, np.array, list,
+ lambda x: MultiIndex.from_arrays([x]),
+ lambda x: x.name])
+ @pytest.mark.parametrize('append, index_name', [(True, None),
+ (True, 'A'), (True, 'test'), (False, None)])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
+ append, index_name, box1, box2):
+ df = frame_of_index_cols
+ df.index.name = index_name
+
+ keys = [box1(df['A']), box2(df['A'])]
+ result = df.set_index(keys, drop=drop, append=append)
+
+ # need to adapt first drop for case that both keys are 'A' --
+ # cannot drop the same column twice;
+ # use "is" because == would give ambiguous Boolean error for containers
+ first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop
+
+ # to test against already-tested behaviour, we add sequentially,
+ # hence second append always True; must wrap keys in list, otherwise
+ # box = list would be interpreted as keys
+ expected = df.set_index([keys[0]], drop=first_drop, append=append)
+ expected = expected.set_index([keys[1]], drop=drop, append=True)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('append', [True, False])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_pass_multiindex(self, frame_of_index_cols,
+ drop, append):
+ df = frame_of_index_cols
+ keys = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])
+
+ result = df.set_index(keys, drop=drop, append=append)
+
+ # setting with a MultiIndex will never drop columns
+ expected = df.set_index(['A', 'B'], drop=False, append=append)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_index_verify_integrity(self, frame_of_index_cols):
+ df = frame_of_index_cols
+
+ with pytest.raises(ValueError, match='Index has duplicate keys'):
+ df.set_index('A', verify_integrity=True)
+ # with MultiIndex
+ with pytest.raises(ValueError, match='Index has duplicate keys'):
+ df.set_index([df['A'], df['A']], verify_integrity=True)
+
+ @pytest.mark.parametrize('append', [True, False])
+ @pytest.mark.parametrize('drop', [True, False])
+ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
+ df = frame_of_index_cols
+
+ with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
+ # column names are A-E, as well as one tuple
+ df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
+
+ # non-existent key in list with arrays
+ with pytest.raises(KeyError, match='X'):
+ df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
+
+ msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
+ # tuples always raise KeyError
+ with pytest.raises(KeyError, match=msg):
+ df.set_index(tuple(df['A']), drop=drop, append=append)
+
+ # also within a list
+ with pytest.raises(KeyError, match=msg):
+ df.set_index(['A', df['A'], tuple(df['A'])],
+ drop=drop, append=append)
+
+ @pytest.mark.xfail(reason='broken due to revert, see GH 25085')
+ @pytest.mark.parametrize('append', [True, False])
+ @pytest.mark.parametrize('drop', [True, False])
+ @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)],
+ ids=['set', 'iter', 'generator'])
+ def test_set_index_raise_on_type(self, frame_of_index_cols, box,
+ drop, append):
+ df = frame_of_index_cols
+
+ msg = 'The parameter "keys" may be a column key, .*'
+ # forbidden type, e.g. set/iter/generator
+ with pytest.raises(TypeError, match=msg):
+ df.set_index(box(df['A']), drop=drop, append=append)
+
+ # forbidden type in list, e.g. set/iter/generator
+ with pytest.raises(TypeError, match=msg):
+ df.set_index(['A', df['A'], box(df['A'])],
+ drop=drop, append=append)
+
+ def test_set_index_custom_label_type(self):
+ # GH 24969
+
+ class Thing(object):
+ def __init__(self, name, color):
+ self.name = name
+ self.color = color
+
+ def __str__(self):
+ return "<Thing %r>" % (self.name,)
+
+ # necessary for pretty KeyError
+ __repr__ = __str__
+
+ thing1 = Thing('One', 'red')
+ thing2 = Thing('Two', 'blue')
+ df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
+ expected = DataFrame({thing1: [0, 1]},
+ index=Index([2, 3], name=thing2))
+
+ # use custom label directly
+ result = df.set_index(thing2)
+ tm.assert_frame_equal(result, expected)
+
+ # custom label wrapped in list
+ result = df.set_index([thing2])
+ tm.assert_frame_equal(result, expected)
+
+ # missing key
+ thing3 = Thing('Three', 'pink')
+ msg = "<Thing 'Three'>"
+ with pytest.raises(KeyError, match=msg):
+ # missing label directly
+ df.set_index(thing3)
+
+ with pytest.raises(KeyError, match=msg):
+ # missing label in list
+ df.set_index([thing3])
+
+ def test_set_index_custom_label_hashable_iterable(self):
+ # GH 24969
+
+ # actual example discussed in GH 24984 was e.g. for shapely.geometry
+ # objects (e.g. a collection of Points) that can be both hashable and
+ # iterable; using frozenset as a stand-in for testing here
+
+ class Thing(frozenset):
+ # need to stabilize repr for KeyError (due to random order in sets)
+ def __repr__(self):
+ tmp = sorted(list(self))
+ # double curly brace prints one brace in format string
+ return "frozenset({{{}}})".format(', '.join(map(repr, tmp)))
+
+ thing1 = Thing(['One', 'red'])
+ thing2 = Thing(['Two', 'blue'])
+ df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
+ expected = DataFrame({thing1: [0, 1]},
+ index=Index([2, 3], name=thing2))
+
+ # use custom label directly
+ result = df.set_index(thing2)
+ tm.assert_frame_equal(result, expected)
+
+ # custom label wrapped in list
+ result = df.set_index([thing2])
+ tm.assert_frame_equal(result, expected)
+
+ # missing key
+ thing3 = Thing(['Three', 'pink'])
+ msg = '.*' # due to revert, see GH 25085
+ with pytest.raises(KeyError, match=msg):
+ # missing label directly
+ df.set_index(thing3)
+
+ with pytest.raises(KeyError, match=msg):
+ # missing label in list
+ df.set_index([thing3])
+
+ def test_set_index_custom_label_type_raises(self):
+ # GH 24969
+
+ # purposefully inherit from something unhashable
+ class Thing(set):
+ def __init__(self, name, color):
+ self.name = name
+ self.color = color
+
+ def __str__(self):
+ return "<Thing %r>" % (self.name,)
+
+ thing1 = Thing('One', 'red')
+ thing2 = Thing('Two', 'blue')
+ df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
+
+ msg = 'unhashable type.*'
+
+ with pytest.raises(TypeError, match=msg):
+ # use custom label directly
+ df.set_index(thing2)
+
+ with pytest.raises(TypeError, match=msg):
+ # custom label wrapped in list
+ df.set_index([thing2])
+
+ def test_construction_with_categorical_index(self):
+ ci = tm.makeCategoricalIndex(10)
+ ci.name = 'B'
+
+ # with Categorical
+ df = DataFrame({'A': np.random.randn(10),
+ 'B': ci.values})
+ idf = df.set_index('B')
+ tm.assert_index_equal(idf.index, ci)
+
+ # from a CategoricalIndex
+ df = DataFrame({'A': np.random.randn(10),
+ 'B': ci})
+ idf = df.set_index('B')
+ tm.assert_index_equal(idf.index, ci)
+
+ # round-trip
+ idf = idf.reset_index().set_index('B')
+ tm.assert_index_equal(idf.index, ci)
+
+ def test_set_index_cast_datetimeindex(self):
+ df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
+ for i in range(1000)],
+ 'B': np.random.randn(1000)})
+
+ idf = df.set_index('A')
+ assert isinstance(idf.index, DatetimeIndex)
+
+ def test_convert_dti_to_series(self):
+ # don't cast a DatetimeIndex WITH a tz, leave as object
+ # GH 6032
+ idx = DatetimeIndex(to_datetime(['2013-1-1 13:00',
+ '2013-1-2 14:00']),
+ name='B').tz_localize('US/Pacific')
+ df = DataFrame(np.random.randn(2, 1), columns=['A'])
+
+ expected = Series(np.array([Timestamp('2013-01-01 13:00:00-0800',
+ tz='US/Pacific'),
+ Timestamp('2013-01-02 14:00:00-0800',
+ tz='US/Pacific')],
+ dtype="object"), name='B')
+
+ # convert index to series
+ result = Series(idx)
+ tm.assert_series_equal(result, expected)
+
+ # assign to frame
+ df['B'] = idx
+ result = df['B']
+ tm.assert_series_equal(result, expected)
+
+ # convert to series while keeping the timezone
+ result = idx.to_series(keep_tz=True, index=[0, 1])
+ tm.assert_series_equal(result, expected)
+
+ # convert to utc
+ with tm.assert_produces_warning(FutureWarning):
+ df['B'] = idx.to_series(keep_tz=False, index=[0, 1])
+ result = df['B']
+ comp = Series(DatetimeIndex(expected.values).tz_localize(None),
+ name='B')
+ tm.assert_series_equal(result, comp)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = idx.to_series(index=[0, 1])
+ tm.assert_series_equal(result, expected.dt.tz_convert(None))
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = idx.to_series(keep_tz=False, index=[0, 1])
+ tm.assert_series_equal(result, expected.dt.tz_convert(None))
+
+ # list of datetimes with a tz
+ df['B'] = idx.to_pydatetime()
+ result = df['B']
+ tm.assert_series_equal(result, expected)
+
+ # GH 6785
+ # set the index manually
+ import pytz
+ df = DataFrame(
+ [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
+ expected = df.set_index('ts')
+ df.index = df['ts']
+ df.pop('ts')
+ tm.assert_frame_equal(df, expected)
+
+ def test_reset_index_tz(self, tz_aware_fixture):
+ # GH 3950
+ # reset_index with single level
+ tz = tz_aware_fixture
+ idx = date_range('1/1/2011', periods=5,
+ freq='D', tz=tz, name='idx')
+ df = DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']},
+ index=idx)
+
+ expected = DataFrame({'idx': [datetime(2011, 1, 1),
+ datetime(2011, 1, 2),
+ datetime(2011, 1, 3),
+ datetime(2011, 1, 4),
+ datetime(2011, 1, 5)],
+ 'a': range(5),
+ 'b': ['A', 'B', 'C', 'D', 'E']},
+ columns=['idx', 'a', 'b'])
+ expected['idx'] = expected['idx'].apply(lambda d: Timestamp(d, tz=tz))
+ tm.assert_frame_equal(df.reset_index(), expected)
+
+ def test_set_index_timezone(self):
+ # GH 12358
+ # tz-aware Series should retain the tz
+ idx = to_datetime(["2014-01-01 10:10:10"],
+ utc=True).tz_convert('Europe/Rome')
+ df = DataFrame({'A': idx})
+ assert df.set_index(idx).index[0].hour == 11
+ assert DatetimeIndex(Series(df.A))[0].hour == 11
+ assert df.set_index(df.A).index[0].hour == 11
+
+ def test_set_index_dst(self):
+ di = date_range('2006-10-29 00:00:00', periods=3,
+ freq='H', tz='US/Pacific')
+
+ df = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]},
+ index=di).reset_index()
+ # single level
+ res = df.set_index('index')
+ exp = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]},
+ index=Index(di, name='index'))
+ tm.assert_frame_equal(res, exp)
+
+ # GH 12920
+ res = df.set_index(['index', 'a'])
+ exp_index = MultiIndex.from_arrays([di, [0, 1, 2]],
+ names=['index', 'a'])
+ exp = DataFrame({'b': [3, 4, 5]}, index=exp_index)
+ tm.assert_frame_equal(res, exp)
+
+ def test_reset_index_with_intervals(self):
+ idx = IntervalIndex.from_breaks(np.arange(11), name='x')
+ original = DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']]
+
+ result = original.set_index('x')
+ expected = DataFrame({'y': np.arange(10)}, index=idx)
+ tm.assert_frame_equal(result, expected)
+
+ result2 = result.reset_index()
+ tm.assert_frame_equal(result2, original)
+
+ def test_set_index_multiindexcolumns(self):
+ columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)])
+ df = DataFrame(np.random.randn(3, 3), columns=columns)
+ result = df.set_index(df.columns[0])
+ expected = df.iloc[:, 1:]
+ expected.index = df.iloc[:, 0].values
+ expected.index.names = [df.columns[0]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_index_empty_column(self):
+ # GH 1971
+ df = DataFrame([
+ {'a': 1, 'p': 0},
+ {'a': 2, 'm': 10},
+ {'a': 3, 'm': 11, 'p': 20},
+ {'a': 4, 'm': 12, 'p': 21}
+ ], columns=('a', 'm', 'p', 'x'))
+
+ result = df.set_index(['a', 'x'])
+ expected = df[['m', 'p']]
+ expected.index = MultiIndex.from_arrays([df['a'], df['x']],
+ names=['a', 'x'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_columns(self, float_string_frame):
+ cols = Index(np.arange(len(float_string_frame.columns)))
+ float_string_frame.columns = cols
+ with pytest.raises(ValueError, match='Length mismatch'):
+ float_string_frame.columns = cols[::2]
+
+ def test_dti_set_index_reindex(self):
+ # GH 6631
+ df = DataFrame(np.random.random(6))
+ idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern')
+ idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo')
+
+ df = df.set_index(idx1)
+ tm.assert_index_equal(df.index, idx1)
+ df = df.reindex(idx2)
+ tm.assert_index_equal(df.index, idx2)
+
+ # GH 11314
+ # with tz
+ index = date_range(datetime(2015, 10, 1),
+ datetime(2015, 10, 1, 23),
+ freq='H', tz='US/Eastern')
+ df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
+ new_index = date_range(datetime(2015, 10, 2),
+ datetime(2015, 10, 2, 23),
+ freq='H', tz='US/Eastern')
+
+ result = df.set_index(new_index)
+ assert result.index.freq == index.freq
+
+ # Renaming
+
+ def test_rename(self, float_frame):
+ mapping = {
+ 'A': 'a',
+ 'B': 'b',
+ 'C': 'c',
+ 'D': 'd'
+ }
+
+ renamed = float_frame.rename(columns=mapping)
+ renamed2 = float_frame.rename(columns=str.lower)
+
+ tm.assert_frame_equal(renamed, renamed2)
+ tm.assert_frame_equal(renamed2.rename(columns=str.upper),
+ float_frame, check_names=False)
+
+ # index
+ data = {
+ 'A': {'foo': 0, 'bar': 1}
+ }
+
+ # gets sorted alphabetical
+ df = DataFrame(data)
+ renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'})
+ tm.assert_index_equal(renamed.index, Index(['foo', 'bar']))
+
+ renamed = df.rename(index=str.upper)
+ tm.assert_index_equal(renamed.index, Index(['BAR', 'FOO']))
+
+ # have to pass something
+ pytest.raises(TypeError, float_frame.rename)
+
+ # partial columns
+ renamed = float_frame.rename(columns={'C': 'foo', 'D': 'bar'})
+ tm.assert_index_equal(renamed.columns, Index(['A', 'B', 'foo', 'bar']))
+
+ # other axis
+ renamed = float_frame.T.rename(index={'C': 'foo', 'D': 'bar'})
+ tm.assert_index_equal(renamed.index, Index(['A', 'B', 'foo', 'bar']))
+
+ # index with name
+ index = Index(['foo', 'bar'], name='name')
+ renamer = DataFrame(data, index=index)
+ renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
+ tm.assert_index_equal(renamed.index,
+ Index(['bar', 'foo'], name='name'))
+ assert renamed.index.name == renamer.index.name
+
+ def test_rename_axis_inplace(self, float_frame):
+ # GH 15704
+ expected = float_frame.rename_axis('foo')
+ result = float_frame.copy()
+ no_return = result.rename_axis('foo', inplace=True)
+
+ assert no_return is None
+ tm.assert_frame_equal(result, expected)
+
+ expected = float_frame.rename_axis('bar', axis=1)
+ result = float_frame.copy()
+ no_return = result.rename_axis('bar', axis=1, inplace=True)
+
+ assert no_return is None
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename_axis_warns(self):
+ # https://github.com/pandas-dev/pandas/issues/17833
+ df = DataFrame({"A": [1, 2], "B": [1, 2]})
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.rename_axis(id, axis=0)
+ assert 'rename' in str(w[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.rename_axis({0: 10, 1: 20}, axis=0)
+ assert 'rename' in str(w[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.rename_axis(id, axis=1)
+ assert 'rename' in str(w[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df['A'].rename_axis(id)
+ assert 'rename' in str(w[0].message)
+
+ def test_rename_axis_mapper(self):
+ # GH 19978
+ mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]],
+ names=['ll', 'nn'])
+ df = DataFrame({'x': [i for i in range(len(mi))],
+ 'y': [i * 10 for i in range(len(mi))]},
+ index=mi)
+
+ # Test for rename of the Index object of columns
+ result = df.rename_axis('cols', axis=1)
+ tm.assert_index_equal(result.columns,
+ Index(['x', 'y'], name='cols'))
+
+ # Test for rename of the Index object of columns using dict
+ result = result.rename_axis(columns={'cols': 'new'}, axis=1)
+ tm.assert_index_equal(result.columns,
+ Index(['x', 'y'], name='new'))
+
+ # Test for renaming index using dict
+ result = df.rename_axis(index={'ll': 'foo'})
+ assert result.index.names == ['foo', 'nn']
+
+ # Test for renaming index using a function
+ result = df.rename_axis(index=str.upper, axis=0)
+ assert result.index.names == ['LL', 'NN']
+
+ # Test for renaming index providing complete list
+ result = df.rename_axis(index=['foo', 'goo'])
+ assert result.index.names == ['foo', 'goo']
+
+ # Test for changing index and columns at same time
+ sdf = df.reset_index().set_index('nn').drop(columns=['ll', 'y'])
+ result = sdf.rename_axis(index='foo', columns='meh')
+ assert result.index.name == 'foo'
+ assert result.columns.name == 'meh'
+
+ # Test different error cases
+ with pytest.raises(TypeError, match='Must pass'):
+ df.rename_axis(index='wrong')
+
+ with pytest.raises(ValueError, match='Length of names'):
+ df.rename_axis(index=['wrong'])
+
+ with pytest.raises(TypeError, match='bogus'):
+ df.rename_axis(bogus=None)
+
+ @pytest.mark.parametrize('kwargs, rename_index, rename_columns', [
+ ({'mapper': None, 'axis': 0}, True, False),
+ ({'mapper': None, 'axis': 1}, False, True),
+ ({'index': None}, True, False),
+ ({'columns': None}, False, True),
+ ({'index': None, 'columns': None}, True, True),
+ ({}, False, False)])
+ def test_rename_axis_none(self, kwargs, rename_index, rename_columns):
+ # GH 25034
+ index = Index(list('abc'), name='foo')
+ columns = Index(['col1', 'col2'], name='bar')
+ data = np.arange(6).reshape(3, 2)
+ df = DataFrame(data, index, columns)
+
+ result = df.rename_axis(**kwargs)
+ expected_index = index.rename(None) if rename_index else index
+ expected_columns = columns.rename(None) if rename_columns else columns
+ expected = DataFrame(data, expected_index, expected_columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename_multiindex(self):
+
+ tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
+ tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
+ index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
+ columns = MultiIndex.from_tuples(
+ tuples_columns, names=['fizz', 'buzz'])
+ df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)
+
+ #
+ # without specifying level -> across all levels
+
+ renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
+ columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
+ new_index = MultiIndex.from_tuples([('foo3', 'bar1'),
+ ('foo2', 'bar3')],
+ names=['foo', 'bar'])
+ new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'),
+ ('fizz2', 'buzz3')],
+ names=['fizz', 'buzz'])
+ tm.assert_index_equal(renamed.index, new_index)
+ tm.assert_index_equal(renamed.columns, new_columns)
+ assert renamed.index.names == df.index.names
+ assert renamed.columns.names == df.columns.names
+
+ #
+ # with specifying a level (GH13766)
+
+ # dict
+ new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'),
+ ('fizz2', 'buzz2')],
+ names=['fizz', 'buzz'])
+ renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
+ level=0)
+ tm.assert_index_equal(renamed.columns, new_columns)
+ renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
+ level='fizz')
+ tm.assert_index_equal(renamed.columns, new_columns)
+
+ new_columns = MultiIndex.from_tuples([('fizz1', 'buzz1'),
+ ('fizz2', 'buzz3')],
+ names=['fizz', 'buzz'])
+ renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
+ level=1)
+ tm.assert_index_equal(renamed.columns, new_columns)
+ renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
+ level='buzz')
+ tm.assert_index_equal(renamed.columns, new_columns)
+
+ # function
+ func = str.upper
+ new_columns = MultiIndex.from_tuples([('FIZZ1', 'buzz1'),
+ ('FIZZ2', 'buzz2')],
+ names=['fizz', 'buzz'])
+ renamed = df.rename(columns=func, level=0)
+ tm.assert_index_equal(renamed.columns, new_columns)
+ renamed = df.rename(columns=func, level='fizz')
+ tm.assert_index_equal(renamed.columns, new_columns)
+
+ new_columns = MultiIndex.from_tuples([('fizz1', 'BUZZ1'),
+ ('fizz2', 'BUZZ2')],
+ names=['fizz', 'buzz'])
+ renamed = df.rename(columns=func, level=1)
+ tm.assert_index_equal(renamed.columns, new_columns)
+ renamed = df.rename(columns=func, level='buzz')
+ tm.assert_index_equal(renamed.columns, new_columns)
+
+ # index
+ new_index = MultiIndex.from_tuples([('foo3', 'bar1'),
+ ('foo2', 'bar2')],
+ names=['foo', 'bar'])
+ renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
+ level=0)
+ tm.assert_index_equal(renamed.index, new_index)
+
+ def test_rename_nocopy(self, float_frame):
+ renamed = float_frame.rename(columns={'C': 'foo'}, copy=False)
+ renamed['foo'] = 1.
+ assert (float_frame['C'] == 1.).all()
+
+ def test_rename_inplace(self, float_frame):
+ float_frame.rename(columns={'C': 'foo'})
+ assert 'C' in float_frame
+ assert 'foo' not in float_frame
+
+ c_id = id(float_frame['C'])
+ float_frame = float_frame.copy()
+ float_frame.rename(columns={'C': 'foo'}, inplace=True)
+
+ assert 'C' not in float_frame
+ assert 'foo' in float_frame
+ assert id(float_frame['foo']) != c_id
+
+ def test_rename_bug(self):
+ # GH 5344
+ # rename set ref_locs, and set_index was not resetting
+ df = DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]})
+ df = df.rename(columns={0: 'a'})
+ df = df.rename(columns={1: 'b'})
+ df = df.set_index(['a', 'b'])
+ df.columns = ['2001-01-01']
+ expected = DataFrame([[1], [2]],
+ index=MultiIndex.from_tuples(
+ [('foo', 'bah'), ('bar', 'bas')],
+ names=['a', 'b']),
+ columns=['2001-01-01'])
+ tm.assert_frame_equal(df, expected)
+
+ def test_rename_bug2(self):
+ # GH 19497
+ # rename was changing Index to MultiIndex if Index contained tuples
+
+ df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)],
+ columns=["a"])
+ df = df.rename({(1, 1): (5, 4)}, axis="index")
+ expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)],
+ columns=["a"])
+ tm.assert_frame_equal(df, expected)
+
+ def test_reorder_levels(self):
+ index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0],
+ [0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1]],
+ names=['L0', 'L1', 'L2'])
+ df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index)
+
+ # no change, position
+ result = df.reorder_levels([0, 1, 2])
+ tm.assert_frame_equal(df, result)
+
+ # no change, labels
+ result = df.reorder_levels(['L0', 'L1', 'L2'])
+ tm.assert_frame_equal(df, result)
+
+ # rotate, position
+ result = df.reorder_levels([1, 2, 0])
+ e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']],
+ codes=[[0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1],
+ [0, 0, 0, 0, 0, 0]],
+ names=['L1', 'L2', 'L0'])
+ expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
+ index=e_idx)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.reorder_levels([0, 0, 0])
+ e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
+ codes=[[0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0, 0]],
+ names=['L0', 'L0', 'L0'])
+ expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
+ index=e_idx)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.reorder_levels(['L0', 'L0', 'L0'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_reset_index(self, float_frame):
+ stacked = float_frame.stack()[::2]
+ stacked = DataFrame({'foo': stacked, 'bar': stacked})
+
+ names = ['first', 'second']
+ stacked.index.names = names
+ deleveled = stacked.reset_index()
+ for i, (lev, level_codes) in enumerate(zip(stacked.index.levels,
+ stacked.index.codes)):
+ values = lev.take(level_codes)
+ name = names[i]
+ tm.assert_index_equal(values, Index(deleveled[name]))
+
+ stacked.index.names = [None, None]
+ deleveled2 = stacked.reset_index()
+ tm.assert_series_equal(deleveled['first'], deleveled2['level_0'],
+ check_names=False)
+ tm.assert_series_equal(deleveled['second'], deleveled2['level_1'],
+ check_names=False)
+
+ # default name assigned
+ rdf = float_frame.reset_index()
+ exp = Series(float_frame.index.values, name='index')
+ tm.assert_series_equal(rdf['index'], exp)
+
+ # default name assigned, corner case
+ df = float_frame.copy()
+ df['index'] = 'foo'
+ rdf = df.reset_index()
+ exp = Series(float_frame.index.values, name='level_0')
+ tm.assert_series_equal(rdf['level_0'], exp)
+
+ # but this is ok
+ float_frame.index.name = 'index'
+ deleveled = float_frame.reset_index()
+ tm.assert_series_equal(deleveled['index'], Series(float_frame.index))
+ tm.assert_index_equal(deleveled.index,
+ Index(np.arange(len(deleveled))))
+
+ # preserve column names
+ float_frame.columns.name = 'columns'
+ resetted = float_frame.reset_index()
+ assert resetted.columns.name == 'columns'
+
+ # only remove certain columns
+ df = float_frame.reset_index().set_index(['index', 'A', 'B'])
+ rs = df.reset_index(['A', 'B'])
+
+ # TODO should reset_index check_names ?
+ tm.assert_frame_equal(rs, float_frame, check_names=False)
+
+ rs = df.reset_index(['index', 'A', 'B'])
+ tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False)
+
+ rs = df.reset_index(['index', 'A', 'B'])
+ tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False)
+
+ rs = df.reset_index('A')
+ xp = float_frame.reset_index().set_index(['index', 'B'])
+ tm.assert_frame_equal(rs, xp, check_names=False)
+
+ # test resetting in place
+ df = float_frame.copy()
+ resetted = float_frame.reset_index()
+ df.reset_index(inplace=True)
+ tm.assert_frame_equal(df, resetted, check_names=False)
+
+ df = float_frame.reset_index().set_index(['index', 'A', 'B'])
+ rs = df.reset_index('A', drop=True)
+ xp = float_frame.copy()
+ del xp['A']
+ xp = xp.set_index(['B'], append=True)
+ tm.assert_frame_equal(rs, xp, check_names=False)
+
+ def test_reset_index_name(self):
+ df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
+ columns=['A', 'B', 'C', 'D'],
+ index=Index(range(2), name='x'))
+ assert df.reset_index().index.name is None
+ assert df.reset_index(drop=True).index.name is None
+ df.reset_index(inplace=True)
+ assert df.index.name is None
+
+ def test_reset_index_level(self):
+ df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
+ columns=['A', 'B', 'C', 'D'])
+
+ for levels in ['A', 'B'], [0, 1]:
+ # With MultiIndex
+ result = df.set_index(['A', 'B']).reset_index(level=levels[0])
+ tm.assert_frame_equal(result, df.set_index('B'))
+
+ result = df.set_index(['A', 'B']).reset_index(level=levels[:1])
+ tm.assert_frame_equal(result, df.set_index('B'))
+
+ result = df.set_index(['A', 'B']).reset_index(level=levels)
+ tm.assert_frame_equal(result, df)
+
+ result = df.set_index(['A', 'B']).reset_index(level=levels,
+ drop=True)
+ tm.assert_frame_equal(result, df[['C', 'D']])
+
+ # With single-level Index (GH 16263)
+ result = df.set_index('A').reset_index(level=levels[0])
+ tm.assert_frame_equal(result, df)
+
+ result = df.set_index('A').reset_index(level=levels[:1])
+ tm.assert_frame_equal(result, df)
+
+ result = df.set_index(['A']).reset_index(level=levels[0],
+ drop=True)
+ tm.assert_frame_equal(result, df[['B', 'C', 'D']])
+
+ # Missing levels - for both MultiIndex and single-level Index:
+ for idx_lev in ['A', 'B'], ['A']:
+ with pytest.raises(KeyError, match='Level E '):
+ df.set_index(idx_lev).reset_index(level=['A', 'E'])
+ with pytest.raises(IndexError, match='Too many levels'):
+ df.set_index(idx_lev).reset_index(level=[0, 1, 2])
+
+ def test_reset_index_right_dtype(self):
+ time = np.arange(0.0, 10, np.sqrt(2) / 2)
+ s1 = Series((9.81 * time ** 2) / 2,
+ index=Index(time, name='time'),
+ name='speed')
+ df = DataFrame(s1)
+
+ resetted = s1.reset_index()
+ assert resetted['time'].dtype == np.float64
+
+ resetted = df.reset_index()
+ assert resetted['time'].dtype == np.float64
+
+ def test_reset_index_multiindex_col(self):
+ vals = np.random.randn(3, 3).astype(object)
+ idx = ['x', 'y', 'z']
+ full = np.hstack(([[x] for x in idx], vals))
+ df = DataFrame(vals, Index(idx, name='a'),
+ columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']])
+ rs = df.reset_index()
+ xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'],
+ ['', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ rs = df.reset_index(col_fill=None)
+ xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'],
+ ['a', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ rs = df.reset_index(col_level=1, col_fill='blah')
+ xp = DataFrame(full, columns=[['blah', 'b', 'b', 'c'],
+ ['a', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ df = DataFrame(vals,
+ MultiIndex.from_arrays([[0, 1, 2], ['x', 'y', 'z']],
+ names=['d', 'a']),
+ columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']])
+ rs = df.reset_index('a', )
+ xp = DataFrame(full, Index([0, 1, 2], name='d'),
+ columns=[['a', 'b', 'b', 'c'],
+ ['', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ rs = df.reset_index('a', col_fill=None)
+ xp = DataFrame(full, Index(lrange(3), name='d'),
+ columns=[['a', 'b', 'b', 'c'],
+ ['a', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ rs = df.reset_index('a', col_fill='blah', col_level=1)
+ xp = DataFrame(full, Index(lrange(3), name='d'),
+ columns=[['blah', 'b', 'b', 'c'],
+ ['a', 'mean', 'median', 'mean']])
+ tm.assert_frame_equal(rs, xp)
+
+ def test_reset_index_multiindex_nan(self):
+ # GH6322, testing reset_index on MultiIndexes
+ # when we have a nan or all nan
+ df = DataFrame({'A': ['a', 'b', 'c'],
+ 'B': [0, 1, np.nan],
+ 'C': np.random.rand(3)})
+ rs = df.set_index(['A', 'B']).reset_index()
+ tm.assert_frame_equal(rs, df)
+
+ df = DataFrame({'A': [np.nan, 'b', 'c'],
+ 'B': [0, 1, 2],
+ 'C': np.random.rand(3)})
+ rs = df.set_index(['A', 'B']).reset_index()
+ tm.assert_frame_equal(rs, df)
+
+ df = DataFrame({'A': ['a', 'b', 'c'],
+ 'B': [0, 1, 2],
+ 'C': [np.nan, 1.1, 2.2]})
+ rs = df.set_index(['A', 'B']).reset_index()
+ tm.assert_frame_equal(rs, df)
+
+ df = DataFrame({'A': ['a', 'b', 'c'],
+ 'B': [np.nan, np.nan, np.nan],
+ 'C': np.random.rand(3)})
+ rs = df.set_index(['A', 'B']).reset_index()
+ tm.assert_frame_equal(rs, df)
+
+ def test_reset_index_with_datetimeindex_cols(self):
+ # GH5818
+ #
+ df = DataFrame([[1, 2], [3, 4]],
+ columns=date_range('1/1/2013', '1/2/2013'),
+ index=['A', 'B'])
+
+ result = df.reset_index()
+ expected = DataFrame([['A', 1, 2], ['B', 3, 4]],
+ columns=['index', datetime(2013, 1, 1),
+ datetime(2013, 1, 2)])
+ tm.assert_frame_equal(result, expected)
+
+ def test_reset_index_range(self):
+ # GH 12071
+ df = DataFrame([[0, 0], [1, 1]], columns=['A', 'B'],
+ index=RangeIndex(stop=2))
+ result = df.reset_index()
+ assert isinstance(result.index, RangeIndex)
+ expected = DataFrame([[0, 0, 0], [1, 1, 1]],
+ columns=['index', 'A', 'B'],
+ index=RangeIndex(stop=2))
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_index_names(self):
+ df = tm.makeDataFrame()
+ df.index.name = 'name'
+
+ assert df.set_index(df.index).index.names == ['name']
+
+ mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B'])
+ mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values,
+ names=['A', 'B', 'C', 'D'])
+
+ df = df.set_index(['A', 'B'])
+
+ assert df.set_index(df.index).index.names == ['A', 'B']
+
+ # Check that set_index isn't converting a MultiIndex into an Index
+ assert isinstance(df.set_index(df.index).index, MultiIndex)
+
+ # Check actual equality
+ tm.assert_index_equal(df.set_index(df.index).index, mi)
+
+ idx2 = df.index.rename(['C', 'D'])
+
+ # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
+ # than a pair of tuples
+ assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)
+
+ # Check equality
+ tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
+
+ def test_rename_objects(self, float_string_frame):
+ renamed = float_string_frame.rename(columns=str.upper)
+
+ assert 'FOO' in renamed
+ assert 'foo' not in renamed
+
+ def test_rename_axis_style(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y'])
+ expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y'])
+
+ result = df.rename(str.lower, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename(str.lower, axis='columns')
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename({"A": 'a', 'B': 'b'}, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename({"A": 'a', 'B': 'b'}, axis='columns')
+ tm.assert_frame_equal(result, expected)
+
+ # Index
+ expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y'])
+ result = df.rename(str.lower, axis=0)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename(str.lower, axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename({'X': 'x', 'Y': 'y'}, axis=0)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename({'X': 'x', 'Y': 'y'}, axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rename(mapper=str.lower, axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename_mapper_multi(self):
+ df = DataFrame({"A": ['a', 'b'], "B": ['c', 'd'],
+ 'C': [1, 2]}).set_index(["A", "B"])
+ result = df.rename(str.upper)
+ expected = df.rename(index=str.upper)
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename_positional_named(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y'])
+ result = df.rename(str.lower, columns=str.upper)
+ expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename_axis_style_raises(self):
+ # see gh-12392
+ df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"])
+
+ # Named target and axis
+ over_spec_msg = ("Cannot specify both 'axis' and "
+ "any of 'index' or 'columns'")
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(index=str.lower, axis=1)
+
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(index=str.lower, axis="columns")
+
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(columns=str.lower, axis="columns")
+
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(index=str.lower, axis=0)
+
+ # Multiple targets and axis
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(str.lower, str.lower, axis="columns")
+
+ # Too many targets
+ over_spec_msg = "Cannot specify all of 'mapper', 'index', 'columns'."
+ with pytest.raises(TypeError, match=over_spec_msg):
+ df.rename(str.lower, str.lower, str.lower)
+
+ # Duplicates
+ with pytest.raises(TypeError, match="multiple values"):
+ df.rename(id, mapper=id)
+
+ def test_reindex_api_equivalence(self):
+ # equivalence of the labels/axis and index/columns API's
+ df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
+ index=['a', 'b', 'c'],
+ columns=['d', 'e', 'f'])
+
+ res1 = df.reindex(['b', 'a'])
+ res2 = df.reindex(index=['b', 'a'])
+ res3 = df.reindex(labels=['b', 'a'])
+ res4 = df.reindex(labels=['b', 'a'], axis=0)
+ res5 = df.reindex(['b', 'a'], axis=0)
+ for res in [res2, res3, res4, res5]:
+ tm.assert_frame_equal(res1, res)
+
+ res1 = df.reindex(columns=['e', 'd'])
+ res2 = df.reindex(['e', 'd'], axis=1)
+ res3 = df.reindex(labels=['e', 'd'], axis=1)
+ for res in [res2, res3]:
+ tm.assert_frame_equal(res1, res)
+
+ res1 = df.reindex(index=['b', 'a'], columns=['e', 'd'])
+ res2 = df.reindex(columns=['e', 'd'], index=['b', 'a'])
+ res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'],
+ axis=1)
+ for res in [res2, res3]:
+ tm.assert_frame_equal(res1, res)
+
+ def test_rename_positional(self):
+ df = DataFrame(columns=['A', 'B'])
+ with tm.assert_produces_warning(FutureWarning) as rec:
+ result = df.rename(None, str.lower)
+ expected = DataFrame(columns=['a', 'b'])
+ tm.assert_frame_equal(result, expected)
+ assert len(rec) == 1
+ message = str(rec[0].message)
+ assert 'rename' in message
+ assert 'Use named arguments' in message
+
+ def test_assign_columns(self, float_frame):
+ float_frame['hi'] = 'there'
+
+ df = float_frame.copy()
+ df.columns = ['foo', 'bar', 'baz', 'quux', 'foo2']
+ tm.assert_series_equal(float_frame['C'], df['baz'], check_names=False)
+ tm.assert_series_equal(float_frame['hi'], df['foo2'],
+ check_names=False)
+
+ def test_set_index_preserve_categorical_dtype(self):
+ # GH13743, GH13854
+ df = DataFrame({'A': [1, 2, 1, 1, 2],
+ 'B': [10, 16, 22, 28, 34],
+ 'C1': Categorical(list("abaab"),
+ categories=list("bac"),
+ ordered=False),
+ 'C2': Categorical(list("abaab"),
+ categories=list("bac"),
+ ordered=True)})
+ for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]:
+ result = df.set_index(cols).reset_index()
+ result = result.reindex(columns=df.columns)
+ tm.assert_frame_equal(result, df)
+
+ def test_ambiguous_warns(self):
+ df = DataFrame({"A": [1, 2]})
+ with tm.assert_produces_warning(FutureWarning):
+ df.rename(id, id)
+
+ with tm.assert_produces_warning(FutureWarning):
+ df.rename({0: 10}, {"A": "B"})
+
+ @pytest.mark.skipif(PY2, reason="inspect.signature")
+ def test_rename_signature(self):
+ sig = inspect.signature(DataFrame.rename)
+ parameters = set(sig.parameters)
+ assert parameters == {"self", "mapper", "index", "columns", "axis",
+ "inplace", "copy", "level"}
+
+ @pytest.mark.skipif(PY2, reason="inspect.signature")
+ def test_reindex_signature(self):
+ sig = inspect.signature(DataFrame.reindex)
+ parameters = set(sig.parameters)
+ assert parameters == {"self", "labels", "index", "columns", "axis",
+ "limit", "copy", "level", "method",
+ "fill_value", "tolerance"}
+
+ def test_droplevel(self):
+ # GH20342
+ df = DataFrame([
+ [1, 2, 3, 4],
+ [5, 6, 7, 8],
+ [9, 10, 11, 12]
+ ])
+ df = df.set_index([0, 1]).rename_axis(['a', 'b'])
+ df.columns = MultiIndex.from_tuples([('c', 'e'), ('d', 'f')],
+ names=['level_1', 'level_2'])
+
+ # test that dropping of a level in index works
+ expected = df.reset_index('a', drop=True)
+ result = df.droplevel('a', axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ # test that dropping of a level in columns works
+ expected = df.copy()
+ expected.columns = Index(['c', 'd'], name='level_1')
+ result = df.droplevel('level_2', axis='columns')
+ tm.assert_frame_equal(result, expected)
+
+
+class TestIntervalIndex(object):
+
+ def test_setitem(self):
+
+ df = DataFrame({'A': range(10)})
+ s = cut(df.A, 5)
+ assert isinstance(s.cat.categories, IntervalIndex)
+
+ # B & D end up as Categoricals
+ # the remainer are converted to in-line objects
+ # contining an IntervalIndex.values
+ df['B'] = s
+ df['C'] = np.array(s)
+ df['D'] = s.values
+ df['E'] = np.array(s.values)
+
+ assert is_categorical_dtype(df['B'])
+ assert is_interval_dtype(df['B'].cat.categories)
+ assert is_categorical_dtype(df['D'])
+ assert is_interval_dtype(df['D'].cat.categories)
+
+ assert is_object_dtype(df['C'])
+ assert is_object_dtype(df['E'])
+
+ # they compare equal as Index
+ # when converted to numpy objects
+ c = lambda x: Index(np.array(x))
+ tm.assert_index_equal(c(df.B), c(df.B), check_names=False)
+ tm.assert_index_equal(c(df.B), c(df.C), check_names=False)
+ tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
+ tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
+
+ # B & D are the same Series
+ tm.assert_series_equal(df['B'], df['B'], check_names=False)
+ tm.assert_series_equal(df['B'], df['D'], check_names=False)
+
+ # C & E are the same Series
+ tm.assert_series_equal(df['C'], df['C'], check_names=False)
+ tm.assert_series_equal(df['C'], df['E'], check_names=False)
+
+ def test_set_reset_index(self):
+
+ df = DataFrame({'A': range(10)})
+ s = cut(df.A, 5)
+ df['B'] = s
+ df = df.set_index('B')
+
+ df = df.reset_index()
+
+ def test_set_axis_inplace(self):
+ # GH14636
+ df = DataFrame({'A': [1.1, 2.2, 3.3],
+ 'B': [5.0, 6.1, 7.2],
+ 'C': [4.4, 5.5, 6.6]},
+ index=[2010, 2011, 2012])
+
+ expected = {0: df.copy(),
+ 1: df.copy()}
+ expected[0].index = list('abc')
+ expected[1].columns = list('abc')
+ expected['index'] = expected[0]
+ expected['columns'] = expected[1]
+
+ for axis in expected:
+ # inplace=True
+ # The FutureWarning comes from the fact that we would like to have
+ # inplace default to False some day
+ for inplace, warn in (None, FutureWarning), (True, None):
+ kwargs = {'inplace': inplace}
+
+ result = df.copy()
+ with tm.assert_produces_warning(warn):
+ result.set_axis(list('abc'), axis=axis, **kwargs)
+ tm.assert_frame_equal(result, expected[axis])
+
+ # inplace=False
+ result = df.set_axis(list('abc'), axis=axis, inplace=False)
+ tm.assert_frame_equal(expected[axis], result)
+
+ # omitting the "axis" parameter
+ with tm.assert_produces_warning(None):
+ result = df.set_axis(list('abc'), inplace=False)
+ tm.assert_frame_equal(result, expected[0])
+
+ # wrong values for the "axis" parameter
+ for axis in 3, 'foo':
+ with pytest.raises(ValueError, match='No axis named'):
+ df.set_axis(list('abc'), axis=axis, inplace=False)
+
+ def test_set_axis_prior_to_deprecation_signature(self):
+ df = DataFrame({'A': [1.1, 2.2, 3.3],
+ 'B': [5.0, 6.1, 7.2],
+ 'C': [4.4, 5.5, 6.6]},
+ index=[2010, 2011, 2012])
+
+ expected = {0: df.copy(),
+ 1: df.copy()}
+ expected[0].index = list('abc')
+ expected[1].columns = list('abc')
+ expected['index'] = expected[0]
+ expected['columns'] = expected[1]
+
+ # old signature
+ for axis in expected:
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.set_axis(axis, list('abc'), inplace=False)
+ tm.assert_frame_equal(result, expected[axis])
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_analytics.py b/contrib/python/pandas/py2/pandas/tests/frame/test_analytics.py
new file mode 100644
index 00000000000..456af34e749
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_analytics.py
@@ -0,0 +1,2393 @@
+# -*- coding: utf-8 -*-
+
+from datetime import timedelta
+import operator
+from string import ascii_lowercase
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY35, lrange
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, MultiIndex, Series, Timestamp, compat, date_range,
+ isna, notna, to_datetime, to_timedelta)
+import pandas.core.algorithms as algorithms
+import pandas.core.nanops as nanops
+import pandas.util.testing as tm
+
+
+def assert_stat_op_calc(opname, alternative, frame, has_skipna=True,
+ check_dtype=True, check_dates=False,
+ check_less_precise=False, skipna_alternative=None):
+ """
+ Check that operator opname works as advertised on frame
+
+ Parameters
+ ----------
+ opname : string
+ Name of the operator to test on frame
+ alternative : function
+ Function that opname is tested against; i.e. "frame.opname()" should
+ equal "alternative(frame)".
+ frame : DataFrame
+ The object that the tests are executed on
+ has_skipna : bool, default True
+ Whether the method "opname" has the kwarg "skip_na"
+ check_dtype : bool, default True
+ Whether the dtypes of the result of "frame.opname()" and
+ "alternative(frame)" should be checked.
+ check_dates : bool, default false
+ Whether opname should be tested on a Datetime Series
+ check_less_precise : bool, default False
+ Whether results should only be compared approximately;
+ passed on to tm.assert_series_equal
+ skipna_alternative : function, default None
+ NaN-safe version of alternative
+ """
+
+ f = getattr(frame, opname)
+
+ if check_dates:
+ df = DataFrame({'b': date_range('1/1/2001', periods=2)})
+ result = getattr(df, opname)()
+ assert isinstance(result, Series)
+
+ df['a'] = lrange(len(df))
+ result = getattr(df, opname)()
+ assert isinstance(result, Series)
+ assert len(result)
+
+ if has_skipna:
+ def wrapper(x):
+ return alternative(x.values)
+
+ skipna_wrapper = tm._make_skipna_wrapper(alternative,
+ skipna_alternative)
+ result0 = f(axis=0, skipna=False)
+ result1 = f(axis=1, skipna=False)
+ tm.assert_series_equal(result0, frame.apply(wrapper),
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise)
+ # HACK: win32
+ tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
+ check_dtype=False,
+ check_less_precise=check_less_precise)
+ else:
+ skipna_wrapper = alternative
+
+ result0 = f(axis=0)
+ result1 = f(axis=1)
+ tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise)
+
+ if opname in ['sum', 'prod']:
+ expected = frame.apply(skipna_wrapper, axis=1)
+ tm.assert_series_equal(result1, expected, check_dtype=False,
+ check_less_precise=check_less_precise)
+
+ # check dtypes
+ if check_dtype:
+ lcd_dtype = frame.values.dtype
+ assert lcd_dtype == result0.dtype
+ assert lcd_dtype == result1.dtype
+
+ # bad axis
+ with pytest.raises(ValueError, match='No axis named 2'):
+ f(axis=2)
+
+ # all NA case
+ if has_skipna:
+ all_na = frame * np.NaN
+ r0 = getattr(all_na, opname)(axis=0)
+ r1 = getattr(all_na, opname)(axis=1)
+ if opname in ['sum', 'prod']:
+ unit = 1 if opname == 'prod' else 0 # result for empty sum/prod
+ expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
+ tm.assert_series_equal(r0, expected)
+ expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
+ tm.assert_series_equal(r1, expected)
+
+
+def assert_stat_op_api(opname, float_frame, float_string_frame,
+ has_numeric_only=False):
+ """
+ Check that API for operator opname works as advertised on frame
+
+ Parameters
+ ----------
+ opname : string
+ Name of the operator to test on frame
+ float_frame : DataFrame
+ DataFrame with columns of type float
+ float_string_frame : DataFrame
+ DataFrame with both float and string columns
+ has_numeric_only : bool, default False
+ Whether the method "opname" has the kwarg "numeric_only"
+ """
+
+ # make sure works on mixed-type frame
+ getattr(float_string_frame, opname)(axis=0)
+ getattr(float_string_frame, opname)(axis=1)
+
+ if has_numeric_only:
+ getattr(float_string_frame, opname)(axis=0, numeric_only=True)
+ getattr(float_string_frame, opname)(axis=1, numeric_only=True)
+ getattr(float_frame, opname)(axis=0, numeric_only=False)
+ getattr(float_frame, opname)(axis=1, numeric_only=False)
+
+
+def assert_bool_op_calc(opname, alternative, frame, has_skipna=True):
+ """
+ Check that bool operator opname works as advertised on frame
+
+ Parameters
+ ----------
+ opname : string
+ Name of the operator to test on frame
+ alternative : function
+ Function that opname is tested against; i.e. "frame.opname()" should
+ equal "alternative(frame)".
+ frame : DataFrame
+ The object that the tests are executed on
+ has_skipna : bool, default True
+ Whether the method "opname" has the kwarg "skip_na"
+ """
+
+ f = getattr(frame, opname)
+
+ if has_skipna:
+ def skipna_wrapper(x):
+ nona = x.dropna().values
+ return alternative(nona)
+
+ def wrapper(x):
+ return alternative(x.values)
+
+ result0 = f(axis=0, skipna=False)
+ result1 = f(axis=1, skipna=False)
+
+ tm.assert_series_equal(result0, frame.apply(wrapper))
+ tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
+ check_dtype=False) # HACK: win32
+ else:
+ skipna_wrapper = alternative
+ wrapper = alternative
+
+ result0 = f(axis=0)
+ result1 = f(axis=1)
+
+ tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
+ tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
+ check_dtype=False)
+
+ # bad axis
+ with pytest.raises(ValueError, match='No axis named 2'):
+ f(axis=2)
+
+ # all NA case
+ if has_skipna:
+ all_na = frame * np.NaN
+ r0 = getattr(all_na, opname)(axis=0)
+ r1 = getattr(all_na, opname)(axis=1)
+ if opname == 'any':
+ assert not r0.any()
+ assert not r1.any()
+ else:
+ assert r0.all()
+ assert r1.all()
+
+
+def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
+ has_bool_only=False):
+ """
+ Check that API for boolean operator opname works as advertised on frame
+
+ Parameters
+ ----------
+ opname : string
+ Name of the operator to test on frame
+ float_frame : DataFrame
+ DataFrame with columns of type float
+ float_string_frame : DataFrame
+ DataFrame with both float and string columns
+ has_bool_only : bool, default False
+ Whether the method "opname" has the kwarg "bool_only"
+ """
+ # make sure op works on mixed-type frame
+ mixed = float_string_frame
+ mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5
+ getattr(mixed, opname)(axis=0)
+ getattr(mixed, opname)(axis=1)
+
+ if has_bool_only:
+ getattr(mixed, opname)(axis=0, bool_only=True)
+ getattr(mixed, opname)(axis=1, bool_only=True)
+ getattr(bool_frame_with_na, opname)(axis=0, bool_only=False)
+ getattr(bool_frame_with_na, opname)(axis=1, bool_only=False)
+
+
+class TestDataFrameAnalytics():
+
+ # ---------------------------------------------------------------------=
+ # Correlation and covariance
+
+ @td.skip_if_no_scipy
+ def test_corr_pearson(self, float_frame):
+ float_frame['A'][:5] = np.nan
+ float_frame['B'][5:10] = np.nan
+
+ self._check_method(float_frame, 'pearson')
+
+ @td.skip_if_no_scipy
+ def test_corr_kendall(self, float_frame):
+ float_frame['A'][:5] = np.nan
+ float_frame['B'][5:10] = np.nan
+
+ self._check_method(float_frame, 'kendall')
+
+ @td.skip_if_no_scipy
+ def test_corr_spearman(self, float_frame):
+ float_frame['A'][:5] = np.nan
+ float_frame['B'][5:10] = np.nan
+
+ self._check_method(float_frame, 'spearman')
+
+ def _check_method(self, frame, method='pearson'):
+ correls = frame.corr(method=method)
+ expected = frame['A'].corr(frame['C'], method=method)
+ tm.assert_almost_equal(correls['A']['C'], expected)
+
+ @td.skip_if_no_scipy
+ def test_corr_non_numeric(self, float_frame, float_string_frame):
+ float_frame['A'][:5] = np.nan
+ float_frame['B'][5:10] = np.nan
+
+ # exclude non-numeric types
+ result = float_string_frame.corr()
+ expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].corr()
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize('meth', ['pearson', 'kendall', 'spearman'])
+ def test_corr_nooverlap(self, meth):
+ # nothing in common
+ df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan],
+ 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1],
+ 'C': [np.nan, np.nan, np.nan, np.nan,
+ np.nan, np.nan]})
+ rs = df.corr(meth)
+ assert isna(rs.loc['A', 'B'])
+ assert isna(rs.loc['B', 'A'])
+ assert rs.loc['A', 'A'] == 1
+ assert rs.loc['B', 'B'] == 1
+ assert isna(rs.loc['C', 'C'])
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize('meth', ['pearson', 'spearman'])
+ def test_corr_constant(self, meth):
+ # constant --> all NA
+
+ df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan],
+ 'B': [np.nan, np.nan, np.nan, 1, 1, 1]})
+ rs = df.corr(meth)
+ assert isna(rs.values).all()
+
+ def test_corr_int(self):
+ # dtypes other than float64 #1761
+ df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
+
+ df3.cov()
+ df3.corr()
+
+ @td.skip_if_no_scipy
+ def test_corr_int_and_boolean(self):
+ # when dtypes of pandas series are different
+ # then ndarray will have dtype=object,
+ # so it need to be properly handled
+ df = DataFrame({"a": [True, False], "b": [1, 0]})
+
+ expected = DataFrame(np.ones((2, 2)), index=[
+ 'a', 'b'], columns=['a', 'b'])
+ for meth in ['pearson', 'kendall', 'spearman']:
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ result = df.corr(meth)
+ tm.assert_frame_equal(result, expected)
+
+ def test_corr_cov_independent_index_column(self):
+ # GH 14617
+ df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4),
+ columns=list("abcd"))
+ for method in ['cov', 'corr']:
+ result = getattr(df, method)()
+ assert result.index is not result.columns
+ assert result.index.equals(result.columns)
+
+ def test_corr_invalid_method(self):
+ # GH 22298
+ df = pd.DataFrame(np.random.normal(size=(10, 2)))
+ msg = ("method must be either 'pearson', 'spearman', "
+ "or 'kendall'")
+ with pytest.raises(ValueError, match=msg):
+ df.corr(method="____")
+
+ def test_cov(self, float_frame, float_string_frame):
+ # min_periods no NAs (corner case)
+ expected = float_frame.cov()
+ result = float_frame.cov(min_periods=len(float_frame))
+
+ tm.assert_frame_equal(expected, result)
+
+ result = float_frame.cov(min_periods=len(float_frame) + 1)
+ assert isna(result.values).all()
+
+ # with NAs
+ frame = float_frame.copy()
+ frame['A'][:5] = np.nan
+ frame['B'][5:10] = np.nan
+ result = float_frame.cov(min_periods=len(float_frame) - 8)
+ expected = float_frame.cov()
+ expected.loc['A', 'B'] = np.nan
+ expected.loc['B', 'A'] = np.nan
+
+ # regular
+ float_frame['A'][:5] = np.nan
+ float_frame['B'][:10] = np.nan
+ cov = float_frame.cov()
+
+ tm.assert_almost_equal(cov['A']['C'],
+ float_frame['A'].cov(float_frame['C']))
+
+ # exclude non-numeric types
+ result = float_string_frame.cov()
+ expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].cov()
+ tm.assert_frame_equal(result, expected)
+
+ # Single column frame
+ df = DataFrame(np.linspace(0.0, 1.0, 10))
+ result = df.cov()
+ expected = DataFrame(np.cov(df.values.T).reshape((1, 1)),
+ index=df.columns, columns=df.columns)
+ tm.assert_frame_equal(result, expected)
+ df.loc[0] = np.nan
+ result = df.cov()
+ expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)),
+ index=df.columns, columns=df.columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_corrwith(self, datetime_frame):
+ a = datetime_frame
+ noise = Series(np.random.randn(len(a)), index=a.index)
+
+ b = datetime_frame.add(noise, axis=0)
+
+ # make sure order does not matter
+ b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
+ del b['B']
+
+ colcorr = a.corrwith(b, axis=0)
+ tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
+
+ rowcorr = a.corrwith(b, axis=1)
+ tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
+
+ dropped = a.corrwith(b, axis=0, drop=True)
+ tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
+ assert 'B' not in dropped
+
+ dropped = a.corrwith(b, axis=1, drop=True)
+ assert a.index[-1] not in dropped.index
+
+ # non time-series data
+ index = ['a', 'b', 'c', 'd', 'e']
+ columns = ['one', 'two', 'three', 'four']
+ df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
+ df2 = DataFrame(np.random.randn(4, 4),
+ index=index[:4], columns=columns)
+ correls = df1.corrwith(df2, axis=1)
+ for row in index[:4]:
+ tm.assert_almost_equal(correls[row],
+ df1.loc[row].corr(df2.loc[row]))
+
+ def test_corrwith_with_objects(self):
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame()
+ cols = ['A', 'B', 'C', 'D']
+
+ df1['obj'] = 'foo'
+ df2['obj'] = 'bar'
+
+ result = df1.corrwith(df2)
+ expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
+ tm.assert_series_equal(result, expected)
+
+ result = df1.corrwith(df2, axis=1)
+ expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
+ tm.assert_series_equal(result, expected)
+
+ def test_corrwith_series(self, datetime_frame):
+ result = datetime_frame.corrwith(datetime_frame['A'])
+ expected = datetime_frame.apply(datetime_frame['A'].corr)
+
+ tm.assert_series_equal(result, expected)
+
+ def test_corrwith_matches_corrcoef(self):
+ df1 = DataFrame(np.arange(10000), columns=['a'])
+ df2 = DataFrame(np.arange(10000) ** 2, columns=['a'])
+ c1 = df1.corrwith(df2)['a']
+ c2 = np.corrcoef(df1['a'], df2['a'])[0][1]
+
+ tm.assert_almost_equal(c1, c2)
+ assert c1 < 1
+
+ def test_corrwith_mixed_dtypes(self):
+ # GH 18570
+ df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3],
+ 'c': ['a', 'b', 'c', 'd']})
+ s = pd.Series([0, 6, 7, 3])
+ result = df.corrwith(s)
+ corrs = [df['a'].corr(s), df['b'].corr(s)]
+ expected = pd.Series(data=corrs, index=['a', 'b'])
+ tm.assert_series_equal(result, expected)
+
+ def test_corrwith_index_intersection(self):
+ df1 = pd.DataFrame(np.random.random(size=(10, 2)),
+ columns=["a", "b"])
+ df2 = pd.DataFrame(np.random.random(size=(10, 3)),
+ columns=["a", "b", "c"])
+
+ result = df1.corrwith(df2, drop=True).index.sort_values()
+ expected = df1.columns.intersection(df2.columns).sort_values()
+ tm.assert_index_equal(result, expected)
+
+ def test_corrwith_index_union(self):
+ df1 = pd.DataFrame(np.random.random(size=(10, 2)),
+ columns=["a", "b"])
+ df2 = pd.DataFrame(np.random.random(size=(10, 3)),
+ columns=["a", "b", "c"])
+
+ result = df1.corrwith(df2, drop=False).index.sort_values()
+ expected = df1.columns.union(df2.columns).sort_values()
+ tm.assert_index_equal(result, expected)
+
+ def test_corrwith_dup_cols(self):
+ # GH 21925
+ df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T)
+ df2 = df1.copy()
+ df2 = pd.concat((df2, df2[0]), axis=1)
+
+ result = df1.corrwith(df2)
+ expected = pd.Series(np.ones(4), index=[0, 0, 1, 2])
+ tm.assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_corrwith_spearman(self):
+ # GH 21925
+ df = pd.DataFrame(np.random.random(size=(100, 3)))
+ result = df.corrwith(df**2, method="spearman")
+ expected = Series(np.ones(len(result)))
+ tm.assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_corrwith_kendall(self):
+ # GH 21925
+ df = pd.DataFrame(np.random.random(size=(100, 3)))
+ result = df.corrwith(df**2, method="kendall")
+ expected = Series(np.ones(len(result)))
+ tm.assert_series_equal(result, expected)
+
+ def test_bool_describe_in_mixed_frame(self):
+ df = DataFrame({
+ 'string_data': ['a', 'b', 'c', 'd', 'e'],
+ 'bool_data': [True, True, False, False, False],
+ 'int_data': [10, 20, 30, 40, 50],
+ })
+
+ # Integer data are included in .describe() output,
+ # Boolean and string data are not.
+ result = df.describe()
+ expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
+ 10, 20, 30, 40, 50]},
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'])
+ tm.assert_frame_equal(result, expected)
+
+ # Top value is a boolean value that is False
+ result = df.describe(include=['bool'])
+
+ expected = DataFrame({'bool_data': [5, 2, False, 3]},
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_describe_bool_frame(self):
+ # GH 13891
+ df = pd.DataFrame({
+ 'bool_data_1': [False, False, True, True],
+ 'bool_data_2': [False, True, True, True]
+ })
+ result = df.describe()
+ expected = DataFrame({'bool_data_1': [4, 2, True, 2],
+ 'bool_data_2': [4, 2, True, 3]},
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_frame_equal(result, expected)
+
+ df = pd.DataFrame({
+ 'bool_data': [False, False, True, True, False],
+ 'int_data': [0, 1, 2, 3, 4]
+ })
+ result = df.describe()
+ expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
+ 2, 3, 4]},
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'])
+ tm.assert_frame_equal(result, expected)
+
+ df = pd.DataFrame({
+ 'bool_data': [False, False, True, True],
+ 'str_data': ['a', 'b', 'c', 'a']
+ })
+ result = df.describe()
+ expected = DataFrame({'bool_data': [4, 2, True, 2],
+ 'str_data': [4, 3, 'a', 2]},
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_describe_categorical(self):
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+ cat = df
+
+ # Categoricals should not show up together with numerical columns
+ result = cat.describe()
+ assert len(result.columns) == 1
+
+ # In a frame, describe() for the cat should be the same as for string
+ # arrays (count, unique, top, freq)
+
+ cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
+ ordered=True)
+ s = Series(cat)
+ result = s.describe()
+ expected = Series([4, 2, "b", 3],
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_series_equal(result, expected)
+
+ cat = Series(Categorical(["a", "b", "c", "c"]))
+ df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
+ result = df3.describe()
+ tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
+
+ def test_describe_categorical_columns(self):
+ # GH 11558
+ columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
+ ordered=True, name='XXX')
+ df = DataFrame({'int1': [10, 20, 30, 40, 50],
+ 'int2': [10, 20, 30, 40, 50],
+ 'obj': ['A', 0, None, 'X', 1]},
+ columns=columns)
+ result = df.describe()
+
+ exp_columns = pd.CategoricalIndex(['int1', 'int2'],
+ categories=['int1', 'int2', 'obj'],
+ ordered=True, name='XXX')
+ expected = DataFrame({'int1': [5, 30, df.int1.std(),
+ 10, 20, 30, 40, 50],
+ 'int2': [5, 30, df.int2.std(),
+ 10, 20, 30, 40, 50]},
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'],
+ columns=exp_columns)
+ tm.assert_frame_equal(result, expected)
+ tm.assert_categorical_equal(result.columns.values,
+ expected.columns.values)
+
+ def test_describe_datetime_columns(self):
+ columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ freq='MS', tz='US/Eastern', name='XXX')
+ df = DataFrame({0: [10, 20, 30, 40, 50],
+ 1: [10, 20, 30, 40, 50],
+ 2: ['A', 0, None, 'X', 1]})
+ df.columns = columns
+ result = df.describe()
+
+ exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
+ freq='MS', tz='US/Eastern', name='XXX')
+ expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
+ 10, 20, 30, 40, 50],
+ 1: [5, 30, df.iloc[:, 1].std(),
+ 10, 20, 30, 40, 50]},
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'])
+ expected.columns = exp_columns
+ tm.assert_frame_equal(result, expected)
+ assert result.columns.freq == 'MS'
+ assert result.columns.tz == expected.columns.tz
+
+ def test_describe_timedelta_values(self):
+ # GH 6145
+ t1 = pd.timedelta_range('1 days', freq='D', periods=5)
+ t2 = pd.timedelta_range('1 hours', freq='H', periods=5)
+ df = pd.DataFrame({'t1': t1, 't2': t2})
+
+ expected = DataFrame({'t1': [5, pd.Timedelta('3 days'),
+ df.iloc[:, 0].std(),
+ pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('3 days'),
+ pd.Timedelta('4 days'),
+ pd.Timedelta('5 days')],
+ 't2': [5, pd.Timedelta('3 hours'),
+ df.iloc[:, 1].std(),
+ pd.Timedelta('1 hours'),
+ pd.Timedelta('2 hours'),
+ pd.Timedelta('3 hours'),
+ pd.Timedelta('4 hours'),
+ pd.Timedelta('5 hours')]},
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'])
+
+ result = df.describe()
+ tm.assert_frame_equal(result, expected)
+
+ exp_repr = (" t1 t2\n"
+ "count 5 5\n"
+ "mean 3 days 00:00:00 0 days 03:00:00\n"
+ "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
+ "min 1 days 00:00:00 0 days 01:00:00\n"
+ "25% 2 days 00:00:00 0 days 02:00:00\n"
+ "50% 3 days 00:00:00 0 days 03:00:00\n"
+ "75% 4 days 00:00:00 0 days 04:00:00\n"
+ "max 5 days 00:00:00 0 days 05:00:00")
+ assert repr(result) == exp_repr
+
+ def test_describe_tz_values(self, tz_naive_fixture):
+ # GH 21332
+ tz = tz_naive_fixture
+ s1 = Series(range(5))
+ start = Timestamp(2018, 1, 1)
+ end = Timestamp(2018, 1, 5)
+ s2 = Series(date_range(start, end, tz=tz))
+ df = pd.DataFrame({'s1': s1, 's2': s2})
+
+ expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
+ 2, 1.581139, 0, 1, 2, 3, 4],
+ 's2': [5, 5, s2.value_counts().index[0], 1,
+ start.tz_localize(tz),
+ end.tz_localize(tz), np.nan, np.nan,
+ np.nan, np.nan, np.nan, np.nan, np.nan]},
+ index=['count', 'unique', 'top', 'freq', 'first',
+ 'last', 'mean', 'std', 'min', '25%', '50%',
+ '75%', 'max']
+ )
+ result = df.describe(include='all')
+ tm.assert_frame_equal(result, expected)
+
+ def test_reduce_mixed_frame(self):
+ # GH 6806
+ df = DataFrame({
+ 'bool_data': [True, True, False, False, False],
+ 'int_data': [10, 20, 30, 40, 50],
+ 'string_data': ['a', 'b', 'c', 'd', 'e'],
+ })
+ df.reindex(columns=['bool_data', 'int_data', 'string_data'])
+ test = df.sum(axis=0)
+ tm.assert_numpy_array_equal(test.values,
+ np.array([2, 150, 'abcde'], dtype=object))
+ tm.assert_series_equal(test, df.T.sum(axis=1))
+
+ def test_count(self, float_frame_with_na, float_frame, float_string_frame):
+ f = lambda s: notna(s).sum()
+ assert_stat_op_calc('count', f, float_frame_with_na, has_skipna=False,
+ check_dtype=False, check_dates=True)
+ assert_stat_op_api('count', float_frame, float_string_frame,
+ has_numeric_only=True)
+
+ # corner case
+ frame = DataFrame()
+ ct1 = frame.count(1)
+ assert isinstance(ct1, Series)
+
+ ct2 = frame.count(0)
+ assert isinstance(ct2, Series)
+
+ # GH 423
+ df = DataFrame(index=lrange(10))
+ result = df.count(1)
+ expected = Series(0, index=df.index)
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame(columns=lrange(10))
+ result = df.count(0)
+ expected = Series(0, index=df.columns)
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame()
+ result = df.count()
+ expected = Series(0, index=[])
+ tm.assert_series_equal(result, expected)
+
+ def test_nunique(self, float_frame_with_na, float_frame,
+ float_string_frame):
+ f = lambda s: len(algorithms.unique1d(s.dropna()))
+ assert_stat_op_calc('nunique', f, float_frame_with_na,
+ has_skipna=False, check_dtype=False,
+ check_dates=True)
+ assert_stat_op_api('nunique', float_frame, float_string_frame)
+
+ df = DataFrame({'A': [1, 1, 1],
+ 'B': [1, 2, 3],
+ 'C': [1, np.nan, 3]})
+ tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
+ tm.assert_series_equal(df.nunique(dropna=False),
+ Series({'A': 1, 'B': 3, 'C': 3}))
+ tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
+ tm.assert_series_equal(df.nunique(axis=1, dropna=False),
+ Series({0: 1, 1: 3, 2: 2}))
+
+ def test_sum(self, float_frame_with_na, mixed_float_frame,
+ float_frame, float_string_frame):
+ assert_stat_op_api('sum', float_frame, float_string_frame,
+ has_numeric_only=True)
+ assert_stat_op_calc('sum', np.sum, float_frame_with_na,
+ skipna_alternative=np.nansum)
+ # mixed types (with upcasting happening)
+ assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'),
+ check_dtype=False, check_less_precise=True)
+
+ @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var',
+ 'std', 'skew', 'min', 'max'])
+ def test_stat_operators_attempt_obj_array(self, method):
+ # GH 676
+ data = {
+ 'a': [-0.00049987540199591344, -0.0016467257772919831,
+ 0.00067695870775883013],
+ 'b': [-0, -0, 0.0],
+ 'c': [0.00031111847529610595, 0.0014902627951905339,
+ -0.00094099200035979691]
+ }
+ df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O')
+
+ df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
+ 2: [np.nan, 4]}, dtype=object)
+
+ for df in [df1, df2]:
+ assert df.values.dtype == np.object_
+ result = getattr(df, method)(1)
+ expected = getattr(df.astype('f8'), method)(1)
+
+ if method in ['sum', 'prod']:
+ tm.assert_series_equal(result, expected)
+
+ def test_mean(self, float_frame_with_na, float_frame, float_string_frame):
+ assert_stat_op_calc('mean', np.mean, float_frame_with_na,
+ check_dates=True)
+ assert_stat_op_api('mean', float_frame, float_string_frame)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ def test_mean_mixed_datetime_numeric(self, tz):
+ # https://github.com/pandas-dev/pandas/issues/24752
+ df = pd.DataFrame({"A": [1, 1],
+ "B": [pd.Timestamp('2000', tz=tz)] * 2})
+ result = df.mean()
+ expected = pd.Series([1.0], index=['A'])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ def test_mean_excludeds_datetimes(self, tz):
+ # https://github.com/pandas-dev/pandas/issues/24752
+ # Our long-term desired behavior is unclear, but the behavior in
+ # 0.24.0rc1 was buggy.
+ df = pd.DataFrame({"A": [pd.Timestamp('2000', tz=tz)] * 2})
+ result = df.mean()
+ expected = pd.Series()
+ tm.assert_series_equal(result, expected)
+
+ def test_product(self, float_frame_with_na, float_frame,
+ float_string_frame):
+ assert_stat_op_calc('product', np.prod, float_frame_with_na)
+ assert_stat_op_api('product', float_frame, float_string_frame)
+
+ # TODO: Ensure warning isn't emitted in the first place
+ @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
+ def test_median(self, float_frame_with_na, float_frame,
+ float_string_frame):
+ def wrapper(x):
+ if isna(x).any():
+ return np.nan
+ return np.median(x)
+
+ assert_stat_op_calc('median', wrapper, float_frame_with_na,
+ check_dates=True)
+ assert_stat_op_api('median', float_frame, float_string_frame)
+
+ def test_min(self, float_frame_with_na, int_frame,
+ float_frame, float_string_frame):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ assert_stat_op_calc('min', np.min, float_frame_with_na,
+ check_dates=True)
+ assert_stat_op_calc('min', np.min, int_frame)
+ assert_stat_op_api('min', float_frame, float_string_frame)
+
+ def test_cummin(self, datetime_frame):
+ datetime_frame.loc[5:10, 0] = np.nan
+ datetime_frame.loc[10:15, 1] = np.nan
+ datetime_frame.loc[15:, 2] = np.nan
+
+ # axis = 0
+ cummin = datetime_frame.cummin()
+ expected = datetime_frame.apply(Series.cummin)
+ tm.assert_frame_equal(cummin, expected)
+
+ # axis = 1
+ cummin = datetime_frame.cummin(axis=1)
+ expected = datetime_frame.apply(Series.cummin, axis=1)
+ tm.assert_frame_equal(cummin, expected)
+
+ # it works
+ df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
+ result = df.cummin() # noqa
+
+ # fix issue
+ cummin_xs = datetime_frame.cummin(axis=1)
+ assert np.shape(cummin_xs) == np.shape(datetime_frame)
+
+ def test_cummax(self, datetime_frame):
+ datetime_frame.loc[5:10, 0] = np.nan
+ datetime_frame.loc[10:15, 1] = np.nan
+ datetime_frame.loc[15:, 2] = np.nan
+
+ # axis = 0
+ cummax = datetime_frame.cummax()
+ expected = datetime_frame.apply(Series.cummax)
+ tm.assert_frame_equal(cummax, expected)
+
+ # axis = 1
+ cummax = datetime_frame.cummax(axis=1)
+ expected = datetime_frame.apply(Series.cummax, axis=1)
+ tm.assert_frame_equal(cummax, expected)
+
+ # it works
+ df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
+ result = df.cummax() # noqa
+
+ # fix issue
+ cummax_xs = datetime_frame.cummax(axis=1)
+ assert np.shape(cummax_xs) == np.shape(datetime_frame)
+
+ def test_max(self, float_frame_with_na, int_frame,
+ float_frame, float_string_frame):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ assert_stat_op_calc('max', np.max, float_frame_with_na,
+ check_dates=True)
+ assert_stat_op_calc('max', np.max, int_frame)
+ assert_stat_op_api('max', float_frame, float_string_frame)
+
+ def test_mad(self, float_frame_with_na, float_frame, float_string_frame):
+ f = lambda x: np.abs(x - x.mean()).mean()
+ assert_stat_op_calc('mad', f, float_frame_with_na)
+ assert_stat_op_api('mad', float_frame, float_string_frame)
+
+ def test_var_std(self, float_frame_with_na, datetime_frame, float_frame,
+ float_string_frame):
+ alt = lambda x: np.var(x, ddof=1)
+ assert_stat_op_calc('var', alt, float_frame_with_na)
+ assert_stat_op_api('var', float_frame, float_string_frame)
+
+ alt = lambda x: np.std(x, ddof=1)
+ assert_stat_op_calc('std', alt, float_frame_with_na)
+ assert_stat_op_api('std', float_frame, float_string_frame)
+
+ result = datetime_frame.std(ddof=4)
+ expected = datetime_frame.apply(lambda x: x.std(ddof=4))
+ tm.assert_almost_equal(result, expected)
+
+ result = datetime_frame.var(ddof=4)
+ expected = datetime_frame.apply(lambda x: x.var(ddof=4))
+ tm.assert_almost_equal(result, expected)
+
+ arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
+ result = nanops.nanvar(arr, axis=0)
+ assert not (result < 0).any()
+
+ with pd.option_context('use_bottleneck', False):
+ result = nanops.nanvar(arr, axis=0)
+ assert not (result < 0).any()
+
+ @pytest.mark.parametrize(
+ "meth", ['sem', 'var', 'std'])
+ def test_numeric_only_flag(self, meth):
+ # GH 9201
+ df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
+ # set one entry to a number in str format
+ df1.loc[0, 'foo'] = '100'
+
+ df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
+ # set one entry to a non-number str
+ df2.loc[0, 'foo'] = 'a'
+
+ result = getattr(df1, meth)(axis=1, numeric_only=True)
+ expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
+ tm.assert_series_equal(expected, result)
+
+ result = getattr(df2, meth)(axis=1, numeric_only=True)
+ expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
+ tm.assert_series_equal(expected, result)
+
+ # df1 has all numbers, df2 has a letter inside
+ pytest.raises(TypeError, lambda: getattr(df1, meth)(
+ axis=1, numeric_only=False))
+ pytest.raises(TypeError, lambda: getattr(df2, meth)(
+ axis=1, numeric_only=False))
+
+ @pytest.mark.parametrize('op', ['mean', 'std', 'var',
+ 'skew', 'kurt', 'sem'])
+ def test_mixed_ops(self, op):
+ # GH 16116
+ df = DataFrame({'int': [1, 2, 3, 4],
+ 'float': [1., 2., 3., 4.],
+ 'str': ['a', 'b', 'c', 'd']})
+
+ result = getattr(df, op)()
+ assert len(result) == 2
+
+ with pd.option_context('use_bottleneck', False):
+ result = getattr(df, op)()
+ assert len(result) == 2
+
+ def test_cumsum(self, datetime_frame):
+ datetime_frame.loc[5:10, 0] = np.nan
+ datetime_frame.loc[10:15, 1] = np.nan
+ datetime_frame.loc[15:, 2] = np.nan
+
+ # axis = 0
+ cumsum = datetime_frame.cumsum()
+ expected = datetime_frame.apply(Series.cumsum)
+ tm.assert_frame_equal(cumsum, expected)
+
+ # axis = 1
+ cumsum = datetime_frame.cumsum(axis=1)
+ expected = datetime_frame.apply(Series.cumsum, axis=1)
+ tm.assert_frame_equal(cumsum, expected)
+
+ # works
+ df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
+ result = df.cumsum() # noqa
+
+ # fix issue
+ cumsum_xs = datetime_frame.cumsum(axis=1)
+ assert np.shape(cumsum_xs) == np.shape(datetime_frame)
+
+ def test_cumprod(self, datetime_frame):
+ datetime_frame.loc[5:10, 0] = np.nan
+ datetime_frame.loc[10:15, 1] = np.nan
+ datetime_frame.loc[15:, 2] = np.nan
+
+ # axis = 0
+ cumprod = datetime_frame.cumprod()
+ expected = datetime_frame.apply(Series.cumprod)
+ tm.assert_frame_equal(cumprod, expected)
+
+ # axis = 1
+ cumprod = datetime_frame.cumprod(axis=1)
+ expected = datetime_frame.apply(Series.cumprod, axis=1)
+ tm.assert_frame_equal(cumprod, expected)
+
+ # fix issue
+ cumprod_xs = datetime_frame.cumprod(axis=1)
+ assert np.shape(cumprod_xs) == np.shape(datetime_frame)
+
+ # ints
+ df = datetime_frame.fillna(0).astype(int)
+ df.cumprod(0)
+ df.cumprod(1)
+
+ # ints32
+ df = datetime_frame.fillna(0).astype(np.int32)
+ df.cumprod(0)
+ df.cumprod(1)
+
+ def test_sem(self, float_frame_with_na, datetime_frame,
+ float_frame, float_string_frame):
+ alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
+ assert_stat_op_calc('sem', alt, float_frame_with_na)
+ assert_stat_op_api('sem', float_frame, float_string_frame)
+
+ result = datetime_frame.sem(ddof=4)
+ expected = datetime_frame.apply(
+ lambda x: x.std(ddof=4) / np.sqrt(len(x)))
+ tm.assert_almost_equal(result, expected)
+
+ arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
+ result = nanops.nansem(arr, axis=0)
+ assert not (result < 0).any()
+
+ with pd.option_context('use_bottleneck', False):
+ result = nanops.nansem(arr, axis=0)
+ assert not (result < 0).any()
+
+ @td.skip_if_no_scipy
+ def test_skew(self, float_frame_with_na, float_frame, float_string_frame):
+ from scipy.stats import skew
+
+ def alt(x):
+ if len(x) < 3:
+ return np.nan
+ return skew(x, bias=False)
+
+ assert_stat_op_calc('skew', alt, float_frame_with_na)
+ assert_stat_op_api('skew', float_frame, float_string_frame)
+
+ @td.skip_if_no_scipy
+ def test_kurt(self, float_frame_with_na, float_frame, float_string_frame):
+ from scipy.stats import kurtosis
+
+ def alt(x):
+ if len(x) < 4:
+ return np.nan
+ return kurtosis(x, bias=False)
+
+ assert_stat_op_calc('kurt', alt, float_frame_with_na)
+ assert_stat_op_api('kurt', float_frame, float_string_frame)
+
+ index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0],
+ [0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1]])
+ df = DataFrame(np.random.randn(6, 3), index=index)
+
+ kurt = df.kurt()
+ kurt2 = df.kurt(level=0).xs('bar')
+ tm.assert_series_equal(kurt, kurt2, check_names=False)
+ assert kurt.name is None
+ assert kurt2.name == 'bar'
+
+ @pytest.mark.parametrize("dropna, expected", [
+ (True, {'A': [12],
+ 'B': [10.0],
+ 'C': [1.0],
+ 'D': ['a'],
+ 'E': Categorical(['a'], categories=['a']),
+ 'F': to_datetime(['2000-1-2']),
+ 'G': to_timedelta(['1 days'])}),
+ (False, {'A': [12],
+ 'B': [10.0],
+ 'C': [np.nan],
+ 'D': np.array([np.nan], dtype=object),
+ 'E': Categorical([np.nan], categories=['a']),
+ 'F': [pd.NaT],
+ 'G': to_timedelta([pd.NaT])}),
+ (True, {'H': [8, 9, np.nan, np.nan],
+ 'I': [8, 9, np.nan, np.nan],
+ 'J': [1, np.nan, np.nan, np.nan],
+ 'K': Categorical(['a', np.nan, np.nan, np.nan],
+ categories=['a']),
+ 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
+ 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
+ 'N': [0, 1, 2, 3]}),
+ (False, {'H': [8, 9, np.nan, np.nan],
+ 'I': [8, 9, np.nan, np.nan],
+ 'J': [1, np.nan, np.nan, np.nan],
+ 'K': Categorical([np.nan, 'a', np.nan, np.nan],
+ categories=['a']),
+ 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
+ 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
+ 'N': [0, 1, 2, 3]})
+ ])
+ def test_mode_dropna(self, dropna, expected):
+
+ df = DataFrame({"A": [12, 12, 19, 11],
+ "B": [10, 10, np.nan, 3],
+ "C": [1, np.nan, np.nan, np.nan],
+ "D": [np.nan, np.nan, 'a', np.nan],
+ "E": Categorical([np.nan, np.nan, 'a', np.nan]),
+ "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
+ "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
+ "H": [8, 8, 9, 9],
+ "I": [9, 9, 8, 8],
+ "J": [1, 1, np.nan, np.nan],
+ "K": Categorical(['a', np.nan, 'a', np.nan]),
+ "L": to_datetime(['2000-1-2', '2000-1-2',
+ 'NaT', 'NaT']),
+ "M": to_timedelta(['1 days', 'nan',
+ '1 days', 'nan']),
+ "N": np.arange(4, dtype='int64')})
+
+ result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
+ expected = DataFrame(expected)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(not compat.PY3, reason="only PY3")
+ def test_mode_sortwarning(self):
+ # Check for the warning that is raised when the mode
+ # results cannot be sorted
+
+ df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
+ expected = DataFrame({'A': ['a', np.nan]})
+
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ result = df.mode(dropna=False)
+ result = result.sort_values(by='A').reset_index(drop=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_operators_timedelta64(self):
+ df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),
+ B=date_range('2012-1-2', periods=3, freq='D'),
+ C=Timestamp('20120101') -
+ timedelta(minutes=5, seconds=5)))
+
+ diffs = DataFrame(dict(A=df['A'] - df['C'],
+ B=df['A'] - df['B']))
+
+ # min
+ result = diffs.min()
+ assert result[0] == diffs.loc[0, 'A']
+ assert result[1] == diffs.loc[0, 'B']
+
+ result = diffs.min(axis=1)
+ assert (result == diffs.loc[0, 'B']).all()
+
+ # max
+ result = diffs.max()
+ assert result[0] == diffs.loc[2, 'A']
+ assert result[1] == diffs.loc[2, 'B']
+
+ result = diffs.max(axis=1)
+ assert (result == diffs['A']).all()
+
+ # abs
+ result = diffs.abs()
+ result2 = abs(diffs)
+ expected = DataFrame(dict(A=df['A'] - df['C'],
+ B=df['B'] - df['A']))
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ # mixed frame
+ mixed = diffs.copy()
+ mixed['C'] = 'foo'
+ mixed['D'] = 1
+ mixed['E'] = 1.
+ mixed['F'] = Timestamp('20130101')
+
+ # results in an object array
+ result = mixed.min()
+ expected = Series([pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
+ pd.Timedelta(timedelta(days=-1)),
+ 'foo', 1, 1.0,
+ Timestamp('20130101')],
+ index=mixed.columns)
+ tm.assert_series_equal(result, expected)
+
+ # excludes numeric
+ result = mixed.min(axis=1)
+ expected = Series([1, 1, 1.], index=[0, 1, 2])
+ tm.assert_series_equal(result, expected)
+
+ # works when only those columns are selected
+ result = mixed[['A', 'B']].min(1)
+ expected = Series([timedelta(days=-1)] * 3)
+ tm.assert_series_equal(result, expected)
+
+ result = mixed[['A', 'B']].min()
+ expected = Series([timedelta(seconds=5 * 60 + 5),
+ timedelta(days=-1)], index=['A', 'B'])
+ tm.assert_series_equal(result, expected)
+
+ # GH 3106
+ df = DataFrame({'time': date_range('20130102', periods=5),
+ 'time2': date_range('20130105', periods=5)})
+ df['off1'] = df['time2'] - df['time']
+ assert df['off1'].dtype == 'timedelta64[ns]'
+
+ df['off2'] = df['time'] - df['time2']
+ df._consolidate_inplace()
+ assert df['off1'].dtype == 'timedelta64[ns]'
+ assert df['off2'].dtype == 'timedelta64[ns]'
+
+ def test_sum_corner(self, empty_frame):
+ axis0 = empty_frame.sum(0)
+ axis1 = empty_frame.sum(1)
+ assert isinstance(axis0, Series)
+ assert isinstance(axis1, Series)
+ assert len(axis0) == 0
+ assert len(axis1) == 0
+
+ @pytest.mark.parametrize('method, unit', [
+ ('sum', 0),
+ ('prod', 1),
+ ])
+ def test_sum_prod_nanops(self, method, unit):
+ idx = ['a', 'b', 'c']
+ df = pd.DataFrame({"a": [unit, unit],
+ "b": [unit, np.nan],
+ "c": [np.nan, np.nan]})
+ # The default
+ result = getattr(df, method)
+ expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
+
+ # min_count=1
+ result = getattr(df, method)(min_count=1)
+ expected = pd.Series([unit, unit, np.nan], index=idx)
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = getattr(df, method)(min_count=0)
+ expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
+ tm.assert_series_equal(result, expected)
+
+ result = getattr(df.iloc[1:], method)(min_count=1)
+ expected = pd.Series([unit, np.nan, np.nan], index=idx)
+ tm.assert_series_equal(result, expected)
+
+ # min_count > 1
+ df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
+ result = getattr(df, method)(min_count=5)
+ expected = pd.Series(result, index=['A', 'B'])
+ tm.assert_series_equal(result, expected)
+
+ result = getattr(df, method)(min_count=6)
+ expected = pd.Series(result, index=['A', 'B'])
+ tm.assert_series_equal(result, expected)
+
+ def test_sum_nanops_timedelta(self):
+ # prod isn't defined on timedeltas
+ idx = ['a', 'b', 'c']
+ df = pd.DataFrame({"a": [0, 0],
+ "b": [0, np.nan],
+ "c": [np.nan, np.nan]})
+
+ df2 = df.apply(pd.to_timedelta)
+
+ # 0 by default
+ result = df2.sum()
+ expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx)
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = df2.sum(min_count=0)
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = df2.sum(min_count=1)
+ expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx)
+ tm.assert_series_equal(result, expected)
+
+ def test_sum_object(self, float_frame):
+ values = float_frame.values.astype(int)
+ frame = DataFrame(values, index=float_frame.index,
+ columns=float_frame.columns)
+ deltas = frame * timedelta(1)
+ deltas.sum()
+
+ def test_sum_bool(self, float_frame):
+ # ensure this works, bug report
+ bools = np.isnan(float_frame)
+ bools.sum(1)
+ bools.sum(0)
+
+ def test_mean_corner(self, float_frame, float_string_frame):
+ # unit test when have object data
+ the_mean = float_string_frame.mean(axis=0)
+ the_sum = float_string_frame.sum(axis=0, numeric_only=True)
+ tm.assert_index_equal(the_sum.index, the_mean.index)
+ assert len(the_mean.index) < len(float_string_frame.columns)
+
+ # xs sum mixed type, just want to know it works...
+ the_mean = float_string_frame.mean(axis=1)
+ the_sum = float_string_frame.sum(axis=1, numeric_only=True)
+ tm.assert_index_equal(the_sum.index, the_mean.index)
+
+ # take mean of boolean column
+ float_frame['bool'] = float_frame['A'] > 0
+ means = float_frame.mean(0)
+ assert means['bool'] == float_frame['bool'].values.mean()
+
+ def test_stats_mixed_type(self, float_string_frame):
+ # don't blow up
+ float_string_frame.std(1)
+ float_string_frame.var(1)
+ float_string_frame.mean(1)
+ float_string_frame.skew(1)
+
+ # TODO: Ensure warning isn't emitted in the first place
+ @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
+ def test_median_corner(self, int_frame, float_frame, float_string_frame):
+ def wrapper(x):
+ if isna(x).any():
+ return np.nan
+ return np.median(x)
+
+ assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False,
+ check_dates=True)
+ assert_stat_op_api('median', float_frame, float_string_frame)
+
+ # Miscellanea
+
+ def test_count_objects(self, float_string_frame):
+ dm = DataFrame(float_string_frame._series)
+ df = DataFrame(float_string_frame._series)
+
+ tm.assert_series_equal(dm.count(), df.count())
+ tm.assert_series_equal(dm.count(1), df.count(1))
+
+ def test_cumsum_corner(self):
+ dm = DataFrame(np.arange(20).reshape(4, 5),
+ index=lrange(4), columns=lrange(5))
+ # ?(wesm)
+ result = dm.cumsum() # noqa
+
+ def test_sum_bools(self):
+ df = DataFrame(index=lrange(1), columns=lrange(10))
+ bools = isna(df)
+ assert bools.sum(axis=1)[0] == 10
+
+ # Index of max / min
+
+ def test_idxmin(self, float_frame, int_frame):
+ frame = float_frame
+ frame.loc[5:10] = np.nan
+ frame.loc[15:20, -2:] = np.nan
+ for skipna in [True, False]:
+ for axis in [0, 1]:
+ for df in [frame, int_frame]:
+ result = df.idxmin(axis=axis, skipna=skipna)
+ expected = df.apply(Series.idxmin, axis=axis,
+ skipna=skipna)
+ tm.assert_series_equal(result, expected)
+
+ pytest.raises(ValueError, frame.idxmin, axis=2)
+
+ def test_idxmax(self, float_frame, int_frame):
+ frame = float_frame
+ frame.loc[5:10] = np.nan
+ frame.loc[15:20, -2:] = np.nan
+ for skipna in [True, False]:
+ for axis in [0, 1]:
+ for df in [frame, int_frame]:
+ result = df.idxmax(axis=axis, skipna=skipna)
+ expected = df.apply(Series.idxmax, axis=axis,
+ skipna=skipna)
+ tm.assert_series_equal(result, expected)
+
+ pytest.raises(ValueError, frame.idxmax, axis=2)
+
+ # ----------------------------------------------------------------------
+ # Logical reductions
+
+ @pytest.mark.parametrize('opname', ['any', 'all'])
+ def test_any_all(self, opname, bool_frame_with_na, float_string_frame):
+ assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na,
+ has_skipna=True)
+ assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
+ has_bool_only=True)
+
+ def test_any_all_extra(self):
+ df = DataFrame({
+ 'A': [True, False, False],
+ 'B': [True, True, False],
+ 'C': [True, True, True],
+ }, index=['a', 'b', 'c'])
+ result = df[['A', 'B']].any(1)
+ expected = Series([True, True, False], index=['a', 'b', 'c'])
+ tm.assert_series_equal(result, expected)
+
+ result = df[['A', 'B']].any(1, bool_only=True)
+ tm.assert_series_equal(result, expected)
+
+ result = df.all(1)
+ expected = Series([True, False, False], index=['a', 'b', 'c'])
+ tm.assert_series_equal(result, expected)
+
+ result = df.all(1, bool_only=True)
+ tm.assert_series_equal(result, expected)
+
+ # Axis is None
+ result = df.all(axis=None).item()
+ assert result is False
+
+ result = df.any(axis=None).item()
+ assert result is True
+
+ result = df[['C']].all(axis=None).item()
+ assert result is True
+
+ def test_any_datetime(self):
+
+ # GH 23070
+ float_data = [1, np.nan, 3, np.nan]
+ datetime_data = [pd.Timestamp('1960-02-15'),
+ pd.Timestamp('1960-02-16'),
+ pd.NaT,
+ pd.NaT]
+ df = DataFrame({
+ "A": float_data,
+ "B": datetime_data
+ })
+
+ result = df.any(1)
+ expected = Series([True, True, True, False])
+ tm.assert_series_equal(result, expected)
+
+ def test_any_all_bool_only(self):
+
+ # GH 25101
+ df = DataFrame({"col1": [1, 2, 3],
+ "col2": [4, 5, 6],
+ "col3": [None, None, None]})
+
+ result = df.all(bool_only=True)
+ expected = Series(dtype=np.bool)
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame({"col1": [1, 2, 3],
+ "col2": [4, 5, 6],
+ "col3": [None, None, None],
+ "col4": [False, False, True]})
+
+ result = df.all(bool_only=True)
+ expected = Series({"col4": False})
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('func, data, expected', [
+ (np.any, {}, False),
+ (np.all, {}, True),
+ (np.any, {'A': []}, False),
+ (np.all, {'A': []}, True),
+ (np.any, {'A': [False, False]}, False),
+ (np.all, {'A': [False, False]}, False),
+ (np.any, {'A': [True, False]}, True),
+ (np.all, {'A': [True, False]}, False),
+ (np.any, {'A': [True, True]}, True),
+ (np.all, {'A': [True, True]}, True),
+
+ (np.any, {'A': [False], 'B': [False]}, False),
+ (np.all, {'A': [False], 'B': [False]}, False),
+
+ (np.any, {'A': [False, False], 'B': [False, True]}, True),
+ (np.all, {'A': [False, False], 'B': [False, True]}, False),
+
+ # other types
+ (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False),
+ (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True),
+ (np.all, {'A': pd.Series([0, 1], dtype=int)}, False),
+ (np.any, {'A': pd.Series([0, 1], dtype=int)}, True),
+ pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
+ marks=[td.skip_if_np_lt_115]),
+ (np.all, {'A': pd.Series([0, 1], dtype='category')}, False),
+ (np.any, {'A': pd.Series([0, 1], dtype='category')}, True),
+ (np.all, {'A': pd.Series([1, 2], dtype='category')}, True),
+ (np.any, {'A': pd.Series([1, 2], dtype='category')}, True),
+
+ # # Mix
+ # GH 21484
+ # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
+ # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
+ ])
+ def test_any_all_np_func(self, func, data, expected):
+ # GH 19976
+ data = DataFrame(data)
+ result = func(data)
+ assert isinstance(result, np.bool_)
+ assert result.item() is expected
+
+ # method version
+ result = getattr(DataFrame(data), func.__name__)(axis=None)
+ assert isinstance(result, np.bool_)
+ assert result.item() is expected
+
+ def test_any_all_object(self):
+ # GH 19976
+ result = np.all(DataFrame(columns=['a', 'b'])).item()
+ assert result is True
+
+ result = np.any(DataFrame(columns=['a', 'b'])).item()
+ assert result is False
+
+ @pytest.mark.parametrize('method', ['any', 'all'])
+ def test_any_all_level_axis_none_raises(self, method):
+ df = DataFrame(
+ {"A": 1},
+ index=MultiIndex.from_product([['A', 'B'], ['a', 'b']],
+ names=['out', 'in'])
+ )
+ xpr = "Must specify 'axis' when aggregating by level."
+ with pytest.raises(ValueError, match=xpr):
+ getattr(df, method)(axis=None, level='out')
+
+ # ----------------------------------------------------------------------
+ # Isin
+
+ def test_isin(self):
+ # GH 4211
+ df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
+ 'ids2': ['a', 'n', 'c', 'n']},
+ index=['foo', 'bar', 'baz', 'qux'])
+ other = ['a', 'b', 'c']
+
+ result = df.isin(other)
+ expected = DataFrame([df.loc[s].isin(other) for s in df.index])
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
+ def test_isin_empty(self, empty):
+ # GH 16991
+ df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
+ expected = DataFrame(False, df.index, df.columns)
+
+ result = df.isin(empty)
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_dict(self):
+ df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
+ d = {'A': ['a']}
+
+ expected = DataFrame(False, df.index, df.columns)
+ expected.loc[0, 'A'] = True
+
+ result = df.isin(d)
+ tm.assert_frame_equal(result, expected)
+
+ # non unique columns
+ df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
+ df.columns = ['A', 'A']
+ expected = DataFrame(False, df.index, df.columns)
+ expected.loc[0, 'A'] = True
+ result = df.isin(d)
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_with_string_scalar(self):
+ # GH 4763
+ df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
+ 'ids2': ['a', 'n', 'c', 'n']},
+ index=['foo', 'bar', 'baz', 'qux'])
+ with pytest.raises(TypeError):
+ df.isin('a')
+
+ with pytest.raises(TypeError):
+ df.isin('aaa')
+
+ def test_isin_df(self):
+ df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
+ df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
+ expected = DataFrame(False, df1.index, df1.columns)
+ result = df1.isin(df2)
+ expected['A'].loc[[1, 3]] = True
+ expected['B'].loc[[0, 2]] = True
+ tm.assert_frame_equal(result, expected)
+
+ # partial overlapping columns
+ df2.columns = ['A', 'C']
+ result = df1.isin(df2)
+ expected['B'] = False
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_tuples(self):
+ # GH 16394
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
+ df['C'] = list(zip(df['A'], df['B']))
+ result = df['C'].isin([(1, 'a')])
+ tm.assert_series_equal(result,
+ Series([True, False, False], name="C"))
+
+ def test_isin_df_dupe_values(self):
+ df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
+ # just cols duped
+ df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
+ columns=['B', 'B'])
+ with pytest.raises(ValueError):
+ df1.isin(df2)
+
+ # just index duped
+ df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
+ columns=['A', 'B'], index=[0, 0, 1, 1])
+ with pytest.raises(ValueError):
+ df1.isin(df2)
+
+ # cols and index:
+ df2.columns = ['B', 'B']
+ with pytest.raises(ValueError):
+ df1.isin(df2)
+
+ def test_isin_dupe_self(self):
+ other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
+ df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A'])
+ result = df.isin(other)
+ expected = DataFrame(False, index=df.index, columns=df.columns)
+ expected.loc[0] = True
+ expected.iloc[1, 1] = True
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_against_series(self):
+ df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
+ index=['a', 'b', 'c', 'd'])
+ s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
+ expected = DataFrame(False, index=df.index, columns=df.columns)
+ expected['A'].loc['a'] = True
+ expected.loc['d'] = True
+ result = df.isin(s)
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_multiIndex(self):
+ idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
+ (0, 'b', 'bar'), (0, 'b', 'baz'),
+ (2, 'a', 'foo'), (2, 'a', 'bar'),
+ (2, 'c', 'bar'), (2, 'c', 'baz'),
+ (1, 'b', 'foo'), (1, 'b', 'bar'),
+ (1, 'c', 'bar'), (1, 'c', 'baz')])
+ df1 = DataFrame({'A': np.ones(12),
+ 'B': np.zeros(12)}, index=idx)
+ df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
+ 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
+ # against regular index
+ expected = DataFrame(False, index=df1.index, columns=df1.columns)
+ result = df1.isin(df2)
+ tm.assert_frame_equal(result, expected)
+
+ df2.index = idx
+ expected = df2.values.astype(np.bool)
+ expected[:, 1] = ~expected[:, 1]
+ expected = DataFrame(expected, columns=['A', 'B'], index=idx)
+
+ result = df1.isin(df2)
+ tm.assert_frame_equal(result, expected)
+
+ def test_isin_empty_datetimelike(self):
+ # GH 15473
+ df1_ts = DataFrame({'date':
+ pd.to_datetime(['2014-01-01', '2014-01-02'])})
+ df1_td = DataFrame({'date':
+ [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]})
+ df2 = DataFrame({'date': []})
+ df3 = DataFrame()
+
+ expected = DataFrame({'date': [False, False]})
+
+ result = df1_ts.isin(df2)
+ tm.assert_frame_equal(result, expected)
+ result = df1_ts.isin(df3)
+ tm.assert_frame_equal(result, expected)
+
+ result = df1_td.isin(df2)
+ tm.assert_frame_equal(result, expected)
+ result = df1_td.isin(df3)
+ tm.assert_frame_equal(result, expected)
+
+ # Rounding
+ def test_round(self):
+ # GH 2665
+
+ # Test that rounding an empty DataFrame does nothing
+ df = DataFrame()
+ tm.assert_frame_equal(df, df.round())
+
+ # Here's the test frame we'll be working with
+ df = DataFrame({'col1': [1.123, 2.123, 3.123],
+ 'col2': [1.234, 2.234, 3.234]})
+
+ # Default round to integer (i.e. decimals=0)
+ expected_rounded = DataFrame(
+ {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
+ tm.assert_frame_equal(df.round(), expected_rounded)
+
+ # Round with an integer
+ decimals = 2
+ expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12],
+ 'col2': [1.23, 2.23, 3.23]})
+ tm.assert_frame_equal(df.round(decimals), expected_rounded)
+
+ # This should also work with np.round (since np.round dispatches to
+ # df.round)
+ tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
+
+ # Round with a list
+ round_list = [1, 2]
+ with pytest.raises(TypeError):
+ df.round(round_list)
+
+ # Round with a dictionary
+ expected_rounded = DataFrame(
+ {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]})
+ round_dict = {'col1': 1, 'col2': 2}
+ tm.assert_frame_equal(df.round(round_dict), expected_rounded)
+
+ # Incomplete dict
+ expected_partially_rounded = DataFrame(
+ {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]})
+ partial_round_dict = {'col2': 1}
+ tm.assert_frame_equal(df.round(partial_round_dict),
+ expected_partially_rounded)
+
+ # Dict with unknown elements
+ wrong_round_dict = {'col3': 2, 'col2': 1}
+ tm.assert_frame_equal(df.round(wrong_round_dict),
+ expected_partially_rounded)
+
+ # float input to `decimals`
+ non_int_round_dict = {'col1': 1, 'col2': 0.5}
+ with pytest.raises(TypeError):
+ df.round(non_int_round_dict)
+
+ # String input
+ non_int_round_dict = {'col1': 1, 'col2': 'foo'}
+ with pytest.raises(TypeError):
+ df.round(non_int_round_dict)
+
+ non_int_round_Series = Series(non_int_round_dict)
+ with pytest.raises(TypeError):
+ df.round(non_int_round_Series)
+
+ # List input
+ non_int_round_dict = {'col1': 1, 'col2': [1, 2]}
+ with pytest.raises(TypeError):
+ df.round(non_int_round_dict)
+
+ non_int_round_Series = Series(non_int_round_dict)
+ with pytest.raises(TypeError):
+ df.round(non_int_round_Series)
+
+ # Non integer Series inputs
+ non_int_round_Series = Series(non_int_round_dict)
+ with pytest.raises(TypeError):
+ df.round(non_int_round_Series)
+
+ non_int_round_Series = Series(non_int_round_dict)
+ with pytest.raises(TypeError):
+ df.round(non_int_round_Series)
+
+ # Negative numbers
+ negative_round_dict = {'col1': -1, 'col2': -2}
+ big_df = df * 100
+ expected_neg_rounded = DataFrame(
+ {'col1': [110., 210, 310], 'col2': [100., 200, 300]})
+ tm.assert_frame_equal(big_df.round(negative_round_dict),
+ expected_neg_rounded)
+
+ # nan in Series round
+ nan_round_Series = Series({'col1': np.nan, 'col2': 1})
+
+ # TODO(wesm): unused?
+ expected_nan_round = DataFrame({ # noqa
+ 'col1': [1.123, 2.123, 3.123],
+ 'col2': [1.2, 2.2, 3.2]})
+
+ with pytest.raises(TypeError):
+ df.round(nan_round_Series)
+
+ # Make sure this doesn't break existing Series.round
+ tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1'])
+
+ # named columns
+ # GH 11986
+ decimals = 2
+ expected_rounded = DataFrame(
+ {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]})
+ df.columns.name = "cols"
+ expected_rounded.columns.name = "cols"
+ tm.assert_frame_equal(df.round(decimals), expected_rounded)
+
+ # interaction of named columns & series
+ tm.assert_series_equal(df['col1'].round(decimals),
+ expected_rounded['col1'])
+ tm.assert_series_equal(df.round(decimals)['col1'],
+ expected_rounded['col1'])
+
+ def test_numpy_round(self):
+ # GH 12600
+ df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
+ out = np.round(df, decimals=0)
+ expected = DataFrame([[2., 1.], [0., 7.]])
+ tm.assert_frame_equal(out, expected)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.round(df, decimals=0, out=df)
+
+ def test_round_mixed_type(self):
+ # GH 11885
+ df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4],
+ 'col2': ['1', 'a', 'c', 'f'],
+ 'col3': date_range('20111111', periods=4)})
+ round_0 = DataFrame({'col1': [1., 2., 3., 4.],
+ 'col2': ['1', 'a', 'c', 'f'],
+ 'col3': date_range('20111111', periods=4)})
+ tm.assert_frame_equal(df.round(), round_0)
+ tm.assert_frame_equal(df.round(1), df)
+ tm.assert_frame_equal(df.round({'col1': 1}), df)
+ tm.assert_frame_equal(df.round({'col1': 0}), round_0)
+ tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0)
+ tm.assert_frame_equal(df.round({'col3': 1}), df)
+
+ def test_round_issue(self):
+ # GH 11611
+
+ df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'],
+ index=['first', 'second', 'third'])
+
+ dfs = pd.concat((df, df), axis=1)
+ rounded = dfs.round()
+ tm.assert_index_equal(rounded.index, dfs.index)
+
+ decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A'])
+ pytest.raises(ValueError, df.round, decimals)
+
+ def test_built_in_round(self):
+ if not compat.PY3:
+ pytest.skip("build in round cannot be overridden "
+ "prior to Python 3")
+
+ # GH 11763
+ # Here's the test frame we'll be working with
+ df = DataFrame(
+ {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]})
+
+ # Default round to integer (i.e. decimals=0)
+ expected_rounded = DataFrame(
+ {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
+ tm.assert_frame_equal(round(df), expected_rounded)
+
+ def test_round_nonunique_categorical(self):
+ # See GH21809
+ idx = pd.CategoricalIndex(['low'] * 3 + ['hi'] * 3)
+ df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc'))
+
+ expected = df.round(3)
+ expected.index = idx
+
+ df_categorical = df.copy().set_index(idx)
+ assert df_categorical.shape == (6, 3)
+ result = df_categorical.round(3)
+ assert result.shape == (6, 3)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_pct_change(self):
+ # GH 11150
+ pnl = DataFrame([np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(
+ 0, 40, 10)]).astype(np.float64)
+ pnl.iat[1, 0] = np.nan
+ pnl.iat[1, 1] = np.nan
+ pnl.iat[2, 3] = 60
+
+ for axis in range(2):
+ expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(
+ axis=axis) - 1
+ result = pnl.pct_change(axis=axis, fill_method='pad')
+
+ tm.assert_frame_equal(result, expected)
+
+ # Clip
+ def test_clip(self, float_frame):
+ median = float_frame.median().median()
+ original = float_frame.copy()
+
+ with tm.assert_produces_warning(FutureWarning):
+ capped = float_frame.clip_upper(median)
+ assert not (capped.values > median).any()
+
+ with tm.assert_produces_warning(FutureWarning):
+ floored = float_frame.clip_lower(median)
+ assert not (floored.values < median).any()
+
+ double = float_frame.clip(upper=median, lower=median)
+ assert not (double.values != median).any()
+
+ # Verify that float_frame was not changed inplace
+ assert (float_frame.values == original.values).all()
+
+ def test_inplace_clip(self, float_frame):
+ # GH 15388
+ median = float_frame.median().median()
+ frame_copy = float_frame.copy()
+
+ with tm.assert_produces_warning(FutureWarning):
+ frame_copy.clip_upper(median, inplace=True)
+ assert not (frame_copy.values > median).any()
+ frame_copy = float_frame.copy()
+
+ with tm.assert_produces_warning(FutureWarning):
+ frame_copy.clip_lower(median, inplace=True)
+ assert not (frame_copy.values < median).any()
+ frame_copy = float_frame.copy()
+
+ frame_copy.clip(upper=median, lower=median, inplace=True)
+ assert not (frame_copy.values != median).any()
+
+ def test_dataframe_clip(self):
+ # GH 2747
+ df = DataFrame(np.random.randn(1000, 2))
+
+ for lb, ub in [(-1, 1), (1, -1)]:
+ clipped_df = df.clip(lb, ub)
+
+ lb, ub = min(lb, ub), max(ub, lb)
+ lb_mask = df.values <= lb
+ ub_mask = df.values >= ub
+ mask = ~lb_mask & ~ub_mask
+ assert (clipped_df.values[lb_mask] == lb).all()
+ assert (clipped_df.values[ub_mask] == ub).all()
+ assert (clipped_df.values[mask] == df.values[mask]).all()
+
+ def test_clip_mixed_numeric(self):
+ # TODO(jreback)
+ # clip on mixed integer or floats
+ # with integer clippers coerces to float
+ df = DataFrame({'A': [1, 2, 3],
+ 'B': [1., np.nan, 3.]})
+ result = df.clip(1, 2)
+ expected = DataFrame({'A': [1, 2, 2],
+ 'B': [1., np.nan, 2.]})
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ # GH 24162, clipping now preserves numeric types per column
+ df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]],
+ columns=['foo', 'bar', 'baz'])
+ expected = df.dtypes
+ result = df.clip(upper=3).dtypes
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("inplace", [True, False])
+ def test_clip_against_series(self, inplace):
+ # GH 6966
+
+ df = DataFrame(np.random.randn(1000, 2))
+ lb = Series(np.random.randn(1000))
+ ub = lb + 1
+
+ original = df.copy()
+ clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
+
+ if inplace:
+ clipped_df = df
+
+ for i in range(2):
+ lb_mask = original.iloc[:, i] <= lb
+ ub_mask = original.iloc[:, i] >= ub
+ mask = ~lb_mask & ~ub_mask
+
+ result = clipped_df.loc[lb_mask, i]
+ tm.assert_series_equal(result, lb[lb_mask], check_names=False)
+ assert result.name == i
+
+ result = clipped_df.loc[ub_mask, i]
+ tm.assert_series_equal(result, ub[ub_mask], check_names=False)
+ assert result.name == i
+
+ tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
+
+ @pytest.mark.parametrize("inplace", [True, False])
+ @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
+ @pytest.mark.parametrize("axis,res", [
+ (0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]),
+ (1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]])
+ ])
+ def test_clip_against_list_like(self, simple_frame,
+ inplace, lower, axis, res):
+ # GH 15390
+ original = simple_frame.copy(deep=True)
+
+ result = original.clip(lower=lower, upper=[5, 6, 7],
+ axis=axis, inplace=inplace)
+
+ expected = pd.DataFrame(res,
+ columns=original.columns,
+ index=original.index)
+ if inplace:
+ result = original
+ tm.assert_frame_equal(result, expected, check_exact=True)
+
+ @pytest.mark.parametrize("axis", [0, 1, None])
+ def test_clip_against_frame(self, axis):
+ df = DataFrame(np.random.randn(1000, 2))
+ lb = DataFrame(np.random.randn(1000, 2))
+ ub = lb + 1
+
+ clipped_df = df.clip(lb, ub, axis=axis)
+
+ lb_mask = df <= lb
+ ub_mask = df >= ub
+ mask = ~lb_mask & ~ub_mask
+
+ tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
+ tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
+ tm.assert_frame_equal(clipped_df[mask], df[mask])
+
+ def test_clip_against_unordered_columns(self):
+ # GH 20911
+ df1 = DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D'])
+ df2 = DataFrame(np.random.randn(1000, 4), columns=['D', 'A', 'B', 'C'])
+ df3 = DataFrame(df2.values - 1, columns=['B', 'D', 'C', 'A'])
+ result_upper = df1.clip(lower=0, upper=df2)
+ expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
+ result_lower = df1.clip(lower=df3, upper=3)
+ expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
+ result_lower_upper = df1.clip(lower=df3, upper=df2)
+ expected_lower_upper = df1.clip(lower=df3[df1.columns],
+ upper=df2[df1.columns])
+ tm.assert_frame_equal(result_upper, expected_upper)
+ tm.assert_frame_equal(result_lower, expected_lower)
+ tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
+
+ def test_clip_with_na_args(self, float_frame):
+ """Should process np.nan argument as None """
+ # GH 17276
+ tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
+ tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan),
+ float_frame)
+
+ # GH 19992
+ df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6],
+ 'col_2': [7, 8, 9]})
+
+ result = df.clip(lower=[4, 5, np.nan], axis=0)
+ expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan],
+ 'col_2': [7, 8, np.nan]})
+ tm.assert_frame_equal(result, expected)
+
+ result = df.clip(lower=[4, 5, np.nan], axis=1)
+ expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6],
+ 'col_2': [np.nan, np.nan, np.nan]})
+ tm.assert_frame_equal(result, expected)
+
+ # Matrix-like
+ def test_dot(self):
+ a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
+ columns=['p', 'q', 'r', 's'])
+ b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
+ columns=['one', 'two'])
+
+ result = a.dot(b)
+ expected = DataFrame(np.dot(a.values, b.values),
+ index=['a', 'b', 'c'],
+ columns=['one', 'two'])
+ # Check alignment
+ b1 = b.reindex(index=reversed(b.index))
+ result = a.dot(b)
+ tm.assert_frame_equal(result, expected)
+
+ # Check series argument
+ result = a.dot(b['one'])
+ tm.assert_series_equal(result, expected['one'], check_names=False)
+ assert result.name is None
+
+ result = a.dot(b1['one'])
+ tm.assert_series_equal(result, expected['one'], check_names=False)
+ assert result.name is None
+
+ # can pass correct-length arrays
+ row = a.iloc[0].values
+
+ result = a.dot(row)
+ expected = a.dot(a.iloc[0])
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(ValueError, match='Dot product shape mismatch'):
+ a.dot(row[:-1])
+
+ a = np.random.rand(1, 5)
+ b = np.random.rand(5, 1)
+ A = DataFrame(a)
+
+ # TODO(wesm): unused
+ B = DataFrame(b) # noqa
+
+ # it works
+ result = A.dot(b)
+
+ # unaligned
+ df = DataFrame(np.random.randn(3, 4),
+ index=[1, 2, 3], columns=lrange(4))
+ df2 = DataFrame(np.random.randn(5, 3),
+ index=lrange(5), columns=[1, 2, 3])
+
+ with pytest.raises(ValueError, match='aligned'):
+ df.dot(df2)
+
+ @pytest.mark.skipif(not PY35,
+ reason='matmul supported for Python>=3.5')
+ def test_matmul(self):
+ # matmul test is for GH 10259
+ a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
+ columns=['p', 'q', 'r', 's'])
+ b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
+ columns=['one', 'two'])
+
+ # DataFrame @ DataFrame
+ result = operator.matmul(a, b)
+ expected = DataFrame(np.dot(a.values, b.values),
+ index=['a', 'b', 'c'],
+ columns=['one', 'two'])
+ tm.assert_frame_equal(result, expected)
+
+ # DataFrame @ Series
+ result = operator.matmul(a, b.one)
+ expected = Series(np.dot(a.values, b.one.values),
+ index=['a', 'b', 'c'])
+ tm.assert_series_equal(result, expected)
+
+ # np.array @ DataFrame
+ result = operator.matmul(a.values, b)
+ assert isinstance(result, DataFrame)
+ assert result.columns.equals(b.columns)
+ assert result.index.equals(pd.Index(range(3)))
+ expected = np.dot(a.values, b.values)
+ tm.assert_almost_equal(result.values, expected)
+
+ # nested list @ DataFrame (__rmatmul__)
+ result = operator.matmul(a.values.tolist(), b)
+ expected = DataFrame(np.dot(a.values, b.values),
+ index=['a', 'b', 'c'],
+ columns=['one', 'two'])
+ tm.assert_almost_equal(result.values, expected.values)
+
+ # mixed dtype DataFrame @ DataFrame
+ a['q'] = a.q.round().astype(int)
+ result = operator.matmul(a, b)
+ expected = DataFrame(np.dot(a.values, b.values),
+ index=['a', 'b', 'c'],
+ columns=['one', 'two'])
+ tm.assert_frame_equal(result, expected)
+
+ # different dtypes DataFrame @ DataFrame
+ a = a.astype(int)
+ result = operator.matmul(a, b)
+ expected = DataFrame(np.dot(a.values, b.values),
+ index=['a', 'b', 'c'],
+ columns=['one', 'two'])
+ tm.assert_frame_equal(result, expected)
+
+ # unaligned
+ df = DataFrame(np.random.randn(3, 4),
+ index=[1, 2, 3], columns=lrange(4))
+ df2 = DataFrame(np.random.randn(5, 3),
+ index=lrange(5), columns=[1, 2, 3])
+
+ with pytest.raises(ValueError, match='aligned'):
+ operator.matmul(df, df2)
+
+
+def df_duplicates():
+ return pd.DataFrame({'a': [1, 2, 3, 4, 4],
+ 'b': [1, 1, 1, 1, 1],
+ 'c': [0, 1, 2, 5, 4]},
+ index=[0, 0, 1, 1, 1])
+
+
+def df_strings():
+ return pd.DataFrame({'a': np.random.permutation(10),
+ 'b': list(ascii_lowercase[:10]),
+ 'c': np.random.permutation(10).astype('float64')})
+
+
+def df_main_dtypes():
+ return pd.DataFrame(
+ {'group': [1, 1, 2],
+ 'int': [1, 2, 3],
+ 'float': [4., 5., 6.],
+ 'string': list('abc'),
+ 'category_string': pd.Series(list('abc')).astype('category'),
+ 'category_int': [7, 8, 9],
+ 'datetime': pd.date_range('20130101', periods=3),
+ 'datetimetz': pd.date_range('20130101',
+ periods=3,
+ tz='US/Eastern'),
+ 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
+ columns=['group', 'int', 'float', 'string',
+ 'category_string', 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+
+
+class TestNLargestNSmallest(object):
+
+ dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot "
+ "use method {method!r} with this dtype")
+
+ # ----------------------------------------------------------------------
+ # Top / bottom
+ @pytest.mark.parametrize('order', [
+ ['a'],
+ ['c'],
+ ['a', 'b'],
+ ['a', 'c'],
+ ['b', 'a'],
+ ['b', 'c'],
+ ['a', 'b', 'c'],
+ ['c', 'a', 'b'],
+ ['c', 'b', 'a'],
+ ['b', 'c', 'a'],
+ ['b', 'a', 'c'],
+
+ # dups!
+ ['b', 'c', 'c']])
+ @pytest.mark.parametrize('n', range(1, 11))
+ def test_n(self, df_strings, nselect_method, n, order):
+ # GH 10393
+ df = df_strings
+ if 'b' in order:
+
+ error_msg = self.dtype_error_msg_template.format(
+ column='b', method=nselect_method, dtype='object')
+ with pytest.raises(TypeError, match=error_msg):
+ getattr(df, nselect_method)(n, order)
+ else:
+ ascending = nselect_method == 'nsmallest'
+ result = getattr(df, nselect_method)(n, order)
+ expected = df.sort_values(order, ascending=ascending).head(n)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('columns', [
+ ['group', 'category_string'], ['group', 'string']])
+ def test_n_error(self, df_main_dtypes, nselect_method, columns):
+ df = df_main_dtypes
+ col = columns[1]
+ error_msg = self.dtype_error_msg_template.format(
+ column=col, method=nselect_method, dtype=df[col].dtype)
+ # escape some characters that may be in the repr
+ error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
+ .replace("[", "\\[").replace("]", "\\]"))
+ with pytest.raises(TypeError, match=error_msg):
+ getattr(df, nselect_method)(2, columns)
+
+ def test_n_all_dtypes(self, df_main_dtypes):
+ df = df_main_dtypes
+ df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
+ df.nlargest(2, list(set(df) - {'category_string', 'string'}))
+
+ @pytest.mark.parametrize('method,expected', [
+ ('nlargest',
+ pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
+ index=[2, 1, 0, 3])),
+ ('nsmallest',
+ pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]},
+ index=[5, 4, 3, 0]))])
+ def test_duplicates_on_starter_columns(self, method, expected):
+ # regression test for #22752
+
+ df = pd.DataFrame({
+ 'a': [2, 2, 2, 1, 1, 1],
+ 'b': [1, 2, 3, 3, 2, 1]
+ })
+
+ result = getattr(df, method)(4, columns=['a', 'b'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_n_identical_values(self):
+ # GH 15297
+ df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
+
+ result = df.nlargest(3, 'a')
+ expected = pd.DataFrame(
+ {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
+ )
+ tm.assert_frame_equal(result, expected)
+
+ result = df.nsmallest(3, 'a')
+ expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('order', [
+ ['a', 'b', 'c'],
+ ['c', 'b', 'a'],
+ ['a'],
+ ['b'],
+ ['a', 'b'],
+ ['c', 'b']])
+ @pytest.mark.parametrize('n', range(1, 6))
+ def test_n_duplicate_index(self, df_duplicates, n, order):
+ # GH 13412
+
+ df = df_duplicates
+ result = df.nsmallest(n, order)
+ expected = df.sort_values(order).head(n)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.nlargest(n, order)
+ expected = df.sort_values(order, ascending=False).head(n)
+ tm.assert_frame_equal(result, expected)
+
+ def test_duplicate_keep_all_ties(self):
+ # GH 16818
+ df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3],
+ 'b': [10, 9, 8, 7, 5, 50, 10, 20]})
+ result = df.nlargest(4, 'a', keep='all')
+ expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3,
+ 5: 3, 6: 3, 7: 3},
+ 'b': {0: 10, 1: 9, 2: 8, 4: 5,
+ 5: 50, 6: 10, 7: 20}})
+ tm.assert_frame_equal(result, expected)
+
+ result = df.nsmallest(2, 'a', keep='all')
+ expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
+ 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}})
+ tm.assert_frame_equal(result, expected)
+
+ def test_series_broadcasting(self):
+ # smoke test for numpy warnings
+ # GH 16378, GH 16306
+ df = DataFrame([1.0, 1.0, 1.0])
+ df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]})
+ s = Series([1, 1, 1])
+ s_nan = Series([np.nan, np.nan, 1])
+
+ with tm.assert_produces_warning(None):
+ with tm.assert_produces_warning(FutureWarning):
+ df_nan.clip_lower(s, axis=0)
+ for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
+ getattr(df, op)(s_nan, axis=0)
+
+ def test_series_nat_conversion(self):
+ # GH 18521
+ # Check rank does not mutate DataFrame
+ df = DataFrame(np.random.randn(10, 3), dtype='float64')
+ expected = df.copy()
+ df.rank()
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ def test_multiindex_column_lookup(self):
+ # Check whether tuples are correctly treated as multi-level lookups.
+ # GH 23033
+ df = pd.DataFrame(
+ columns=pd.MultiIndex.from_product([['x'], ['a', 'b']]),
+ data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]])
+
+ # nsmallest
+ result = df.nsmallest(3, ('x', 'a'))
+ expected = df.iloc[[2, 0, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # nlargest
+ result = df.nlargest(3, ('x', 'b'))
+ expected = df.iloc[[3, 2, 1]]
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_api.py b/contrib/python/pandas/py2/pandas/tests/frame/test_api.py
new file mode 100644
index 00000000000..0934dd20638
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_api.py
@@ -0,0 +1,534 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+# pylint: disable-msg=W0612,E1101
+from copy import deepcopy
+import pydoc
+
+import numpy as np
+import pytest
+
+from pandas.compat import long, lrange, range
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Series, SparseDataFrame, compat, date_range,
+ timedelta_range)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+
+class SharedWithSparse(object):
+ """
+ A collection of tests DataFrame and SparseDataFrame can share.
+
+ In generic tests on this class, use ``self._assert_frame_equal()`` and
+ ``self._assert_series_equal()`` which are implemented in sub-classes
+ and dispatch correctly.
+ """
+ def _assert_frame_equal(self, left, right):
+ """Dispatch to frame class dependent assertion"""
+ raise NotImplementedError
+
+ def _assert_series_equal(self, left, right):
+ """Dispatch to series class dependent assertion"""
+ raise NotImplementedError
+
+ def test_copy_index_name_checking(self, float_frame):
+ # don't want to be able to modify the index stored elsewhere after
+ # making a copy
+ for attr in ('index', 'columns'):
+ ind = getattr(float_frame, attr)
+ ind.name = None
+ cp = float_frame.copy()
+ getattr(cp, attr).name = 'foo'
+ assert getattr(float_frame, attr).name is None
+
+ def test_getitem_pop_assign_name(self, float_frame):
+ s = float_frame['A']
+ assert s.name == 'A'
+
+ s = float_frame.pop('A')
+ assert s.name == 'A'
+
+ s = float_frame.loc[:, 'B']
+ assert s.name == 'B'
+
+ s2 = s.loc[:]
+ assert s2.name == 'B'
+
+ def test_get_value(self, float_frame):
+ for idx in float_frame.index:
+ for col in float_frame.columns:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = float_frame.get_value(idx, col)
+ expected = float_frame[col][idx]
+ tm.assert_almost_equal(result, expected)
+
+ def test_add_prefix_suffix(self, float_frame):
+ with_prefix = float_frame.add_prefix('foo#')
+ expected = pd.Index(['foo#%s' % c for c in float_frame.columns])
+ tm.assert_index_equal(with_prefix.columns, expected)
+
+ with_suffix = float_frame.add_suffix('#foo')
+ expected = pd.Index(['%s#foo' % c for c in float_frame.columns])
+ tm.assert_index_equal(with_suffix.columns, expected)
+
+ with_pct_prefix = float_frame.add_prefix('%')
+ expected = pd.Index(['%{}'.format(c) for c in float_frame.columns])
+ tm.assert_index_equal(with_pct_prefix.columns, expected)
+
+ with_pct_suffix = float_frame.add_suffix('%')
+ expected = pd.Index(['{}%'.format(c) for c in float_frame.columns])
+ tm.assert_index_equal(with_pct_suffix.columns, expected)
+
+ def test_get_axis(self, float_frame):
+ f = float_frame
+ assert f._get_axis_number(0) == 0
+ assert f._get_axis_number(1) == 1
+ assert f._get_axis_number('index') == 0
+ assert f._get_axis_number('rows') == 0
+ assert f._get_axis_number('columns') == 1
+
+ assert f._get_axis_name(0) == 'index'
+ assert f._get_axis_name(1) == 'columns'
+ assert f._get_axis_name('index') == 'index'
+ assert f._get_axis_name('rows') == 'index'
+ assert f._get_axis_name('columns') == 'columns'
+
+ assert f._get_axis(0) is f.index
+ assert f._get_axis(1) is f.columns
+
+ with pytest.raises(ValueError, match='No axis named'):
+ f._get_axis_number(2)
+
+ with pytest.raises(ValueError, match='No axis.*foo'):
+ f._get_axis_name('foo')
+
+ with pytest.raises(ValueError, match='No axis.*None'):
+ f._get_axis_name(None)
+
+ with pytest.raises(ValueError, match='No axis named'):
+ f._get_axis_number(None)
+
+ def test_keys(self, float_frame):
+ getkeys = float_frame.keys
+ assert getkeys() is float_frame.columns
+
+ def test_column_contains_typeerror(self, float_frame):
+ try:
+ float_frame.columns in float_frame
+ except TypeError:
+ pass
+
+ def test_tab_completion(self):
+ # DataFrame whose columns are identifiers shall have them in __dir__.
+ df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
+ for key in list('ABCD'):
+ assert key in dir(df)
+ assert isinstance(df.__getitem__('A'), pd.Series)
+
+ # DataFrame whose first-level columns are identifiers shall have
+ # them in __dir__.
+ df = pd.DataFrame(
+ [list('abcd'), list('efgh')],
+ columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
+ for key in list('ABCD'):
+ assert key in dir(df)
+ for key in list('EFGH'):
+ assert key not in dir(df)
+ assert isinstance(df.__getitem__('A'), pd.DataFrame)
+
+ def test_not_hashable(self, empty_frame):
+ df = self.klass([1])
+ pytest.raises(TypeError, hash, df)
+ pytest.raises(TypeError, hash, empty_frame)
+
+ def test_new_empty_index(self):
+ df1 = self.klass(np.random.randn(0, 3))
+ df2 = self.klass(np.random.randn(0, 3))
+ df1.index.name = 'foo'
+ assert df2.index.name is None
+
+ def test_array_interface(self, float_frame):
+ with np.errstate(all='ignore'):
+ result = np.sqrt(float_frame)
+ assert isinstance(result, type(float_frame))
+ assert result.index is float_frame.index
+ assert result.columns is float_frame.columns
+
+ self._assert_frame_equal(result, float_frame.apply(np.sqrt))
+
+ def test_get_agg_axis(self, float_frame):
+ cols = float_frame._get_agg_axis(0)
+ assert cols is float_frame.columns
+
+ idx = float_frame._get_agg_axis(1)
+ assert idx is float_frame.index
+
+ pytest.raises(ValueError, float_frame._get_agg_axis, 2)
+
+ def test_nonzero(self, float_frame, float_string_frame, empty_frame):
+ assert empty_frame.empty
+
+ assert not float_frame.empty
+ assert not float_string_frame.empty
+
+ # corner case
+ df = DataFrame({'A': [1., 2., 3.],
+ 'B': ['a', 'b', 'c']},
+ index=np.arange(3))
+ del df['A']
+ assert not df.empty
+
+ def test_iteritems(self):
+ df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
+ for k, v in compat.iteritems(df):
+ assert isinstance(v, self.klass._constructor_sliced)
+
+ def test_items(self):
+ # GH 17213, GH 13918
+ cols = ['a', 'b', 'c']
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
+ for c, (k, v) in zip(cols, df.items()):
+ assert c == k
+ assert isinstance(v, Series)
+ assert (df[k] == v).all()
+
+ def test_iter(self, float_frame):
+ assert tm.equalContents(list(float_frame), float_frame.columns)
+
+ def test_iterrows(self, float_frame, float_string_frame):
+ for k, v in float_frame.iterrows():
+ exp = float_frame.loc[k]
+ self._assert_series_equal(v, exp)
+
+ for k, v in float_string_frame.iterrows():
+ exp = float_string_frame.loc[k]
+ self._assert_series_equal(v, exp)
+
+ def test_iterrows_iso8601(self):
+ # GH 19671
+ if self.klass == SparseDataFrame:
+ pytest.xfail(reason='SparseBlock datetime type not implemented.')
+
+ s = self.klass(
+ {'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
+ 'iso8601': date_range('2000-01-01', periods=4, freq='M')})
+ for k, v in s.iterrows():
+ exp = s.loc[k]
+ self._assert_series_equal(v, exp)
+
+ def test_itertuples(self, float_frame):
+ for i, tup in enumerate(float_frame.itertuples()):
+ s = self.klass._constructor_sliced(tup[1:])
+ s.name = tup[0]
+ expected = float_frame.iloc[i, :].reset_index(drop=True)
+ self._assert_series_equal(s, expected)
+
+ df = self.klass({'floats': np.random.randn(5),
+ 'ints': lrange(5)}, columns=['floats', 'ints'])
+
+ for tup in df.itertuples(index=False):
+ assert isinstance(tup[1], (int, long))
+
+ df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
+ dfaa = df[['a', 'a']]
+
+ assert (list(dfaa.itertuples()) ==
+ [(0, 1, 1), (1, 2, 2), (2, 3, 3)])
+
+ # repr with be int/long on 32-bit/windows
+ if not (compat.is_platform_windows() or compat.is_platform_32bit()):
+ assert (repr(list(df.itertuples(name=None))) ==
+ '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
+
+ tup = next(df.itertuples(name='TestName'))
+ assert tup._fields == ('Index', 'a', 'b')
+ assert (tup.Index, tup.a, tup.b) == tup
+ assert type(tup).__name__ == 'TestName'
+
+ df.columns = ['def', 'return']
+ tup2 = next(df.itertuples(name='TestName'))
+ assert tup2 == (0, 1, 4)
+ assert tup2._fields == ('Index', '_1', '_2')
+
+ df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
+ # will raise SyntaxError if trying to create namedtuple
+ tup3 = next(df3.itertuples())
+ assert not hasattr(tup3, '_fields')
+ assert isinstance(tup3, tuple)
+
+ def test_sequence_like_with_categorical(self):
+
+ # GH 7839
+ # make sure can iterate
+ df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
+ "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
+ df['grade'] = Categorical(df['raw_grade'])
+
+ # basic sequencing testing
+ result = list(df.grade.values)
+ expected = np.array(df.grade.values).tolist()
+ tm.assert_almost_equal(result, expected)
+
+ # iteration
+ for t in df.itertuples(index=False):
+ str(t)
+
+ for row, s in df.iterrows():
+ str(s)
+
+ for c, col in df.iteritems():
+ str(s)
+
+ def test_len(self, float_frame):
+ assert len(float_frame) == len(float_frame.index)
+
+ def test_values(self, float_frame, float_string_frame):
+ frame = float_frame
+ arr = frame.values
+
+ frame_cols = frame.columns
+ for i, row in enumerate(arr):
+ for j, value in enumerate(row):
+ col = frame_cols[j]
+ if np.isnan(value):
+ assert np.isnan(frame[col][i])
+ else:
+ assert value == frame[col][i]
+
+ # mixed type
+ arr = float_string_frame[['foo', 'A']].values
+ assert arr[0, 0] == 'bar'
+
+ df = self.klass({'complex': [1j, 2j, 3j], 'real': [1, 2, 3]})
+ arr = df.values
+ assert arr[0, 0] == 1j
+
+ # single block corner case
+ arr = float_frame[['A', 'B']].values
+ expected = float_frame.reindex(columns=['A', 'B']).values
+ assert_almost_equal(arr, expected)
+
+ def test_to_numpy(self):
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
+ expected = np.array([[1, 3], [2, 4.5]])
+ result = df.to_numpy()
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_to_numpy_dtype(self):
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
+ expected = np.array([[1, 3], [2, 4]], dtype="int64")
+ result = df.to_numpy(dtype="int64")
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_to_numpy_copy(self):
+ arr = np.random.randn(4, 3)
+ df = pd.DataFrame(arr)
+ assert df.values.base is arr
+ assert df.to_numpy(copy=False).base is arr
+ assert df.to_numpy(copy=True).base is None
+
+ def test_transpose(self, float_frame):
+ frame = float_frame
+ dft = frame.T
+ for idx, series in compat.iteritems(dft):
+ for col, value in compat.iteritems(series):
+ if np.isnan(value):
+ assert np.isnan(frame[col][idx])
+ else:
+ assert value == frame[col][idx]
+
+ # mixed type
+ index, data = tm.getMixedTypeDict()
+ mixed = self.klass(data, index=index)
+
+ mixed_T = mixed.T
+ for col, s in compat.iteritems(mixed_T):
+ assert s.dtype == np.object_
+
+ def test_swapaxes(self):
+ df = self.klass(np.random.randn(10, 5))
+ self._assert_frame_equal(df.T, df.swapaxes(0, 1))
+ self._assert_frame_equal(df.T, df.swapaxes(1, 0))
+ self._assert_frame_equal(df, df.swapaxes(0, 0))
+ pytest.raises(ValueError, df.swapaxes, 2, 5)
+
+ def test_axis_aliases(self, float_frame):
+ f = float_frame
+
+ # reg name
+ expected = f.sum(axis=0)
+ result = f.sum(axis='index')
+ assert_series_equal(result, expected)
+
+ expected = f.sum(axis=1)
+ result = f.sum(axis='columns')
+ assert_series_equal(result, expected)
+
+ def test_class_axis(self):
+ # GH 18147
+ # no exception and no empty docstring
+ assert pydoc.getdoc(DataFrame.index)
+ assert pydoc.getdoc(DataFrame.columns)
+
+ def test_more_values(self, float_string_frame):
+ values = float_string_frame.values
+ assert values.shape[1] == len(float_string_frame.columns)
+
+ def test_repr_with_mi_nat(self, float_string_frame):
+ df = self.klass({'X': [1, 2]},
+ index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
+ result = repr(df)
+ expected = ' X\nNaT a 1\n2013-01-01 b 2'
+ assert result == expected
+
+ def test_iteritems_names(self, float_string_frame):
+ for k, v in compat.iteritems(float_string_frame):
+ assert v.name == k
+
+ def test_series_put_names(self, float_string_frame):
+ series = float_string_frame._series
+ for k, v in compat.iteritems(series):
+ assert v.name == k
+
+ def test_empty_nonzero(self):
+ df = self.klass([1, 2, 3])
+ assert not df.empty
+ df = self.klass(index=[1], columns=[1])
+ assert not df.empty
+ df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
+ assert df.empty
+ assert df.T.empty
+ empty_frames = [self.klass(),
+ self.klass(index=[1]),
+ self.klass(columns=[1]),
+ self.klass({1: []})]
+ for df in empty_frames:
+ assert df.empty
+ assert df.T.empty
+
+ def test_with_datetimelikes(self):
+
+ df = self.klass({'A': date_range('20130101', periods=10),
+ 'B': timedelta_range('1 day', periods=10)})
+ t = df.T
+
+ result = t.get_dtype_counts()
+ if self.klass is DataFrame:
+ expected = Series({'object': 10})
+ else:
+ expected = Series({'Sparse[object, nan]': 10})
+ tm.assert_series_equal(result, expected)
+
+
+class TestDataFrameMisc(SharedWithSparse):
+
+ klass = DataFrame
+ # SharedWithSparse tests use generic, klass-agnostic assertion
+ _assert_frame_equal = staticmethod(assert_frame_equal)
+ _assert_series_equal = staticmethod(assert_series_equal)
+
+ def test_values(self, float_frame):
+ float_frame.values[:, 0] = 5.
+ assert (float_frame.values[:, 0] == 5).all()
+
+ def test_as_matrix_deprecated(self, float_frame):
+ # GH 18458
+ with tm.assert_produces_warning(FutureWarning):
+ cols = float_frame.columns.tolist()
+ result = float_frame.as_matrix(columns=cols)
+ expected = float_frame.values
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_deepcopy(self, float_frame):
+ cp = deepcopy(float_frame)
+ series = cp['A']
+ series[:] = 10
+ for idx, value in compat.iteritems(series):
+ assert float_frame['A'][idx] != value
+
+ def test_transpose_get_view(self, float_frame):
+ dft = float_frame.T
+ dft.values[:, 5:10] = 5
+
+ assert (float_frame.values[5:10] == 5).all()
+
+ def test_inplace_return_self(self):
+ # GH 1893
+
+ data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
+ 'b': [0, 0, 1, 1],
+ 'c': [1, 2, 3, 4]})
+
+ def _check_f(base, f):
+ result = f(base)
+ assert result is None
+
+ # -----DataFrame-----
+
+ # set_index
+ f = lambda x: x.set_index('a', inplace=True)
+ _check_f(data.copy(), f)
+
+ # reset_index
+ f = lambda x: x.reset_index(inplace=True)
+ _check_f(data.set_index('a'), f)
+
+ # drop_duplicates
+ f = lambda x: x.drop_duplicates(inplace=True)
+ _check_f(data.copy(), f)
+
+ # sort
+ f = lambda x: x.sort_values('b', inplace=True)
+ _check_f(data.copy(), f)
+
+ # sort_index
+ f = lambda x: x.sort_index(inplace=True)
+ _check_f(data.copy(), f)
+
+ # fillna
+ f = lambda x: x.fillna(0, inplace=True)
+ _check_f(data.copy(), f)
+
+ # replace
+ f = lambda x: x.replace(1, 0, inplace=True)
+ _check_f(data.copy(), f)
+
+ # rename
+ f = lambda x: x.rename({1: 'foo'}, inplace=True)
+ _check_f(data.copy(), f)
+
+ # -----Series-----
+ d = data.copy()['c']
+
+ # reset_index
+ f = lambda x: x.reset_index(inplace=True, drop=True)
+ _check_f(data.set_index('a')['c'], f)
+
+ # fillna
+ f = lambda x: x.fillna(0, inplace=True)
+ _check_f(d.copy(), f)
+
+ # replace
+ f = lambda x: x.replace(1, 0, inplace=True)
+ _check_f(d.copy(), f)
+
+ # rename
+ f = lambda x: x.rename({1: 'foo'}, inplace=True)
+ _check_f(d.copy(), f)
+
+ def test_tab_complete_warning(self, ip):
+ # GH 16409
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.completer import provisionalcompleter
+
+ code = "import pandas as pd; df = pd.DataFrame()"
+ ip.run_code(code)
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('df.', 1))
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_apply.py b/contrib/python/pandas/py2/pandas/tests/frame/test_apply.py
new file mode 100644
index 00000000000..a4cd1aa3bac
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_apply.py
@@ -0,0 +1,1154 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from collections import OrderedDict
+from datetime import datetime
+from itertools import chain
+import operator
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ DataFrame, MultiIndex, Series, Timestamp, compat, date_range, notna)
+from pandas.conftest import _get_cython_table_params
+from pandas.core.apply import frame_apply
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+def int_frame_const_col():
+ """
+ Fixture for DataFrame of ints which are constant per column
+
+ Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3]
+ """
+ df = DataFrame(np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
+ columns=['A', 'B', 'C'])
+ return df
+
+
+class TestDataFrameApply():
+
+ def test_apply(self, float_frame):
+ with np.errstate(all='ignore'):
+ # ufunc
+ applied = float_frame.apply(np.sqrt)
+ tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A'])
+
+ # aggregator
+ applied = float_frame.apply(np.mean)
+ assert applied['A'] == np.mean(float_frame['A'])
+
+ d = float_frame.index[0]
+ applied = float_frame.apply(np.mean, axis=1)
+ assert applied[d] == np.mean(float_frame.xs(d))
+ assert applied.index is float_frame.index # want this
+
+ # invalid axis
+ df = DataFrame(
+ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
+ with pytest.raises(ValueError):
+ df.apply(lambda x: x, 2)
+
+ # GH 9573
+ df = DataFrame({'c0': ['A', 'A', 'B', 'B'],
+ 'c1': ['C', 'C', 'D', 'D']})
+ df = df.apply(lambda ts: ts.astype('category'))
+
+ assert df.shape == (4, 2)
+ assert isinstance(df['c0'].dtype, CategoricalDtype)
+ assert isinstance(df['c1'].dtype, CategoricalDtype)
+
+ def test_apply_mixed_datetimelike(self):
+ # mixed datetimelike
+ # GH 7778
+ df = DataFrame({'A': date_range('20130101', periods=3),
+ 'B': pd.to_timedelta(np.arange(3), unit='s')})
+ result = df.apply(lambda x: x, axis=1)
+ assert_frame_equal(result, df)
+
+ def test_apply_empty(self, float_frame, empty_frame):
+ # empty
+ applied = empty_frame.apply(np.sqrt)
+ assert applied.empty
+
+ applied = empty_frame.apply(np.mean)
+ assert applied.empty
+
+ no_rows = float_frame[:0]
+ result = no_rows.apply(lambda x: x.mean())
+ expected = Series(np.nan, index=float_frame.columns)
+ assert_series_equal(result, expected)
+
+ no_cols = float_frame.loc[:, []]
+ result = no_cols.apply(lambda x: x.mean(), axis=1)
+ expected = Series(np.nan, index=float_frame.index)
+ assert_series_equal(result, expected)
+
+ # GH 2476
+ expected = DataFrame(index=['a'])
+ result = expected.apply(lambda x: x['a'], axis=1)
+ assert_frame_equal(expected, result)
+
+ def test_apply_with_reduce_empty(self, empty_frame):
+ # reduce with an empty DataFrame
+ x = []
+ result = empty_frame.apply(x.append, axis=1, result_type='expand')
+ assert_frame_equal(result, empty_frame)
+ result = empty_frame.apply(x.append, axis=1, result_type='reduce')
+ assert_series_equal(result, Series(
+ [], index=pd.Index([], dtype=object)))
+
+ empty_with_cols = DataFrame(columns=['a', 'b', 'c'])
+ result = empty_with_cols.apply(x.append, axis=1, result_type='expand')
+ assert_frame_equal(result, empty_with_cols)
+ result = empty_with_cols.apply(x.append, axis=1, result_type='reduce')
+ assert_series_equal(result, Series(
+ [], index=pd.Index([], dtype=object)))
+
+ # Ensure that x.append hasn't been called
+ assert x == []
+
+ def test_apply_deprecate_reduce(self, empty_frame):
+ x = []
+ with tm.assert_produces_warning(FutureWarning):
+ empty_frame.apply(x.append, axis=1, reduce=True)
+
+ def test_apply_standard_nonunique(self):
+ df = DataFrame(
+ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
+
+ result = df.apply(lambda s: s[0], axis=1)
+ expected = Series([1, 4, 7], ['a', 'a', 'c'])
+ assert_series_equal(result, expected)
+
+ result = df.T.apply(lambda s: s[0], axis=0)
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('func', ['sum', 'mean', 'min', 'max', 'std'])
+ @pytest.mark.parametrize('args,kwds', [
+ pytest.param([], {}, id='no_args_or_kwds'),
+ pytest.param([1], {}, id='axis_from_args'),
+ pytest.param([], {'axis': 1}, id='axis_from_kwds'),
+ pytest.param([], {'numeric_only': True}, id='optional_kwds'),
+ pytest.param([1, None], {'numeric_only': True}, id='args_and_kwds')
+ ])
+ def test_apply_with_string_funcs(self, float_frame, func, args, kwds):
+ result = float_frame.apply(func, *args, **kwds)
+ expected = getattr(float_frame, func)(*args, **kwds)
+ tm.assert_series_equal(result, expected)
+
+ def test_apply_broadcast_deprecated(self, float_frame):
+ with tm.assert_produces_warning(FutureWarning):
+ float_frame.apply(np.mean, broadcast=True)
+
+ def test_apply_broadcast(self, float_frame, int_frame_const_col):
+
+ # scalars
+ result = float_frame.apply(np.mean, result_type='broadcast')
+ expected = DataFrame([float_frame.mean()], index=float_frame.index)
+ tm.assert_frame_equal(result, expected)
+
+ result = float_frame.apply(np.mean, axis=1, result_type='broadcast')
+ m = float_frame.mean(axis=1)
+ expected = DataFrame({c: m for c in float_frame.columns})
+ tm.assert_frame_equal(result, expected)
+
+ # lists
+ result = float_frame.apply(
+ lambda x: list(range(len(float_frame.columns))),
+ axis=1,
+ result_type='broadcast')
+ m = list(range(len(float_frame.columns)))
+ expected = DataFrame([m] * len(float_frame.index),
+ dtype='float64',
+ index=float_frame.index,
+ columns=float_frame.columns)
+ tm.assert_frame_equal(result, expected)
+
+ result = float_frame.apply(lambda x:
+ list(range(len(float_frame.index))),
+ result_type='broadcast')
+ m = list(range(len(float_frame.index)))
+ expected = DataFrame({c: m for c in float_frame.columns},
+ dtype='float64',
+ index=float_frame.index)
+ tm.assert_frame_equal(result, expected)
+
+ # preserve columns
+ df = int_frame_const_col
+ result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast')
+ tm.assert_frame_equal(result, df)
+
+ df = int_frame_const_col
+ result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')),
+ axis=1, result_type='broadcast')
+ expected = df.copy()
+ tm.assert_frame_equal(result, expected)
+
+ def test_apply_broadcast_error(self, int_frame_const_col):
+ df = int_frame_const_col
+
+ # > 1 ndim
+ with pytest.raises(ValueError):
+ df.apply(lambda x: np.array([1, 2]).reshape(-1, 2),
+ axis=1, result_type='broadcast')
+
+ # cannot broadcast
+ with pytest.raises(ValueError):
+ df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
+
+ with pytest.raises(ValueError):
+ df.apply(lambda x: Series([1, 2]), axis=1, result_type='broadcast')
+
+ def test_apply_raw(self, float_frame):
+ result0 = float_frame.apply(np.mean, raw=True)
+ result1 = float_frame.apply(np.mean, axis=1, raw=True)
+
+ expected0 = float_frame.apply(lambda x: x.values.mean())
+ expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1)
+
+ assert_series_equal(result0, expected0)
+ assert_series_equal(result1, expected1)
+
+ # no reduction
+ result = float_frame.apply(lambda x: x * 2, raw=True)
+ expected = float_frame * 2
+ assert_frame_equal(result, expected)
+
+ def test_apply_axis1(self, float_frame):
+ d = float_frame.index[0]
+ tapplied = float_frame.apply(np.mean, axis=1)
+ assert tapplied[d] == np.mean(float_frame.xs(d))
+
+ def test_apply_ignore_failures(self, float_string_frame):
+ result = frame_apply(float_string_frame, np.mean, 0,
+ ignore_failures=True).apply_standard()
+ expected = float_string_frame._get_numeric_data().apply(np.mean)
+ assert_series_equal(result, expected)
+
+ def test_apply_mixed_dtype_corner(self):
+ df = DataFrame({'A': ['foo'],
+ 'B': [1.]})
+ result = df[:0].apply(np.mean, axis=1)
+ # the result here is actually kind of ambiguous, should it be a Series
+ # or a DataFrame?
+ expected = Series(np.nan, index=pd.Index([], dtype='int64'))
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': ['foo'],
+ 'B': [1.]})
+ result = df.apply(lambda x: x['A'], axis=1)
+ expected = Series(['foo'], index=[0])
+ assert_series_equal(result, expected)
+
+ result = df.apply(lambda x: x['B'], axis=1)
+ expected = Series([1.], index=[0])
+ assert_series_equal(result, expected)
+
+ def test_apply_empty_infer_type(self):
+ no_cols = DataFrame(index=['a', 'b', 'c'])
+ no_index = DataFrame(columns=['a', 'b', 'c'])
+
+ def _check(df, f):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ test_res = f(np.array([], dtype='f8'))
+ is_reduction = not isinstance(test_res, np.ndarray)
+
+ def _checkit(axis=0, raw=False):
+ result = df.apply(f, axis=axis, raw=raw)
+ if is_reduction:
+ agg_axis = df._get_agg_axis(axis)
+ assert isinstance(result, Series)
+ assert result.index is agg_axis
+ else:
+ assert isinstance(result, DataFrame)
+
+ _checkit()
+ _checkit(axis=1)
+ _checkit(raw=True)
+ _checkit(axis=0, raw=True)
+
+ with np.errstate(all='ignore'):
+ _check(no_cols, lambda x: x)
+ _check(no_cols, lambda x: x.mean())
+ _check(no_index, lambda x: x)
+ _check(no_index, lambda x: x.mean())
+
+ result = no_cols.apply(lambda x: x.mean(), result_type='broadcast')
+ assert isinstance(result, DataFrame)
+
+ def test_apply_with_args_kwds(self, float_frame):
+ def add_some(x, howmuch=0):
+ return x + howmuch
+
+ def agg_and_add(x, howmuch=0):
+ return x.mean() + howmuch
+
+ def subtract_and_divide(x, sub, divide=1):
+ return (x - sub) / divide
+
+ result = float_frame.apply(add_some, howmuch=2)
+ expected = float_frame.apply(lambda x: x + 2)
+ assert_frame_equal(result, expected)
+
+ result = float_frame.apply(agg_and_add, howmuch=2)
+ expected = float_frame.apply(lambda x: x.mean() + 2)
+ assert_series_equal(result, expected)
+
+ result = float_frame.apply(subtract_and_divide, args=(2,), divide=2)
+ expected = float_frame.apply(lambda x: (x - 2.) / 2.)
+ assert_frame_equal(result, expected)
+
+ def test_apply_yield_list(self, float_frame):
+ result = float_frame.apply(list)
+ assert_frame_equal(result, float_frame)
+
+ def test_apply_reduce_Series(self, float_frame):
+ float_frame.loc[::2, 'A'] = np.nan
+ expected = float_frame.mean(1)
+ result = float_frame.apply(np.mean, axis=1)
+ assert_series_equal(result, expected)
+
+ def test_apply_reduce_rows_to_dict(self):
+ # GH 25196
+ data = pd.DataFrame([[1, 2], [3, 4]])
+ expected = pd.Series([{0: 1, 1: 3}, {0: 2, 1: 4}])
+ result = data.apply(dict)
+ assert_series_equal(result, expected)
+
+ def test_apply_differently_indexed(self):
+ df = DataFrame(np.random.randn(20, 10))
+
+ result0 = df.apply(Series.describe, axis=0)
+ expected0 = DataFrame({i: v.describe()
+ for i, v in compat.iteritems(df)},
+ columns=df.columns)
+ assert_frame_equal(result0, expected0)
+
+ result1 = df.apply(Series.describe, axis=1)
+ expected1 = DataFrame({i: v.describe()
+ for i, v in compat.iteritems(df.T)},
+ columns=df.index).T
+ assert_frame_equal(result1, expected1)
+
+ def test_apply_modify_traceback(self):
+ data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
+ 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two',
+ 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull',
+ 'dull', 'shiny', 'shiny', 'dull',
+ 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ data.loc[4, 'C'] = np.nan
+
+ def transform(row):
+ if row['C'].startswith('shin') and row['A'] == 'foo':
+ row['D'] = 7
+ return row
+
+ def transform2(row):
+ if (notna(row['C']) and row['C'].startswith('shin') and
+ row['A'] == 'foo'):
+ row['D'] = 7
+ return row
+
+ try:
+ data.apply(transform, axis=1)
+ except AttributeError as e:
+ assert len(e.args) == 2
+ assert e.args[1] == 'occurred at index 4'
+ assert e.args[0] == "'float' object has no attribute 'startswith'"
+
+ def test_apply_bug(self):
+
+ # GH 6125
+ positions = pd.DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20],
+ [1, 'DEF0', 20], [2, 'ABC1', 50],
+ [2, 'YUM1', 20], [2, 'DEF1', 20]],
+ columns=['a', 'market', 'position'])
+
+ def f(r):
+ return r['market']
+ expected = positions.apply(f, axis=1)
+
+ positions = DataFrame([[datetime(2013, 1, 1), 'ABC0', 50],
+ [datetime(2013, 1, 2), 'YUM0', 20],
+ [datetime(2013, 1, 3), 'DEF0', 20],
+ [datetime(2013, 1, 4), 'ABC1', 50],
+ [datetime(2013, 1, 5), 'YUM1', 20],
+ [datetime(2013, 1, 6), 'DEF1', 20]],
+ columns=['a', 'market', 'position'])
+ result = positions.apply(f, axis=1)
+ assert_series_equal(result, expected)
+
+ def test_apply_convert_objects(self):
+ data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
+ 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two',
+ 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull',
+ 'dull', 'shiny', 'shiny', 'dull',
+ 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ result = data.apply(lambda x: x, axis=1)
+ assert_frame_equal(result._convert(datetime=True), data)
+
+ def test_apply_attach_name(self, float_frame):
+ result = float_frame.apply(lambda x: x.name)
+ expected = Series(float_frame.columns, index=float_frame.columns)
+ assert_series_equal(result, expected)
+
+ result = float_frame.apply(lambda x: x.name, axis=1)
+ expected = Series(float_frame.index, index=float_frame.index)
+ assert_series_equal(result, expected)
+
+ # non-reductions
+ result = float_frame.apply(lambda x: np.repeat(x.name, len(x)))
+ expected = DataFrame(np.tile(float_frame.columns,
+ (len(float_frame.index), 1)),
+ index=float_frame.index,
+ columns=float_frame.columns)
+ assert_frame_equal(result, expected)
+
+ result = float_frame.apply(lambda x: np.repeat(x.name, len(x)),
+ axis=1)
+ expected = Series(np.repeat(t[0], len(float_frame.columns))
+ for t in float_frame.itertuples())
+ expected.index = float_frame.index
+ assert_series_equal(result, expected)
+
+ def test_apply_multi_index(self, float_frame):
+ index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']])
+ s = DataFrame([[1, 2], [3, 4], [5, 6]],
+ index=index,
+ columns=['col1', 'col2'])
+ result = s.apply(
+ lambda x: Series({'min': min(x), 'max': max(x)}), 1)
+ expected = DataFrame([[1, 2], [3, 4], [5, 6]],
+ index=index,
+ columns=['min', 'max'])
+ assert_frame_equal(result, expected, check_like=True)
+
+ def test_apply_dict(self):
+
+ # GH 8735
+ A = DataFrame([['foo', 'bar'], ['spam', 'eggs']])
+ A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]),
+ dict([(0, 'bar'), (1, 'eggs')])])
+ B = DataFrame([[0, 1], [2, 3]])
+ B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
+ fn = lambda x: x.to_dict()
+
+ for df, dicts in [(A, A_dicts), (B, B_dicts)]:
+ reduce_true = df.apply(fn, result_type='reduce')
+ reduce_false = df.apply(fn, result_type='expand')
+ reduce_none = df.apply(fn)
+
+ assert_series_equal(reduce_true, dicts)
+ assert_frame_equal(reduce_false, df)
+ assert_series_equal(reduce_none, dicts)
+
+ def test_applymap(self, float_frame):
+ applied = float_frame.applymap(lambda x: x * 2)
+ tm.assert_frame_equal(applied, float_frame * 2)
+ float_frame.applymap(type)
+
+ # GH 465: function returning tuples
+ result = float_frame.applymap(lambda x: (x, x))
+ assert isinstance(result['A'][0], tuple)
+
+ # GH 2909: object conversion to float in constructor?
+ df = DataFrame(data=[1, 'a'])
+ result = df.applymap(lambda x: x)
+ assert result.dtypes[0] == object
+
+ df = DataFrame(data=[1., 'a'])
+ result = df.applymap(lambda x: x)
+ assert result.dtypes[0] == object
+
+ # GH 2786
+ df = DataFrame(np.random.random((3, 4)))
+ df2 = df.copy()
+ cols = ['a', 'a', 'a', 'a']
+ df.columns = cols
+
+ expected = df2.applymap(str)
+ expected.columns = cols
+ result = df.applymap(str)
+ tm.assert_frame_equal(result, expected)
+
+ # datetime/timedelta
+ df['datetime'] = Timestamp('20130101')
+ df['timedelta'] = pd.Timedelta('1 min')
+ result = df.applymap(str)
+ for f in ['datetime', 'timedelta']:
+ assert result.loc[0, f] == str(df.loc[0, f])
+
+ # GH 8222
+ empty_frames = [pd.DataFrame(),
+ pd.DataFrame(columns=list('ABC')),
+ pd.DataFrame(index=list('ABC')),
+ pd.DataFrame({'A': [], 'B': [], 'C': []})]
+ for frame in empty_frames:
+ for func in [round, lambda x: x]:
+ result = frame.applymap(func)
+ tm.assert_frame_equal(result, frame)
+
+ def test_applymap_box_timestamps(self):
+ # GH 2689, GH 2627
+ ser = pd.Series(date_range('1/1/2000', periods=10))
+
+ def func(x):
+ return (x.hour, x.day, x.month)
+
+ # it works!
+ pd.DataFrame(ser).applymap(func)
+
+ def test_applymap_box(self):
+ # ufunc will not be boxed. Same test cases as the test_map_box
+ df = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02')],
+ 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern')],
+ 'c': [pd.Timedelta('1 days'),
+ pd.Timedelta('2 days')],
+ 'd': [pd.Period('2011-01-01', freq='M'),
+ pd.Period('2011-01-02', freq='M')]})
+
+ result = df.applymap(lambda x: '{0}'.format(x.__class__.__name__))
+ expected = pd.DataFrame({'a': ['Timestamp', 'Timestamp'],
+ 'b': ['Timestamp', 'Timestamp'],
+ 'c': ['Timedelta', 'Timedelta'],
+ 'd': ['Period', 'Period']})
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_apply_dont_convert_datetime64(self):
+ from pandas.tseries.offsets import BDay
+ df = DataFrame({'x1': [datetime(1996, 1, 1)]})
+
+ df = df.applymap(lambda x: x + BDay())
+ df = df.applymap(lambda x: x + BDay())
+
+ assert df.x1.dtype == 'M8[ns]'
+
+ def test_apply_non_numpy_dtype(self):
+ # GH 12244
+ df = DataFrame({'dt': pd.date_range(
+ "2015-01-01", periods=3, tz='Europe/Brussels')})
+ result = df.apply(lambda x: x)
+ assert_frame_equal(result, df)
+
+ result = df.apply(lambda x: x + pd.Timedelta('1day'))
+ expected = DataFrame({'dt': pd.date_range(
+ "2015-01-02", periods=3, tz='Europe/Brussels')})
+ assert_frame_equal(result, expected)
+
+ df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category')
+ result = df.apply(lambda x: x)
+ assert_frame_equal(result, df)
+
+ def test_apply_dup_names_multi_agg(self):
+ # GH 21063
+ df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'a'])
+ expected = pd.DataFrame([[0, 1]], columns=['a', 'a'], index=['min'])
+ result = df.agg(['min'])
+
+ tm.assert_frame_equal(result, expected)
+
+
+class TestInferOutputShape(object):
+ # the user has supplied an opaque UDF where
+ # they are transforming the input that requires
+ # us to infer the output
+
+ def test_infer_row_shape(self):
+ # GH 17437
+ # if row shape is changing, infer it
+ df = pd.DataFrame(np.random.rand(10, 2))
+ result = df.apply(np.fft.fft, axis=0)
+ assert result.shape == (10, 2)
+
+ result = df.apply(np.fft.rfft, axis=0)
+ assert result.shape == (6, 2)
+
+ def test_with_dictlike_columns(self):
+ # GH 17602
+ df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ result = df.apply(lambda x: {'s': x['a'] + x['b']},
+ axis=1)
+ expected = Series([{'s': 3} for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
+ pd.Timestamp('2017-05-02 00:00:00')]
+ result = df.apply(lambda x: {'s': x['a'] + x['b']},
+ axis=1)
+ assert_series_equal(result, expected)
+
+ # compose a series
+ result = (df['a'] + df['b']).apply(lambda x: {'s': x})
+ expected = Series([{'s': 3}, {'s': 3}])
+ assert_series_equal(result, expected)
+
+ # GH 18775
+ df = DataFrame()
+ df["author"] = ["X", "Y", "Z"]
+ df["publisher"] = ["BBC", "NBC", "N24"]
+ df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
+ '13-05-2011 08:20:35',
+ '15-01-2013 09:09:09'])
+ result = df.apply(lambda x: {}, axis=1)
+ expected = Series([{}, {}, {}])
+ assert_series_equal(result, expected)
+
+ def test_with_dictlike_columns_with_infer(self):
+ # GH 17602
+ df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ result = df.apply(lambda x: {'s': x['a'] + x['b']},
+ axis=1, result_type='expand')
+ expected = DataFrame({'s': [3, 3]})
+ assert_frame_equal(result, expected)
+
+ df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
+ pd.Timestamp('2017-05-02 00:00:00')]
+ result = df.apply(lambda x: {'s': x['a'] + x['b']},
+ axis=1, result_type='expand')
+ assert_frame_equal(result, expected)
+
+ def test_with_listlike_columns(self):
+ # GH 17348
+ df = DataFrame({'a': Series(np.random.randn(4)),
+ 'b': ['a', 'list', 'of', 'words'],
+ 'ts': date_range('2016-10-01', periods=4, freq='H')})
+
+ result = df[['a', 'b']].apply(tuple, axis=1)
+ expected = Series([t[1:] for t in df[['a', 'b']].itertuples()])
+ assert_series_equal(result, expected)
+
+ result = df[['a', 'ts']].apply(tuple, axis=1)
+ expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()])
+ assert_series_equal(result, expected)
+
+ # GH 18919
+ df = DataFrame({'x': Series([['a', 'b'], ['q']]),
+ 'y': Series([['z'], ['q', 't']])})
+ df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')])
+
+ result = df.apply(
+ lambda row: [el for el in row['x'] if el in row['y']],
+ axis=1)
+ expected = Series([[], ['q']], index=df.index)
+ assert_series_equal(result, expected)
+
+ def test_infer_output_shape_columns(self):
+ # GH 18573
+
+ df = DataFrame({'number': [1., 2.],
+ 'string': ['foo', 'bar'],
+ 'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
+ pd.Timestamp('2017-11-29 03:45:00')]})
+ result = df.apply(lambda row: (row.number, row.string), axis=1)
+ expected = Series([(t.number, t.string) for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ def test_infer_output_shape_listlike_columns(self):
+ # GH 16353
+
+ df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
+
+ result = df.apply(lambda x: [1, 2, 3], axis=1)
+ expected = Series([[1, 2, 3] for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ result = df.apply(lambda x: [1, 2], axis=1)
+ expected = Series([[1, 2] for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ # GH 17970
+ df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))
+
+ result = df.apply(lambda row: np.ones(1), axis=1)
+ expected = Series([np.ones(1) for t in df.itertuples()],
+ index=df.index)
+ assert_series_equal(result, expected)
+
+ result = df.apply(lambda row: np.ones(2), axis=1)
+ expected = Series([np.ones(2) for t in df.itertuples()],
+ index=df.index)
+ assert_series_equal(result, expected)
+
+ # GH 17892
+ df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
+ pd.Timestamp('2010-02-04'),
+ pd.Timestamp('2010-02-05'),
+ pd.Timestamp('2010-02-06')],
+ 'b': [9, 5, 4, 3],
+ 'c': [5, 3, 4, 2],
+ 'd': [1, 2, 3, 4]})
+
+ def fun(x):
+ return (1, 2)
+
+ result = df.apply(fun, axis=1)
+ expected = Series([(1, 2) for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ def test_consistent_coerce_for_shapes(self):
+ # we want column names to NOT be propagated
+ # just because the shape matches the input shape
+ df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])
+
+ result = df.apply(lambda x: [1, 2, 3], axis=1)
+ expected = Series([[1, 2, 3] for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ result = df.apply(lambda x: [1, 2], axis=1)
+ expected = Series([[1, 2] for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ def test_consistent_names(self, int_frame_const_col):
+ # if a Series is returned, we should use the resulting index names
+ df = int_frame_const_col
+
+ result = df.apply(lambda x: Series([1, 2, 3],
+ index=['test', 'other', 'cols']),
+ axis=1)
+ expected = int_frame_const_col.rename(columns={'A': 'test',
+ 'B': 'other',
+ 'C': 'cols'})
+ assert_frame_equal(result, expected)
+
+ result = df.apply(lambda x: Series([1, 2], index=['test', 'other']),
+ axis=1)
+ expected = expected[['test', 'other']]
+ assert_frame_equal(result, expected)
+
+ def test_result_type(self, int_frame_const_col):
+ # result_type should be consistent no matter which
+ # path we take in the code
+ df = int_frame_const_col
+
+ result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand')
+ expected = df.copy()
+ expected.columns = [0, 1, 2]
+ assert_frame_equal(result, expected)
+
+ result = df.apply(lambda x: [1, 2], axis=1, result_type='expand')
+ expected = df[['A', 'B']].copy()
+ expected.columns = [0, 1]
+ assert_frame_equal(result, expected)
+
+ # broadcast result
+ result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast')
+ expected = df.copy()
+ assert_frame_equal(result, expected)
+
+ columns = ['other', 'col', 'names']
+ result = df.apply(lambda x: Series([1, 2, 3], index=columns),
+ axis=1, result_type='broadcast')
+ expected = df.copy()
+ assert_frame_equal(result, expected)
+
+ # series result
+ result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1)
+ expected = df.copy()
+ assert_frame_equal(result, expected)
+
+ # series result with other index
+ columns = ['other', 'col', 'names']
+ result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1)
+ expected = df.copy()
+ expected.columns = columns
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("result_type", ['foo', 1])
+ def test_result_type_error(self, result_type, int_frame_const_col):
+ # allowed result_type
+ df = int_frame_const_col
+
+ with pytest.raises(ValueError):
+ df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type)
+
+ @pytest.mark.parametrize(
+ "box",
+ [lambda x: list(x),
+ lambda x: tuple(x),
+ lambda x: np.array(x, dtype='int64')],
+ ids=['list', 'tuple', 'array'])
+ def test_consistency_for_boxed(self, box, int_frame_const_col):
+ # passing an array or list should not affect the output shape
+ df = int_frame_const_col
+
+ result = df.apply(lambda x: box([1, 2]), axis=1)
+ expected = Series([box([1, 2]) for t in df.itertuples()])
+ assert_series_equal(result, expected)
+
+ result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand')
+ expected = int_frame_const_col[['A', 'B']].rename(columns={'A': 0,
+ 'B': 1})
+ assert_frame_equal(result, expected)
+
+
+def zip_frames(frames, axis=1):
+ """
+ take a list of frames, zip them together under the
+ assumption that these all have the first frames' index/columns.
+
+ Returns
+ -------
+ new_frame : DataFrame
+ """
+ if axis == 1:
+ columns = frames[0].columns
+ zipped = [f.loc[:, c] for c in columns for f in frames]
+ return pd.concat(zipped, axis=1)
+ else:
+ index = frames[0].index
+ zipped = [f.loc[i, :] for i in index for f in frames]
+ return pd.DataFrame(zipped)
+
+
+class TestDataFrameAggregate():
+
+ def test_agg_transform(self, axis, float_frame):
+ other_axis = 1 if axis in {0, 'index'} else 0
+
+ with np.errstate(all='ignore'):
+
+ f_abs = np.abs(float_frame)
+ f_sqrt = np.sqrt(float_frame)
+
+ # ufunc
+ result = float_frame.transform(np.sqrt, axis=axis)
+ expected = f_sqrt.copy()
+ assert_frame_equal(result, expected)
+
+ result = float_frame.apply(np.sqrt, axis=axis)
+ assert_frame_equal(result, expected)
+
+ result = float_frame.transform(np.sqrt, axis=axis)
+ assert_frame_equal(result, expected)
+
+ # list-like
+ result = float_frame.apply([np.sqrt], axis=axis)
+ expected = f_sqrt.copy()
+ if axis in {0, 'index'}:
+ expected.columns = pd.MultiIndex.from_product(
+ [float_frame.columns, ['sqrt']])
+ else:
+ expected.index = pd.MultiIndex.from_product(
+ [float_frame.index, ['sqrt']])
+ assert_frame_equal(result, expected)
+
+ result = float_frame.transform([np.sqrt], axis=axis)
+ assert_frame_equal(result, expected)
+
+ # multiple items in list
+ # these are in the order as if we are applying both
+ # functions per series and then concatting
+ result = float_frame.apply([np.abs, np.sqrt], axis=axis)
+ expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
+ if axis in {0, 'index'}:
+ expected.columns = pd.MultiIndex.from_product(
+ [float_frame.columns, ['absolute', 'sqrt']])
+ else:
+ expected.index = pd.MultiIndex.from_product(
+ [float_frame.index, ['absolute', 'sqrt']])
+ assert_frame_equal(result, expected)
+
+ result = float_frame.transform([np.abs, 'sqrt'], axis=axis)
+ assert_frame_equal(result, expected)
+
+ def test_transform_and_agg_err(self, axis, float_frame):
+ # cannot both transform and agg
+ with pytest.raises(ValueError):
+ float_frame.transform(['max', 'min'], axis=axis)
+
+ with pytest.raises(ValueError):
+ with np.errstate(all='ignore'):
+ float_frame.agg(['max', 'sqrt'], axis=axis)
+
+ with pytest.raises(ValueError):
+ with np.errstate(all='ignore'):
+ float_frame.transform(['max', 'sqrt'], axis=axis)
+
+ df = pd.DataFrame({'A': range(5), 'B': 5})
+
+ def f():
+ with np.errstate(all='ignore'):
+ df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}, axis=axis)
+
+ @pytest.mark.parametrize('method', [
+ 'abs', 'shift', 'pct_change', 'cumsum', 'rank',
+ ])
+ def test_transform_method_name(self, method):
+ # GH 19760
+ df = pd.DataFrame({"A": [-1, 2]})
+ result = df.transform(method)
+ expected = operator.methodcaller(method)(df)
+ tm.assert_frame_equal(result, expected)
+
+ def test_demo(self):
+ # demonstration tests
+ df = pd.DataFrame({'A': range(5), 'B': 5})
+
+ result = df.agg(['min', 'max'])
+ expected = DataFrame({'A': [0, 4], 'B': [5, 5]},
+ columns=['A', 'B'],
+ index=['min', 'max'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']})
+ expected = DataFrame({'A': [4.0, 0.0, np.nan],
+ 'B': [5.0, np.nan, 25.0]},
+ columns=['A', 'B'],
+ index=['max', 'min', 'sum'])
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ def test_agg_multiple_mixed_no_warning(self):
+ # GH 20909
+ mdf = pd.DataFrame({'A': [1, 2, 3],
+ 'B': [1., 2., 3.],
+ 'C': ['foo', 'bar', 'baz'],
+ 'D': pd.date_range('20130101', periods=3)})
+ expected = pd.DataFrame({"A": [1, 6], 'B': [1.0, 6.0],
+ "C": ['bar', 'foobarbaz'],
+ "D": [pd.Timestamp('2013-01-01'), pd.NaT]},
+ index=['min', 'sum'])
+ # sorted index
+ with tm.assert_produces_warning(None):
+ result = mdf.agg(['min', 'sum'])
+
+ tm.assert_frame_equal(result, expected)
+
+ with tm.assert_produces_warning(None):
+ result = mdf[['D', 'C', 'B', 'A']].agg(['sum', 'min'])
+
+ # For backwards compatibility, the result's index is
+ # still sorted by function name, so it's ['min', 'sum']
+ # not ['sum', 'min'].
+ expected = expected[['D', 'C', 'B', 'A']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_agg_dict_nested_renaming_depr(self):
+
+ df = pd.DataFrame({'A': range(5), 'B': 5})
+
+ # nested renaming
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ df.agg({'A': {'foo': 'min'},
+ 'B': {'bar': 'max'}})
+
+ def test_agg_reduce(self, axis, float_frame):
+ other_axis = 1 if axis in {0, 'index'} else 0
+ name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()
+
+ # all reducers
+ expected = pd.concat([float_frame.mean(axis=axis),
+ float_frame.max(axis=axis),
+ float_frame.sum(axis=axis),
+ ], axis=1)
+ expected.columns = ['mean', 'max', 'sum']
+ expected = expected.T if axis in {0, 'index'} else expected
+
+ result = float_frame.agg(['mean', 'max', 'sum'], axis=axis)
+ assert_frame_equal(result, expected)
+
+ # dict input with scalars
+ func = OrderedDict([(name1, 'mean'), (name2, 'sum')])
+ result = float_frame.agg(func, axis=axis)
+ expected = Series([float_frame.loc(other_axis)[name1].mean(),
+ float_frame.loc(other_axis)[name2].sum()],
+ index=[name1, name2])
+ assert_series_equal(result, expected)
+
+ # dict input with lists
+ func = OrderedDict([(name1, ['mean']), (name2, ['sum'])])
+ result = float_frame.agg(func, axis=axis)
+ expected = DataFrame({
+ name1: Series([float_frame.loc(other_axis)[name1].mean()],
+ index=['mean']),
+ name2: Series([float_frame.loc(other_axis)[name2].sum()],
+ index=['sum'])})
+ expected = expected.T if axis in {1, 'columns'} else expected
+ assert_frame_equal(result, expected)
+
+ # dict input with lists with multiple
+ func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])])
+ result = float_frame.agg(func, axis=axis)
+ expected = DataFrame(OrderedDict([
+ (name1, Series([float_frame.loc(other_axis)[name1].mean(),
+ float_frame.loc(other_axis)[name1].sum()],
+ index=['mean', 'sum'])),
+ (name2, Series([float_frame.loc(other_axis)[name2].sum(),
+ float_frame.loc(other_axis)[name2].max()],
+ index=['sum', 'max'])),
+ ]))
+ expected = expected.T if axis in {1, 'columns'} else expected
+ assert_frame_equal(result, expected)
+
+ def test_nuiscance_columns(self):
+
+ # GH 15015
+ df = DataFrame({'A': [1, 2, 3],
+ 'B': [1., 2., 3.],
+ 'C': ['foo', 'bar', 'baz'],
+ 'D': pd.date_range('20130101', periods=3)})
+
+ result = df.agg('min')
+ expected = Series([1, 1., 'bar', pd.Timestamp('20130101')],
+ index=df.columns)
+ assert_series_equal(result, expected)
+
+ result = df.agg(['min'])
+ expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]],
+ index=['min'], columns=df.columns)
+ assert_frame_equal(result, expected)
+
+ result = df.agg('sum')
+ expected = Series([6, 6., 'foobarbaz'],
+ index=['A', 'B', 'C'])
+ assert_series_equal(result, expected)
+
+ result = df.agg(['sum'])
+ expected = DataFrame([[6, 6., 'foobarbaz']],
+ index=['sum'], columns=['A', 'B', 'C'])
+ assert_frame_equal(result, expected)
+
+ def test_non_callable_aggregates(self):
+
+ # GH 16405
+ # 'size' is a property of frame/series
+ # validate that this is working
+ df = DataFrame({'A': [None, 2, 3],
+ 'B': [1.0, np.nan, 3.0],
+ 'C': ['foo', None, 'bar']})
+
+ # Function aggregate
+ result = df.agg({'A': 'count'})
+ expected = Series({'A': 2})
+
+ assert_series_equal(result, expected)
+
+ # Non-function aggregate
+ result = df.agg({'A': 'size'})
+ expected = Series({'A': 3})
+
+ assert_series_equal(result, expected)
+
+ # Mix function and non-function aggs
+ result1 = df.agg(['count', 'size'])
+ result2 = df.agg({'A': ['count', 'size'],
+ 'B': ['count', 'size'],
+ 'C': ['count', 'size']})
+ expected = pd.DataFrame({'A': {'count': 2, 'size': 3},
+ 'B': {'count': 2, 'size': 3},
+ 'C': {'count': 2, 'size': 3}})
+
+ assert_frame_equal(result1, result2, check_like=True)
+ assert_frame_equal(result2, expected, check_like=True)
+
+ # Just functional string arg is same as calling df.arg()
+ result = df.agg('count')
+ expected = df.count()
+
+ assert_series_equal(result, expected)
+
+ # Just a string attribute arg same as calling df.arg
+ result = df.agg('size')
+ expected = df.size
+
+ assert result == expected
+
+ @pytest.mark.parametrize("df, func, expected", chain(
+ _get_cython_table_params(
+ DataFrame(), [
+ ('sum', Series()),
+ ('max', Series()),
+ ('min', Series()),
+ ('all', Series(dtype=bool)),
+ ('any', Series(dtype=bool)),
+ ('mean', Series()),
+ ('prod', Series()),
+ ('std', Series()),
+ ('var', Series()),
+ ('median', Series()),
+ ]),
+ _get_cython_table_params(
+ DataFrame([[np.nan, 1], [1, 2]]), [
+ ('sum', Series([1., 3])),
+ ('max', Series([1., 2])),
+ ('min', Series([1., 1])),
+ ('all', Series([True, True])),
+ ('any', Series([True, True])),
+ ('mean', Series([1, 1.5])),
+ ('prod', Series([1., 2])),
+ ('std', Series([np.nan, 0.707107])),
+ ('var', Series([np.nan, 0.5])),
+ ('median', Series([1, 1.5])),
+ ]),
+ ))
+ def test_agg_cython_table(self, df, func, expected, axis):
+ # GH 21224
+ # test reducing functions in
+ # pandas.core.base.SelectionMixin._cython_table
+ result = df.agg(func, axis=axis)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("df, func, expected", chain(
+ _get_cython_table_params(
+ DataFrame(), [
+ ('cumprod', DataFrame()),
+ ('cumsum', DataFrame()),
+ ]),
+ _get_cython_table_params(
+ DataFrame([[np.nan, 1], [1, 2]]), [
+ ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])),
+ ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])),
+ ]),
+ ))
+ def test_agg_cython_table_transform(self, df, func, expected, axis):
+ # GH 21224
+ # test transforming functions in
+ # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+ result = df.agg(func, axis=axis)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("df, func, expected", _get_cython_table_params(
+ DataFrame([['a', 'b'], ['b', 'a']]), [
+ ['cumprod', TypeError],
+ ]),
+ )
+ def test_agg_cython_table_raises(self, df, func, expected, axis):
+ # GH 21224
+ with pytest.raises(expected):
+ df.agg(func, axis=axis)
+
+ @pytest.mark.parametrize("num_cols", [2, 3, 5])
+ def test_frequency_is_original(self, num_cols):
+ # GH 22150
+ index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
+ original = index.copy()
+ df = DataFrame(1, index=index, columns=range(num_cols))
+ df.apply(lambda x: x)
+ assert index.freq == original.freq
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/frame/test_arithmetic.py
new file mode 100644
index 00000000000..f14ecae4487
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_arithmetic.py
@@ -0,0 +1,636 @@
+# -*- coding: utf-8 -*-
+from collections import deque
+from datetime import datetime
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
+import pandas.util.testing as tm
+
+# -------------------------------------------------------------------
+# Comparisons
+
+
+class TestFrameComparisons(object):
+ # Specifically _not_ flex-comparisons
+
+ def test_comparison_invalid(self):
+
+ def check(df, df2):
+
+ for (x, y) in [(df, df2), (df2, df)]:
+ # we expect the result to match Series comparisons for
+ # == and !=, inequalities should raise
+ result = x == y
+ expected = pd.DataFrame({col: x[col] == y[col]
+ for col in x.columns},
+ index=x.index, columns=x.columns)
+ tm.assert_frame_equal(result, expected)
+
+ result = x != y
+ expected = pd.DataFrame({col: x[col] != y[col]
+ for col in x.columns},
+ index=x.index, columns=x.columns)
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ x >= y
+ with pytest.raises(TypeError):
+ x > y
+ with pytest.raises(TypeError):
+ x < y
+ with pytest.raises(TypeError):
+ x <= y
+
+ # GH4968
+ # invalid date/int comparisons
+ df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a'])
+ df['dates'] = pd.date_range('20010101', periods=len(df))
+
+ df2 = df.copy()
+ df2['dates'] = df['a']
+ check(df, df2)
+
+ df = pd.DataFrame(np.random.randint(10, size=(10, 2)),
+ columns=['a', 'b'])
+ df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)),
+ 'b': pd.date_range('20100101', periods=len(df))})
+ check(df, df2)
+
+ def test_timestamp_compare(self):
+ # make sure we can compare Timestamps on the right AND left hand side
+ # GH#4982
+ df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10),
+ 'dates2': pd.date_range('20010102', periods=10),
+ 'intcol': np.random.randint(1000000000, size=10),
+ 'floatcol': np.random.randn(10),
+ 'stringcol': list(tm.rands(10))})
+ df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
+ ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq',
+ 'ne': 'ne'}
+
+ for left, right in ops.items():
+ left_f = getattr(operator, left)
+ right_f = getattr(operator, right)
+
+ # no nats
+ if left in ['eq', 'ne']:
+ expected = left_f(df, pd.Timestamp('20010109'))
+ result = right_f(pd.Timestamp('20010109'), df)
+ tm.assert_frame_equal(result, expected)
+ else:
+ with pytest.raises(TypeError):
+ left_f(df, pd.Timestamp('20010109'))
+ with pytest.raises(TypeError):
+ right_f(pd.Timestamp('20010109'), df)
+ # nats
+ expected = left_f(df, pd.Timestamp('nat'))
+ result = right_f(pd.Timestamp('nat'), df)
+ tm.assert_frame_equal(result, expected)
+
+ def test_mixed_comparison(self):
+ # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
+ # not raise TypeError
+ # (this appears to be fixed before GH#22163, not sure when)
+ df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]])
+ other = pd.DataFrame([['a', 'b'], ['c', 'd']])
+
+ result = df == other
+ assert not result.any().any()
+
+ result = df != other
+ assert result.all().all()
+
+ def test_df_boolean_comparison_error(self):
+ # GH#4576, GH#22880
+ # comparing DataFrame against list/tuple with len(obj) matching
+ # len(df.columns) is supported as of GH#22800
+ df = pd.DataFrame(np.arange(6).reshape((3, 2)))
+
+ expected = pd.DataFrame([[False, False],
+ [True, False],
+ [False, False]])
+
+ result = df == (2, 2)
+ tm.assert_frame_equal(result, expected)
+
+ result = df == [2, 2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_float_none_comparison(self):
+ df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
+ columns=['A', 'B', 'C'])
+
+ result = df.__eq__(None)
+ assert not result.any().any()
+
+ def test_df_string_comparison(self):
+ df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
+ mask_a = df.a > 1
+ tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
+ tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
+
+ mask_b = df.b == "foo"
+ tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
+ tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
+
+
+class TestFrameFlexComparisons(object):
+ # TODO: test_bool_flex_frame needs a better name
+ def test_bool_flex_frame(self):
+ data = np.random.randn(5, 3)
+ other_data = np.random.randn(5, 3)
+ df = pd.DataFrame(data)
+ other = pd.DataFrame(other_data)
+ ndim_5 = np.ones(df.shape + (1, 3))
+
+ # Unaligned
+ def _check_unaligned_frame(meth, op, df, other):
+ part_o = other.loc[3:, 1:].copy()
+ rs = meth(part_o)
+ xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
+ tm.assert_frame_equal(rs, xp)
+
+ # DataFrame
+ assert df.eq(df).values.all()
+ assert not df.ne(df).values.any()
+ for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
+ f = getattr(df, op)
+ o = getattr(operator, op)
+ # No NAs
+ tm.assert_frame_equal(f(other), o(df, other))
+ _check_unaligned_frame(f, o, df, other)
+ # ndarray
+ tm.assert_frame_equal(f(other.values), o(df, other.values))
+ # scalar
+ tm.assert_frame_equal(f(0), o(df, 0))
+ # NAs
+ msg = "Unable to coerce to Series/DataFrame"
+ tm.assert_frame_equal(f(np.nan), o(df, np.nan))
+ with pytest.raises(ValueError, match=msg):
+ f(ndim_5)
+
+ # Series
+ def _test_seq(df, idx_ser, col_ser):
+ idx_eq = df.eq(idx_ser, axis=0)
+ col_eq = df.eq(col_ser)
+ idx_ne = df.ne(idx_ser, axis=0)
+ col_ne = df.ne(col_ser)
+ tm.assert_frame_equal(col_eq, df == pd.Series(col_ser))
+ tm.assert_frame_equal(col_eq, -col_ne)
+ tm.assert_frame_equal(idx_eq, -idx_ne)
+ tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
+ tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
+ tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0))
+ tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
+
+ idx_gt = df.gt(idx_ser, axis=0)
+ col_gt = df.gt(col_ser)
+ idx_le = df.le(idx_ser, axis=0)
+ col_le = df.le(col_ser)
+
+ tm.assert_frame_equal(col_gt, df > pd.Series(col_ser))
+ tm.assert_frame_equal(col_gt, -col_le)
+ tm.assert_frame_equal(idx_gt, -idx_le)
+ tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
+
+ idx_ge = df.ge(idx_ser, axis=0)
+ col_ge = df.ge(col_ser)
+ idx_lt = df.lt(idx_ser, axis=0)
+ col_lt = df.lt(col_ser)
+ tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser))
+ tm.assert_frame_equal(col_ge, -col_lt)
+ tm.assert_frame_equal(idx_ge, -idx_lt)
+ tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
+
+ idx_ser = pd.Series(np.random.randn(5))
+ col_ser = pd.Series(np.random.randn(3))
+ _test_seq(df, idx_ser, col_ser)
+
+ # list/tuple
+ _test_seq(df, idx_ser.values, col_ser.values)
+
+ # NA
+ df.loc[0, 0] = np.nan
+ rs = df.eq(df)
+ assert not rs.loc[0, 0]
+ rs = df.ne(df)
+ assert rs.loc[0, 0]
+ rs = df.gt(df)
+ assert not rs.loc[0, 0]
+ rs = df.lt(df)
+ assert not rs.loc[0, 0]
+ rs = df.ge(df)
+ assert not rs.loc[0, 0]
+ rs = df.le(df)
+ assert not rs.loc[0, 0]
+
+ # complex
+ arr = np.array([np.nan, 1, 6, np.nan])
+ arr2 = np.array([2j, np.nan, 7, None])
+ df = pd.DataFrame({'a': arr})
+ df2 = pd.DataFrame({'a': arr2})
+ rs = df.gt(df2)
+ assert not rs.values.any()
+ rs = df.ne(df2)
+ assert rs.values.all()
+
+ arr3 = np.array([2j, np.nan, None])
+ df3 = pd.DataFrame({'a': arr3})
+ rs = df3.gt(2j)
+ assert not rs.values.any()
+
+ # corner, dtype=object
+ df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']})
+ df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']})
+ result = df1.ne(df2)
+ exp = pd.DataFrame({'col': [False, True, False]})
+ tm.assert_frame_equal(result, exp)
+
+ def test_flex_comparison_nat(self):
+ # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
+ # and _definitely_ not be NaN
+ df = pd.DataFrame([pd.NaT])
+
+ result = df == pd.NaT
+ # result.iloc[0, 0] is a np.bool_ object
+ assert result.iloc[0, 0].item() is False
+
+ result = df.eq(pd.NaT)
+ assert result.iloc[0, 0].item() is False
+
+ result = df != pd.NaT
+ assert result.iloc[0, 0].item() is True
+
+ result = df.ne(pd.NaT)
+ assert result.iloc[0, 0].item() is True
+
+ @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
+ def test_df_flex_cmp_constant_return_types(self, opname):
+ # GH 15077, non-empty DataFrame
+ df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
+ const = 2
+
+ result = getattr(df, opname)(const).get_dtype_counts()
+ tm.assert_series_equal(result, pd.Series([2], ['bool']))
+
+ @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
+ def test_df_flex_cmp_constant_return_types_empty(self, opname):
+ # GH 15077 empty DataFrame
+ df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
+ const = 2
+
+ empty = df.iloc[:0]
+ result = getattr(empty, opname)(const).get_dtype_counts()
+ tm.assert_series_equal(result, pd.Series([2], ['bool']))
+
+
+# -------------------------------------------------------------------
+# Arithmetic
+
+class TestFrameFlexArithmetic(object):
+
+ def test_df_add_td64_columnwise(self):
+ # GH 22534 Check that column-wise addition broadcasts correctly
+ dti = pd.date_range('2016-01-01', periods=10)
+ tdi = pd.timedelta_range('1', periods=10)
+ tser = pd.Series(tdi)
+ df = pd.DataFrame({0: dti, 1: tdi})
+
+ result = df.add(tser, axis=0)
+ expected = pd.DataFrame({0: dti + tdi,
+ 1: tdi + tdi})
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_add_flex_filled_mixed_dtypes(self):
+ # GH 19611
+ dti = pd.date_range('2016-01-01', periods=3)
+ ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
+ df = pd.DataFrame({'A': dti, 'B': ser})
+ other = pd.DataFrame({'A': ser, 'B': ser})
+ fill = pd.Timedelta(days=1).to_timedelta64()
+ result = df.add(other, fill_value=fill)
+
+ expected = pd.DataFrame(
+ {'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
+ dtype='datetime64[ns]'),
+ 'B': ser * 2})
+ tm.assert_frame_equal(result, expected)
+
+ def test_arith_flex_frame(self, all_arithmetic_operators, float_frame,
+ mixed_float_frame):
+ # one instance of parametrized fixture
+ op = all_arithmetic_operators
+
+ def f(x, y):
+ # r-versions not in operator-stdlib; get op without "r" and invert
+ if op.startswith('__r'):
+ return getattr(operator, op.replace('__r', '__'))(y, x)
+ return getattr(operator, op)(x, y)
+
+ result = getattr(float_frame, op)(2 * float_frame)
+ expected = f(float_frame, 2 * float_frame)
+ tm.assert_frame_equal(result, expected)
+
+ # vs mix float
+ result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
+ expected = f(mixed_float_frame, 2 * mixed_float_frame)
+ tm.assert_frame_equal(result, expected)
+ _check_mixed_float(result, dtype=dict(C=None))
+
+ @pytest.mark.parametrize('op', ['__add__', '__sub__', '__mul__'])
+ def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame,
+ mixed_float_frame):
+ f = getattr(operator, op)
+
+ # vs mix int
+ result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
+ expected = f(mixed_int_frame, 2 + mixed_int_frame)
+
+ # no overflow in the uint
+ dtype = None
+ if op in ['__sub__']:
+ dtype = dict(B='uint64', C=None)
+ elif op in ['__add__', '__mul__']:
+ dtype = dict(C=None)
+ tm.assert_frame_equal(result, expected)
+ _check_mixed_int(result, dtype=dtype)
+
+ # vs mix float
+ result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
+ expected = f(mixed_float_frame, 2 * mixed_float_frame)
+ tm.assert_frame_equal(result, expected)
+ _check_mixed_float(result, dtype=dict(C=None))
+
+ # vs plain int
+ result = getattr(int_frame, op)(2 * int_frame)
+ expected = f(int_frame, 2 * int_frame)
+ tm.assert_frame_equal(result, expected)
+
+ def test_arith_flex_frame_raise(self, all_arithmetic_operators,
+ float_frame):
+ # one instance of parametrized fixture
+ op = all_arithmetic_operators
+
+ # Check that arrays with dim >= 3 raise
+ for dim in range(3, 6):
+ arr = np.ones((1,) * dim)
+ msg = "Unable to coerce to Series/DataFrame"
+ with pytest.raises(ValueError, match=msg):
+ getattr(float_frame, op)(arr)
+
+ def test_arith_flex_frame_corner(self, float_frame):
+
+ const_add = float_frame.add(1)
+ tm.assert_frame_equal(const_add, float_frame + 1)
+
+ # corner cases
+ result = float_frame.add(float_frame[:0])
+ tm.assert_frame_equal(result, float_frame * np.nan)
+
+ result = float_frame[:0].add(float_frame)
+ tm.assert_frame_equal(result, float_frame * np.nan)
+
+ with pytest.raises(NotImplementedError, match='fill_value'):
+ float_frame.add(float_frame.iloc[0], fill_value=3)
+
+ with pytest.raises(NotImplementedError, match='fill_value'):
+ float_frame.add(float_frame.iloc[0], axis='index', fill_value=3)
+
+ def test_arith_flex_series(self, simple_frame):
+ df = simple_frame
+
+ row = df.xs('a')
+ col = df['two']
+ # after arithmetic refactor, add truediv here
+ ops = ['add', 'sub', 'mul', 'mod']
+ for op in ops:
+ f = getattr(df, op)
+ op = getattr(operator, op)
+ tm.assert_frame_equal(f(row), op(df, row))
+ tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
+
+ # special case for some reason
+ tm.assert_frame_equal(df.add(row, axis=None), df + row)
+
+ # cases which will be refactored after big arithmetic refactor
+ tm.assert_frame_equal(df.div(row), df / row)
+ tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
+
+ # broadcasting issue in GH 7325
+ df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
+ expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
+ result = df.div(df[0], axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
+ expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
+ result = df.div(df[0], axis='index')
+ tm.assert_frame_equal(result, expected)
+
+ def test_arith_flex_zero_len_raises(self):
+ # GH 19522 passing fill_value to frame flex arith methods should
+ # raise even in the zero-length special cases
+ ser_len0 = pd.Series([])
+ df_len0 = pd.DataFrame([], columns=['A', 'B'])
+ df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+
+ with pytest.raises(NotImplementedError, match='fill_value'):
+ df.add(ser_len0, fill_value='E')
+
+ with pytest.raises(NotImplementedError, match='fill_value'):
+ df_len0.sub(df['A'], axis=None, fill_value=3)
+
+
+class TestFrameArithmetic(object):
+ def test_df_add_2d_array_rowlike_broadcasts(self):
+ # GH#23000
+ arr = np.arange(6).reshape(3, 2)
+ df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
+
+ rowlike = arr[[1], :] # shape --> (1, ncols)
+ assert rowlike.shape == (1, df.shape[1])
+
+ expected = pd.DataFrame([[2, 4],
+ [4, 6],
+ [6, 8]],
+ columns=df.columns, index=df.index,
+ # specify dtype explicitly to avoid failing
+ # on 32bit builds
+ dtype=arr.dtype)
+ result = df + rowlike
+ tm.assert_frame_equal(result, expected)
+ result = rowlike + df
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_add_2d_array_collike_broadcasts(self):
+ # GH#23000
+ arr = np.arange(6).reshape(3, 2)
+ df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
+
+ collike = arr[:, [1]] # shape --> (nrows, 1)
+ assert collike.shape == (df.shape[0], 1)
+
+ expected = pd.DataFrame([[1, 2],
+ [5, 6],
+ [9, 10]],
+ columns=df.columns, index=df.index,
+ # specify dtype explicitly to avoid failing
+ # on 32bit builds
+ dtype=arr.dtype)
+ result = df + collike
+ tm.assert_frame_equal(result, expected)
+ result = collike + df
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_arith_2d_array_rowlike_broadcasts(self,
+ all_arithmetic_operators):
+ # GH#23000
+ opname = all_arithmetic_operators
+
+ arr = np.arange(6).reshape(3, 2)
+ df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
+
+ rowlike = arr[[1], :] # shape --> (1, ncols)
+ assert rowlike.shape == (1, df.shape[1])
+
+ exvals = [getattr(df.loc['A'], opname)(rowlike.squeeze()),
+ getattr(df.loc['B'], opname)(rowlike.squeeze()),
+ getattr(df.loc['C'], opname)(rowlike.squeeze())]
+
+ expected = pd.DataFrame(exvals, columns=df.columns, index=df.index)
+
+ if opname in ['__rmod__', '__rfloordiv__']:
+ # exvals will have dtypes [f8, i8, i8] so expected will be
+ # all-f8, but the DataFrame operation will return mixed dtypes
+ # use exvals[-1].dtype instead of "i8" for compat with 32-bit
+ # systems/pythons
+ expected[False] = expected[False].astype(exvals[-1].dtype)
+
+ result = getattr(df, opname)(rowlike)
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_arith_2d_array_collike_broadcasts(self,
+ all_arithmetic_operators):
+ # GH#23000
+ opname = all_arithmetic_operators
+
+ arr = np.arange(6).reshape(3, 2)
+ df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
+
+ collike = arr[:, [1]] # shape --> (nrows, 1)
+ assert collike.shape == (df.shape[0], 1)
+
+ exvals = {True: getattr(df[True], opname)(collike.squeeze()),
+ False: getattr(df[False], opname)(collike.squeeze())}
+
+ dtype = None
+ if opname in ['__rmod__', '__rfloordiv__']:
+ # Series ops may return mixed int/float dtypes in cases where
+ # DataFrame op will return all-float. So we upcast `expected`
+ dtype = np.common_type(*[x.values for x in exvals.values()])
+
+ expected = pd.DataFrame(exvals, columns=df.columns, index=df.index,
+ dtype=dtype)
+
+ result = getattr(df, opname)(collike)
+ tm.assert_frame_equal(result, expected)
+
+ def test_df_bool_mul_int(self):
+ # GH 22047, GH 22163 multiplication by 1 should result in int dtype,
+ # not object dtype
+ df = pd.DataFrame([[False, True], [False, False]])
+ result = df * 1
+
+ # On appveyor this comes back as np.int32 instead of np.int64,
+ # so we check dtype.kind instead of just dtype
+ kinds = result.dtypes.apply(lambda x: x.kind)
+ assert (kinds == 'i').all()
+
+ result = 1 * df
+ kinds = result.dtypes.apply(lambda x: x.kind)
+ assert (kinds == 'i').all()
+
+ def test_arith_mixed(self):
+
+ left = pd.DataFrame({'A': ['a', 'b', 'c'],
+ 'B': [1, 2, 3]})
+
+ result = left + left
+ expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'],
+ 'B': [2, 4, 6]})
+ tm.assert_frame_equal(result, expected)
+
+ def test_arith_getitem_commute(self):
+ df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
+
+ def _test_op(df, op):
+ result = op(df, 1)
+
+ if not df.columns.is_unique:
+ raise ValueError("Only unique columns supported by this test")
+
+ for col in result.columns:
+ tm.assert_series_equal(result[col], op(df[col], 1))
+
+ _test_op(df, operator.add)
+ _test_op(df, operator.sub)
+ _test_op(df, operator.mul)
+ _test_op(df, operator.truediv)
+ _test_op(df, operator.floordiv)
+ _test_op(df, operator.pow)
+
+ _test_op(df, lambda x, y: y + x)
+ _test_op(df, lambda x, y: y - x)
+ _test_op(df, lambda x, y: y * x)
+ _test_op(df, lambda x, y: y / x)
+ _test_op(df, lambda x, y: y ** x)
+
+ _test_op(df, lambda x, y: x + y)
+ _test_op(df, lambda x, y: x - y)
+ _test_op(df, lambda x, y: x * y)
+ _test_op(df, lambda x, y: x / y)
+ _test_op(df, lambda x, y: x ** y)
+
+ @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]),
+ range(1, 3), deque([1, 2])])
+ def test_arith_alignment_non_pandas_object(self, values):
+ # GH#17901
+ df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]})
+ expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]})
+ result = df + values
+ tm.assert_frame_equal(result, expected)
+
+ def test_arith_non_pandas_object(self):
+ df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3),
+ columns=['one', 'two', 'three'],
+ index=['a', 'b', 'c'])
+
+ val1 = df.xs('a').values
+ added = pd.DataFrame(df.values + val1,
+ index=df.index, columns=df.columns)
+ tm.assert_frame_equal(df + val1, added)
+
+ added = pd.DataFrame((df.values.T + val1).T,
+ index=df.index, columns=df.columns)
+ tm.assert_frame_equal(df.add(val1, axis=0), added)
+
+ val2 = list(df['two'])
+
+ added = pd.DataFrame(df.values + val2,
+ index=df.index, columns=df.columns)
+ tm.assert_frame_equal(df + val2, added)
+
+ added = pd.DataFrame((df.values.T + val2).T, index=df.index,
+ columns=df.columns)
+ tm.assert_frame_equal(df.add(val2, axis='index'), added)
+
+ val3 = np.random.rand(*df.shape)
+ added = pd.DataFrame(df.values + val3,
+ index=df.index, columns=df.columns)
+ tm.assert_frame_equal(df.add(val3), added)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_asof.py b/contrib/python/pandas/py2/pandas/tests/frame/test_asof.py
new file mode 100644
index 00000000000..0947e6f252d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_asof.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series, Timestamp, date_range, to_datetime
+import pandas.util.testing as tm
+
+from .common import TestData
+
+
+class TestFrameAsof(TestData):
+ def setup_method(self, method):
+ self.N = N = 50
+ self.rng = date_range('1/1/1990', periods=N, freq='53s')
+ self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
+ index=self.rng)
+
+ def test_basic(self):
+ df = self.df.copy()
+ df.loc[15:30, 'A'] = np.nan
+ dates = date_range('1/1/1990', periods=self.N * 3,
+ freq='25s')
+
+ result = df.asof(dates)
+ assert result.notna().all(1).all()
+ lb = df.index[14]
+ ub = df.index[30]
+
+ dates = list(dates)
+ result = df.asof(dates)
+ assert result.notna().all(1).all()
+
+ mask = (result.index >= lb) & (result.index < ub)
+ rs = result[mask]
+ assert (rs == 14).all(1).all()
+
+ def test_subset(self):
+ N = 10
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+ df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
+ index=rng)
+ df.loc[4:8, 'A'] = np.nan
+ dates = date_range('1/1/1990', periods=N * 3,
+ freq='25s')
+
+ # with a subset of A should be the same
+ result = df.asof(dates, subset='A')
+ expected = df.asof(dates)
+ tm.assert_frame_equal(result, expected)
+
+ # same with A/B
+ result = df.asof(dates, subset=['A', 'B'])
+ expected = df.asof(dates)
+ tm.assert_frame_equal(result, expected)
+
+ # B gives self.df.asof
+ result = df.asof(dates, subset='B')
+ expected = df.resample('25s', closed='right').ffill().reindex(dates)
+ expected.iloc[20:] = 9
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_missing(self):
+ # GH 15118
+ # no match found - `where` value before earliest date in index
+ N = 10
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+ df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
+ index=rng)
+ result = df.asof('1989-12-31')
+
+ expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
+ tm.assert_series_equal(result, expected)
+
+ result = df.asof(to_datetime(['1989-12-31']))
+ expected = DataFrame(index=to_datetime(['1989-12-31']),
+ columns=['A', 'B'], dtype='float64')
+ tm.assert_frame_equal(result, expected)
+
+ def test_all_nans(self):
+ # GH 15713
+ # DataFrame is all nans
+ result = DataFrame([np.nan]).asof([0])
+ expected = DataFrame([np.nan])
+ tm.assert_frame_equal(result, expected)
+
+ # testing non-default indexes, multiple inputs
+ dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
+ result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates)
+ expected = DataFrame(np.nan, index=dates, columns=['A'])
+ tm.assert_frame_equal(result, expected)
+
+ # testing multiple columns
+ dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
+ result = DataFrame(np.nan, index=self.rng,
+ columns=['A', 'B', 'C']).asof(dates)
+ expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C'])
+ tm.assert_frame_equal(result, expected)
+
+ # testing scalar input
+ result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3])
+ expected = DataFrame(np.nan, index=[3], columns=['A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3)
+ expected = Series(np.nan, index=['A', 'B'], name=3)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "stamp,expected",
+ [(Timestamp('2018-01-01 23:22:43.325+00:00'),
+ Series(2.0, name=Timestamp('2018-01-01 23:22:43.325+00:00'))),
+ (Timestamp('2018-01-01 22:33:20.682+01:00'),
+ Series(1.0, name=Timestamp('2018-01-01 22:33:20.682+01:00'))),
+ ]
+ )
+ def test_time_zone_aware_index(self, stamp, expected):
+ # GH21194
+ # Testing awareness of DataFrame index considering different
+ # UTC and timezone
+ df = DataFrame(data=[1, 2],
+ index=[Timestamp('2018-01-01 21:00:05.001+00:00'),
+ Timestamp('2018-01-01 22:35:10.550+00:00')])
+ result = df.asof(stamp)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_axis_select_reindex.py b/contrib/python/pandas/py2/pandas/tests/frame/test_axis_select_reindex.py
new file mode 100644
index 00000000000..dea925dcde6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_axis_select_reindex.py
@@ -0,0 +1,1159 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, lzip, u
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, MultiIndex, Series, compat, date_range,
+ isna)
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+
+class TestDataFrameSelectReindex(TestData):
+ # These are specific reindex-based tests; other indexing tests should go in
+ # test_indexing
+
+ def test_drop_names(self):
+ df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
+ index=['a', 'b', 'c'],
+ columns=['d', 'e', 'f'])
+ df.index.name, df.columns.name = 'first', 'second'
+ df_dropped_b = df.drop('b')
+ df_dropped_e = df.drop('e', axis=1)
+ df_inplace_b, df_inplace_e = df.copy(), df.copy()
+ df_inplace_b.drop('b', inplace=True)
+ df_inplace_e.drop('e', axis=1, inplace=True)
+ for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
+ assert obj.index.name == 'first'
+ assert obj.columns.name == 'second'
+ assert list(df.columns) == ['d', 'e', 'f']
+
+ pytest.raises(KeyError, df.drop, ['g'])
+ pytest.raises(KeyError, df.drop, ['g'], 1)
+
+ # errors = 'ignore'
+ dropped = df.drop(['g'], errors='ignore')
+ expected = Index(['a', 'b', 'c'], name='first')
+ tm.assert_index_equal(dropped.index, expected)
+
+ dropped = df.drop(['b', 'g'], errors='ignore')
+ expected = Index(['a', 'c'], name='first')
+ tm.assert_index_equal(dropped.index, expected)
+
+ dropped = df.drop(['g'], axis=1, errors='ignore')
+ expected = Index(['d', 'e', 'f'], name='second')
+ tm.assert_index_equal(dropped.columns, expected)
+
+ dropped = df.drop(['d', 'g'], axis=1, errors='ignore')
+ expected = Index(['e', 'f'], name='second')
+ tm.assert_index_equal(dropped.columns, expected)
+
+ # GH 16398
+ dropped = df.drop([], errors='ignore')
+ expected = Index(['a', 'b', 'c'], name='first')
+ tm.assert_index_equal(dropped.index, expected)
+
+ def test_drop_col_still_multiindex(self):
+ arrays = [['a', 'b', 'c', 'top'],
+ ['', '', '', 'OD'],
+ ['', '', '', 'wx']]
+
+ tuples = sorted(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples)
+
+ df = DataFrame(np.random.randn(3, 4), columns=index)
+ del df[('a', '', '')]
+ assert(isinstance(df.columns, MultiIndex))
+
+ def test_drop(self):
+ simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
+ assert_frame_equal(simple.drop("A", axis=1), simple[['B']])
+ assert_frame_equal(simple.drop(["A", "B"], axis='columns'),
+ simple[[]])
+ assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
+ assert_frame_equal(simple.drop(
+ [0, 3], axis='index'), simple.loc[[1, 2], :])
+
+ pytest.raises(KeyError, simple.drop, 5)
+ pytest.raises(KeyError, simple.drop, 'C', 1)
+ pytest.raises(KeyError, simple.drop, [1, 5])
+ pytest.raises(KeyError, simple.drop, ['A', 'C'], 1)
+
+ # errors = 'ignore'
+ assert_frame_equal(simple.drop(5, errors='ignore'), simple)
+ assert_frame_equal(simple.drop([0, 5], errors='ignore'),
+ simple.loc[[1, 2, 3], :])
+ assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple)
+ assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'),
+ simple[['B']])
+
+ # non-unique - wheee!
+ nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')),
+ columns=['a', 'a', 'b'])
+ assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']])
+ assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a'])
+ assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
+
+ nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X']))
+ nu_df.columns = list('abc')
+ assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :])
+ assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :])
+
+ # inplace cache issue
+ # GH 5628
+ df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc'))
+ expected = df[~(df.b > 0)]
+ df.drop(labels=df[df.b > 0].index, inplace=True)
+ assert_frame_equal(df, expected)
+
+ def test_drop_multiindex_not_lexsorted(self):
+ # GH 11640
+
+ # define the lexsorted version
+ lexsorted_mi = MultiIndex.from_tuples(
+ [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
+ lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
+ assert lexsorted_df.columns.is_lexsorted()
+
+ # define the non-lexsorted version
+ not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
+ data=[[1, 'b1', 'c1', 3],
+ [1, 'b2', 'c2', 4]])
+ not_lexsorted_df = not_lexsorted_df.pivot_table(
+ index='a', columns=['b', 'c'], values='d')
+ not_lexsorted_df = not_lexsorted_df.reset_index()
+ assert not not_lexsorted_df.columns.is_lexsorted()
+
+ # compare the results
+ tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
+
+ expected = lexsorted_df.drop('a', axis=1)
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = not_lexsorted_df.drop('a', axis=1)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_drop_api_equivalence(self):
+ # equivalence of the labels/axis and index/columns API's (GH12392)
+ df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
+ index=['a', 'b', 'c'],
+ columns=['d', 'e', 'f'])
+
+ res1 = df.drop('a')
+ res2 = df.drop(index='a')
+ tm.assert_frame_equal(res1, res2)
+
+ res1 = df.drop('d', 1)
+ res2 = df.drop(columns='d')
+ tm.assert_frame_equal(res1, res2)
+
+ res1 = df.drop(labels='e', axis=1)
+ res2 = df.drop(columns='e')
+ tm.assert_frame_equal(res1, res2)
+
+ res1 = df.drop(['a'], axis=0)
+ res2 = df.drop(index=['a'])
+ tm.assert_frame_equal(res1, res2)
+
+ res1 = df.drop(['a'], axis=0).drop(['d'], axis=1)
+ res2 = df.drop(index=['a'], columns=['d'])
+ tm.assert_frame_equal(res1, res2)
+
+ with pytest.raises(ValueError):
+ df.drop(labels='a', index='b')
+
+ with pytest.raises(ValueError):
+ df.drop(labels='a', columns='b')
+
+ with pytest.raises(ValueError):
+ df.drop(axis=1)
+
+ def test_merge_join_different_levels(self):
+ # GH 9455
+
+ # first dataframe
+ df1 = DataFrame(columns=['a', 'b'], data=[[1, 11], [0, 22]])
+
+ # second dataframe
+ columns = MultiIndex.from_tuples([('a', ''), ('c', 'c1')])
+ df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])
+
+ # merge
+ columns = ['a', 'b', ('c', 'c1')]
+ expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]])
+ with tm.assert_produces_warning(UserWarning):
+ result = pd.merge(df1, df2, on='a')
+ tm.assert_frame_equal(result, expected)
+
+ # join, see discussion in GH 12219
+ columns = ['a', 'b', ('a', ''), ('c', 'c1')]
+ expected = DataFrame(columns=columns,
+ data=[[1, 11, 0, 44], [0, 22, 1, 33]])
+ with tm.assert_produces_warning(UserWarning):
+ result = df1.join(df2, on='a')
+ tm.assert_frame_equal(result, expected)
+
+ def test_reindex(self):
+ newFrame = self.frame.reindex(self.ts1.index)
+
+ for col in newFrame.columns:
+ for idx, val in compat.iteritems(newFrame[col]):
+ if idx in self.frame.index:
+ if np.isnan(val):
+ assert np.isnan(self.frame[col][idx])
+ else:
+ assert val == self.frame[col][idx]
+ else:
+ assert np.isnan(val)
+
+ for col, series in compat.iteritems(newFrame):
+ assert tm.equalContents(series.index, newFrame.index)
+ emptyFrame = self.frame.reindex(Index([]))
+ assert len(emptyFrame.index) == 0
+
+ # Cython code should be unit-tested directly
+ nonContigFrame = self.frame.reindex(self.ts1.index[::2])
+
+ for col in nonContigFrame.columns:
+ for idx, val in compat.iteritems(nonContigFrame[col]):
+ if idx in self.frame.index:
+ if np.isnan(val):
+ assert np.isnan(self.frame[col][idx])
+ else:
+ assert val == self.frame[col][idx]
+ else:
+ assert np.isnan(val)
+
+ for col, series in compat.iteritems(nonContigFrame):
+ assert tm.equalContents(series.index, nonContigFrame.index)
+
+ # corner cases
+
+ # Same index, copies values but not index if copy=False
+ newFrame = self.frame.reindex(self.frame.index, copy=False)
+ assert newFrame.index is self.frame.index
+
+ # length zero
+ newFrame = self.frame.reindex([])
+ assert newFrame.empty
+ assert len(newFrame.columns) == len(self.frame.columns)
+
+ # length zero with columns reindexed with non-empty index
+ newFrame = self.frame.reindex([])
+ newFrame = newFrame.reindex(self.frame.index)
+ assert len(newFrame.index) == len(self.frame.index)
+ assert len(newFrame.columns) == len(self.frame.columns)
+
+ # pass non-Index
+ newFrame = self.frame.reindex(list(self.ts1.index))
+ tm.assert_index_equal(newFrame.index, self.ts1.index)
+
+ # copy with no axes
+ result = self.frame.reindex()
+ assert_frame_equal(result, self.frame)
+ assert result is not self.frame
+
+ def test_reindex_nan(self):
+ df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]],
+ index=[2, np.nan, 1, 5],
+ columns=['joe', 'jim'])
+
+ i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1]
+ assert_frame_equal(df.reindex(i), df.iloc[j])
+
+ df.index = df.index.astype('object')
+ assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False)
+
+ # GH10388
+ df = pd.DataFrame({'other': ['a', 'b', np.nan, 'c'],
+ 'date': ['2015-03-22', np.nan,
+ '2012-01-08', np.nan],
+ 'amount': [2, 3, 4, 5]})
+
+ df['date'] = pd.to_datetime(df.date)
+ df['delta'] = (pd.to_datetime('2015-06-18') - df['date']).shift(1)
+
+ left = df.set_index(['delta', 'other', 'date']).reset_index()
+ right = df.reindex(columns=['delta', 'other', 'date', 'amount'])
+ assert_frame_equal(left, right)
+
+ def test_reindex_name_remains(self):
+ s = Series(np.random.rand(10))
+ df = DataFrame(s, index=np.arange(len(s)))
+ i = Series(np.arange(10), name='iname')
+
+ df = df.reindex(i)
+ assert df.index.name == 'iname'
+
+ df = df.reindex(Index(np.arange(10), name='tmpname'))
+ assert df.index.name == 'tmpname'
+
+ s = Series(np.random.rand(10))
+ df = DataFrame(s.T, index=np.arange(len(s)))
+ i = Series(np.arange(10), name='iname')
+ df = df.reindex(columns=i)
+ assert df.columns.name == 'iname'
+
+ def test_reindex_int(self):
+ smaller = self.intframe.reindex(self.intframe.index[::2])
+
+ assert smaller['A'].dtype == np.int64
+
+ bigger = smaller.reindex(self.intframe.index)
+ assert bigger['A'].dtype == np.float64
+
+ smaller = self.intframe.reindex(columns=['A', 'B'])
+ assert smaller['A'].dtype == np.int64
+
+ def test_reindex_like(self):
+ other = self.frame.reindex(index=self.frame.index[:10],
+ columns=['C', 'B'])
+
+ assert_frame_equal(other, self.frame.reindex_like(other))
+
+ def test_reindex_columns(self):
+ new_frame = self.frame.reindex(columns=['A', 'B', 'E'])
+
+ tm.assert_series_equal(new_frame['B'], self.frame['B'])
+ assert np.isnan(new_frame['E']).all()
+ assert 'C' not in new_frame
+
+ # Length zero
+ new_frame = self.frame.reindex(columns=[])
+ assert new_frame.empty
+
+ def test_reindex_columns_method(self):
+
+ # GH 14992, reindexing over columns ignored method
+ df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]],
+ index=[1, 2, 4],
+ columns=[1, 2, 4],
+ dtype=float)
+
+ # default method
+ result = df.reindex(columns=range(6))
+ expected = DataFrame(data=[[np.nan, 11, 12, np.nan, 13, np.nan],
+ [np.nan, 21, 22, np.nan, 23, np.nan],
+ [np.nan, 31, 32, np.nan, 33, np.nan]],
+ index=[1, 2, 4],
+ columns=range(6),
+ dtype=float)
+ assert_frame_equal(result, expected)
+
+ # method='ffill'
+ result = df.reindex(columns=range(6), method='ffill')
+ expected = DataFrame(data=[[np.nan, 11, 12, 12, 13, 13],
+ [np.nan, 21, 22, 22, 23, 23],
+ [np.nan, 31, 32, 32, 33, 33]],
+ index=[1, 2, 4],
+ columns=range(6),
+ dtype=float)
+ assert_frame_equal(result, expected)
+
+ # method='bfill'
+ result = df.reindex(columns=range(6), method='bfill')
+ expected = DataFrame(data=[[11, 11, 12, 13, 13, np.nan],
+ [21, 21, 22, 23, 23, np.nan],
+ [31, 31, 32, 33, 33, np.nan]],
+ index=[1, 2, 4],
+ columns=range(6),
+ dtype=float)
+ assert_frame_equal(result, expected)
+
+ def test_reindex_axes(self):
+ # GH 3317, reindexing by both axes loses freq of the index
+ df = DataFrame(np.ones((3, 3)),
+ index=[datetime(2012, 1, 1),
+ datetime(2012, 1, 2),
+ datetime(2012, 1, 3)],
+ columns=['a', 'b', 'c'])
+ time_freq = date_range('2012-01-01', '2012-01-03', freq='d')
+ some_cols = ['a', 'b']
+
+ index_freq = df.reindex(index=time_freq).index.freq
+ both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
+ seq_freq = df.reindex(index=time_freq).reindex(
+ columns=some_cols).index.freq
+ assert index_freq == both_freq
+ assert index_freq == seq_freq
+
+ def test_reindex_fill_value(self):
+ df = DataFrame(np.random.randn(10, 4))
+
+ # axis=0
+ result = df.reindex(lrange(15))
+ assert np.isnan(result.values[-5:]).all()
+
+ result = df.reindex(lrange(15), fill_value=0)
+ expected = df.reindex(lrange(15)).fillna(0)
+ assert_frame_equal(result, expected)
+
+ # axis=1
+ result = df.reindex(columns=lrange(5), fill_value=0.)
+ expected = df.copy()
+ expected[4] = 0.
+ assert_frame_equal(result, expected)
+
+ result = df.reindex(columns=lrange(5), fill_value=0)
+ expected = df.copy()
+ expected[4] = 0
+ assert_frame_equal(result, expected)
+
+ result = df.reindex(columns=lrange(5), fill_value='foo')
+ expected = df.copy()
+ expected[4] = 'foo'
+ assert_frame_equal(result, expected)
+
+ # reindex_axis
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.reindex_axis(lrange(15), fill_value=0., axis=0)
+ expected = df.reindex(lrange(15)).fillna(0)
+ assert_frame_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.reindex_axis(lrange(5), fill_value=0., axis=1)
+ expected = df.reindex(columns=lrange(5)).fillna(0)
+ assert_frame_equal(result, expected)
+
+ # other dtypes
+ df['foo'] = 'foo'
+ result = df.reindex(lrange(15), fill_value=0)
+ expected = df.reindex(lrange(15)).fillna(0)
+ assert_frame_equal(result, expected)
+
+ def test_reindex_dups(self):
+
+ # GH4746, reindex on duplicate index error messages
+ arr = np.random.randn(10)
+ df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
+
+ # set index is ok
+ result = df.copy()
+ result.index = list(range(len(df)))
+ expected = DataFrame(arr, index=list(range(len(df))))
+ assert_frame_equal(result, expected)
+
+ # reindex fails
+ pytest.raises(ValueError, df.reindex, index=list(range(len(df))))
+
+ def test_reindex_axis_style(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ expected = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, 5, np.nan]},
+ index=[0, 1, 3])
+ result = df.reindex([0, 1, 3])
+ assert_frame_equal(result, expected)
+
+ result = df.reindex([0, 1, 3], axis=0)
+ assert_frame_equal(result, expected)
+
+ result = df.reindex([0, 1, 3], axis='index')
+ assert_frame_equal(result, expected)
+
+ def test_reindex_positional_warns(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ expected = pd.DataFrame({"A": [1., 2], 'B': [4., 5],
+ "C": [np.nan, np.nan]})
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.reindex([0, 1], ['A', 'B', 'C'])
+
+ assert_frame_equal(result, expected)
+
+ def test_reindex_axis_style_raises(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = pd.DataFrame({"A": [1, 2, 3], 'B': [4, 5, 6]})
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex([0, 1], ['A'], axis=1)
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex([0, 1], ['A'], axis='index')
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(index=[0, 1], axis='index')
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(index=[0, 1], axis='columns')
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(columns=[0, 1], axis='columns')
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(index=[0, 1], columns=[0, 1], axis='columns')
+
+ with pytest.raises(TypeError, match='Cannot specify all'):
+ df.reindex([0, 1], [0], ['A'])
+
+ # Mixing styles
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(index=[0, 1], axis='index')
+
+ with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
+ df.reindex(index=[0, 1], axis='columns')
+
+ # Duplicates
+ with pytest.raises(TypeError, match="multiple values"):
+ df.reindex([0, 1], labels=[0, 1])
+
+ def test_reindex_single_named_indexer(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]})
+ result = df.reindex([0, 1], columns=['A'])
+ expected = pd.DataFrame({"A": [1, 2]})
+ assert_frame_equal(result, expected)
+
+ def test_reindex_api_equivalence(self):
+ # https://github.com/pandas-dev/pandas/issues/12392
+ # equivalence of the labels/axis and index/columns API's
+ df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
+ index=['a', 'b', 'c'],
+ columns=['d', 'e', 'f'])
+
+ res1 = df.reindex(['b', 'a'])
+ res2 = df.reindex(index=['b', 'a'])
+ res3 = df.reindex(labels=['b', 'a'])
+ res4 = df.reindex(labels=['b', 'a'], axis=0)
+ res5 = df.reindex(['b', 'a'], axis=0)
+ for res in [res2, res3, res4, res5]:
+ tm.assert_frame_equal(res1, res)
+
+ res1 = df.reindex(columns=['e', 'd'])
+ res2 = df.reindex(['e', 'd'], axis=1)
+ res3 = df.reindex(labels=['e', 'd'], axis=1)
+ for res in [res2, res3]:
+ tm.assert_frame_equal(res1, res)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ res1 = df.reindex(['b', 'a'], ['e', 'd'])
+ assert 'reindex' in str(m[0].message)
+ res2 = df.reindex(columns=['e', 'd'], index=['b', 'a'])
+ res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'],
+ axis=1)
+ for res in [res2, res3]:
+ tm.assert_frame_equal(res1, res)
+
+ def test_align(self):
+ af, bf = self.frame.align(self.frame)
+ assert af._data is not self.frame._data
+
+ af, bf = self.frame.align(self.frame, copy=False)
+ assert af._data is self.frame._data
+
+ # axis = 0
+ other = self.frame.iloc[:-5, :3]
+ af, bf = self.frame.align(other, axis=0, fill_value=-1)
+
+ tm.assert_index_equal(bf.columns, other.columns)
+
+ # test fill value
+ join_idx = self.frame.index.join(other.index)
+ diff_a = self.frame.index.difference(join_idx)
+ diff_b = other.index.difference(join_idx)
+ diff_a_vals = af.reindex(diff_a).values
+ diff_b_vals = bf.reindex(diff_b).values
+ assert (diff_a_vals == -1).all()
+
+ af, bf = self.frame.align(other, join='right', axis=0)
+ tm.assert_index_equal(bf.columns, other.columns)
+ tm.assert_index_equal(bf.index, other.index)
+ tm.assert_index_equal(af.index, other.index)
+
+ # axis = 1
+ other = self.frame.iloc[:-5, :3].copy()
+ af, bf = self.frame.align(other, axis=1)
+ tm.assert_index_equal(bf.columns, self.frame.columns)
+ tm.assert_index_equal(bf.index, other.index)
+
+ # test fill value
+ join_idx = self.frame.index.join(other.index)
+ diff_a = self.frame.index.difference(join_idx)
+ diff_b = other.index.difference(join_idx)
+ diff_a_vals = af.reindex(diff_a).values
+
+ # TODO(wesm): unused?
+ diff_b_vals = bf.reindex(diff_b).values # noqa
+
+ assert (diff_a_vals == -1).all()
+
+ af, bf = self.frame.align(other, join='inner', axis=1)
+ tm.assert_index_equal(bf.columns, other.columns)
+
+ af, bf = self.frame.align(other, join='inner', axis=1, method='pad')
+ tm.assert_index_equal(bf.columns, other.columns)
+
+ # test other non-float types
+ af, bf = self.intframe.align(other, join='inner', axis=1, method='pad')
+ tm.assert_index_equal(bf.columns, other.columns)
+
+ af, bf = self.mixed_frame.align(self.mixed_frame,
+ join='inner', axis=1, method='pad')
+ tm.assert_index_equal(bf.columns, self.mixed_frame.columns)
+
+ af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1,
+ method=None, fill_value=None)
+ tm.assert_index_equal(bf.index, Index([]))
+
+ af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1,
+ method=None, fill_value=0)
+ tm.assert_index_equal(bf.index, Index([]))
+
+ # mixed floats/ints
+ af, bf = self.mixed_float.align(other.iloc[:, 0], join='inner', axis=1,
+ method=None, fill_value=0)
+ tm.assert_index_equal(bf.index, Index([]))
+
+ af, bf = self.mixed_int.align(other.iloc[:, 0], join='inner', axis=1,
+ method=None, fill_value=0)
+ tm.assert_index_equal(bf.index, Index([]))
+
+ # Try to align DataFrame to Series along bad axis
+ with pytest.raises(ValueError):
+ self.frame.align(af.iloc[0, :3], join='inner', axis=2)
+
+ # align dataframe to series with broadcast or not
+ idx = self.frame.index
+ s = Series(range(len(idx)), index=idx)
+
+ left, right = self.frame.align(s, axis=0)
+ tm.assert_index_equal(left.index, self.frame.index)
+ tm.assert_index_equal(right.index, self.frame.index)
+ assert isinstance(right, Series)
+
+ left, right = self.frame.align(s, broadcast_axis=1)
+ tm.assert_index_equal(left.index, self.frame.index)
+ expected = {c: s for c in self.frame.columns}
+ expected = DataFrame(expected, index=self.frame.index,
+ columns=self.frame.columns)
+ tm.assert_frame_equal(right, expected)
+
+ # see gh-9558
+ df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ result = df[df['a'] == 2]
+ expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.where(df['a'] == 2, 0)
+ expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]})
+ tm.assert_frame_equal(result, expected)
+
+ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
+ aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit,
+ fill_axis=fill_axis)
+
+ join_index, join_columns = None, None
+
+ ea, eb = a, b
+ if axis is None or axis == 0:
+ join_index = a.index.join(b.index, how=how)
+ ea = ea.reindex(index=join_index)
+ eb = eb.reindex(index=join_index)
+
+ if axis is None or axis == 1:
+ join_columns = a.columns.join(b.columns, how=how)
+ ea = ea.reindex(columns=join_columns)
+ eb = eb.reindex(columns=join_columns)
+
+ ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
+ eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
+
+ assert_frame_equal(aa, ea)
+ assert_frame_equal(ab, eb)
+
+ @pytest.mark.parametrize('meth', ['pad', 'bfill'])
+ @pytest.mark.parametrize('ax', [0, 1, None])
+ @pytest.mark.parametrize('fax', [0, 1])
+ @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right'])
+ def test_align_fill_method(self, how, meth, ax, fax):
+ self._check_align_fill(how, meth, ax, fax)
+
+ def _check_align_fill(self, kind, meth, ax, fax):
+ left = self.frame.iloc[0:4, :10]
+ right = self.frame.iloc[2:, 6:]
+ empty = self.frame.iloc[:0, :0]
+
+ self._check_align(left, right, axis=ax, fill_axis=fax,
+ how=kind, method=meth)
+ self._check_align(left, right, axis=ax, fill_axis=fax,
+ how=kind, method=meth, limit=1)
+
+ # empty left
+ self._check_align(empty, right, axis=ax, fill_axis=fax,
+ how=kind, method=meth)
+ self._check_align(empty, right, axis=ax, fill_axis=fax,
+ how=kind, method=meth, limit=1)
+
+ # empty right
+ self._check_align(left, empty, axis=ax, fill_axis=fax,
+ how=kind, method=meth)
+ self._check_align(left, empty, axis=ax, fill_axis=fax,
+ how=kind, method=meth, limit=1)
+
+ # both empty
+ self._check_align(empty, empty, axis=ax, fill_axis=fax,
+ how=kind, method=meth)
+ self._check_align(empty, empty, axis=ax, fill_axis=fax,
+ how=kind, method=meth, limit=1)
+
+ def test_align_int_fill_bug(self):
+ # GH #910
+ X = np.arange(10 * 10, dtype='float64').reshape(10, 10)
+ Y = np.ones((10, 1), dtype=int)
+
+ df1 = DataFrame(X)
+ df1['0.X'] = Y.squeeze()
+
+ df2 = df1.astype(float)
+
+ result = df1 - df1.mean()
+ expected = df2 - df2.mean()
+ assert_frame_equal(result, expected)
+
+ def test_align_multiindex(self):
+ # GH 10665
+ # same test cases as test_align_multiindex in test_series.py
+
+ midx = pd.MultiIndex.from_product([range(2), range(3), range(2)],
+ names=('a', 'b', 'c'))
+ idx = pd.Index(range(2), name='b')
+ df1 = pd.DataFrame(np.arange(12, dtype='int64'), index=midx)
+ df2 = pd.DataFrame(np.arange(2, dtype='int64'), index=idx)
+
+ # these must be the same results (but flipped)
+ res1l, res1r = df1.align(df2, join='left')
+ res2l, res2r = df2.align(df1, join='right')
+
+ expl = df1
+ assert_frame_equal(expl, res1l)
+ assert_frame_equal(expl, res2r)
+ expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
+ assert_frame_equal(expr, res1r)
+ assert_frame_equal(expr, res2l)
+
+ res1l, res1r = df1.align(df2, join='right')
+ res2l, res2r = df2.align(df1, join='left')
+
+ exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)],
+ names=('a', 'b', 'c'))
+ expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
+ assert_frame_equal(expl, res1l)
+ assert_frame_equal(expl, res2r)
+ expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
+ assert_frame_equal(expr, res1r)
+ assert_frame_equal(expr, res2l)
+
+ def test_align_series_combinations(self):
+ df = pd.DataFrame({'a': [1, 3, 5],
+ 'b': [1, 3, 5]}, index=list('ACE'))
+ s = pd.Series([1, 2, 4], index=list('ABD'), name='x')
+
+ # frame + series
+ res1, res2 = df.align(s, axis=0)
+ exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
+ 'b': [1, np.nan, 3, np.nan, 5]},
+ index=list('ABCDE'))
+ exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
+ index=list('ABCDE'), name='x')
+
+ tm.assert_frame_equal(res1, exp1)
+ tm.assert_series_equal(res2, exp2)
+
+ # series + frame
+ res1, res2 = s.align(df)
+ tm.assert_series_equal(res1, exp2)
+ tm.assert_frame_equal(res2, exp1)
+
+ def test_filter(self):
+ # Items
+ filtered = self.frame.filter(['A', 'B', 'E'])
+ assert len(filtered.columns) == 2
+ assert 'E' not in filtered
+
+ filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
+ assert len(filtered.columns) == 2
+ assert 'E' not in filtered
+
+ # Other axis
+ idx = self.frame.index[0:4]
+ filtered = self.frame.filter(idx, axis='index')
+ expected = self.frame.reindex(index=idx)
+ tm.assert_frame_equal(filtered, expected)
+
+ # like
+ fcopy = self.frame.copy()
+ fcopy['AA'] = 1
+
+ filtered = fcopy.filter(like='A')
+ assert len(filtered.columns) == 2
+ assert 'AA' in filtered
+
+ # like with ints in column names
+ df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
+ filtered = df.filter(like='_')
+ assert len(filtered.columns) == 2
+
+ # regex with ints in column names
+ # from PR #10384
+ df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
+ expected = DataFrame(
+ 0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
+ filtered = df.filter(regex='^[0-9]+$')
+ tm.assert_frame_equal(filtered, expected)
+
+ expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
+ # shouldn't remove anything
+ filtered = expected.filter(regex='^[0-9]+$')
+ tm.assert_frame_equal(filtered, expected)
+
+ # pass in None
+ with pytest.raises(TypeError, match='Must pass'):
+ self.frame.filter()
+ with pytest.raises(TypeError, match='Must pass'):
+ self.frame.filter(items=None)
+ with pytest.raises(TypeError, match='Must pass'):
+ self.frame.filter(axis=1)
+
+ # test mutually exclusive arguments
+ with pytest.raises(TypeError, match='mutually exclusive'):
+ self.frame.filter(items=['one', 'three'], regex='e$', like='bbi')
+ with pytest.raises(TypeError, match='mutually exclusive'):
+ self.frame.filter(items=['one', 'three'], regex='e$', axis=1)
+ with pytest.raises(TypeError, match='mutually exclusive'):
+ self.frame.filter(items=['one', 'three'], regex='e$')
+ with pytest.raises(TypeError, match='mutually exclusive'):
+ self.frame.filter(items=['one', 'three'], like='bbi', axis=0)
+ with pytest.raises(TypeError, match='mutually exclusive'):
+ self.frame.filter(items=['one', 'three'], like='bbi')
+
+ # objects
+ filtered = self.mixed_frame.filter(like='foo')
+ assert 'foo' in filtered
+
+ # unicode columns, won't ascii-encode
+ df = self.frame.rename(columns={'B': u('\u2202')})
+ filtered = df.filter(like='C')
+ assert 'C' in filtered
+
+ def test_filter_regex_search(self):
+ fcopy = self.frame.copy()
+ fcopy['AA'] = 1
+
+ # regex
+ filtered = fcopy.filter(regex='[A]+')
+ assert len(filtered.columns) == 2
+ assert 'AA' in filtered
+
+ # doesn't have to be at beginning
+ df = DataFrame({'aBBa': [1, 2],
+ 'BBaBB': [1, 2],
+ 'aCCa': [1, 2],
+ 'aCCaBB': [1, 2]})
+
+ result = df.filter(regex='BB')
+ exp = df[[x for x in df.columns if 'BB' in x]]
+ assert_frame_equal(result, exp)
+
+ @pytest.mark.parametrize('name,expected', [
+ ('a', DataFrame({u'a': [1, 2]})),
+ (u'a', DataFrame({u'a': [1, 2]})),
+ (u'あ', DataFrame({u'あ': [3, 4]}))
+ ])
+ def test_filter_unicode(self, name, expected):
+ # GH13101
+ df = DataFrame({u'a': [1, 2], u'あ': [3, 4]})
+
+ assert_frame_equal(df.filter(like=name), expected)
+ assert_frame_equal(df.filter(regex=name), expected)
+
+ @pytest.mark.parametrize('name', ['a', u'a'])
+ def test_filter_bytestring(self, name):
+ # GH13101
+ df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
+ expected = DataFrame({b'a': [1, 2]})
+
+ assert_frame_equal(df.filter(like=name), expected)
+ assert_frame_equal(df.filter(regex=name), expected)
+
+ def test_filter_corner(self):
+ empty = DataFrame()
+
+ result = empty.filter([])
+ assert_frame_equal(result, empty)
+
+ result = empty.filter(like='foo')
+ assert_frame_equal(result, empty)
+
+ def test_select(self):
+
+ # deprecated: gh-12410
+ f = lambda x: x.weekday() == 2
+ index = self.tsframe.index[[f(x) for x in self.tsframe.index]]
+ expected_weekdays = self.tsframe.reindex(index=index)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = self.tsframe.select(f, axis=0)
+ assert_frame_equal(result, expected_weekdays)
+
+ result = self.frame.select(lambda x: x in ('B', 'D'), axis=1)
+ expected = self.frame.reindex(columns=['B', 'D'])
+ assert_frame_equal(result, expected, check_names=False)
+
+ # replacement
+ f = lambda x: x.weekday == 2
+ result = self.tsframe.loc(axis=0)[f(self.tsframe.index)]
+ assert_frame_equal(result, expected_weekdays)
+
+ crit = lambda x: x in ['B', 'D']
+ result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))]
+ expected = self.frame.reindex(columns=['B', 'D'])
+ assert_frame_equal(result, expected, check_names=False)
+
+ # doc example
+ df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz'])
+
+ crit = lambda x: x in ['bar', 'baz']
+ with tm.assert_produces_warning(FutureWarning):
+ expected = df.select(crit)
+ result = df.loc[df.index.map(crit)]
+ assert_frame_equal(result, expected, check_names=False)
+
+ def test_take(self):
+ # homogeneous
+ order = [3, 1, 2, 0]
+ for df in [self.frame]:
+
+ result = df.take(order, axis=0)
+ expected = df.reindex(df.index.take(order))
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.take(order, axis=1)
+ expected = df.loc[:, ['D', 'B', 'C', 'A']]
+ assert_frame_equal(result, expected, check_names=False)
+
+ # negative indices
+ order = [2, 1, -1]
+ for df in [self.frame]:
+
+ result = df.take(order, axis=0)
+ expected = df.reindex(df.index.take(order))
+ assert_frame_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.take(order, convert=True, axis=0)
+ assert_frame_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.take(order, convert=False, axis=0)
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.take(order, axis=1)
+ expected = df.loc[:, ['C', 'B', 'D']]
+ assert_frame_equal(result, expected, check_names=False)
+
+ # illegal indices
+ pytest.raises(IndexError, df.take, [3, 1, 2, 30], axis=0)
+ pytest.raises(IndexError, df.take, [3, 1, 2, -31], axis=0)
+ pytest.raises(IndexError, df.take, [3, 1, 2, 5], axis=1)
+ pytest.raises(IndexError, df.take, [3, 1, 2, -5], axis=1)
+
+ # mixed-dtype
+ order = [4, 1, 2, 0, 3]
+ for df in [self.mixed_frame]:
+
+ result = df.take(order, axis=0)
+ expected = df.reindex(df.index.take(order))
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.take(order, axis=1)
+ expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']]
+ assert_frame_equal(result, expected)
+
+ # negative indices
+ order = [4, 1, -2]
+ for df in [self.mixed_frame]:
+
+ result = df.take(order, axis=0)
+ expected = df.reindex(df.index.take(order))
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.take(order, axis=1)
+ expected = df.loc[:, ['foo', 'B', 'D']]
+ assert_frame_equal(result, expected)
+
+ # by dtype
+ order = [1, 2, 0, 3]
+ for df in [self.mixed_float, self.mixed_int]:
+
+ result = df.take(order, axis=0)
+ expected = df.reindex(df.index.take(order))
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.take(order, axis=1)
+ expected = df.loc[:, ['B', 'C', 'A', 'D']]
+ assert_frame_equal(result, expected)
+
+ def test_reindex_boolean(self):
+ frame = DataFrame(np.ones((10, 2), dtype=bool),
+ index=np.arange(0, 20, 2),
+ columns=[0, 2])
+
+ reindexed = frame.reindex(np.arange(10))
+ assert reindexed.values.dtype == np.object_
+ assert isna(reindexed[0][1])
+
+ reindexed = frame.reindex(columns=lrange(3))
+ assert reindexed.values.dtype == np.object_
+ assert isna(reindexed[1]).all()
+
+ def test_reindex_objects(self):
+ reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B'])
+ assert 'foo' in reindexed
+
+ reindexed = self.mixed_frame.reindex(columns=['A', 'B'])
+ assert 'foo' not in reindexed
+
+ def test_reindex_corner(self):
+ index = Index(['a', 'b', 'c'])
+ dm = self.empty.reindex(index=[1, 2, 3])
+ reindexed = dm.reindex(columns=index)
+ tm.assert_index_equal(reindexed.columns, index)
+
+ # ints are weird
+ smaller = self.intframe.reindex(columns=['A', 'B', 'E'])
+ assert smaller['E'].dtype == np.float64
+
+ def test_reindex_axis(self):
+ cols = ['A', 'B', 'E']
+ with tm.assert_produces_warning(FutureWarning) as m:
+ reindexed1 = self.intframe.reindex_axis(cols, axis=1)
+ assert 'reindex' in str(m[0].message)
+ reindexed2 = self.intframe.reindex(columns=cols)
+ assert_frame_equal(reindexed1, reindexed2)
+
+ rows = self.intframe.index[0:5]
+ with tm.assert_produces_warning(FutureWarning) as m:
+ reindexed1 = self.intframe.reindex_axis(rows, axis=0)
+ assert 'reindex' in str(m[0].message)
+ reindexed2 = self.intframe.reindex(index=rows)
+ assert_frame_equal(reindexed1, reindexed2)
+
+ pytest.raises(ValueError, self.intframe.reindex_axis, rows, axis=2)
+
+ # no-op case
+ cols = self.frame.columns.copy()
+ with tm.assert_produces_warning(FutureWarning) as m:
+ newFrame = self.frame.reindex_axis(cols, axis=1)
+ assert 'reindex' in str(m[0].message)
+ assert_frame_equal(newFrame, self.frame)
+
+ def test_reindex_with_nans(self):
+ df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
+ columns=['a', 'b'],
+ index=[100.0, 101.0, np.nan, 102.0, 103.0])
+
+ result = df.reindex(index=[101.0, 102.0, 103.0])
+ expected = df.iloc[[1, 3, 4]]
+ assert_frame_equal(result, expected)
+
+ result = df.reindex(index=[103.0])
+ expected = df.iloc[[4]]
+ assert_frame_equal(result, expected)
+
+ result = df.reindex(index=[101.0])
+ expected = df.iloc[[1]]
+ assert_frame_equal(result, expected)
+
+ def test_reindex_multi(self):
+ df = DataFrame(np.random.randn(3, 3))
+
+ result = df.reindex(index=lrange(4), columns=lrange(4))
+ expected = df.reindex(lrange(4)).reindex(columns=lrange(4))
+
+ assert_frame_equal(result, expected)
+
+ df = DataFrame(np.random.randint(0, 10, (3, 3)))
+
+ result = df.reindex(index=lrange(4), columns=lrange(4))
+ expected = df.reindex(lrange(4)).reindex(columns=lrange(4))
+
+ assert_frame_equal(result, expected)
+
+ df = DataFrame(np.random.randint(0, 10, (3, 3)))
+
+ result = df.reindex(index=lrange(2), columns=lrange(2))
+ expected = df.reindex(lrange(2)).reindex(columns=lrange(2))
+
+ assert_frame_equal(result, expected)
+
+ df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c'])
+
+ result = df.reindex(index=[0, 1], columns=['a', 'b'])
+ expected = df.reindex([0, 1]).reindex(columns=['a', 'b'])
+
+ assert_frame_equal(result, expected)
+
+ def test_reindex_multi_categorical_time(self):
+ # https://github.com/pandas-dev/pandas/issues/21390
+ midx = pd.MultiIndex.from_product(
+ [Categorical(['a', 'b', 'c']),
+ Categorical(date_range("2012-01-01", periods=3, freq='H'))])
+ df = pd.DataFrame({'a': range(len(midx))}, index=midx)
+ df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]
+
+ result = df2.reindex(midx)
+ expected = pd.DataFrame(
+ {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
+ assert_frame_equal(result, expected)
+
+ data = [[1, 2, 3], [1, 2, 3]]
+
+ @pytest.mark.parametrize('actual', [
+ DataFrame(data=data, index=['a', 'a']),
+ DataFrame(data=data, index=['a', 'b']),
+ DataFrame(data=data, index=['a', 'b']).set_index([0, 1]),
+ DataFrame(data=data, index=['a', 'a']).set_index([0, 1])
+ ])
+ def test_raise_on_drop_duplicate_index(self, actual):
+
+ # issue 19186
+ level = 0 if isinstance(actual.index, MultiIndex) else None
+ with pytest.raises(KeyError):
+ actual.drop('c', level=level, axis=0)
+ with pytest.raises(KeyError):
+ actual.T.drop('c', level=level, axis=1)
+ expected_no_err = actual.drop('c', axis=0, level=level,
+ errors='ignore')
+ assert_frame_equal(expected_no_err, actual)
+ expected_no_err = actual.T.drop('c', axis=1, level=level,
+ errors='ignore')
+ assert_frame_equal(expected_no_err.T, actual)
+
+ @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]])
+ @pytest.mark.parametrize('drop_labels', [[], [1], [2]])
+ def test_drop_empty_list(self, index, drop_labels):
+ # GH 21494
+ expected_index = [i for i in index if i not in drop_labels]
+ frame = pd.DataFrame(index=index).drop(drop_labels)
+ tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index))
+
+ @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]])
+ @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]])
+ def test_drop_non_empty_list(self, index, drop_labels):
+ # GH 21494
+ with pytest.raises(KeyError, match='not found in axis'):
+ pd.DataFrame(index=index).drop(drop_labels)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_block_internals.py b/contrib/python/pandas/py2/pandas/tests/frame/test_block_internals.py
new file mode 100644
index 00000000000..5419f4d5127
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_block_internals.py
@@ -0,0 +1,587 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime, timedelta
+import itertools
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Series, Timestamp, compat, date_range,
+ option_context)
+from pandas.core.arrays import IntervalArray, integer_array
+from pandas.core.internals.blocks import IntBlock
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+# Segregated collection of methods that require the BlockManager internal data
+# structure
+
+
+class TestDataFrameBlockInternals():
+ def test_setitem_invalidates_datetime_index_freq(self):
+ # GH#24096 altering a datetime64tz column inplace invalidates the
+ # `freq` attribute on the underlying DatetimeIndex
+
+ dti = date_range('20130101', periods=3, tz='US/Eastern')
+ ts = dti[1]
+
+ df = DataFrame({'B': dti})
+ assert df['B']._values.freq == 'D'
+
+ df.iloc[1, 0] = pd.NaT
+ assert df['B']._values.freq is None
+
+ # check that the DatetimeIndex was not altered in place
+ assert dti.freq == 'D'
+ assert dti[1] == ts
+
+ def test_cast_internals(self, float_frame):
+ casted = DataFrame(float_frame._data, dtype=int)
+ expected = DataFrame(float_frame._series, dtype=int)
+ assert_frame_equal(casted, expected)
+
+ casted = DataFrame(float_frame._data, dtype=np.int32)
+ expected = DataFrame(float_frame._series, dtype=np.int32)
+ assert_frame_equal(casted, expected)
+
+ def test_consolidate(self, float_frame):
+ float_frame['E'] = 7.
+ consolidated = float_frame._consolidate()
+ assert len(consolidated._data.blocks) == 1
+
+ # Ensure copy, do I want this?
+ recons = consolidated._consolidate()
+ assert recons is not consolidated
+ tm.assert_frame_equal(recons, consolidated)
+
+ float_frame['F'] = 8.
+ assert len(float_frame._data.blocks) == 3
+
+ float_frame._consolidate(inplace=True)
+ assert len(float_frame._data.blocks) == 1
+
+ def test_consolidate_inplace(self, float_frame):
+ frame = float_frame.copy() # noqa
+
+ # triggers in-place consolidation
+ for letter in range(ord('A'), ord('Z')):
+ float_frame[chr(letter)] = chr(letter)
+
+ def test_values_consolidate(self, float_frame):
+ float_frame['E'] = 7.
+ assert not float_frame._data.is_consolidated()
+ _ = float_frame.values # noqa
+ assert float_frame._data.is_consolidated()
+
+ def test_modify_values(self, float_frame):
+ float_frame.values[5] = 5
+ assert (float_frame.values[5] == 5).all()
+
+ # unconsolidated
+ float_frame['E'] = 7.
+ float_frame.values[6] = 6
+ assert (float_frame.values[6] == 6).all()
+
+ def test_boolean_set_uncons(self, float_frame):
+ float_frame['E'] = 7.
+
+ expected = float_frame.values.copy()
+ expected[expected > 1] = 2
+
+ float_frame[float_frame > 1] = 2
+ assert_almost_equal(expected, float_frame.values)
+
+ def test_values_numeric_cols(self, float_frame):
+ float_frame['foo'] = 'bar'
+
+ values = float_frame[['A', 'B', 'C', 'D']].values
+ assert values.dtype == np.float64
+
+ def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
+
+ # mixed lcd
+ values = mixed_float_frame[['A', 'B', 'C', 'D']].values
+ assert values.dtype == np.float64
+
+ values = mixed_float_frame[['A', 'B', 'C']].values
+ assert values.dtype == np.float32
+
+ values = mixed_float_frame[['C']].values
+ assert values.dtype == np.float16
+
+ # GH 10364
+ # B uint64 forces float because there are other signed int types
+ values = mixed_int_frame[['A', 'B', 'C', 'D']].values
+ assert values.dtype == np.float64
+
+ values = mixed_int_frame[['A', 'D']].values
+ assert values.dtype == np.int64
+
+ # B uint64 forces float because there are other signed int types
+ values = mixed_int_frame[['A', 'B', 'C']].values
+ assert values.dtype == np.float64
+
+ # as B and C are both unsigned, no forcing to float is needed
+ values = mixed_int_frame[['B', 'C']].values
+ assert values.dtype == np.uint64
+
+ values = mixed_int_frame[['A', 'C']].values
+ assert values.dtype == np.int32
+
+ values = mixed_int_frame[['C', 'D']].values
+ assert values.dtype == np.int64
+
+ values = mixed_int_frame[['A']].values
+ assert values.dtype == np.int32
+
+ values = mixed_int_frame[['C']].values
+ assert values.dtype == np.uint8
+
+ def test_constructor_with_convert(self):
+ # this is actually mostly a test of lib.maybe_convert_objects
+ # #2845
+ df = DataFrame({'A': [2 ** 63 - 1]})
+ result = df['A']
+ expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [2 ** 63]})
+ result = df['A']
+ expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [datetime(2005, 1, 1), True]})
+ result = df['A']
+ expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
+ name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [None, 1]})
+ result = df['A']
+ expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0, 2]})
+ result = df['A']
+ expected = Series(np.asarray([1.0, 2], np.float_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0 + 2.0j, 3]})
+ result = df['A']
+ expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
+ result = df['A']
+ expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0 + 2.0j, True]})
+ result = df['A']
+ expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0, None]})
+ result = df['A']
+ expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [1.0 + 2.0j, None]})
+ result = df['A']
+ expected = Series(np.asarray(
+ [1.0 + 2.0j, np.nan], np.complex_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [2.0, 1, True, None]})
+ result = df['A']
+ expected = Series(np.asarray(
+ [2.0, 1, True, None], np.object_), name='A')
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
+ result = df['A']
+ expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
+ None], np.object_), name='A')
+ assert_series_equal(result, expected)
+
+ def test_construction_with_mixed(self, float_string_frame):
+ # test construction edge cases with mixed types
+
+ # f7u12, this does not work without extensive workaround
+ data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
+ [datetime(2000, 1, 2), datetime(2000, 1, 3),
+ datetime(2000, 1, 1)]]
+ df = DataFrame(data)
+
+ # check dtypes
+ result = df.get_dtype_counts().sort_values()
+ expected = Series({'datetime64[ns]': 3})
+
+ # mixed-type frames
+ float_string_frame['datetime'] = datetime.now()
+ float_string_frame['timedelta'] = timedelta(days=1, seconds=1)
+ assert float_string_frame['datetime'].dtype == 'M8[ns]'
+ assert float_string_frame['timedelta'].dtype == 'm8[ns]'
+ result = float_string_frame.get_dtype_counts().sort_values()
+ expected = Series({'float64': 4,
+ 'object': 1,
+ 'datetime64[ns]': 1,
+ 'timedelta64[ns]': 1}).sort_values()
+ assert_series_equal(result, expected)
+
+ def test_construction_with_conversions(self):
+
+ # convert from a numpy array of non-ns timedelta64
+ arr = np.array([1, 2, 3], dtype='timedelta64[s]')
+ df = DataFrame(index=range(3))
+ df['A'] = arr
+ expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
+ freq='s')},
+ index=range(3))
+ assert_frame_equal(df, expected)
+
+ expected = DataFrame({
+ 'dt1': Timestamp('20130101'),
+ 'dt2': date_range('20130101', periods=3),
+ # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
+ }, index=range(3))
+
+ df = DataFrame(index=range(3))
+ df['dt1'] = np.datetime64('2013-01-01')
+ df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
+ dtype='datetime64[D]')
+
+ # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
+ # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
+
+ assert_frame_equal(df, expected)
+
+ def test_constructor_compound_dtypes(self):
+ # GH 5191
+ # compound dtypes should raise not-implementederror
+
+ def f(dtype):
+ data = list(itertools.repeat((datetime(2001, 1, 1),
+ "aa", 20), 9))
+ return DataFrame(data=data,
+ columns=["A", "B", "C"],
+ dtype=dtype)
+
+ pytest.raises(NotImplementedError, f,
+ [("A", "datetime64[h]"),
+ ("B", "str"),
+ ("C", "int32")])
+
+ # these work (though results may be unexpected)
+ f('int64')
+ f('float64')
+
+ # 10822
+ # invalid error message on dt inference
+ if not compat.is_platform_windows():
+ f('M8[ns]')
+
+ def test_equals_different_blocks(self):
+ # GH 9330
+ df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
+ "C": ["w", "z"]})
+ df1 = df0.reset_index()[["A", "B", "C"]]
+ # this assert verifies that the above operations have
+ # induced a block rearrangement
+ assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
+
+ # do the real tests
+ assert_frame_equal(df0, df1)
+ assert df0.equals(df1)
+ assert df1.equals(df0)
+
+ def test_copy_blocks(self, float_frame):
+ # API/ENH 9607
+ df = DataFrame(float_frame, copy=True)
+ column = df.columns[0]
+
+ # use the default copy=True, change a column
+
+ # deprecated 0.21.0
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ blocks = df.as_blocks()
+ for dtype, _df in blocks.items():
+ if column in _df:
+ _df.loc[:, column] = _df[column] + 1
+
+ # make sure we did not change the original DataFrame
+ assert not _df[column].equals(df[column])
+
+ def test_no_copy_blocks(self, float_frame):
+ # API/ENH 9607
+ df = DataFrame(float_frame, copy=True)
+ column = df.columns[0]
+
+ # use the copy=False, change a column
+
+ # deprecated 0.21.0
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ blocks = df.as_blocks(copy=False)
+ for dtype, _df in blocks.items():
+ if column in _df:
+ _df.loc[:, column] = _df[column] + 1
+
+ # make sure we did change the original DataFrame
+ assert _df[column].equals(df[column])
+
+ def test_copy(self, float_frame, float_string_frame):
+ cop = float_frame.copy()
+ cop['E'] = cop['A']
+ assert 'E' not in float_frame
+
+ # copy objects
+ copy = float_string_frame.copy()
+ assert copy._data is not float_string_frame._data
+
+ def test_pickle(self, float_string_frame, empty_frame, timezone_frame):
+ unpickled = tm.round_trip_pickle(float_string_frame)
+ assert_frame_equal(float_string_frame, unpickled)
+
+ # buglet
+ float_string_frame._data.ndim
+
+ # empty
+ unpickled = tm.round_trip_pickle(empty_frame)
+ repr(unpickled)
+
+ # tz frame
+ unpickled = tm.round_trip_pickle(timezone_frame)
+ assert_frame_equal(timezone_frame, unpickled)
+
+ def test_consolidate_datetime64(self):
+ # numpy vstack bug
+
+ data = """\
+starting,ending,measure
+2012-06-21 00:00,2012-06-23 07:00,77
+2012-06-23 07:00,2012-06-23 16:30,65
+2012-06-23 16:30,2012-06-25 08:00,77
+2012-06-25 08:00,2012-06-26 12:00,0
+2012-06-26 12:00,2012-06-27 08:00,77
+"""
+ df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
+
+ ser_starting = df.starting
+ ser_starting.index = ser_starting.values
+ ser_starting = ser_starting.tz_localize('US/Eastern')
+ ser_starting = ser_starting.tz_convert('UTC')
+ ser_starting.index.name = 'starting'
+
+ ser_ending = df.ending
+ ser_ending.index = ser_ending.values
+ ser_ending = ser_ending.tz_localize('US/Eastern')
+ ser_ending = ser_ending.tz_convert('UTC')
+ ser_ending.index.name = 'ending'
+
+ df.starting = ser_starting.index
+ df.ending = ser_ending.index
+
+ tm.assert_index_equal(pd.DatetimeIndex(
+ df.starting), ser_starting.index)
+ tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
+
+ def test_is_mixed_type(self, float_frame, float_string_frame):
+ assert not float_frame._is_mixed_type
+ assert float_string_frame._is_mixed_type
+
+ def test_get_numeric_data(self):
+ # TODO(wesm): unused?
+ intname = np.dtype(np.int_).name # noqa
+ floatname = np.dtype(np.float_).name # noqa
+
+ datetime64name = np.dtype('M8[ns]').name
+ objectname = np.dtype(np.object_).name
+
+ df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ 'f': Timestamp('20010102')},
+ index=np.arange(10))
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1, 'float64': 1,
+ datetime64name: 1, objectname: 1})
+ result = result.sort_index()
+ expected = expected.sort_index()
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ 'd': np.array([1.] * 10, dtype='float32'),
+ 'e': np.array([1] * 10, dtype='int32'),
+ 'f': np.array([1] * 10, dtype='int16'),
+ 'g': Timestamp('20010102')},
+ index=np.arange(10))
+
+ result = df._get_numeric_data()
+ expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
+ assert_frame_equal(result, expected)
+
+ only_obj = df.loc[:, ['c', 'g']]
+ result = only_obj._get_numeric_data()
+ expected = df.loc[:, []]
+ assert_frame_equal(result, expected)
+
+ df = DataFrame.from_dict(
+ {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
+ result = df._get_numeric_data()
+ expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
+ assert_frame_equal(result, expected)
+
+ df = result.copy()
+ result = df._get_numeric_data()
+ expected = df
+ assert_frame_equal(result, expected)
+
+ def test_get_numeric_data_extension_dtype(self):
+ # GH 22290
+ df = DataFrame({
+ 'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
+ 'B': Categorical(list('abcabc')),
+ 'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
+ 'D': IntervalArray.from_breaks(range(7))})
+ result = df._get_numeric_data()
+ expected = df.loc[:, ['A', 'C']]
+ assert_frame_equal(result, expected)
+
+ def test_convert_objects(self, float_string_frame):
+
+ oops = float_string_frame.T.T
+ converted = oops._convert(datetime=True)
+ assert_frame_equal(converted, float_string_frame)
+ assert converted['A'].dtype == np.float64
+
+ # force numeric conversion
+ float_string_frame['H'] = '1.'
+ float_string_frame['I'] = '1'
+
+ # add in some items that will be nan
+ length = len(float_string_frame)
+ float_string_frame['J'] = '1.'
+ float_string_frame['K'] = '1'
+ float_string_frame.loc[0:5, ['J', 'K']] = 'garbled'
+ converted = float_string_frame._convert(datetime=True, numeric=True)
+ assert converted['H'].dtype == 'float64'
+ assert converted['I'].dtype == 'int64'
+ assert converted['J'].dtype == 'float64'
+ assert converted['K'].dtype == 'float64'
+ assert len(converted['J'].dropna()) == length - 5
+ assert len(converted['K'].dropna()) == length - 5
+
+ # via astype
+ converted = float_string_frame.copy()
+ converted['H'] = converted['H'].astype('float64')
+ converted['I'] = converted['I'].astype('int64')
+ assert converted['H'].dtype == 'float64'
+ assert converted['I'].dtype == 'int64'
+
+ # via astype, but errors
+ converted = float_string_frame.copy()
+ with pytest.raises(ValueError, match='invalid literal'):
+ converted['H'].astype('int32')
+
+ # mixed in a single column
+ df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
+ result = df._convert(datetime=True, numeric=True)
+ expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
+ assert_frame_equal(result, expected)
+
+ def test_convert_objects_no_conversion(self):
+ mixed1 = DataFrame(
+ {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
+ mixed2 = mixed1._convert(datetime=True)
+ assert_frame_equal(mixed1, mixed2)
+
+ def test_infer_objects(self):
+ # GH 11221
+ df = DataFrame({'a': ['a', 1, 2, 3],
+ 'b': ['b', 2.0, 3.0, 4.1],
+ 'c': ['c', datetime(2016, 1, 1),
+ datetime(2016, 1, 2),
+ datetime(2016, 1, 3)],
+ 'd': [1, 2, 3, 'd']},
+ columns=['a', 'b', 'c', 'd'])
+ df = df.iloc[1:].infer_objects()
+
+ assert df['a'].dtype == 'int64'
+ assert df['b'].dtype == 'float64'
+ assert df['c'].dtype == 'M8[ns]'
+ assert df['d'].dtype == 'object'
+
+ expected = DataFrame({'a': [1, 2, 3],
+ 'b': [2.0, 3.0, 4.1],
+ 'c': [datetime(2016, 1, 1),
+ datetime(2016, 1, 2),
+ datetime(2016, 1, 3)],
+ 'd': [2, 3, 'd']},
+ columns=['a', 'b', 'c', 'd'])
+ # reconstruct frame to verify inference is same
+ tm.assert_frame_equal(df.reset_index(drop=True), expected)
+
+ def test_stale_cached_series_bug_473(self):
+
+ # this is chained, but ok
+ with option_context('chained_assignment', None):
+ Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
+ columns=('e', 'f', 'g', 'h'))
+ repr(Y)
+ Y['e'] = Y['e'].astype('object')
+ Y['g']['c'] = np.NaN
+ repr(Y)
+ result = Y.sum() # noqa
+ exp = Y['g'].sum() # noqa
+ assert pd.isna(Y['g']['c'])
+
+ def test_get_X_columns(self):
+ # numeric and object columns
+
+ df = DataFrame({'a': [1, 2, 3],
+ 'b': [True, False, True],
+ 'c': ['foo', 'bar', 'baz'],
+ 'd': [None, None, None],
+ 'e': [3.14, 0.577, 2.773]})
+
+ tm.assert_index_equal(df._get_numeric_data().columns,
+ pd.Index(['a', 'b', 'e']))
+
+ def test_strange_column_corruption_issue(self):
+ # (wesm) Unclear how exactly this is related to internal matters
+ df = DataFrame(index=[0, 1])
+ df[0] = np.nan
+ wasCol = {}
+ # uncommenting these makes the results match
+ # for col in xrange(100, 200):
+ # wasCol[col] = 1
+ # df[col] = np.nan
+
+ for i, dt in enumerate(df.index):
+ for col in range(100, 200):
+ if col not in wasCol:
+ wasCol[col] = 1
+ df[col] = np.nan
+ df[col][dt] = i
+
+ myid = 100
+
+ first = len(df.loc[pd.isna(df[myid]), [myid]])
+ second = len(df.loc[pd.isna(df[myid]), [myid]])
+ assert first == second == 0
+
+ def test_constructor_no_pandas_array(self):
+ # Ensure that PandasArray isn't allowed inside Series
+ # See https://github.com/pandas-dev/pandas/issues/23995 for more.
+ arr = pd.Series([1, 2, 3]).array
+ result = pd.DataFrame({"A": arr})
+ expected = pd.DataFrame({"A": [1, 2, 3]})
+ tm.assert_frame_equal(result, expected)
+ assert isinstance(result._data.blocks[0], IntBlock)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_combine_concat.py b/contrib/python/pandas/py2/pandas/tests/frame/test_combine_concat.py
new file mode 100644
index 00000000000..59497153c85
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_combine_concat.py
@@ -0,0 +1,863 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import DataFrame, Index, Series, Timestamp, date_range
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameConcatCommon(TestData):
+
+ def test_concat_multiple_frames_dtypes(self):
+
+ # GH 2759
+ A = DataFrame(data=np.ones((10, 2)), columns=[
+ 'foo', 'bar'], dtype=np.float64)
+ B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
+ results = pd.concat((A, B), axis=1).get_dtype_counts()
+ expected = Series(dict(float64=2, float32=2))
+ assert_series_equal(results, expected)
+
+ @pytest.mark.parametrize('data', [
+ pd.date_range('2000', periods=4),
+ pd.date_range('2000', periods=4, tz="US/Central"),
+ pd.period_range('2000', periods=4),
+ pd.timedelta_range(0, periods=4),
+ ])
+ def test_combine_datetlike_udf(self, data):
+ # https://github.com/pandas-dev/pandas/issues/23079
+ df = pd.DataFrame({"A": data})
+ other = df.copy()
+ df.iloc[1, 0] = None
+
+ def combiner(a, b):
+ return b
+
+ result = df.combine(other, combiner)
+ tm.assert_frame_equal(result, other)
+
+ def test_concat_multiple_tzs(self):
+ # GH 12467
+ # combining datetime tz-aware and naive DataFrames
+ ts1 = Timestamp('2015-01-01', tz=None)
+ ts2 = Timestamp('2015-01-01', tz='UTC')
+ ts3 = Timestamp('2015-01-01', tz='EST')
+
+ df1 = DataFrame(dict(time=[ts1]))
+ df2 = DataFrame(dict(time=[ts2]))
+ df3 = DataFrame(dict(time=[ts3]))
+
+ results = pd.concat([df1, df2]).reset_index(drop=True)
+ expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
+ assert_frame_equal(results, expected)
+
+ results = pd.concat([df1, df3]).reset_index(drop=True)
+ expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
+ assert_frame_equal(results, expected)
+
+ results = pd.concat([df2, df3]).reset_index(drop=True)
+ expected = DataFrame(dict(time=[ts2, ts3]))
+ assert_frame_equal(results, expected)
+
+ @pytest.mark.parametrize(
+ 't1',
+ [
+ '2015-01-01',
+ pytest.param(pd.NaT, marks=pytest.mark.xfail(
+ reason='GH23037 incorrect dtype when concatenating'))])
+ def test_concat_tz_NaT(self, t1):
+ # GH 22796
+ # Concating tz-aware multicolumn DataFrames
+ ts1 = Timestamp(t1, tz='UTC')
+ ts2 = Timestamp('2015-01-01', tz='UTC')
+ ts3 = Timestamp('2015-01-01', tz='UTC')
+
+ df1 = DataFrame([[ts1, ts2]])
+ df2 = DataFrame([[ts3]])
+
+ result = pd.concat([df1, df2])
+ expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
+
+ assert_frame_equal(result, expected)
+
+ def test_concat_tz_not_aligned(self):
+ # GH 22796
+ ts = pd.to_datetime([1, 2]).tz_localize("UTC")
+ a = pd.DataFrame({"A": ts})
+ b = pd.DataFrame({"A": ts, "B": ts})
+ result = pd.concat([a, b], sort=True, ignore_index=True)
+ expected = pd.DataFrame({"A": list(ts) + list(ts),
+ "B": [pd.NaT, pd.NaT] + list(ts)})
+ assert_frame_equal(result, expected)
+
+ def test_concat_tuple_keys(self):
+ # GH 14438
+ df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
+ df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
+ results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
+ expected = pd.DataFrame(
+ {'A': {('bee', 'bah', 0): 1.0,
+ ('bee', 'bah', 1): 1.0,
+ ('bee', 'boo', 0): 2.0,
+ ('bee', 'boo', 1): 2.0,
+ ('bee', 'boo', 2): 2.0},
+ 'B': {('bee', 'bah', 0): 1.0,
+ ('bee', 'bah', 1): 1.0,
+ ('bee', 'boo', 0): 2.0,
+ ('bee', 'boo', 1): 2.0,
+ ('bee', 'boo', 2): 2.0}})
+ assert_frame_equal(results, expected)
+
+ def test_append_series_dict(self):
+ df = DataFrame(np.random.randn(5, 4),
+ columns=['foo', 'bar', 'baz', 'qux'])
+
+ series = df.loc[4]
+ msg = 'Indexes have overlapping values'
+ with pytest.raises(ValueError, match=msg):
+ df.append(series, verify_integrity=True)
+
+ series.name = None
+ msg = 'Can only append a Series if ignore_index=True'
+ with pytest.raises(TypeError, match=msg):
+ df.append(series, verify_integrity=True)
+
+ result = df.append(series[::-1], ignore_index=True)
+ expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
+ ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ # dict
+ result = df.append(series.to_dict(), ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ result = df.append(series[::-1][:3], ignore_index=True)
+ expected = df.append(DataFrame({0: series[::-1][:3]}).T,
+ ignore_index=True, sort=True)
+ assert_frame_equal(result, expected.loc[:, result.columns])
+
+ # can append when name set
+ row = df.loc[4]
+ row.name = 5
+ result = df.append(row)
+ expected = df.append(df[-1:], ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ def test_append_list_of_series_dicts(self):
+ df = DataFrame(np.random.randn(5, 4),
+ columns=['foo', 'bar', 'baz', 'qux'])
+
+ dicts = [x.to_dict() for idx, x in df.iterrows()]
+
+ result = df.append(dicts, ignore_index=True)
+ expected = df.append(df, ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ # different columns
+ dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
+ {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
+ result = df.append(dicts, ignore_index=True, sort=True)
+ expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
+ assert_frame_equal(result, expected)
+
+ def test_append_empty_dataframe(self):
+
+ # Empty df append empty df
+ df1 = DataFrame([])
+ df2 = DataFrame([])
+ result = df1.append(df2)
+ expected = df1.copy()
+ assert_frame_equal(result, expected)
+
+ # Non-empty df append empty df
+ df1 = DataFrame(np.random.randn(5, 2))
+ df2 = DataFrame()
+ result = df1.append(df2)
+ expected = df1.copy()
+ assert_frame_equal(result, expected)
+
+ # Empty df with columns append empty df
+ df1 = DataFrame(columns=['bar', 'foo'])
+ df2 = DataFrame()
+ result = df1.append(df2)
+ expected = df1.copy()
+ assert_frame_equal(result, expected)
+
+ # Non-Empty df with columns append empty df
+ df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
+ df2 = DataFrame()
+ result = df1.append(df2)
+ expected = df1.copy()
+ assert_frame_equal(result, expected)
+
+ def test_append_dtypes(self):
+
+ # GH 5754
+ # row appends of different dtypes (so need to do by-item)
+ # can sometimes infer the correct type
+
+ df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
+ df2 = DataFrame()
+ result = df1.append(df2)
+ expected = df1.copy()
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
+ df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
+ result = df1.append(df2)
+ expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
+ df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
+ result = df1.append(df2)
+ expected = DataFrame(
+ {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
+ df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
+ result = df1.append(df2)
+ expected = DataFrame(
+ {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({'bar': np.nan}, index=lrange(1))
+ df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
+ result = df1.append(df2)
+ expected = DataFrame(
+ {'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
+ df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
+ result = df1.append(df2)
+ expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
+ assert_frame_equal(result, expected)
+
+ def test_update(self):
+ df = DataFrame([[1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3]])
+
+ other = DataFrame([[3.6, 2., np.nan],
+ [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other)
+
+ expected = DataFrame([[1.5, np.nan, 3],
+ [3.6, 2, 3],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 7.]])
+ assert_frame_equal(df, expected)
+
+ def test_update_dtypes(self):
+
+ # gh 3016
+ df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
+ columns=['A', 'B', 'bool1', 'bool2'])
+
+ other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
+ df.update(other)
+
+ expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
+ columns=['A', 'B', 'bool1', 'bool2'])
+ assert_frame_equal(df, expected)
+
+ def test_update_nooverwrite(self):
+ df = DataFrame([[1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3]])
+
+ other = DataFrame([[3.6, 2., np.nan],
+ [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other, overwrite=False)
+
+ expected = DataFrame([[1.5, np.nan, 3],
+ [1.5, 2, 3],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3.]])
+ assert_frame_equal(df, expected)
+
+ def test_update_filtered(self):
+ df = DataFrame([[1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3]])
+
+ other = DataFrame([[3.6, 2., np.nan],
+ [np.nan, np.nan, 7]], index=[1, 3])
+
+ df.update(other, filter_func=lambda x: x > 2)
+
+ expected = DataFrame([[1.5, np.nan, 3],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 7.]])
+ assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize('bad_kwarg, exception, msg', [
+ # errors must be 'ignore' or 'raise'
+ ({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
+ ({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
+ ])
+ def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
+ df = DataFrame([[1.5, 1, 3.]])
+ with pytest.raises(exception, match=msg):
+ df.update(df, **bad_kwarg)
+
+ def test_update_raise_on_overlap(self):
+ df = DataFrame([[1.5, 1, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3]])
+
+ other = DataFrame([[2., np.nan],
+ [np.nan, 7]], index=[1, 3], columns=[1, 2])
+ with pytest.raises(ValueError, match="Data overlaps"):
+ df.update(other, errors='raise')
+
+ @pytest.mark.parametrize('raise_conflict', [True, False])
+ def test_update_deprecation(self, raise_conflict):
+ df = DataFrame([[1.5, 1, 3.]])
+ other = DataFrame()
+ with tm.assert_produces_warning(FutureWarning):
+ df.update(other, raise_conflict=raise_conflict)
+
+ def test_update_from_non_df(self):
+ d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
+ df = DataFrame(d)
+
+ d['a'] = Series([5, 6, 7, 8])
+ df.update(d)
+
+ expected = DataFrame(d)
+
+ assert_frame_equal(df, expected)
+
+ d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
+ df = DataFrame(d)
+
+ d['a'] = [5, 6, 7, 8]
+ df.update(d)
+
+ expected = DataFrame(d)
+
+ assert_frame_equal(df, expected)
+
+ def test_join_str_datetime(self):
+ str_dates = ['20120209', '20120222']
+ dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
+
+ A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
+ C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
+
+ tst = A.join(C, on='aa')
+
+ assert len(tst.columns) == 3
+
+ def test_join_multiindex_leftright(self):
+ # GH 10741
+ df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
+ ['a', 'z', 0.563634], ['b', 'x', -0.353756],
+ ['b', 'y', 0.368062], ['b', 'z', -1.721840],
+ ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
+ columns=['first', 'second', 'value1'])
+ .set_index(['first', 'second']))
+
+ df2 = (pd.DataFrame([['a', 10], ['b', 20]],
+ columns=['first', 'value2'])
+ .set_index(['first']))
+
+ exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
+ [-0.353756, 20], [0.368062, 20],
+ [-1.721840, 20],
+ [1.000000, np.nan], [2.000000, np.nan],
+ [3.000000, np.nan]],
+ index=df1.index, columns=['value1', 'value2'])
+
+ # these must be the same results (but columns are flipped)
+ assert_frame_equal(df1.join(df2, how='left'), exp)
+ assert_frame_equal(df2.join(df1, how='right'),
+ exp[['value2', 'value1']])
+
+ exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
+ names=['first', 'second'])
+ exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
+ [-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
+ index=exp_idx, columns=['value1', 'value2'])
+
+ assert_frame_equal(df1.join(df2, how='right'), exp)
+ assert_frame_equal(df2.join(df1, how='left'),
+ exp[['value2', 'value1']])
+
+ def test_concat_named_keys(self):
+ # GH 14252
+ df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
+ index = Index(['a', 'b'], name='baz')
+ concatted_named_from_keys = pd.concat([df, df], keys=index)
+ expected_named = pd.DataFrame(
+ {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
+ index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
+ names=['baz', None]))
+ assert_frame_equal(concatted_named_from_keys, expected_named)
+
+ index_no_name = Index(['a', 'b'], name=None)
+ concatted_named_from_names = pd.concat(
+ [df, df], keys=index_no_name, names=['baz'])
+ assert_frame_equal(concatted_named_from_names, expected_named)
+
+ concatted_unnamed = pd.concat([df, df], keys=index_no_name)
+ expected_unnamed = pd.DataFrame(
+ {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
+ index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
+ names=[None, None]))
+ assert_frame_equal(concatted_unnamed, expected_unnamed)
+
+ def test_concat_axis_parameter(self):
+ # GH 14369
+ df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
+ df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
+
+ # Index/row/0 DataFrame
+ expected_index = pd.DataFrame(
+ {'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
+
+ concatted_index = pd.concat([df1, df2], axis='index')
+ assert_frame_equal(concatted_index, expected_index)
+
+ concatted_row = pd.concat([df1, df2], axis='rows')
+ assert_frame_equal(concatted_row, expected_index)
+
+ concatted_0 = pd.concat([df1, df2], axis=0)
+ assert_frame_equal(concatted_0, expected_index)
+
+ # Columns/1 DataFrame
+ expected_columns = pd.DataFrame(
+ [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
+
+ concatted_columns = pd.concat([df1, df2], axis='columns')
+ assert_frame_equal(concatted_columns, expected_columns)
+
+ concatted_1 = pd.concat([df1, df2], axis=1)
+ assert_frame_equal(concatted_1, expected_columns)
+
+ series1 = pd.Series([0.1, 0.2])
+ series2 = pd.Series([0.3, 0.4])
+
+ # Index/row/0 Series
+ expected_index_series = pd.Series(
+ [0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
+
+ concatted_index_series = pd.concat([series1, series2], axis='index')
+ assert_series_equal(concatted_index_series, expected_index_series)
+
+ concatted_row_series = pd.concat([series1, series2], axis='rows')
+ assert_series_equal(concatted_row_series, expected_index_series)
+
+ concatted_0_series = pd.concat([series1, series2], axis=0)
+ assert_series_equal(concatted_0_series, expected_index_series)
+
+ # Columns/1 Series
+ expected_columns_series = pd.DataFrame(
+ [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
+
+ concatted_columns_series = pd.concat(
+ [series1, series2], axis='columns')
+ assert_frame_equal(concatted_columns_series, expected_columns_series)
+
+ concatted_1_series = pd.concat([series1, series2], axis=1)
+ assert_frame_equal(concatted_1_series, expected_columns_series)
+
+ # Testing ValueError
+ with pytest.raises(ValueError, match='No axis named'):
+ pd.concat([series1, series2], axis='something')
+
+ def test_concat_numerical_names(self):
+ # #15262 # #12223
+ df = pd.DataFrame({'col': range(9)},
+ dtype='int32',
+ index=(pd.MultiIndex
+ .from_product([['A0', 'A1', 'A2'],
+ ['B0', 'B1', 'B2']],
+ names=[1, 2])))
+ result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
+ expected = pd.DataFrame({'col': [0, 1, 7, 8]},
+ dtype='int32',
+ index=pd.MultiIndex.from_tuples([('A0', 'B0'),
+ ('A0', 'B1'),
+ ('A2', 'B1'),
+ ('A2', 'B2')],
+ names=[1, 2]))
+ tm.assert_frame_equal(result, expected)
+
+
+class TestDataFrameCombineFirst(TestData):
+
+ def test_combine_first_mixed(self):
+ a = Series(['a', 'b'], index=lrange(2))
+ b = Series(lrange(2), index=lrange(2))
+ f = DataFrame({'A': a, 'B': b})
+
+ a = Series(['a', 'b'], index=lrange(5, 7))
+ b = Series(lrange(2), index=lrange(5, 7))
+ g = DataFrame({'A': a, 'B': b})
+
+ exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
+ index=[0, 1, 5, 6])
+ combined = f.combine_first(g)
+ tm.assert_frame_equal(combined, exp)
+
+ def test_combine_first(self):
+ # disjoint
+ head, tail = self.frame[:5], self.frame[5:]
+
+ combined = head.combine_first(tail)
+ reordered_frame = self.frame.reindex(combined.index)
+ assert_frame_equal(combined, reordered_frame)
+ assert tm.equalContents(combined.columns, self.frame.columns)
+ assert_series_equal(combined['A'], reordered_frame['A'])
+
+ # same index
+ fcopy = self.frame.copy()
+ fcopy['A'] = 1
+ del fcopy['C']
+
+ fcopy2 = self.frame.copy()
+ fcopy2['B'] = 0
+ del fcopy2['D']
+
+ combined = fcopy.combine_first(fcopy2)
+
+ assert (combined['A'] == 1).all()
+ assert_series_equal(combined['B'], fcopy['B'])
+ assert_series_equal(combined['C'], fcopy2['C'])
+ assert_series_equal(combined['D'], fcopy['D'])
+
+ # overlap
+ head, tail = reordered_frame[:10].copy(), reordered_frame
+ head['A'] = 1
+
+ combined = head.combine_first(tail)
+ assert (combined['A'][:10] == 1).all()
+
+ # reverse overlap
+ tail['A'][:10] = 0
+ combined = tail.combine_first(head)
+ assert (combined['A'][:10] == 0).all()
+
+ # no overlap
+ f = self.frame[:10]
+ g = self.frame[10:]
+ combined = f.combine_first(g)
+ assert_series_equal(combined['A'].reindex(f.index), f['A'])
+ assert_series_equal(combined['A'].reindex(g.index), g['A'])
+
+ # corner cases
+ comb = self.frame.combine_first(self.empty)
+ assert_frame_equal(comb, self.frame)
+
+ comb = self.empty.combine_first(self.frame)
+ assert_frame_equal(comb, self.frame)
+
+ comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
+ assert "faz" in comb.index
+
+ # #2525
+ df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
+ df2 = DataFrame({}, columns=['b'])
+ result = df.combine_first(df2)
+ assert 'b' in result
+
+ def test_combine_first_mixed_bug(self):
+ idx = Index(['a', 'b', 'c', 'e'])
+ ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
+ ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
+ ser3 = Series([12, 4, 5, 97], index=idx)
+
+ frame1 = DataFrame({"col0": ser1,
+ "col2": ser2,
+ "col3": ser3})
+
+ idx = Index(['a', 'b', 'c', 'f'])
+ ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
+ ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
+ ser3 = Series([12, 4, 5, 97], index=idx)
+
+ frame2 = DataFrame({"col1": ser1,
+ "col2": ser2,
+ "col5": ser3})
+
+ combined = frame1.combine_first(frame2)
+ assert len(combined.columns) == 5
+
+ # gh 3016 (same as in update)
+ df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
+ columns=['A', 'B', 'bool1', 'bool2'])
+
+ other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
+ result = df.combine_first(other)
+ assert_frame_equal(result, df)
+
+ df.loc[0, 'A'] = np.nan
+ result = df.combine_first(other)
+ df.loc[0, 'A'] = 45
+ assert_frame_equal(result, df)
+
+ # doc example
+ df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
+ 'B': [np.nan, 2., 3., np.nan, 6.]})
+
+ df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
+ 'B': [np.nan, np.nan, 3., 4., 6., 8.]})
+
+ result = df1.combine_first(df2)
+ expected = DataFrame(
+ {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
+ assert_frame_equal(result, expected)
+
+ # GH3552, return object dtype with bools
+ df1 = DataFrame(
+ [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
+ df2 = DataFrame(
+ [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
+
+ result = df1.combine_first(df2)[2]
+ expected = Series([True, True, False], name=2)
+ assert_series_equal(result, expected)
+
+ # GH 3593, converting datetime64[ns] incorrecly
+ df0 = DataFrame({"a": [datetime(2000, 1, 1),
+ datetime(2000, 1, 2),
+ datetime(2000, 1, 3)]})
+ df1 = DataFrame({"a": [None, None, None]})
+ df2 = df1.combine_first(df0)
+ assert_frame_equal(df2, df0)
+
+ df2 = df0.combine_first(df1)
+ assert_frame_equal(df2, df0)
+
+ df0 = DataFrame({"a": [datetime(2000, 1, 1),
+ datetime(2000, 1, 2),
+ datetime(2000, 1, 3)]})
+ df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
+ df2 = df1.combine_first(df0)
+ result = df0.copy()
+ result.iloc[0, :] = df1.iloc[0, :]
+ assert_frame_equal(df2, result)
+
+ df2 = df0.combine_first(df1)
+ assert_frame_equal(df2, df0)
+
+ def test_combine_first_align_nan(self):
+ # GH 7509 (not fixed)
+ dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
+ columns=['a', 'b'])
+ dfb = pd.DataFrame([[4], [5]], columns=['b'])
+ assert dfa['a'].dtype == 'datetime64[ns]'
+ assert dfa['b'].dtype == 'int64'
+
+ res = dfa.combine_first(dfb)
+ exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
+ 'b': [2., 5.]}, columns=['a', 'b'])
+ tm.assert_frame_equal(res, exp)
+ assert res['a'].dtype == 'datetime64[ns]'
+ # ToDo: this must be int64
+ assert res['b'].dtype == 'float64'
+
+ res = dfa.iloc[:0].combine_first(dfb)
+ exp = pd.DataFrame({'a': [np.nan, np.nan],
+ 'b': [4, 5]}, columns=['a', 'b'])
+ tm.assert_frame_equal(res, exp)
+ # ToDo: this must be datetime64
+ assert res['a'].dtype == 'float64'
+ # ToDo: this must be int64
+ assert res['b'].dtype == 'int64'
+
+ def test_combine_first_timezone(self):
+ # see gh-7630
+ data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
+ df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
+ data=data1,
+ index=pd.date_range('20140627', periods=1))
+ data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
+ df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
+ data=data2,
+ index=pd.date_range('20140628', periods=1))
+ res = df2[['UTCdatetime']].combine_first(df1)
+ exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
+ tz='UTC'),
+ pd.Timestamp('2012-12-12 12:12',
+ tz='UTC')],
+ 'abc': [pd.Timestamp('2010-01-01 01:01:00',
+ tz='UTC'), pd.NaT]},
+ columns=['UTCdatetime', 'abc'],
+ index=pd.date_range('20140627', periods=2,
+ freq='D'))
+ tm.assert_frame_equal(res, exp)
+ assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
+ assert res['abc'].dtype == 'datetime64[ns, UTC]'
+
+ # see gh-10567
+ dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
+ df1 = pd.DataFrame({'DATE': dts1})
+ dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
+ df2 = pd.DataFrame({'DATE': dts2})
+
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res['DATE'].dtype == 'datetime64[ns, UTC]'
+
+ dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
+ '2011-01-04'], tz='US/Eastern')
+ df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
+ dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
+ '2012-01-03'], tz='US/Eastern')
+ df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
+ '2012-01-02', '2011-01-03', '2011-01-04'],
+ tz='US/Eastern')
+ exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+
+ # different tz
+ dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
+ df1 = pd.DataFrame({'DATE': dts1})
+ dts2 = pd.date_range('2015-01-03', '2015-01-05')
+ df2 = pd.DataFrame({'DATE': dts2})
+
+ # if df1 doesn't have NaN, keep its dtype
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
+
+ dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
+ df1 = pd.DataFrame({'DATE': dts1})
+ dts2 = pd.date_range('2015-01-01', '2015-01-03')
+ df2 = pd.DataFrame({'DATE': dts2})
+
+ res = df1.combine_first(df2)
+ exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
+ pd.Timestamp('2015-01-02', tz='US/Eastern'),
+ pd.Timestamp('2015-01-03')]
+ exp = pd.DataFrame({'DATE': exp_dts})
+ tm.assert_frame_equal(res, exp)
+ assert res['DATE'].dtype == 'object'
+
+ def test_combine_first_timedelta(self):
+ data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
+ df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
+ data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
+ df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
+ '11 day', '3 day', '4 day'])
+ exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res['TD'].dtype == 'timedelta64[ns]'
+
+ def test_combine_first_period(self):
+ data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
+ '2011-04'], freq='M')
+ df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
+ data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
+ '2012-03'], freq='M')
+ df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
+ '2012-02', '2011-03', '2011-04'],
+ freq='M')
+ exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res['P'].dtype == data1.dtype
+
+ # different freq
+ dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
+ '2012-01-03'], freq='D')
+ df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
+
+ res = df1.combine_first(df2)
+ exp_dts = [pd.Period('2011-01', freq='M'),
+ pd.Period('2012-01-01', freq='D'),
+ pd.NaT,
+ pd.Period('2012-01-02', freq='D'),
+ pd.Period('2011-03', freq='M'),
+ pd.Period('2011-04', freq='M')]
+ exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
+ tm.assert_frame_equal(res, exp)
+ assert res['P'].dtype == 'object'
+
+ def test_combine_first_int(self):
+ # GH14687 - integer series that do no align exactly
+
+ df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
+ df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
+
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res['a'].dtype == 'int64'
+
+ @pytest.mark.parametrize("val", [1, 1.0])
+ def test_combine_first_with_asymmetric_other(self, val):
+ # see gh-20699
+ df1 = pd.DataFrame({'isNum': [val]})
+ df2 = pd.DataFrame({'isBool': [True]})
+
+ res = df1.combine_first(df2)
+ exp = pd.DataFrame({'isBool': [True], 'isNum': [val]})
+
+ tm.assert_frame_equal(res, exp)
+
+ def test_concat_datetime_datetime64_frame(self):
+ # #2624
+ rows = []
+ rows.append([datetime(2010, 1, 1), 1])
+ rows.append([datetime(2010, 1, 2), 'hi'])
+
+ df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
+
+ ind = date_range(start="2000/1/1", freq="D", periods=10)
+ df1 = DataFrame({'date': ind, 'test': lrange(10)})
+
+ # it works!
+ pd.concat([df1, df2_obj])
+
+
+class TestDataFrameUpdate(TestData):
+
+ def test_update_nan(self):
+ # #15593 #15617
+ # test 1
+ df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
+ df2 = DataFrame({'A': [None, 2, 3]})
+ expected = df1.copy()
+ df1.update(df2, overwrite=False)
+
+ tm.assert_frame_equal(df1, expected)
+
+ # test 2
+ df1 = DataFrame({'A': [1.0, None, 3],
+ 'B': date_range('2000', periods=3)})
+ df2 = DataFrame({'A': [None, 2, 3]})
+ expected = DataFrame({'A': [1.0, 2, 3],
+ 'B': date_range('2000', periods=3)})
+ df1.update(df2, overwrite=False)
+
+ tm.assert_frame_equal(df1, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_constructors.py b/contrib/python/pandas/py2/pandas/tests/frame/test_constructors.py
new file mode 100644
index 00000000000..40a942c96ea
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_constructors.py
@@ -0,0 +1,2316 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime, timedelta
+import functools
+import itertools
+
+import numpy as np
+import numpy.ma as ma
+import pytest
+
+from pandas.compat import (
+ PY2, PY3, PY36, OrderedDict, is_platform_little_endian, lmap, long, lrange,
+ lzip, range, zip)
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, MultiIndex, Series, Timedelta, Timestamp,
+ _np_version_under1p13, compat, date_range, isna)
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64']
+MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16',
+ 'int32', 'int64']
+
+
+class TestDataFrameConstructors(TestData):
+
+ def test_constructor(self):
+ df = DataFrame()
+ assert len(df.index) == 0
+
+ df = DataFrame(data={})
+ assert len(df.index) == 0
+
+ def test_constructor_mixed(self):
+ index, data = tm.getMixedTypeDict()
+
+ # TODO(wesm), incomplete test?
+ indexed_frame = DataFrame(data, index=index) # noqa
+ unindexed_frame = DataFrame(data) # noqa
+
+ assert self.mixed_frame['foo'].dtype == np.object_
+
+ def test_constructor_cast_failure(self):
+ foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64)
+ assert foo['a'].dtype == object
+
+ # GH 3010, constructing with odd arrays
+ df = DataFrame(np.ones((4, 2)))
+
+ # this is ok
+ df['foo'] = np.ones((4, 2)).tolist()
+
+ # this is not ok
+ pytest.raises(ValueError, df.__setitem__, tuple(['test']),
+ np.ones((4, 2)))
+
+ # this is ok
+ df['foo2'] = np.ones((4, 2)).tolist()
+
+ def test_constructor_dtype_copy(self):
+ orig_df = DataFrame({
+ 'col1': [1.],
+ 'col2': [2.],
+ 'col3': [3.]})
+
+ new_df = pd.DataFrame(orig_df, dtype=float, copy=True)
+
+ new_df['col1'] = 200.
+ assert orig_df['col1'][0] == 1.
+
+ def test_constructor_dtype_nocast_view(self):
+ df = DataFrame([[1, 2]])
+ should_be_view = DataFrame(df, dtype=df[0].dtype)
+ should_be_view[0][0] = 99
+ assert df.values[0, 0] == 99
+
+ should_be_view = DataFrame(df.values, dtype=df[0].dtype)
+ should_be_view[0][0] = 97
+ assert df.values[0, 0] == 97
+
+ def test_constructor_dtype_list_data(self):
+ df = DataFrame([[1, '2'],
+ [None, 'a']], dtype=object)
+ assert df.loc[1, 0] is None
+ assert df.loc[0, 1] == '2'
+
+ def test_constructor_list_frames(self):
+ # see gh-3243
+ result = DataFrame([DataFrame([])])
+ assert result.shape == (1, 0)
+
+ result = DataFrame([DataFrame(dict(A=lrange(5)))])
+ assert isinstance(result.iloc[0, 0], DataFrame)
+
+ def test_constructor_mixed_dtypes(self):
+
+ def _make_mixed_dtypes_df(typ, ad=None):
+
+ if typ == 'int':
+ dtypes = MIXED_INT_DTYPES
+ arrays = [np.array(np.random.rand(10), dtype=d)
+ for d in dtypes]
+ elif typ == 'float':
+ dtypes = MIXED_FLOAT_DTYPES
+ arrays = [np.array(np.random.randint(
+ 10, size=10), dtype=d) for d in dtypes]
+
+ zipper = lzip(dtypes, arrays)
+ for d, a in zipper:
+ assert(a.dtype == d)
+ if ad is None:
+ ad = dict()
+ ad.update({d: a for d, a in zipper})
+ return DataFrame(ad)
+
+ def _check_mixed_dtypes(df, dtypes=None):
+ if dtypes is None:
+ dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
+ for d in dtypes:
+ if d in df:
+ assert(df.dtypes[d] == d)
+
+ # mixed floating and integer coexinst in the same frame
+ df = _make_mixed_dtypes_df('float')
+ _check_mixed_dtypes(df)
+
+ # add lots of types
+ df = _make_mixed_dtypes_df('float', dict(A=1, B='foo', C='bar'))
+ _check_mixed_dtypes(df)
+
+ # GH 622
+ df = _make_mixed_dtypes_df('int')
+ _check_mixed_dtypes(df)
+
+ def test_constructor_complex_dtypes(self):
+ # GH10952
+ a = np.random.rand(10).astype(np.complex64)
+ b = np.random.rand(10).astype(np.complex128)
+
+ df = DataFrame({'a': a, 'b': b})
+ assert a.dtype == df.a.dtype
+ assert b.dtype == df.b.dtype
+
+ def test_constructor_dtype_str_na_values(self, string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/21083
+ df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
+ result = df.isna()
+ expected = DataFrame({"A": [False, True]})
+ tm.assert_frame_equal(result, expected)
+ assert df.iloc[1, 0] is None
+
+ df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
+ assert np.isnan(df.iloc[1, 0])
+
+ def test_constructor_rec(self):
+ rec = self.frame.to_records(index=False)
+ if PY3:
+ # unicode error under PY2
+ rec.dtype.names = list(rec.dtype.names)[::-1]
+
+ index = self.frame.index
+
+ df = DataFrame(rec)
+ tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names))
+
+ df2 = DataFrame(rec, index=index)
+ tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names))
+ tm.assert_index_equal(df2.index, index)
+
+ rng = np.arange(len(rec))[::-1]
+ df3 = DataFrame(rec, index=rng, columns=['C', 'B'])
+ expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B'])
+ tm.assert_frame_equal(df3, expected)
+
+ def test_constructor_bool(self):
+ df = DataFrame({0: np.ones(10, dtype=bool),
+ 1: np.zeros(10, dtype=bool)})
+ assert df.values.dtype == np.bool_
+
+ def test_constructor_overflow_int64(self):
+ # see gh-14881
+ values = np.array([2 ** 64 - i for i in range(1, 10)],
+ dtype=np.uint64)
+
+ result = DataFrame({'a': values})
+ assert result['a'].dtype == np.uint64
+
+ # see gh-2355
+ data_scores = [(6311132704823138710, 273), (2685045978526272070, 23),
+ (8921811264899370420, 45),
+ (long(17019687244989530680), 270),
+ (long(9930107427299601010), 273)]
+ dtype = [('uid', 'u8'), ('score', 'u8')]
+ data = np.zeros((len(data_scores),), dtype=dtype)
+ data[:] = data_scores
+ df_crawls = DataFrame(data)
+ assert df_crawls['uid'].dtype == np.uint64
+
+ @pytest.mark.parametrize("values", [np.array([2**64], dtype=object),
+ np.array([2**65]), [2**64 + 1],
+ np.array([-2**63 - 4], dtype=object),
+ np.array([-2**64 - 1]), [-2**65 - 2]])
+ def test_constructor_int_overflow(self, values):
+ # see gh-18584
+ value = values[0]
+ result = DataFrame(values)
+
+ assert result[0].dtype == object
+ assert result[0][0] == value
+
+ def test_constructor_ordereddict(self):
+ import random
+ nitems = 100
+ nums = lrange(nitems)
+ random.shuffle(nums)
+ expected = ['A%d' % i for i in nums]
+ df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems)))
+ assert expected == list(df.columns)
+
+ def test_constructor_dict(self):
+ frame = DataFrame({'col1': self.ts1,
+ 'col2': self.ts2})
+
+ # col2 is padded with NaN
+ assert len(self.ts1) == 30
+ assert len(self.ts2) == 25
+
+ tm.assert_series_equal(self.ts1, frame['col1'], check_names=False)
+
+ exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]),
+ index=self.ts1.index, name='col2')
+ tm.assert_series_equal(exp, frame['col2'])
+
+ frame = DataFrame({'col1': self.ts1,
+ 'col2': self.ts2},
+ columns=['col2', 'col3', 'col4'])
+
+ assert len(frame) == len(self.ts2)
+ assert 'col1' not in frame
+ assert isna(frame['col3']).all()
+
+ # Corner cases
+ assert len(DataFrame({})) == 0
+
+ # mix dict and array, wrong size - no spec for which error should raise
+ # first
+ with pytest.raises(ValueError):
+ DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']})
+
+ # Length-one dict micro-optimization
+ frame = DataFrame({'A': {'1': 1, '2': 2}})
+ tm.assert_index_equal(frame.index, pd.Index(['1', '2']))
+
+ # empty dict plus index
+ idx = Index([0, 1, 2])
+ frame = DataFrame({}, index=idx)
+ assert frame.index is idx
+
+ # empty with index and columns
+ idx = Index([0, 1, 2])
+ frame = DataFrame({}, index=idx, columns=idx)
+ assert frame.index is idx
+ assert frame.columns is idx
+ assert len(frame._series) == 3
+
+ # with dict of empty list and Series
+ frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B'])
+ tm.assert_index_equal(frame.index, Index([], dtype=np.int64))
+
+ # GH 14381
+ # Dict with None value
+ frame_none = DataFrame(dict(a=None), index=[0])
+ frame_none_list = DataFrame(dict(a=[None]), index=[0])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert frame_none.get_value(0, 'a') is None
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert frame_none_list.get_value(0, 'a') is None
+ tm.assert_frame_equal(frame_none, frame_none_list)
+
+ # GH10856
+ # dict with scalar values should raise error, even if columns passed
+ msg = 'If using all scalar values, you must pass an index'
+ with pytest.raises(ValueError, match=msg):
+ DataFrame({'a': 0.7})
+
+ with pytest.raises(ValueError, match=msg):
+ DataFrame({'a': 0.7}, columns=['a'])
+
+ @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D'])
+ def test_constructor_invalid_items_unused(self, scalar):
+ # No error if invalid (scalar) value is in fact not used:
+ result = DataFrame({'a': scalar}, columns=['b'])
+ expected = DataFrame(columns=['b'])
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
+ def test_constructor_dict_nan_key(self, value):
+ # GH 18455
+ cols = [1, value, 3]
+ idx = ['a', value]
+ values = [[0, 3], [1, 4], [2, 5]]
+ data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+ result = DataFrame(data).sort_values(1).sort_values('a', axis=1)
+ expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
+ index=idx, columns=cols)
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(data, index=idx).sort_values('a', axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(data, index=idx, columns=cols)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("value", [np.nan, None, float('nan')])
+ def test_constructor_dict_nan_tuple_key(self, value):
+ # GH 18455
+ cols = Index([(11, 21), (value, 22), (13, value)])
+ idx = Index([('a', value), (value, 2)])
+ values = [[0, 3], [1, 4], [2, 5]]
+ data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+ result = (DataFrame(data)
+ .sort_values((11, 21))
+ .sort_values(('a', value), axis=1))
+ expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
+ index=idx, columns=cols)
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(data, index=idx).sort_values(('a', value), axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(data, index=idx, columns=cols)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6')
+ def test_constructor_dict_order_insertion(self):
+ # GH19018
+ # initialization ordering: by insertion order if python>= 3.6
+ d = {'b': self.ts2, 'a': self.ts1}
+ frame = DataFrame(data=d)
+ expected = DataFrame(data=d, columns=list('ba'))
+ tm.assert_frame_equal(frame, expected)
+
+ @pytest.mark.skipif(PY36, reason='order by value for Python<3.6')
+ def test_constructor_dict_order_by_values(self):
+ # GH19018
+ # initialization ordering: by value if python<3.6
+ d = {'b': self.ts2, 'a': self.ts1}
+ frame = DataFrame(data=d)
+ expected = DataFrame(data=d, columns=list('ab'))
+ tm.assert_frame_equal(frame, expected)
+
+ def test_constructor_multi_index(self):
+ # GH 4078
+ # construction error with mi and all-nan frame
+ tuples = [(2, 3), (3, 3), (3, 3)]
+ mi = MultiIndex.from_tuples(tuples)
+ df = DataFrame(index=mi, columns=mi)
+ assert pd.isna(df).values.ravel().all()
+
+ tuples = [(3, 3), (2, 3), (3, 3)]
+ mi = MultiIndex.from_tuples(tuples)
+ df = DataFrame(index=mi, columns=mi)
+ assert pd.isna(df).values.ravel().all()
+
+ def test_constructor_error_msgs(self):
+ msg = "Empty data passed with indices specified."
+ # passing an empty array with columns specified.
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(np.empty(0), columns=list('abc'))
+
+ msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
+ # mix dict and array, wrong size
+ with pytest.raises(ValueError, match=msg):
+ DataFrame({'A': {'a': 'a', 'b': 'b'},
+ 'B': ['a', 'b', 'c']})
+
+ # wrong size ndarray, GH 3105
+ msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)"
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(np.arange(12).reshape((4, 3)),
+ columns=['foo', 'bar', 'baz'],
+ index=pd.date_range('2000-01-01', periods=3))
+
+ arr = np.array([[4, 5, 6]])
+ msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(index=[0], columns=range(0, 4), data=arr)
+
+ arr = np.array([4, 5, 6])
+ msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(index=[0], columns=range(0, 4), data=arr)
+
+ # higher dim raise exception
+ with pytest.raises(ValueError, match='Must pass 2-d input'):
+ DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1])
+
+ # wrong size axis labels
+ msg = ("Shape of passed values "
+ r"is \(2, 3\), indices "
+ r"imply \(1, 3\)")
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1])
+
+ msg = ("Shape of passed values "
+ r"is \(2, 3\), indices "
+ r"imply \(2, 2\)")
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2])
+
+ msg = ("If using all scalar "
+ "values, you must pass "
+ "an index")
+ with pytest.raises(ValueError, match=msg):
+ DataFrame({'a': False, 'b': True})
+
+ def test_constructor_with_embedded_frames(self):
+
+ # embedded data frames
+ df1 = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
+ df2 = DataFrame([df1, df1 + 10])
+
+ df2.dtypes
+ str(df2)
+
+ result = df2.loc[0, 0]
+ tm.assert_frame_equal(result, df1)
+
+ result = df2.loc[1, 0]
+ tm.assert_frame_equal(result, df1 + 10)
+
+ def test_constructor_subclass_dict(self):
+ # Test for passing dict subclass to constructor
+ data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)),
+ 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))}
+ df = DataFrame(data)
+ refdf = DataFrame({col: dict(compat.iteritems(val))
+ for col, val in compat.iteritems(data)})
+ tm.assert_frame_equal(refdf, df)
+
+ data = tm.TestSubDict(compat.iteritems(data))
+ df = DataFrame(data)
+ tm.assert_frame_equal(refdf, df)
+
+ # try with defaultdict
+ from collections import defaultdict
+ data = {}
+ self.frame['B'][:10] = np.nan
+ for k, v in compat.iteritems(self.frame):
+ dct = defaultdict(dict)
+ dct.update(v.to_dict())
+ data[k] = dct
+ frame = DataFrame(data)
+ tm.assert_frame_equal(self.frame.sort_index(), frame)
+
+ def test_constructor_dict_block(self):
+ expected = np.array([[4., 3., 2., 1.]])
+ df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]},
+ columns=['d', 'c', 'b', 'a'])
+ tm.assert_numpy_array_equal(df.values, expected)
+
+ def test_constructor_dict_cast(self):
+ # cast float tests
+ test_data = {
+ 'A': {'1': 1, '2': 2},
+ 'B': {'1': '1', '2': '2', '3': '3'},
+ }
+ frame = DataFrame(test_data, dtype=float)
+ assert len(frame) == 3
+ assert frame['B'].dtype == np.float64
+ assert frame['A'].dtype == np.float64
+
+ frame = DataFrame(test_data)
+ assert len(frame) == 3
+ assert frame['B'].dtype == np.object_
+ assert frame['A'].dtype == np.float64
+
+ # can't cast to float
+ test_data = {
+ 'A': dict(zip(range(20), tm.makeStringIndex(20))),
+ 'B': dict(zip(range(15), np.random.randn(15)))
+ }
+ frame = DataFrame(test_data, dtype=float)
+ assert len(frame) == 20
+ assert frame['A'].dtype == np.object_
+ assert frame['B'].dtype == np.float64
+
+ def test_constructor_dict_dont_upcast(self):
+ d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}}
+ df = DataFrame(d)
+ assert isinstance(df['Col1']['Row2'], float)
+
+ dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2])
+ assert isinstance(dm[1][1], int)
+
+ def test_constructor_dict_of_tuples(self):
+ # GH #1491
+ data = {'a': (1, 2, 3), 'b': (4, 5, 6)}
+
+ result = DataFrame(data)
+ expected = DataFrame({k: list(v) for k, v in compat.iteritems(data)})
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ def test_constructor_dict_multiindex(self):
+ def check(result, expected):
+ return tm.assert_frame_equal(result, expected, check_dtype=True,
+ check_index_type=True,
+ check_column_type=True,
+ check_names=True)
+ d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
+ ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
+ ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
+ _d = sorted(d.items())
+ df = DataFrame(d)
+ expected = DataFrame(
+ [x[1] for x in _d],
+ index=MultiIndex.from_tuples([x[0] for x in _d])).T
+ expected.index = MultiIndex.from_tuples(expected.index)
+ check(df, expected)
+
+ d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111}
+ _d.insert(0, ('z', d['z']))
+ expected = DataFrame(
+ [x[1] for x in _d],
+ index=Index([x[0] for x in _d], tupleize_cols=False)).T
+ expected.index = Index(expected.index, tupleize_cols=False)
+ df = DataFrame(d)
+ df = df.reindex(columns=expected.columns, index=expected.index)
+ check(df, expected)
+
+ def test_constructor_dict_datetime64_index(self):
+ # GH 10160
+ dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
+
+ def create_data(constructor):
+ return {i: {constructor(s): 2 * i}
+ for i, s in enumerate(dates_as_str)}
+
+ data_datetime64 = create_data(np.datetime64)
+ data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
+ data_Timestamp = create_data(Timestamp)
+
+ expected = DataFrame([{0: 0, 1: None, 2: None, 3: None},
+ {0: None, 1: 2, 2: None, 3: None},
+ {0: None, 1: None, 2: 4, 3: None},
+ {0: None, 1: None, 2: None, 3: 6}],
+ index=[Timestamp(dt) for dt in dates_as_str])
+
+ result_datetime64 = DataFrame(data_datetime64)
+ result_datetime = DataFrame(data_datetime)
+ result_Timestamp = DataFrame(data_Timestamp)
+ tm.assert_frame_equal(result_datetime64, expected)
+ tm.assert_frame_equal(result_datetime, expected)
+ tm.assert_frame_equal(result_Timestamp, expected)
+
+ def test_constructor_dict_timedelta64_index(self):
+ # GH 10160
+ td_as_int = [1, 2, 3, 4]
+
+ def create_data(constructor):
+ return {i: {constructor(s): 2 * i}
+ for i, s in enumerate(td_as_int)}
+
+ data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D'))
+ data_timedelta = create_data(lambda x: timedelta(days=x))
+ data_Timedelta = create_data(lambda x: Timedelta(x, 'D'))
+
+ expected = DataFrame([{0: 0, 1: None, 2: None, 3: None},
+ {0: None, 1: 2, 2: None, 3: None},
+ {0: None, 1: None, 2: 4, 3: None},
+ {0: None, 1: None, 2: None, 3: 6}],
+ index=[Timedelta(td, 'D') for td in td_as_int])
+
+ result_timedelta64 = DataFrame(data_timedelta64)
+ result_timedelta = DataFrame(data_timedelta)
+ result_Timedelta = DataFrame(data_Timedelta)
+ tm.assert_frame_equal(result_timedelta64, expected)
+ tm.assert_frame_equal(result_timedelta, expected)
+ tm.assert_frame_equal(result_Timedelta, expected)
+
+ def test_constructor_period(self):
+ # PeriodIndex
+ a = pd.PeriodIndex(['2012-01', 'NaT', '2012-04'], freq='M')
+ b = pd.PeriodIndex(['2012-02-01', '2012-03-01', 'NaT'], freq='D')
+ df = pd.DataFrame({'a': a, 'b': b})
+ assert df['a'].dtype == a.dtype
+ assert df['b'].dtype == b.dtype
+
+ # list of periods
+ df = pd.DataFrame({'a': a.astype(object).tolist(),
+ 'b': b.astype(object).tolist()})
+ assert df['a'].dtype == a.dtype
+ assert df['b'].dtype == b.dtype
+
+ def test_nested_dict_frame_constructor(self):
+ rng = pd.period_range('1/1/2000', periods=5)
+ df = DataFrame(np.random.randn(10, 5), columns=rng)
+
+ data = {}
+ for col in df.columns:
+ for row in df.index:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ data.setdefault(col, {})[row] = df.get_value(row, col)
+
+ result = DataFrame(data, columns=rng)
+ tm.assert_frame_equal(result, df)
+
+ data = {}
+ for col in df.columns:
+ for row in df.index:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ data.setdefault(row, {})[col] = df.get_value(row, col)
+
+ result = DataFrame(data, index=rng).T
+ tm.assert_frame_equal(result, df)
+
+ def _check_basic_constructor(self, empty):
+ # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
+ # objects
+ mat = empty((2, 3), dtype=float)
+ # 2-D input
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+
+ assert len(frame.index) == 2
+ assert len(frame.columns) == 3
+
+ # 1-D input
+ frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3])
+ assert len(frame.index) == 3
+ assert len(frame.columns) == 1
+
+ # cast type
+ frame = DataFrame(mat, columns=['A', 'B', 'C'],
+ index=[1, 2], dtype=np.int64)
+ assert frame.values.dtype == np.int64
+
+ # wrong size axis labels
+ msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)'
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(mat, columns=['A', 'B', 'C'], index=[1])
+ msg = r'Shape of passed values is \(2, 3\), indices imply \(2, 2\)'
+ with pytest.raises(ValueError, match=msg):
+ DataFrame(mat, columns=['A', 'B'], index=[1, 2])
+
+ # higher dim raise exception
+ with pytest.raises(ValueError, match='Must pass 2-d input'):
+ DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'],
+ index=[1])
+
+ # automatic labeling
+ frame = DataFrame(mat)
+ tm.assert_index_equal(frame.index, pd.Index(lrange(2)))
+ tm.assert_index_equal(frame.columns, pd.Index(lrange(3)))
+
+ frame = DataFrame(mat, index=[1, 2])
+ tm.assert_index_equal(frame.columns, pd.Index(lrange(3)))
+
+ frame = DataFrame(mat, columns=['A', 'B', 'C'])
+ tm.assert_index_equal(frame.index, pd.Index(lrange(2)))
+
+ # 0-length axis
+ frame = DataFrame(empty((0, 3)))
+ assert len(frame.index) == 0
+
+ frame = DataFrame(empty((3, 0)))
+ assert len(frame.columns) == 0
+
+ def test_constructor_ndarray(self):
+ self._check_basic_constructor(np.ones)
+
+ frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A'])
+ assert len(frame) == 2
+
+ @pytest.mark.skipif(PY2 and _np_version_under1p13,
+ reason="old numpy & py2")
+ def test_constructor_maskedarray(self):
+ self._check_basic_constructor(ma.masked_all)
+
+ # Check non-masked values
+ mat = ma.masked_all((2, 3), dtype=float)
+ mat[0, 0] = 1.0
+ mat[1, 2] = 2.0
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+ assert 1.0 == frame['A'][1]
+ assert 2.0 == frame['C'][2]
+
+ # what is this even checking??
+ mat = ma.masked_all((2, 3), dtype=float)
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+ assert np.all(~np.asarray(frame == frame))
+
+ @pytest.mark.skipif(PY2 and _np_version_under1p13,
+ reason="old numpy & py2")
+ def test_constructor_maskedarray_nonfloat(self):
+ # masked int promoted to float
+ mat = ma.masked_all((2, 3), dtype=int)
+ # 2-D input
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+
+ assert len(frame.index) == 2
+ assert len(frame.columns) == 3
+ assert np.all(~np.asarray(frame == frame))
+
+ # cast type
+ frame = DataFrame(mat, columns=['A', 'B', 'C'],
+ index=[1, 2], dtype=np.float64)
+ assert frame.values.dtype == np.float64
+
+ # Check non-masked values
+ mat2 = ma.copy(mat)
+ mat2[0, 0] = 1
+ mat2[1, 2] = 2
+ frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
+ assert 1 == frame['A'][1]
+ assert 2 == frame['C'][2]
+
+ # masked np.datetime64 stays (use NaT as null)
+ mat = ma.masked_all((2, 3), dtype='M8[ns]')
+ # 2-D input
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+
+ assert len(frame.index) == 2
+ assert len(frame.columns) == 3
+ assert isna(frame).values.all()
+
+ # cast type
+ frame = DataFrame(mat, columns=['A', 'B', 'C'],
+ index=[1, 2], dtype=np.int64)
+ assert frame.values.dtype == np.int64
+
+ # Check non-masked values
+ mat2 = ma.copy(mat)
+ mat2[0, 0] = 1
+ mat2[1, 2] = 2
+ frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
+ assert 1 == frame['A'].view('i8')[1]
+ assert 2 == frame['C'].view('i8')[2]
+
+ # masked bool promoted to object
+ mat = ma.masked_all((2, 3), dtype=bool)
+ # 2-D input
+ frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
+
+ assert len(frame.index) == 2
+ assert len(frame.columns) == 3
+ assert np.all(~np.asarray(frame == frame))
+
+ # cast type
+ frame = DataFrame(mat, columns=['A', 'B', 'C'],
+ index=[1, 2], dtype=object)
+ assert frame.values.dtype == object
+
+ # Check non-masked values
+ mat2 = ma.copy(mat)
+ mat2[0, 0] = True
+ mat2[1, 2] = False
+ frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
+ assert frame['A'][1] is True
+ assert frame['C'][2] is False
+
+ @pytest.mark.skipif(PY2 and _np_version_under1p13,
+ reason="old numpy & py2")
+ def test_constructor_maskedarray_hardened(self):
+ # Check numpy masked arrays with hard masks -- from GH24574
+ mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask()
+ result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2])
+ expected = pd.DataFrame({
+ 'A': [np.nan, np.nan],
+ 'B': [np.nan, np.nan]},
+ columns=['A', 'B'],
+ index=[1, 2],
+ dtype=float)
+ tm.assert_frame_equal(result, expected)
+ # Check case where mask is hard but no data are masked
+ mat_hard = ma.ones((2, 2), dtype=float).harden_mask()
+ result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2])
+ expected = pd.DataFrame({
+ 'A': [1.0, 1.0],
+ 'B': [1.0, 1.0]},
+ columns=['A', 'B'],
+ index=[1, 2],
+ dtype=float)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(PY2 and _np_version_under1p13,
+ reason="old numpy & py2")
+ def test_constructor_maskedrecarray_dtype(self):
+ # Ensure constructor honors dtype
+ data = np.ma.array(
+ np.ma.zeros(5, dtype=[('date', '<f8'), ('price', '<f8')]),
+ mask=[False] * 5)
+ data = data.view(ma.mrecords.mrecarray)
+ result = pd.DataFrame(data, dtype=int)
+ expected = pd.DataFrame(np.zeros((5, 2), dtype=int),
+ columns=['date', 'price'])
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(PY2 and _np_version_under1p13,
+ reason="old numpy & py2")
+ def test_constructor_mrecarray(self):
+ # Ensure mrecarray produces frame identical to dict of masked arrays
+ # from GH3479
+
+ assert_fr_equal = functools.partial(tm.assert_frame_equal,
+ check_index_type=True,
+ check_column_type=True,
+ check_frame_type=True)
+ arrays = [
+ ('float', np.array([1.5, 2.0])),
+ ('int', np.array([1, 2])),
+ ('str', np.array(['abc', 'def'])),
+ ]
+ for name, arr in arrays[:]:
+ arrays.append(('masked1_' + name,
+ np.ma.masked_array(arr, mask=[False, True])))
+ arrays.append(('masked_all', np.ma.masked_all((2,))))
+ arrays.append(('masked_none',
+ np.ma.masked_array([1.0, 2.5], mask=False)))
+
+ # call assert_frame_equal for all selections of 3 arrays
+ for comb in itertools.combinations(arrays, 3):
+ names, data = zip(*comb)
+ mrecs = ma.mrecords.fromarrays(data, names=names)
+
+ # fill the comb
+ comb = {k: (v.filled() if hasattr(v, 'filled') else v)
+ for k, v in comb}
+
+ expected = DataFrame(comb, columns=names)
+ result = DataFrame(mrecs)
+ assert_fr_equal(result, expected)
+
+ # specify columns
+ expected = DataFrame(comb, columns=names[::-1])
+ result = DataFrame(mrecs, columns=names[::-1])
+ assert_fr_equal(result, expected)
+
+ # specify index
+ expected = DataFrame(comb, columns=names, index=[1, 2])
+ result = DataFrame(mrecs, index=[1, 2])
+ assert_fr_equal(result, expected)
+
+ def test_constructor_corner_shape(self):
+ df = DataFrame(index=[])
+ assert df.values.shape == (0, 0)
+
+ @pytest.mark.parametrize("data, index, columns, dtype, expected", [
+ (None, lrange(10), ['a', 'b'], object, np.object_),
+ (None, None, ['a', 'b'], 'int64', np.dtype('int64')),
+ (None, lrange(10), ['a', 'b'], int, np.dtype('float64')),
+ ({}, None, ['foo', 'bar'], None, np.object_),
+ ({'b': 1}, lrange(10), list('abc'), int, np.dtype('float64'))
+ ])
+ def test_constructor_dtype(self, data, index, columns, dtype, expected):
+ df = DataFrame(data, index, columns, dtype)
+ assert df.values.dtype == expected
+
+ def test_constructor_scalar_inference(self):
+ data = {'int': 1, 'bool': True,
+ 'float': 3., 'complex': 4j, 'object': 'foo'}
+ df = DataFrame(data, index=np.arange(10))
+
+ assert df['int'].dtype == np.int64
+ assert df['bool'].dtype == np.bool_
+ assert df['float'].dtype == np.float64
+ assert df['complex'].dtype == np.complex128
+ assert df['object'].dtype == np.object_
+
+ def test_constructor_arrays_and_scalars(self):
+ df = DataFrame({'a': np.random.randn(10), 'b': True})
+ exp = DataFrame({'a': df['a'].values, 'b': [True] * 10})
+
+ tm.assert_frame_equal(df, exp)
+ with pytest.raises(ValueError, match='must pass an index'):
+ DataFrame({'a': False, 'b': True})
+
+ def test_constructor_DataFrame(self):
+ df = DataFrame(self.frame)
+ tm.assert_frame_equal(df, self.frame)
+
+ df_casted = DataFrame(self.frame, dtype=np.int64)
+ assert df_casted.values.dtype == np.int64
+
+ def test_constructor_more(self):
+ # used to be in test_matrix.py
+ arr = np.random.randn(10)
+ dm = DataFrame(arr, columns=['A'], index=np.arange(10))
+ assert dm.values.ndim == 2
+
+ arr = np.random.randn(0)
+ dm = DataFrame(arr)
+ assert dm.values.ndim == 2
+ assert dm.values.ndim == 2
+
+ # no data specified
+ dm = DataFrame(columns=['A', 'B'], index=np.arange(10))
+ assert dm.values.shape == (10, 2)
+
+ dm = DataFrame(columns=['A', 'B'])
+ assert dm.values.shape == (0, 2)
+
+ dm = DataFrame(index=np.arange(10))
+ assert dm.values.shape == (10, 0)
+
+ # can't cast
+ mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1)
+ with pytest.raises(ValueError, match='cast'):
+ DataFrame(mat, index=[0, 1], columns=[0], dtype=float)
+
+ dm = DataFrame(DataFrame(self.frame._series))
+ tm.assert_frame_equal(dm, self.frame)
+
+ # int cast
+ dm = DataFrame({'A': np.ones(10, dtype=int),
+ 'B': np.ones(10, dtype=np.float64)},
+ index=np.arange(10))
+
+ assert len(dm.columns) == 2
+ assert dm.values.dtype == np.float64
+
+ def test_constructor_empty_list(self):
+ df = DataFrame([], index=[])
+ expected = DataFrame(index=[])
+ tm.assert_frame_equal(df, expected)
+
+ # GH 9939
+ df = DataFrame([], columns=['A', 'B'])
+ expected = DataFrame({}, columns=['A', 'B'])
+ tm.assert_frame_equal(df, expected)
+
+ # Empty generator: list(empty_gen()) == []
+ def empty_gen():
+ return
+ yield
+
+ df = DataFrame(empty_gen(), columns=['A', 'B'])
+ tm.assert_frame_equal(df, expected)
+
+ def test_constructor_list_of_lists(self):
+ # GH #484
+ df = DataFrame(data=[[1, 'a'], [2, 'b']], columns=["num", "str"])
+ assert is_integer_dtype(df['num'])
+ assert df['str'].dtype == np.object_
+
+ # GH 4851
+ # list of 0-dim ndarrays
+ expected = DataFrame({0: np.arange(10)})
+ data = [np.array(x) for x in range(10)]
+ result = DataFrame(data)
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_sequence_like(self):
+ # GH 3783
+ # collections.Squence like
+
+ class DummyContainer(compat.Sequence):
+
+ def __init__(self, lst):
+ self._lst = lst
+
+ def __getitem__(self, n):
+ return self._lst.__getitem__(n)
+
+ def __len__(self, n):
+ return self._lst.__len__()
+
+ lst_containers = [DummyContainer([1, 'a']), DummyContainer([2, 'b'])]
+ columns = ["num", "str"]
+ result = DataFrame(lst_containers, columns=columns)
+ expected = DataFrame([[1, 'a'], [2, 'b']], columns=columns)
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ # GH 4297
+ # support Array
+ import array
+ result = DataFrame({'A': array.array('i', range(10))})
+ expected = DataFrame({'A': list(range(10))})
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ expected = DataFrame([list(range(10)), list(range(10))])
+ result = DataFrame([array.array('i', range(10)),
+ array.array('i', range(10))])
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ def test_constructor_iterable(self):
+ # GH 21987
+ class Iter():
+ def __iter__(self):
+ for i in range(10):
+ yield [1, 2, 3]
+
+ expected = DataFrame([[1, 2, 3]] * 10)
+ result = DataFrame(Iter())
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_iterator(self):
+
+ expected = DataFrame([list(range(10)), list(range(10))])
+ result = DataFrame([range(10), range(10)])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_generator(self):
+ # related #2305
+
+ gen1 = (i for i in range(10))
+ gen2 = (i for i in range(10))
+
+ expected = DataFrame([list(range(10)), list(range(10))])
+ result = DataFrame([gen1, gen2])
+ tm.assert_frame_equal(result, expected)
+
+ gen = ([i, 'a'] for i in range(10))
+ result = DataFrame(gen)
+ expected = DataFrame({0: range(10), 1: 'a'})
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ def test_constructor_list_of_dicts(self):
+ data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]),
+ OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]),
+ OrderedDict([['a', 1.5], ['d', 6]]),
+ OrderedDict(),
+ OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]),
+ OrderedDict([['b', 3], ['c', 4], ['d', 6]])]
+
+ result = DataFrame(data)
+ expected = DataFrame.from_dict(dict(zip(range(len(data)), data)),
+ orient='index')
+ tm.assert_frame_equal(result, expected.reindex(result.index))
+
+ result = DataFrame([{}])
+ expected = DataFrame(index=[0])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_ordered_dict_preserve_order(self):
+ # see gh-13304
+ expected = DataFrame([[2, 1]], columns=['b', 'a'])
+
+ data = OrderedDict()
+ data['b'] = [2]
+ data['a'] = [1]
+
+ result = DataFrame(data)
+ tm.assert_frame_equal(result, expected)
+
+ data = OrderedDict()
+ data['b'] = 2
+ data['a'] = 1
+
+ result = DataFrame([data])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_ordered_dict_conflicting_orders(self):
+ # the first dict element sets the ordering for the DataFrame,
+ # even if there are conflicting orders from subsequent ones
+ row_one = OrderedDict()
+ row_one['b'] = 2
+ row_one['a'] = 1
+
+ row_two = OrderedDict()
+ row_two['a'] = 1
+ row_two['b'] = 2
+
+ row_three = {'b': 2, 'a': 1}
+
+ expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a'])
+ result = DataFrame([row_one, row_two])
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a'])
+ result = DataFrame([row_one, row_two, row_three])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_list_of_series(self):
+ data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
+ OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
+ sdict = OrderedDict(zip(['x', 'y'], data))
+ idx = Index(['a', 'b', 'c'])
+
+ # all named
+ data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'),
+ Series([1.5, 3, 6], idx, name='y')]
+ result = DataFrame(data2)
+ expected = DataFrame.from_dict(sdict, orient='index')
+ tm.assert_frame_equal(result, expected)
+
+ # some unnamed
+ data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'),
+ Series([1.5, 3, 6], idx)]
+ result = DataFrame(data2)
+
+ sdict = OrderedDict(zip(['x', 'Unnamed 0'], data))
+ expected = DataFrame.from_dict(sdict, orient='index')
+ tm.assert_frame_equal(result.sort_index(), expected)
+
+ # none named
+ data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]),
+ OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]),
+ OrderedDict([['a', 1.5], ['d', 6]]),
+ OrderedDict(),
+ OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]),
+ OrderedDict([['b', 3], ['c', 4], ['d', 6]])]
+ data = [Series(d) for d in data]
+
+ result = DataFrame(data)
+ sdict = OrderedDict(zip(range(len(data)), data))
+ expected = DataFrame.from_dict(sdict, orient='index')
+ tm.assert_frame_equal(result, expected.reindex(result.index))
+
+ result2 = DataFrame(data, index=np.arange(6))
+ tm.assert_frame_equal(result, result2)
+
+ result = DataFrame([Series({})])
+ expected = DataFrame(index=[0])
+ tm.assert_frame_equal(result, expected)
+
+ data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
+ OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
+ sdict = OrderedDict(zip(range(len(data)), data))
+
+ idx = Index(['a', 'b', 'c'])
+ data2 = [Series([1.5, 3, 4], idx, dtype='O'),
+ Series([1.5, 3, 6], idx)]
+ result = DataFrame(data2)
+ expected = DataFrame.from_dict(sdict, orient='index')
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_list_of_series_aligned_index(self):
+ series = [pd.Series(i, index=['b', 'a', 'c'], name=str(i))
+ for i in range(3)]
+ result = pd.DataFrame(series)
+ expected = pd.DataFrame({'b': [0, 1, 2],
+ 'a': [0, 1, 2],
+ 'c': [0, 1, 2]},
+ columns=['b', 'a', 'c'],
+ index=['0', '1', '2'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_list_of_derived_dicts(self):
+ class CustomDict(dict):
+ pass
+ d = {'a': 1.5, 'b': 3}
+
+ data_custom = [CustomDict(d)]
+ data = [d]
+
+ result_custom = DataFrame(data_custom)
+ result = DataFrame(data)
+ tm.assert_frame_equal(result, result_custom)
+
+ def test_constructor_ragged(self):
+ data = {'A': np.random.randn(10),
+ 'B': np.random.randn(8)}
+ with pytest.raises(ValueError, match='arrays must all be same length'):
+ DataFrame(data)
+
+ def test_constructor_scalar(self):
+ idx = Index(lrange(3))
+ df = DataFrame({"a": 0}, index=idx)
+ expected = DataFrame({"a": [0, 0, 0]}, index=idx)
+ tm.assert_frame_equal(df, expected, check_dtype=False)
+
+ def test_constructor_Series_copy_bug(self):
+ df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A'])
+ df.copy()
+
+ def test_constructor_mixed_dict_and_Series(self):
+ data = {}
+ data['A'] = {'foo': 1, 'bar': 2, 'baz': 3}
+ data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo'])
+
+ result = DataFrame(data)
+ assert result.index.is_monotonic
+
+ # ordering ambiguous, raise exception
+ with pytest.raises(ValueError, match='ambiguous ordering'):
+ DataFrame({'A': ['a', 'b'], 'B': {'a': 'a', 'b': 'b'}})
+
+ # this is OK though
+ result = DataFrame({'A': ['a', 'b'],
+ 'B': Series(['a', 'b'], index=['a', 'b'])})
+ expected = DataFrame({'A': ['a', 'b'], 'B': ['a', 'b']},
+ index=['a', 'b'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_mixed_type_rows(self):
+ # Issue 25075
+ data = [[1, 2], (3, 4)]
+ result = DataFrame(data)
+ expected = DataFrame([[1, 2], [3, 4]])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_tuples(self):
+ result = DataFrame({'A': [(1, 2), (3, 4)]})
+ expected = DataFrame({'A': Series([(1, 2), (3, 4)])})
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_namedtuples(self):
+ # GH11181
+ from collections import namedtuple
+ named_tuple = namedtuple("Pandas", list('ab'))
+ tuples = [named_tuple(1, 3), named_tuple(2, 4)]
+ expected = DataFrame({'a': [1, 2], 'b': [3, 4]})
+ result = DataFrame(tuples)
+ tm.assert_frame_equal(result, expected)
+
+ # with columns
+ expected = DataFrame({'y': [1, 2], 'z': [3, 4]})
+ result = DataFrame(tuples, columns=['y', 'z'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_constructor_orient(self):
+ data_dict = self.mixed_frame.T._series
+ recons = DataFrame.from_dict(data_dict, orient='index')
+ expected = self.mixed_frame.sort_index()
+ tm.assert_frame_equal(recons, expected)
+
+ # dict of sequence
+ a = {'hi': [32, 3, 3],
+ 'there': [3, 5, 3]}
+ rs = DataFrame.from_dict(a, orient='index')
+ xp = DataFrame.from_dict(a).T.reindex(list(a.keys()))
+ tm.assert_frame_equal(rs, xp)
+
+ def test_from_dict_columns_parameter(self):
+ # GH 18529
+ # Test new columns parameter for from_dict that was added to make
+ # from_items(..., orient='index', columns=[...]) easier to replicate
+ result = DataFrame.from_dict(OrderedDict([('A', [1, 2]),
+ ('B', [4, 5])]),
+ orient='index', columns=['one', 'two'])
+ expected = DataFrame([[1, 2], [4, 5]], index=['A', 'B'],
+ columns=['one', 'two'])
+ tm.assert_frame_equal(result, expected)
+
+ msg = "cannot use columns parameter with orient='columns'"
+ with pytest.raises(ValueError, match=msg):
+ DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]),
+ orient='columns', columns=['one', 'two'])
+ with pytest.raises(ValueError, match=msg):
+ DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]),
+ columns=['one', 'two'])
+
+ def test_constructor_Series_named(self):
+ a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x')
+ df = DataFrame(a)
+ assert df.columns[0] == 'x'
+ tm.assert_index_equal(df.index, a.index)
+
+ # ndarray like
+ arr = np.random.randn(10)
+ s = Series(arr, name='x')
+ df = DataFrame(s)
+ expected = DataFrame(dict(x=s))
+ tm.assert_frame_equal(df, expected)
+
+ s = Series(arr, index=range(3, 13))
+ df = DataFrame(s)
+ expected = DataFrame({0: s})
+ tm.assert_frame_equal(df, expected)
+
+ pytest.raises(ValueError, DataFrame, s, columns=[1, 2])
+
+ # #2234
+ a = Series([], name='x')
+ df = DataFrame(a)
+ assert df.columns[0] == 'x'
+
+ # series with name and w/o
+ s1 = Series(arr, name='x')
+ df = DataFrame([s1, arr]).T
+ expected = DataFrame({'x': s1, 'Unnamed 0': arr},
+ columns=['x', 'Unnamed 0'])
+ tm.assert_frame_equal(df, expected)
+
+ # this is a bit non-intuitive here; the series collapse down to arrays
+ df = DataFrame([arr, s1]).T
+ expected = DataFrame({1: s1, 0: arr}, columns=[0, 1])
+ tm.assert_frame_equal(df, expected)
+
+ def test_constructor_Series_named_and_columns(self):
+ # GH 9232 validation
+
+ s0 = Series(range(5), name=0)
+ s1 = Series(range(5), name=1)
+
+ # matching name and column gives standard frame
+ tm.assert_frame_equal(pd.DataFrame(s0, columns=[0]),
+ s0.to_frame())
+ tm.assert_frame_equal(pd.DataFrame(s1, columns=[1]),
+ s1.to_frame())
+
+ # non-matching produces empty frame
+ assert pd.DataFrame(s0, columns=[1]).empty
+ assert pd.DataFrame(s1, columns=[0]).empty
+
+ def test_constructor_Series_differently_indexed(self):
+ # name
+ s1 = Series([1, 2, 3], index=['a', 'b', 'c'], name='x')
+
+ # no name
+ s2 = Series([1, 2, 3], index=['a', 'b', 'c'])
+
+ other_index = Index(['a', 'b'])
+
+ df1 = DataFrame(s1, index=other_index)
+ exp1 = DataFrame(s1.reindex(other_index))
+ assert df1.columns[0] == 'x'
+ tm.assert_frame_equal(df1, exp1)
+
+ df2 = DataFrame(s2, index=other_index)
+ exp2 = DataFrame(s2.reindex(other_index))
+ assert df2.columns[0] == 0
+ tm.assert_index_equal(df2.index, other_index)
+ tm.assert_frame_equal(df2, exp2)
+
+ def test_constructor_manager_resize(self):
+ index = list(self.frame.index[:5])
+ columns = list(self.frame.columns[:3])
+
+ result = DataFrame(self.frame._data, index=index,
+ columns=columns)
+ tm.assert_index_equal(result.index, Index(index))
+ tm.assert_index_equal(result.columns, Index(columns))
+
+ def test_constructor_from_items(self):
+ items = [(c, self.frame[c]) for c in self.frame.columns]
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ recons = DataFrame.from_items(items)
+ tm.assert_frame_equal(recons, self.frame)
+
+ # pass some columns
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ recons = DataFrame.from_items(items, columns=['C', 'B', 'A'])
+ tm.assert_frame_equal(recons, self.frame.loc[:, ['C', 'B', 'A']])
+
+ # orient='index'
+
+ row_items = [(idx, self.mixed_frame.xs(idx))
+ for idx in self.mixed_frame.index]
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ recons = DataFrame.from_items(row_items,
+ columns=self.mixed_frame.columns,
+ orient='index')
+ tm.assert_frame_equal(recons, self.mixed_frame)
+ assert recons['A'].dtype == np.float64
+
+ msg = "Must pass columns with orient='index'"
+ with pytest.raises(TypeError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ DataFrame.from_items(row_items, orient='index')
+
+ # orient='index', but thar be tuples
+ arr = construct_1d_object_array_from_listlike(
+ [('bar', 'baz')] * len(self.mixed_frame))
+ self.mixed_frame['foo'] = arr
+ row_items = [(idx, list(self.mixed_frame.xs(idx)))
+ for idx in self.mixed_frame.index]
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ recons = DataFrame.from_items(row_items,
+ columns=self.mixed_frame.columns,
+ orient='index')
+ tm.assert_frame_equal(recons, self.mixed_frame)
+ assert isinstance(recons['foo'][0], tuple)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
+ orient='index',
+ columns=['one', 'two', 'three'])
+ xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'],
+ columns=['one', 'two', 'three'])
+ tm.assert_frame_equal(rs, xp)
+
+ def test_constructor_from_items_scalars(self):
+ # GH 17312
+ msg = (r'The value in each \(key, value\) '
+ 'pair must be an array, Series, or dict')
+ with pytest.raises(ValueError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ DataFrame.from_items([('A', 1), ('B', 4)])
+
+ msg = (r'The value in each \(key, value\) '
+ 'pair must be an array, Series, or dict')
+ with pytest.raises(ValueError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'],
+ orient='index')
+
+ def test_from_items_deprecation(self):
+ # GH 17320
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
+ columns=['col1', 'col2', 'col3'],
+ orient='index')
+
+ def test_constructor_mix_series_nonseries(self):
+ df = DataFrame({'A': self.frame['A'],
+ 'B': list(self.frame['B'])}, columns=['A', 'B'])
+ tm.assert_frame_equal(df, self.frame.loc[:, ['A', 'B']])
+
+ msg = 'does not match index length'
+ with pytest.raises(ValueError, match=msg):
+ DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]})
+
+ def test_constructor_miscast_na_int_dtype(self):
+ df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
+ expected = DataFrame([[np.nan, 1], [1, 0]])
+ tm.assert_frame_equal(df, expected)
+
+ def test_constructor_column_duplicates(self):
+ # it works! #2079
+ df = DataFrame([[8, 5]], columns=['a', 'a'])
+ edf = DataFrame([[8, 5]])
+ edf.columns = ['a', 'a']
+
+ tm.assert_frame_equal(df, edf)
+
+ idf = DataFrame.from_records([(8, 5)],
+ columns=['a', 'a'])
+
+ tm.assert_frame_equal(idf, edf)
+
+ pytest.raises(ValueError, DataFrame.from_dict,
+ OrderedDict([('b', 8), ('a', 5), ('a', 6)]))
+
+ def test_constructor_empty_with_string_dtype(self):
+ # GH 9428
+ expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
+
+ df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
+ tm.assert_frame_equal(df, expected)
+ df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
+ tm.assert_frame_equal(df, expected)
+ df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_)
+ tm.assert_frame_equal(df, expected)
+ df = DataFrame(index=[0, 1], columns=[0, 1], dtype='U5')
+ tm.assert_frame_equal(df, expected)
+
+ def test_constructor_single_value(self):
+ # expecting single value upcasting here
+ df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c'])
+ tm.assert_frame_equal(df,
+ DataFrame(np.zeros(df.shape).astype('float64'),
+ df.index, df.columns))
+
+ df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c'])
+ tm.assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'),
+ df.index, df.columns))
+
+ df = DataFrame('a', index=[1, 2], columns=['a', 'c'])
+ tm.assert_frame_equal(df, DataFrame(np.array([['a', 'a'], ['a', 'a']],
+ dtype=object),
+ index=[1, 2], columns=['a', 'c']))
+
+ pytest.raises(ValueError, DataFrame, 'a', [1, 2])
+ pytest.raises(ValueError, DataFrame, 'a', columns=['a', 'c'])
+
+ msg = 'incompatible data and dtype'
+ with pytest.raises(TypeError, match=msg):
+ DataFrame('a', [1, 2], ['a', 'c'], float)
+
+ def test_constructor_with_datetimes(self):
+ intname = np.dtype(np.int_).name
+ floatname = np.dtype(np.float_).name
+ datetime64name = np.dtype('M8[ns]').name
+ objectname = np.dtype(np.object_).name
+
+ # single item
+ df = DataFrame({'A': 1, 'B': 'foo', 'C': 'bar',
+ 'D': Timestamp("20010101"),
+ 'E': datetime(2001, 1, 2, 0, 0)},
+ index=np.arange(10))
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1, datetime64name: 2, objectname: 2})
+ result.sort_index()
+ expected.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0
+ # ndarray with a dtype specified)
+ df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ floatname: np.array(1., dtype=floatname),
+ intname: np.array(1, dtype=intname)},
+ index=np.arange(10))
+ result = df.get_dtype_counts()
+ expected = {objectname: 1}
+ if intname == 'int64':
+ expected['int64'] = 2
+ else:
+ expected['int64'] = 1
+ expected[intname] = 1
+ if floatname == 'float64':
+ expected['float64'] = 2
+ else:
+ expected['float64'] = 1
+ expected[floatname] = 1
+
+ result = result.sort_index()
+ expected = Series(expected).sort_index()
+ tm.assert_series_equal(result, expected)
+
+ # check with ndarray construction ndim>0
+ df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+ floatname: np.array([1.] * 10, dtype=floatname),
+ intname: np.array([1] * 10, dtype=intname)},
+ index=np.arange(10))
+ result = df.get_dtype_counts()
+ result = result.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ # GH 2809
+ ind = date_range(start="2000-01-01", freq="D", periods=10)
+ datetimes = [ts.to_pydatetime() for ts in ind]
+ datetime_s = Series(datetimes)
+ assert datetime_s.dtype == 'M8[ns]'
+ df = DataFrame({'datetime_s': datetime_s})
+ result = df.get_dtype_counts()
+ expected = Series({datetime64name: 1})
+ result = result.sort_index()
+ expected = expected.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ # GH 2810
+ ind = date_range(start="2000-01-01", freq="D", periods=10)
+ datetimes = [ts.to_pydatetime() for ts in ind]
+ dates = [ts.date() for ts in ind]
+ df = DataFrame({'datetimes': datetimes, 'dates': dates})
+ result = df.get_dtype_counts()
+ expected = Series({datetime64name: 1, objectname: 1})
+ result = result.sort_index()
+ expected = expected.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ # GH 7594
+ # don't coerce tz-aware
+ import pytz
+ tz = pytz.timezone('US/Eastern')
+ dt = tz.localize(datetime(2012, 1, 1))
+
+ df = DataFrame({'End Date': dt}, index=[0])
+ assert df.iat[0, 0] == dt
+ tm.assert_series_equal(df.dtypes, Series(
+ {'End Date': 'datetime64[ns, US/Eastern]'}))
+
+ df = DataFrame([{'End Date': dt}])
+ assert df.iat[0, 0] == dt
+ tm.assert_series_equal(df.dtypes, Series(
+ {'End Date': 'datetime64[ns, US/Eastern]'}))
+
+ # tz-aware (UTC and other tz's)
+ # GH 8411
+ dr = date_range('20130101', periods=3)
+ df = DataFrame({'value': dr})
+ assert df.iat[0, 0].tz is None
+ dr = date_range('20130101', periods=3, tz='UTC')
+ df = DataFrame({'value': dr})
+ assert str(df.iat[0, 0].tz) == 'UTC'
+ dr = date_range('20130101', periods=3, tz='US/Eastern')
+ df = DataFrame({'value': dr})
+ assert str(df.iat[0, 0].tz) == 'US/Eastern'
+
+ # GH 7822
+ # preserver an index with a tz on dict construction
+ i = date_range('1/1/2011', periods=5, freq='10s', tz='US/Eastern')
+
+ expected = DataFrame(
+ {'a': i.to_series(keep_tz=True).reset_index(drop=True)})
+ df = DataFrame()
+ df['a'] = i
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'a': i})
+ tm.assert_frame_equal(df, expected)
+
+ # multiples
+ i_no_tz = date_range('1/1/2011', periods=5, freq='10s')
+ df = DataFrame({'a': i, 'b': i_no_tz})
+ expected = DataFrame({'a': i.to_series(keep_tz=True)
+ .reset_index(drop=True), 'b': i_no_tz})
+ tm.assert_frame_equal(df, expected)
+
+ def test_constructor_datetimes_with_nulls(self):
+ # gh-15869
+ for arr in [np.array([None, None, None, None,
+ datetime.now(), None]),
+ np.array([None, None, datetime.now(), None])]:
+ result = DataFrame(arr).get_dtype_counts()
+ expected = Series({'datetime64[ns]': 1})
+ tm.assert_series_equal(result, expected)
+
+ def test_constructor_for_list_with_dtypes(self):
+ # TODO(wesm): unused
+ intname = np.dtype(np.int_).name # noqa
+ floatname = np.dtype(np.float_).name # noqa
+ datetime64name = np.dtype('M8[ns]').name
+ objectname = np.dtype(np.object_).name
+
+ # test list of lists/ndarrays
+ df = DataFrame([np.arange(5) for x in range(5)])
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 5})
+
+ df = DataFrame([np.array(np.arange(5), dtype='int32')
+ for x in range(5)])
+ result = df.get_dtype_counts()
+ expected = Series({'int32': 5})
+
+ # overflow issue? (we always expecte int64 upcasting here)
+ df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]})
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1})
+ tm.assert_series_equal(result, expected)
+
+ # GH #2751 (construction with no index specified), make sure we cast to
+ # platform values
+ df = DataFrame([1, 2])
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1})
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame([1., 2.])
+ result = df.get_dtype_counts()
+ expected = Series({'float64': 1})
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame({'a': [1, 2]})
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1})
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame({'a': [1., 2.]})
+ result = df.get_dtype_counts()
+ expected = Series({'float64': 1})
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame({'a': 1}, index=lrange(3))
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 1})
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame({'a': 1.}, index=lrange(3))
+ result = df.get_dtype_counts()
+ expected = Series({'float64': 1})
+ tm.assert_series_equal(result, expected)
+
+ # with object list
+ df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3],
+ 'c': list('abcd'),
+ 'd': [datetime(2000, 1, 1) for i in range(4)],
+ 'e': [1., 2, 4., 7]})
+ result = df.get_dtype_counts()
+ expected = Series(
+ {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1})
+ result = result.sort_index()
+ expected = expected.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ def test_constructor_frame_copy(self):
+ cop = DataFrame(self.frame, copy=True)
+ cop['A'] = 5
+ assert (cop['A'] == 5).all()
+ assert not (self.frame['A'] == 5).all()
+
+ def test_constructor_ndarray_copy(self):
+ df = DataFrame(self.frame.values)
+
+ self.frame.values[5] = 5
+ assert (df.values[5] == 5).all()
+
+ df = DataFrame(self.frame.values, copy=True)
+ self.frame.values[6] = 6
+ assert not (df.values[6] == 6).all()
+
+ def test_constructor_series_copy(self):
+ series = self.frame._series
+
+ df = DataFrame({'A': series['A']})
+ df['A'][:] = 5
+
+ assert not (series['A'] == 5).all()
+
+ def test_constructor_with_nas(self):
+ # GH 5016
+ # na's in indices
+
+ def check(df):
+ for i in range(len(df.columns)):
+ df.iloc[:, i]
+
+ indexer = np.arange(len(df.columns))[isna(df.columns)]
+
+ # No NaN found -> error
+ if len(indexer) == 0:
+ def f():
+ df.loc[:, np.nan]
+ pytest.raises(TypeError, f)
+ # single nan should result in Series
+ elif len(indexer) == 1:
+ tm.assert_series_equal(df.iloc[:, indexer[0]],
+ df.loc[:, np.nan])
+ # multiple nans should result in DataFrame
+ else:
+ tm.assert_frame_equal(df.iloc[:, indexer],
+ df.loc[:, np.nan])
+
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan])
+ check(df)
+
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan])
+ check(df)
+
+ df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]],
+ columns=[np.nan, 1.1, 2.2, np.nan])
+ check(df)
+
+ df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]],
+ columns=[np.nan, 1.1, 2.2, np.nan])
+ check(df)
+
+ # GH 21428 (non-unique columns)
+ df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]],
+ columns=[np.nan, 1, 2, 2])
+ check(df)
+
+ def test_constructor_lists_to_object_dtype(self):
+ # from #1074
+ d = DataFrame({'a': [np.nan, False]})
+ assert d['a'].dtype == np.object_
+ assert not d['a'][1]
+
+ def test_constructor_categorical(self):
+
+ # GH8626
+
+ # dict creation
+ df = DataFrame({'A': list('abc')}, dtype='category')
+ expected = Series(list('abc'), dtype='category', name='A')
+ tm.assert_series_equal(df['A'], expected)
+
+ # to_frame
+ s = Series(list('abc'), dtype='category')
+ result = s.to_frame()
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(result[0], expected)
+ result = s.to_frame(name='foo')
+ expected = Series(list('abc'), dtype='category', name='foo')
+ tm.assert_series_equal(result['foo'], expected)
+
+ # list-like creation
+ df = DataFrame(list('abc'), dtype='category')
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(df[0], expected)
+
+ # ndim != 1
+ df = DataFrame([Categorical(list('abc'))])
+ expected = DataFrame({0: Series(list('abc'), dtype='category')})
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))])
+ expected = DataFrame({0: Series(list('abc'), dtype='category'),
+ 1: Series(list('abd'), dtype='category')},
+ columns=[0, 1])
+ tm.assert_frame_equal(df, expected)
+
+ # mixed
+ df = DataFrame([Categorical(list('abc')), list('def')])
+ expected = DataFrame({0: Series(list('abc'), dtype='category'),
+ 1: list('def')}, columns=[0, 1])
+ tm.assert_frame_equal(df, expected)
+
+ # invalid (shape)
+ pytest.raises(ValueError,
+ lambda: DataFrame([Categorical(list('abc')),
+ Categorical(list('abdefg'))]))
+
+ # ndim > 1
+ pytest.raises(NotImplementedError,
+ lambda: Categorical(np.array([list('abcd')])))
+
+ def test_constructor_categorical_series(self):
+
+ items = [1, 2, 3, 1]
+ exp = Series(items).astype('category')
+ res = Series(items, dtype='category')
+ tm.assert_series_equal(res, exp)
+
+ items = ["a", "b", "c", "a"]
+ exp = Series(items).astype('category')
+ res = Series(items, dtype='category')
+ tm.assert_series_equal(res, exp)
+
+ # insert into frame with different index
+ # GH 8076
+ index = date_range('20000101', periods=3)
+ expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
+ categories=['a', 'b', 'c']))
+ expected.index = index
+
+ expected = DataFrame({'x': expected})
+ df = DataFrame(
+ {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index)
+ tm.assert_frame_equal(df, expected)
+
+ def test_from_records_to_records(self):
+ # from numpy documentation
+ arr = np.zeros((2,), dtype=('i4,f4,a10'))
+ arr[:] = [(1, 2., 'Hello'), (2, 3., "World")]
+
+ # TODO(wesm): unused
+ frame = DataFrame.from_records(arr) # noqa
+
+ index = pd.Index(np.arange(len(arr))[::-1])
+ indexed_frame = DataFrame.from_records(arr, index=index)
+ tm.assert_index_equal(indexed_frame.index, index)
+
+ # without names, it should go to last ditch
+ arr2 = np.zeros((2, 3))
+ tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))
+
+ # wrong length
+ msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)'
+ with pytest.raises(ValueError, match=msg):
+ DataFrame.from_records(arr, index=index[:-1])
+
+ indexed_frame = DataFrame.from_records(arr, index='f1')
+
+ # what to do?
+ records = indexed_frame.to_records()
+ assert len(records.dtype.names) == 3
+
+ records = indexed_frame.to_records(index=False)
+ assert len(records.dtype.names) == 2
+ assert 'index' not in records.dtype.names
+
+ def test_from_records_nones(self):
+ tuples = [(1, 2, None, 3),
+ (1, 2, None, 3),
+ (None, 2, 5, 3)]
+
+ df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd'])
+ assert np.isnan(df['c'][0])
+
+ def test_from_records_iterator(self):
+ arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6),
+ (7., 7., 8, 8)],
+ dtype=[('x', np.float64), ('u', np.float32),
+ ('y', np.int64), ('z', np.int32)])
+ df = DataFrame.from_records(iter(arr), nrows=2)
+ xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64),
+ 'u': np.array([1.0, 3.0], dtype=np.float32),
+ 'y': np.array([2, 4], dtype=np.int64),
+ 'z': np.array([2, 4], dtype=np.int32)})
+ tm.assert_frame_equal(df.reindex_like(xp), xp)
+
+ # no dtypes specified here, so just compare with the default
+ arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)]
+ df = DataFrame.from_records(iter(arr), columns=['x', 'y'],
+ nrows=2)
+ tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']),
+ check_dtype=False)
+
+ def test_from_records_tuples_generator(self):
+ def tuple_generator(length):
+ for i in range(length):
+ letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ yield (i, letters[i % len(letters)], i / length)
+
+ columns_names = ['Integer', 'String', 'Float']
+ columns = [[i[j] for i in tuple_generator(
+ 10)] for j in range(len(columns_names))]
+ data = {'Integer': columns[0],
+ 'String': columns[1], 'Float': columns[2]}
+ expected = DataFrame(data, columns=columns_names)
+
+ generator = tuple_generator(10)
+ result = DataFrame.from_records(generator, columns=columns_names)
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_lists_generator(self):
+ def list_generator(length):
+ for i in range(length):
+ letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ yield [i, letters[i % len(letters)], i / length]
+
+ columns_names = ['Integer', 'String', 'Float']
+ columns = [[i[j] for i in list_generator(
+ 10)] for j in range(len(columns_names))]
+ data = {'Integer': columns[0],
+ 'String': columns[1], 'Float': columns[2]}
+ expected = DataFrame(data, columns=columns_names)
+
+ generator = list_generator(10)
+ result = DataFrame.from_records(generator, columns=columns_names)
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_columns_not_modified(self):
+ tuples = [(1, 2, 3),
+ (1, 2, 3),
+ (2, 5, 3)]
+
+ columns = ['a', 'b', 'c']
+ original_columns = list(columns)
+
+ df = DataFrame.from_records(tuples, columns=columns, index='a') # noqa
+
+ assert columns == original_columns
+
+ def test_from_records_decimal(self):
+ from decimal import Decimal
+
+ tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)]
+
+ df = DataFrame.from_records(tuples, columns=['a'])
+ assert df['a'].dtype == object
+
+ df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True)
+ assert df['a'].dtype == np.float64
+ assert np.isnan(df['a'].values[-1])
+
+ def test_from_records_duplicates(self):
+ result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)],
+ columns=['a', 'b', 'a'])
+
+ expected = DataFrame([(1, 2, 3), (4, 5, 6)],
+ columns=['a', 'b', 'a'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_set_index_name(self):
+ def create_dict(order_id):
+ return {'order_id': order_id, 'quantity': np.random.randint(1, 10),
+ 'price': np.random.randint(1, 10)}
+ documents = [create_dict(i) for i in range(10)]
+ # demo missing data
+ documents.append({'order_id': 10, 'quantity': 5})
+
+ result = DataFrame.from_records(documents, index='order_id')
+ assert result.index.name == 'order_id'
+
+ # MultiIndex
+ result = DataFrame.from_records(documents,
+ index=['order_id', 'quantity'])
+ assert result.index.names == ('order_id', 'quantity')
+
+ def test_from_records_misc_brokenness(self):
+ # #2179
+
+ data = {1: ['foo'], 2: ['bar']}
+
+ result = DataFrame.from_records(data, columns=['a', 'b'])
+ exp = DataFrame(data, columns=['a', 'b'])
+ tm.assert_frame_equal(result, exp)
+
+ # overlap in index/index_names
+
+ data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
+
+ result = DataFrame.from_records(data, index=['a', 'b', 'c'])
+ exp = DataFrame(data, index=['a', 'b', 'c'])
+ tm.assert_frame_equal(result, exp)
+
+ # GH 2623
+ rows = []
+ rows.append([datetime(2010, 1, 1), 1])
+ rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj
+ df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
+ results = df2_obj.get_dtype_counts()
+ expected = Series({'datetime64[ns]': 1, 'object': 1})
+
+ rows = []
+ rows.append([datetime(2010, 1, 1), 1])
+ rows.append([datetime(2010, 1, 2), 1])
+ df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
+ results = df2_obj.get_dtype_counts().sort_index()
+ expected = Series({'datetime64[ns]': 1, 'int64': 1})
+ tm.assert_series_equal(results, expected)
+
+ def test_from_records_empty(self):
+ # 3562
+ result = DataFrame.from_records([], columns=['a', 'b', 'c'])
+ expected = DataFrame(columns=['a', 'b', 'c'])
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame.from_records([], columns=['a', 'b', 'b'])
+ expected = DataFrame(columns=['a', 'b', 'b'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_empty_with_nonempty_fields_gh3682(self):
+ a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)])
+ df = DataFrame.from_records(a, index='id')
+ tm.assert_index_equal(df.index, Index([1], name='id'))
+ assert df.index.name == 'id'
+ tm.assert_index_equal(df.columns, Index(['value']))
+
+ b = np.array([], dtype=[('id', np.int64), ('value', np.int64)])
+ df = DataFrame.from_records(b, index='id')
+ tm.assert_index_equal(df.index, Index([], name='id'))
+ assert df.index.name == 'id'
+
+ def test_from_records_with_datetimes(self):
+
+ # this may fail on certain platforms because of a numpy issue
+ # related GH6140
+ if not is_platform_little_endian():
+ pytest.skip("known failure of test on non-little endian")
+
+ # construction with a null in a recarray
+ # GH 6140
+ expected = DataFrame({'EXPIRY': [datetime(2005, 3, 1, 0, 0), None]})
+
+ arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
+ dtypes = [('EXPIRY', '<M8[ns]')]
+
+ try:
+ recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
+ except (ValueError):
+ pytest.skip("known failure of numpy rec array creation")
+
+ result = DataFrame.from_records(recarray)
+ tm.assert_frame_equal(result, expected)
+
+ # coercion should work too
+ arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
+ dtypes = [('EXPIRY', '<M8[m]')]
+ recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
+ result = DataFrame.from_records(recarray)
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_sequencelike(self):
+ df = DataFrame({'A': np.array(np.random.randn(6), dtype=np.float64),
+ 'A1': np.array(np.random.randn(6), dtype=np.float64),
+ 'B': np.array(np.arange(6), dtype=np.int64),
+ 'C': ['foo'] * 6,
+ 'D': np.array([True, False] * 3, dtype=bool),
+ 'E': np.array(np.random.randn(6), dtype=np.float32),
+ 'E1': np.array(np.random.randn(6), dtype=np.float32),
+ 'F': np.array(np.arange(6), dtype=np.int32)})
+
+ # this is actually tricky to create the recordlike arrays and
+ # have the dtypes be intact
+ blocks = df._to_dict_of_blocks()
+ tuples = []
+ columns = []
+ dtypes = []
+ for dtype, b in compat.iteritems(blocks):
+ columns.extend(b.columns)
+ dtypes.extend([(c, np.dtype(dtype).descr[0][1])
+ for c in b.columns])
+ for i in range(len(df.index)):
+ tup = []
+ for _, b in compat.iteritems(blocks):
+ tup.extend(b.iloc[i].values)
+ tuples.append(tuple(tup))
+
+ recarray = np.array(tuples, dtype=dtypes).view(np.recarray)
+ recarray2 = df.to_records()
+ lists = [list(x) for x in tuples]
+
+ # tuples (lose the dtype info)
+ result = (DataFrame.from_records(tuples, columns=columns)
+ .reindex(columns=df.columns))
+
+ # created recarray and with to_records recarray (have dtype info)
+ result2 = (DataFrame.from_records(recarray, columns=columns)
+ .reindex(columns=df.columns))
+ result3 = (DataFrame.from_records(recarray2, columns=columns)
+ .reindex(columns=df.columns))
+
+ # list of tupels (no dtype info)
+ result4 = (DataFrame.from_records(lists, columns=columns)
+ .reindex(columns=df.columns))
+
+ tm.assert_frame_equal(result, df, check_dtype=False)
+ tm.assert_frame_equal(result2, df)
+ tm.assert_frame_equal(result3, df)
+ tm.assert_frame_equal(result4, df, check_dtype=False)
+
+ # tuples is in the order of the columns
+ result = DataFrame.from_records(tuples)
+ tm.assert_index_equal(result.columns, pd.Index(lrange(8)))
+
+ # test exclude parameter & we are casting the results here (as we don't
+ # have dtype info to recover)
+ columns_to_test = [columns.index('C'), columns.index('E1')]
+
+ exclude = list(set(range(8)) - set(columns_to_test))
+ result = DataFrame.from_records(tuples, exclude=exclude)
+ result.columns = [columns[i] for i in sorted(columns_to_test)]
+ tm.assert_series_equal(result['C'], df['C'])
+ tm.assert_series_equal(result['E1'], df['E1'].astype('float64'))
+
+ # empty case
+ result = DataFrame.from_records([], columns=['foo', 'bar', 'baz'])
+ assert len(result) == 0
+ tm.assert_index_equal(result.columns,
+ pd.Index(['foo', 'bar', 'baz']))
+
+ result = DataFrame.from_records([])
+ assert len(result) == 0
+ assert len(result.columns) == 0
+
+ def test_from_records_dictlike(self):
+
+ # test the dict methods
+ df = DataFrame({'A': np.array(np.random.randn(6), dtype=np.float64),
+ 'A1': np.array(np.random.randn(6), dtype=np.float64),
+ 'B': np.array(np.arange(6), dtype=np.int64),
+ 'C': ['foo'] * 6,
+ 'D': np.array([True, False] * 3, dtype=bool),
+ 'E': np.array(np.random.randn(6), dtype=np.float32),
+ 'E1': np.array(np.random.randn(6), dtype=np.float32),
+ 'F': np.array(np.arange(6), dtype=np.int32)})
+
+ # columns is in a different order here than the actual items iterated
+ # from the dict
+ blocks = df._to_dict_of_blocks()
+ columns = []
+ for dtype, b in compat.iteritems(blocks):
+ columns.extend(b.columns)
+
+ asdict = {x: y for x, y in compat.iteritems(df)}
+ asdict2 = {x: y.values for x, y in compat.iteritems(df)}
+
+ # dict of series & dict of ndarrays (have dtype info)
+ results = []
+ results.append(DataFrame.from_records(
+ asdict).reindex(columns=df.columns))
+ results.append(DataFrame.from_records(asdict, columns=columns)
+ .reindex(columns=df.columns))
+ results.append(DataFrame.from_records(asdict2, columns=columns)
+ .reindex(columns=df.columns))
+
+ for r in results:
+ tm.assert_frame_equal(r, df)
+
+ def test_from_records_with_index_data(self):
+ df = DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+
+ data = np.random.randn(10)
+ df1 = DataFrame.from_records(df, index=data)
+ tm.assert_index_equal(df1.index, Index(data))
+
+ def test_from_records_bad_index_column(self):
+ df = DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+
+ # should pass
+ df1 = DataFrame.from_records(df, index=['C'])
+ tm.assert_index_equal(df1.index, Index(df.C))
+
+ df1 = DataFrame.from_records(df, index='C')
+ tm.assert_index_equal(df1.index, Index(df.C))
+
+ # should fail
+ pytest.raises(ValueError, DataFrame.from_records, df, index=[2])
+ pytest.raises(KeyError, DataFrame.from_records, df, index=2)
+
+ def test_from_records_non_tuple(self):
+ class Record(object):
+
+ def __init__(self, *args):
+ self.args = args
+
+ def __getitem__(self, i):
+ return self.args[i]
+
+ def __iter__(self):
+ return iter(self.args)
+
+ recs = [Record(1, 2, 3), Record(4, 5, 6), Record(7, 8, 9)]
+ tups = lmap(tuple, recs)
+
+ result = DataFrame.from_records(recs)
+ expected = DataFrame.from_records(tups)
+ tm.assert_frame_equal(result, expected)
+
+ def test_from_records_len0_with_columns(self):
+ # #2633
+ result = DataFrame.from_records([], index='foo',
+ columns=['foo', 'bar'])
+ expected = Index(['bar'])
+
+ assert len(result) == 0
+ assert result.index.name == 'foo'
+ tm.assert_index_equal(result.columns, expected)
+
+ def test_to_frame_with_falsey_names(self):
+ # GH 16114
+ result = Series(name=0).to_frame().dtypes
+ expected = Series({0: np.float64})
+ tm.assert_series_equal(result, expected)
+
+ result = DataFrame(Series(name=0)).dtypes
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
+ def test_constructor_range_dtype(self, dtype):
+ # GH 16804
+ expected = DataFrame({'A': [0, 1, 2, 3, 4]}, dtype=dtype or 'int64')
+ result = DataFrame({'A': range(5)}, dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_from_list_subclass(self):
+ # GH21226
+ class List(list):
+ pass
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6]])
+ result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])]))
+ tm.assert_frame_equal(result, expected)
+
+
+class TestDataFrameConstructorWithDatetimeTZ(TestData):
+
+ def test_from_dict(self):
+
+ # 8260
+ # support datetime64 with tz
+
+ idx = Index(date_range('20130101', periods=3, tz='US/Eastern'),
+ name='foo')
+ dr = date_range('20130110', periods=3)
+
+ # construction
+ df = DataFrame({'A': idx, 'B': dr})
+ assert df['A'].dtype, 'M8[ns, US/Eastern'
+ assert df['A'].name == 'A'
+ tm.assert_series_equal(df['A'], Series(idx, name='A'))
+ tm.assert_series_equal(df['B'], Series(dr, name='B'))
+
+ def test_from_index(self):
+
+ # from index
+ idx2 = date_range('20130101', periods=3, tz='US/Eastern', name='foo')
+ df2 = DataFrame(idx2)
+ tm.assert_series_equal(df2['foo'], Series(idx2, name='foo'))
+ df2 = DataFrame(Series(idx2))
+ tm.assert_series_equal(df2['foo'], Series(idx2, name='foo'))
+
+ idx2 = date_range('20130101', periods=3, tz='US/Eastern')
+ df2 = DataFrame(idx2)
+ tm.assert_series_equal(df2[0], Series(idx2, name=0))
+ df2 = DataFrame(Series(idx2))
+ tm.assert_series_equal(df2[0], Series(idx2, name=0))
+
+ def test_frame_dict_constructor_datetime64_1680(self):
+ dr = date_range('1/1/2012', periods=10)
+ s = Series(dr, index=dr)
+
+ # it works!
+ DataFrame({'a': 'foo', 'b': s}, index=dr)
+ DataFrame({'a': 'foo', 'b': s.values}, index=dr)
+
+ def test_frame_datetime64_mixed_index_ctor_1681(self):
+ dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
+ ts = Series(dr)
+
+ # it works!
+ d = DataFrame({'A': 'foo', 'B': ts}, index=dr)
+ assert d['B'].isna().all()
+
+ def test_frame_timeseries_to_records(self):
+ index = date_range('1/1/2000', periods=10)
+ df = DataFrame(np.random.randn(10, 3), index=index,
+ columns=['a', 'b', 'c'])
+
+ result = df.to_records()
+ result['index'].dtype == 'M8[ns]'
+
+ result = df.to_records(index=False)
+
+ def test_frame_timeseries_column(self):
+ # GH19157
+ dr = date_range(start='20130101T10:00:00', periods=3, freq='T',
+ tz='US/Eastern')
+ result = DataFrame(dr, columns=['timestamps'])
+ expected = DataFrame({'timestamps': [
+ Timestamp('20130101T10:00:00', tz='US/Eastern'),
+ Timestamp('20130101T10:01:00', tz='US/Eastern'),
+ Timestamp('20130101T10:02:00', tz='US/Eastern')]})
+ tm.assert_frame_equal(result, expected)
+
+ def test_nested_dict_construction(self):
+ # GH22227
+ columns = ['Nevada', 'Ohio']
+ pop = {'Nevada': {2001: 2.4, 2002: 2.9},
+ 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
+ result = pd.DataFrame(pop, index=[2001, 2002, 2003], columns=columns)
+ expected = pd.DataFrame(
+ [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)],
+ columns=columns,
+ index=pd.Index([2001, 2002, 2003])
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_convert_to.py b/contrib/python/pandas/py2/pandas/tests/frame/test_convert_to.py
new file mode 100644
index 00000000000..7b98395dd6d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_convert_to.py
@@ -0,0 +1,504 @@
+# -*- coding: utf-8 -*-
+
+import collections
+from collections import OrderedDict, defaultdict
+from datetime import datetime
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas.compat import long
+
+from pandas import DataFrame, MultiIndex, Series, Timestamp, compat, date_range
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+
+class TestDataFrameConvertTo(TestData):
+
+ def test_to_dict_timestamp(self):
+
+ # GH11247
+ # split/records producing np.datetime64 rather than Timestamps
+ # on datetime64[ns] dtypes only
+
+ tsmp = Timestamp('20130101')
+ test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
+ test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})
+
+ expected_records = [{'A': tsmp, 'B': tsmp},
+ {'A': tsmp, 'B': tsmp}]
+ expected_records_mixed = [{'A': tsmp, 'B': 1},
+ {'A': tsmp, 'B': 2}]
+
+ assert (test_data.to_dict(orient='records') ==
+ expected_records)
+ assert (test_data_mixed.to_dict(orient='records') ==
+ expected_records_mixed)
+
+ expected_series = {
+ 'A': Series([tsmp, tsmp], name='A'),
+ 'B': Series([tsmp, tsmp], name='B'),
+ }
+ expected_series_mixed = {
+ 'A': Series([tsmp, tsmp], name='A'),
+ 'B': Series([1, 2], name='B'),
+ }
+
+ tm.assert_dict_equal(test_data.to_dict(orient='series'),
+ expected_series)
+ tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
+ expected_series_mixed)
+
+ expected_split = {
+ 'index': [0, 1],
+ 'data': [[tsmp, tsmp],
+ [tsmp, tsmp]],
+ 'columns': ['A', 'B']
+ }
+ expected_split_mixed = {
+ 'index': [0, 1],
+ 'data': [[tsmp, 1],
+ [tsmp, 2]],
+ 'columns': ['A', 'B']
+ }
+
+ tm.assert_dict_equal(test_data.to_dict(orient='split'),
+ expected_split)
+ tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
+ expected_split_mixed)
+
+ def test_to_dict_index_not_unique_with_index_orient(self):
+ # GH22801
+ # Data loss when indexes are not unique. Raise ValueError.
+ df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
+ pytest.raises(ValueError, df.to_dict, orient='index')
+
+ def test_to_dict_invalid_orient(self):
+ df = DataFrame({'A': [0, 1]})
+ pytest.raises(ValueError, df.to_dict, orient='xinvalid')
+
+ def test_to_records_dt64(self):
+ df = DataFrame([["one", "two", "three"],
+ ["four", "five", "six"]],
+ index=date_range("2012-01-01", "2012-01-02"))
+
+ # convert_datetime64 defaults to None
+ expected = df.index.values[0]
+ result = df.to_records()['index'][0]
+ assert expected == result
+
+ # check for FutureWarning if convert_datetime64=False is passed
+ with tm.assert_produces_warning(FutureWarning):
+ expected = df.index.values[0]
+ result = df.to_records(convert_datetime64=False)['index'][0]
+ assert expected == result
+
+ # check for FutureWarning if convert_datetime64=True is passed
+ with tm.assert_produces_warning(FutureWarning):
+ expected = df.index[0]
+ result = df.to_records(convert_datetime64=True)['index'][0]
+ assert expected == result
+
+ def test_to_records_with_multindex(self):
+ # GH3189
+ index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ data = np.zeros((8, 4))
+ df = DataFrame(data, index=index)
+ r = df.to_records(index=True)['level_0']
+ assert 'bar' in r
+ assert 'one' not in r
+
+ def test_to_records_with_Mapping_type(self):
+ import email
+ from email.parser import Parser
+
+ compat.Mapping.register(email.message.Message)
+
+ headers = Parser().parsestr('From: <[email protected]>\n'
+ 'To: <[email protected]>\n'
+ 'Subject: Test message\n'
+ '\n'
+ 'Body would go here\n')
+
+ frame = DataFrame.from_records([headers])
+ all(x in frame for x in ['Type', 'Subject', 'From'])
+
+ def test_to_records_floats(self):
+ df = DataFrame(np.random.rand(10, 10))
+ df.to_records()
+
+ def test_to_records_index_name(self):
+ df = DataFrame(np.random.randn(3, 3))
+ df.index.name = 'X'
+ rs = df.to_records()
+ assert 'X' in rs.dtype.fields
+
+ df = DataFrame(np.random.randn(3, 3))
+ rs = df.to_records()
+ assert 'index' in rs.dtype.fields
+
+ df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
+ df.index.names = ['A', None]
+ rs = df.to_records()
+ assert 'level_0' in rs.dtype.fields
+
+ def test_to_records_with_unicode_index(self):
+ # GH13172
+ # unicode_literals conflict with to_records
+ result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a') \
+ .to_records()
+ expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
+ tm.assert_almost_equal(result, expected)
+
+ def test_to_records_with_unicode_column_names(self):
+ # xref issue: https://github.com/numpy/numpy/issues/2407
+ # Issue #11879. to_records used to raise an exception when used
+ # with column names containing non-ascii characters in Python 2
+ result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
+
+ # Note that numpy allows for unicode field names but dtypes need
+ # to be specified using dictionary instead of list of tuples.
+ expected = np.rec.array(
+ [(0, 1.0)],
+ dtype={"names": ["index", u"accented_name_é"],
+ "formats": ['=i8', '=f8']}
+ )
+ tm.assert_almost_equal(result, expected)
+
+ def test_to_records_with_categorical(self):
+
+ # GH8626
+
+ # dict creation
+ df = DataFrame({'A': list('abc')}, dtype='category')
+ expected = Series(list('abc'), dtype='category', name='A')
+ tm.assert_series_equal(df['A'], expected)
+
+ # list-like creation
+ df = DataFrame(list('abc'), dtype='category')
+ expected = Series(list('abc'), dtype='category', name=0)
+ tm.assert_series_equal(df[0], expected)
+
+ # to record array
+ # this coerces
+ result = df.to_records()
+ expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
+ dtype=[('index', '=i8'), ('0', 'O')])
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize("kwargs,expected", [
+ # No dtypes --> default to array dtypes.
+ (dict(),
+ np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+ dtype=[("index", "<i8"), ("A", "<i8"),
+ ("B", "<f8"), ("C", "O")])),
+
+ # Should have no effect in this case.
+ (dict(index=True),
+ np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+ dtype=[("index", "<i8"), ("A", "<i8"),
+ ("B", "<f8"), ("C", "O")])),
+
+ # Column dtype applied across the board. Index unaffected.
+ (dict(column_dtypes="<U4"),
+ np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<i8"), ("A", "<U4"),
+ ("B", "<U4"), ("C", "<U4")])),
+
+ # Index dtype applied across the board. Columns unaffected.
+ (dict(index_dtypes="<U1"),
+ np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
+ dtype=[("index", "<U1"), ("A", "<i8"),
+ ("B", "<f8"), ("C", "O")])),
+
+ # Pass in a type instance.
+ (dict(column_dtypes=np.unicode),
+ np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<i8"), ("A", "<U"),
+ ("B", "<U"), ("C", "<U")])),
+
+ # Pass in a dictionary (name-only).
+ (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
+ np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<i8"), ("A", "i1"),
+ ("B", "<f4"), ("C", "<U2")])),
+
+ # Pass in a dictionary (indices-only).
+ (dict(index_dtypes={0: "int16"}),
+ np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+ dtype=[("index", "i2"), ("A", "<i8"),
+ ("B", "<f8"), ("C", "O")])),
+
+ # Ignore index mappings if index is not True.
+ (dict(index=False, index_dtypes="<U2"),
+ np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
+ dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
+
+ # Non-existent names / indices in mapping should not error.
+ (dict(index_dtypes={0: "int16", "not-there": "float32"}),
+ np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+ dtype=[("index", "i2"), ("A", "<i8"),
+ ("B", "<f8"), ("C", "O")])),
+
+ # Names / indices not in mapping default to array dtype.
+ (dict(column_dtypes={"A": np.int8, "B": np.float32}),
+ np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<i8"), ("A", "i1"),
+ ("B", "<f4"), ("C", "O")])),
+
+ # Mixture of everything.
+ (dict(column_dtypes={"A": np.int8, "B": np.float32},
+ index_dtypes="<U2"),
+ np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<U2"), ("A", "i1"),
+ ("B", "<f4"), ("C", "O")])),
+
+ # Invalid dype values.
+ (dict(index=False, column_dtypes=list()),
+ "Invalid dtype \\[\\] specified for column A"),
+
+ (dict(index=False, column_dtypes={"A": "int32", "B": 5}),
+ "Invalid dtype 5 specified for column B"),
+ ])
+ def test_to_records_dtype(self, kwargs, expected):
+ # see gh-18146
+ df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
+
+ if isinstance(expected, str):
+ with pytest.raises(ValueError, match=expected):
+ df.to_records(**kwargs)
+ else:
+ result = df.to_records(**kwargs)
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize("df,kwargs,expected", [
+ # MultiIndex in the index.
+ (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=list("abc")).set_index(["a", "b"]),
+ dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
+ np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
+ dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
+
+ # MultiIndex in the columns.
+ (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
+ ("c", "f")])),
+ dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
+ np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
+ (2., u"7", 8, 9.)],
+ dtype=[("index", "<f4"),
+ ("('a', 'd')", "<U1"),
+ ("('b', 'e')", "<i8"),
+ ("('c', 'f')", "<f4")])),
+
+ # MultiIndex in both the columns and index.
+ (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=MultiIndex.from_tuples([
+ ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
+ index=MultiIndex.from_tuples([
+ ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
+ dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
+ np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
+ ("f", -6, 7, 8, 9.)],
+ dtype=[("c", "<U2"), ("d", "i1"),
+ ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
+ ("('c', 'f')", "<f8")]))
+ ])
+ def test_to_records_dtype_mi(self, df, kwargs, expected):
+ # see gh-18146
+ result = df.to_records(**kwargs)
+ tm.assert_almost_equal(result, expected)
+
+ def test_to_records_dict_like(self):
+ # see gh-18146
+ class DictLike(object):
+ def __init__(self, **kwargs):
+ self.d = kwargs.copy()
+
+ def __getitem__(self, key):
+ return self.d.__getitem__(key)
+
+ def __contains__(self, key):
+ return key in self.d
+
+ def keys(self):
+ return self.d.keys()
+
+ df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
+
+ dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
+ "B": np.float32}),
+ index_dtypes="<U2")
+
+ result = df.to_records(**dtype_mappings)
+ expected = np.rec.array([("0", "1", "0.2", "a"),
+ ("1", "2", "1.5", "bc")],
+ dtype=[("index", "<U2"), ("A", "i1"),
+ ("B", "<f4"), ("C", "O")])
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize('mapping', [
+ dict,
+ collections.defaultdict(list),
+ collections.OrderedDict])
+ def test_to_dict(self, mapping):
+ test_data = {
+ 'A': {'1': 1, '2': 2},
+ 'B': {'1': '1', '2': '2', '3': '3'},
+ }
+
+ # GH16122
+ recons_data = DataFrame(test_data).to_dict(into=mapping)
+
+ for k, v in compat.iteritems(test_data):
+ for k2, v2 in compat.iteritems(v):
+ assert (v2 == recons_data[k][k2])
+
+ recons_data = DataFrame(test_data).to_dict("l", mapping)
+
+ for k, v in compat.iteritems(test_data):
+ for k2, v2 in compat.iteritems(v):
+ assert (v2 == recons_data[k][int(k2) - 1])
+
+ recons_data = DataFrame(test_data).to_dict("s", mapping)
+
+ for k, v in compat.iteritems(test_data):
+ for k2, v2 in compat.iteritems(v):
+ assert (v2 == recons_data[k][k2])
+
+ recons_data = DataFrame(test_data).to_dict("sp", mapping)
+ expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
+ 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
+ tm.assert_dict_equal(recons_data, expected_split)
+
+ recons_data = DataFrame(test_data).to_dict("r", mapping)
+ expected_records = [{'A': 1.0, 'B': '1'},
+ {'A': 2.0, 'B': '2'},
+ {'A': np.nan, 'B': '3'}]
+ assert isinstance(recons_data, list)
+ assert (len(recons_data) == 3)
+ for l, r in zip(recons_data, expected_records):
+ tm.assert_dict_equal(l, r)
+
+ # GH10844
+ recons_data = DataFrame(test_data).to_dict("i")
+
+ for k, v in compat.iteritems(test_data):
+ for k2, v2 in compat.iteritems(v):
+ assert (v2 == recons_data[k2][k])
+
+ df = DataFrame(test_data)
+ df['duped'] = df[df.columns[0]]
+ recons_data = df.to_dict("i")
+ comp_data = test_data.copy()
+ comp_data['duped'] = comp_data[df.columns[0]]
+ for k, v in compat.iteritems(comp_data):
+ for k2, v2 in compat.iteritems(v):
+ assert (v2 == recons_data[k2][k])
+
+ @pytest.mark.parametrize('mapping', [
+ list,
+ collections.defaultdict,
+ []])
+ def test_to_dict_errors(self, mapping):
+ # GH16122
+ df = DataFrame(np.random.randn(3, 3))
+ with pytest.raises(TypeError):
+ df.to_dict(into=mapping)
+
+ def test_to_dict_not_unique_warning(self):
+ # GH16927: When converting to a dict, if a column has a non-unique name
+ # it will be dropped, throwing a warning.
+ df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
+ with tm.assert_produces_warning(UserWarning):
+ df.to_dict()
+
+ @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
+ def test_to_records_datetimeindex_with_tz(self, tz):
+ # GH13937
+ dr = date_range('2016-01-01', periods=10,
+ freq='S', tz=tz)
+
+ df = DataFrame({'datetime': dr}, index=dr)
+
+ expected = df.to_records()
+ result = df.tz_convert("UTC").to_records()
+
+ # both converted to UTC, so they are equal
+ tm.assert_numpy_array_equal(result, expected)
+
+ # orient - orient argument to to_dict function
+ # item_getter - function for extracting value from
+ # the resulting dict using column name and index
+ @pytest.mark.parametrize('orient,item_getter', [
+ ('dict', lambda d, col, idx: d[col][idx]),
+ ('records', lambda d, col, idx: d[idx][col]),
+ ('list', lambda d, col, idx: d[col][idx]),
+ ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]),
+ ('index', lambda d, col, idx: d[idx][col])
+ ])
+ def test_to_dict_box_scalars(self, orient, item_getter):
+ # 14216, 23753
+ # make sure that we are boxing properly
+ df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
+ result = df.to_dict(orient=orient)
+ assert isinstance(item_getter(result, 'a', 0), (int, long))
+ assert isinstance(item_getter(result, 'b', 0), float)
+
+ def test_frame_to_dict_tz(self):
+ # GH18372 When converting to dict with orient='records' columns of
+ # datetime that are tz-aware were not converted to required arrays
+ data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
+ (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
+ df = DataFrame(list(data), columns=["d", ])
+
+ result = df.to_dict(orient='records')
+ expected = [
+ {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
+ {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
+ ]
+ tm.assert_dict_equal(result[0], expected[0])
+ tm.assert_dict_equal(result[1], expected[1])
+
+ @pytest.mark.parametrize('into, expected', [
+ (dict, {0: {'int_col': 1, 'float_col': 1.0},
+ 1: {'int_col': 2, 'float_col': 2.0},
+ 2: {'int_col': 3, 'float_col': 3.0}}),
+ (OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
+ (1, {'int_col': 2, 'float_col': 2.0}),
+ (2, {'int_col': 3, 'float_col': 3.0})])),
+ (defaultdict(list), defaultdict(list,
+ {0: {'int_col': 1, 'float_col': 1.0},
+ 1: {'int_col': 2, 'float_col': 2.0},
+ 2: {'int_col': 3, 'float_col': 3.0}}))
+ ])
+ def test_to_dict_index_dtypes(self, into, expected):
+ # GH 18580
+ # When using to_dict(orient='index') on a dataframe with int
+ # and float columns only the int columns were cast to float
+
+ df = DataFrame({'int_col': [1, 2, 3],
+ 'float_col': [1.0, 2.0, 3.0]})
+
+ result = df.to_dict(orient='index', into=into)
+ cols = ['int_col', 'float_col']
+ result = DataFrame.from_dict(result, orient='index')[cols]
+ expected = DataFrame.from_dict(expected, orient='index')[cols]
+ tm.assert_frame_equal(result, expected)
+
+ def test_to_dict_numeric_names(self):
+ # https://github.com/pandas-dev/pandas/issues/24940
+ df = DataFrame({str(i): [i] for i in range(5)})
+ result = set(df.to_dict('records')[0].keys())
+ expected = set(df.columns)
+ assert result == expected
+
+ def test_to_dict_wide(self):
+ # https://github.com/pandas-dev/pandas/issues/24939
+ df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)})
+ result = df.to_dict('records')[0]
+ expected = {'A_{:d}'.format(i): i for i in range(256)}
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_dtypes.py b/contrib/python/pandas/py2/pandas/tests/frame/test_dtypes.py
new file mode 100644
index 00000000000..a9f8ab47b16
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_dtypes.py
@@ -0,0 +1,989 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import u
+
+from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Series, Timedelta, Timestamp,
+ _np_version_under1p14, compat, concat, date_range, option_context)
+from pandas.core.arrays import integer_array
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf)
+
+
[email protected](params=[str, compat.text_type])
+def text_dtype(request):
+ return request.param
+
+
+class TestDataFrameDataTypes(TestData):
+
+ def test_concat_empty_dataframe_dtypes(self):
+ df = DataFrame(columns=list("abc"))
+ df['a'] = df['a'].astype(np.bool_)
+ df['b'] = df['b'].astype(np.int32)
+ df['c'] = df['c'].astype(np.float64)
+
+ result = pd.concat([df, df])
+ assert result['a'].dtype == np.bool_
+ assert result['b'].dtype == np.int32
+ assert result['c'].dtype == np.float64
+
+ result = pd.concat([df, df.astype(np.float64)])
+ assert result['a'].dtype == np.object_
+ assert result['b'].dtype == np.float64
+ assert result['c'].dtype == np.float64
+
+ def test_empty_frame_dtypes_ftypes(self):
+ empty_df = pd.DataFrame()
+ assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
+ assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))
+
+ nocols_df = pd.DataFrame(index=[1, 2, 3])
+ assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
+ assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))
+
+ norows_df = pd.DataFrame(columns=list("abc"))
+ assert_series_equal(norows_df.dtypes, pd.Series(
+ np.object, index=list("abc")))
+ assert_series_equal(norows_df.ftypes, pd.Series(
+ 'object:dense', index=list("abc")))
+
+ norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
+ assert_series_equal(norows_int_df.dtypes, pd.Series(
+ np.dtype('int32'), index=list("abc")))
+ assert_series_equal(norows_int_df.ftypes, pd.Series(
+ 'int32:dense', index=list("abc")))
+
+ odict = compat.OrderedDict
+ df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
+ index=[1, 2, 3])
+ ex_dtypes = pd.Series(odict([('a', np.int64),
+ ('b', np.bool),
+ ('c', np.float64)]))
+ ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
+ ('b', 'bool:dense'),
+ ('c', 'float64:dense')]))
+ assert_series_equal(df.dtypes, ex_dtypes)
+ assert_series_equal(df.ftypes, ex_ftypes)
+
+ # same but for empty slice of df
+ assert_series_equal(df[:0].dtypes, ex_dtypes)
+ assert_series_equal(df[:0].ftypes, ex_ftypes)
+
+ def test_datetime_with_tz_dtypes(self):
+ tzframe = DataFrame({'A': date_range('20130101', periods=3),
+ 'B': date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'C': date_range('20130101', periods=3, tz='CET')})
+ tzframe.iloc[1, 1] = pd.NaT
+ tzframe.iloc[1, 2] = pd.NaT
+ result = tzframe.dtypes.sort_index()
+ expected = Series([np.dtype('datetime64[ns]'),
+ DatetimeTZDtype('ns', 'US/Eastern'),
+ DatetimeTZDtype('ns', 'CET')],
+ ['A', 'B', 'C'])
+
+ assert_series_equal(result, expected)
+
+ def test_dtypes_are_correct_after_column_slice(self):
+ # GH6525
+ df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
+ odict = compat.OrderedDict
+ assert_series_equal(df.dtypes,
+ pd.Series(odict([('a', np.float_),
+ ('b', np.float_),
+ ('c', np.float_)])))
+ assert_series_equal(df.iloc[:, 2:].dtypes,
+ pd.Series(odict([('c', np.float_)])))
+ assert_series_equal(df.dtypes,
+ pd.Series(odict([('a', np.float_),
+ ('b', np.float_),
+ ('c', np.float_)])))
+
+ def test_select_dtypes_include_using_list_like(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'i': pd.date_range('20130101', periods=3,
+ tz='CET'),
+ 'j': pd.period_range('2013-01', periods=3,
+ freq='M'),
+ 'k': pd.timedelta_range('1 day', periods=3)})
+
+ ri = df.select_dtypes(include=[np.number])
+ ei = df[['b', 'c', 'd', 'k']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
+ ei = df[['b', 'c', 'd']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=[np.number, 'category'],
+ exclude=['timedelta'])
+ ei = df[['b', 'c', 'd', 'f']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=['datetime'])
+ ei = df[['g']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=['datetime64'])
+ ei = df[['g']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=['datetimetz'])
+ ei = df[['h', 'i']]
+ assert_frame_equal(ri, ei)
+
+ pytest.raises(NotImplementedError,
+ lambda: df.select_dtypes(include=['period']))
+
+ def test_select_dtypes_exclude_using_list_like(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True]})
+ re = df.select_dtypes(exclude=[np.number])
+ ee = df[['a', 'e']]
+ assert_frame_equal(re, ee)
+
+ def test_select_dtypes_exclude_include_using_list_like(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.date_range('now', periods=3).values})
+ exclude = np.datetime64,
+ include = np.bool_, 'integer'
+ r = df.select_dtypes(include=include, exclude=exclude)
+ e = df[['b', 'c', 'e']]
+ assert_frame_equal(r, e)
+
+ exclude = 'datetime',
+ include = 'bool', 'int64', 'int32'
+ r = df.select_dtypes(include=include, exclude=exclude)
+ e = df[['b', 'e']]
+ assert_frame_equal(r, e)
+
+ def test_select_dtypes_include_using_scalars(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'i': pd.date_range('20130101', periods=3,
+ tz='CET'),
+ 'j': pd.period_range('2013-01', periods=3,
+ freq='M'),
+ 'k': pd.timedelta_range('1 day', periods=3)})
+
+ ri = df.select_dtypes(include=np.number)
+ ei = df[['b', 'c', 'd', 'k']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include='datetime')
+ ei = df[['g']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include='datetime64')
+ ei = df[['g']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include='category')
+ ei = df[['f']]
+ assert_frame_equal(ri, ei)
+
+ pytest.raises(NotImplementedError,
+ lambda: df.select_dtypes(include='period'))
+
+ def test_select_dtypes_exclude_using_scalars(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'i': pd.date_range('20130101', periods=3,
+ tz='CET'),
+ 'j': pd.period_range('2013-01', periods=3,
+ freq='M'),
+ 'k': pd.timedelta_range('1 day', periods=3)})
+
+ ri = df.select_dtypes(exclude=np.number)
+ ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(exclude='category')
+ ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
+ assert_frame_equal(ri, ei)
+
+ pytest.raises(NotImplementedError,
+ lambda: df.select_dtypes(exclude='period'))
+
+ def test_select_dtypes_include_exclude_using_scalars(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'i': pd.date_range('20130101', periods=3,
+ tz='CET'),
+ 'j': pd.period_range('2013-01', periods=3,
+ freq='M'),
+ 'k': pd.timedelta_range('1 day', periods=3)})
+
+ ri = df.select_dtypes(include=np.number, exclude='floating')
+ ei = df[['b', 'c', 'k']]
+ assert_frame_equal(ri, ei)
+
+ def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'i': pd.date_range('20130101', periods=3,
+ tz='CET'),
+ 'j': pd.period_range('2013-01', periods=3,
+ freq='M'),
+ 'k': pd.timedelta_range('1 day', periods=3)})
+
+ ri = df.select_dtypes(include=np.number,
+ exclude=['floating', 'timedelta'])
+ ei = df[['b', 'c']]
+ assert_frame_equal(ri, ei)
+
+ ri = df.select_dtypes(include=[np.number, 'category'],
+ exclude='floating')
+ ei = df[['b', 'c', 'f', 'k']]
+ assert_frame_equal(ri, ei)
+
+ def test_select_dtypes_duplicate_columns(self):
+ # GH20839
+ odict = compat.OrderedDict
+ df = DataFrame(odict([('a', list('abc')),
+ ('b', list(range(1, 4))),
+ ('c', np.arange(3, 6).astype('u1')),
+ ('d', np.arange(4.0, 7.0, dtype='float64')),
+ ('e', [True, False, True]),
+ ('f', pd.date_range('now', periods=3).values)]))
+ df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
+
+ expected = DataFrame({'a': list(range(1, 4)),
+ 'b': np.arange(3, 6).astype('u1')})
+
+ result = df.select_dtypes(include=[np.number], exclude=['floating'])
+ assert_frame_equal(result, expected)
+
+ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.date_range('now', periods=3).values})
+ df['g'] = df.f.diff()
+ assert not hasattr(np, 'u8')
+ r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
+ e = df[['a', 'b']]
+ assert_frame_equal(r, e)
+
+ r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
+ e = df[['a', 'b', 'g']]
+ assert_frame_equal(r, e)
+
+ def test_select_dtypes_empty(self):
+ df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
+ msg = 'at least one of include or exclude must be nonempty'
+ with pytest.raises(ValueError, match=msg):
+ df.select_dtypes()
+
+ def test_select_dtypes_bad_datetime64(self):
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.date_range('now', periods=3).values})
+ with pytest.raises(ValueError, match='.+ is too specific'):
+ df.select_dtypes(include=['datetime64[D]'])
+
+ with pytest.raises(ValueError, match='.+ is too specific'):
+ df.select_dtypes(exclude=['datetime64[as]'])
+
+ def test_select_dtypes_datetime_with_tz(self):
+
+ df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
+ B=Timestamp('20130603', tz='CET')),
+ index=range(5))
+ df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
+ result = df3.select_dtypes(include=['datetime64[ns]'])
+ expected = df3.reindex(columns=[])
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [
+ str, "str", np.string_, "S1", "unicode", np.unicode_, "U1",
+ compat.text_type
+ ])
+ @pytest.mark.parametrize("arg", ["include", "exclude"])
+ def test_select_dtypes_str_raises(self, dtype, arg):
+ df = DataFrame({"a": list("abc"),
+ "g": list(u("abc")),
+ "b": list(range(1, 4)),
+ "c": np.arange(3, 6).astype("u1"),
+ "d": np.arange(4.0, 7.0, dtype="float64"),
+ "e": [True, False, True],
+ "f": pd.date_range("now", periods=3).values})
+ msg = "string dtypes are not allowed"
+ kwargs = {arg: [dtype]}
+
+ with pytest.raises(TypeError, match=msg):
+ df.select_dtypes(**kwargs)
+
+ def test_select_dtypes_bad_arg_raises(self):
+ df = DataFrame({'a': list('abc'),
+ 'g': list(u('abc')),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.date_range('now', periods=3).values})
+
+ msg = 'data type.*not understood'
+ with pytest.raises(TypeError, match=msg):
+ df.select_dtypes(['blargy, blarg, blarg'])
+
+ def test_select_dtypes_typecodes(self):
+ # GH 11990
+ df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
+ expected = df
+ FLOAT_TYPES = list(np.typecodes['AllFloat'])
+ assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
+
+ def test_dtypes_gh8722(self):
+ self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
+ result = self.mixed_frame.dtypes
+ expected = Series({k: v.dtype
+ for k, v in compat.iteritems(self.mixed_frame)},
+ index=result.index)
+ assert_series_equal(result, expected)
+
+ # compat, GH 8722
+ with option_context('use_inf_as_na', True):
+ df = DataFrame([[1]])
+ result = df.dtypes
+ assert_series_equal(result, Series({0: np.dtype('int64')}))
+
+ def test_ftypes(self):
+ frame = self.mixed_float
+ expected = Series(dict(A='float32:dense',
+ B='float32:dense',
+ C='float16:dense',
+ D='float64:dense')).sort_values()
+ result = frame.ftypes.sort_values()
+ assert_series_equal(result, expected)
+
+ def test_astype(self):
+ casted = self.frame.astype(int)
+ expected = DataFrame(self.frame.values.astype(int),
+ index=self.frame.index,
+ columns=self.frame.columns)
+ assert_frame_equal(casted, expected)
+
+ casted = self.frame.astype(np.int32)
+ expected = DataFrame(self.frame.values.astype(np.int32),
+ index=self.frame.index,
+ columns=self.frame.columns)
+ assert_frame_equal(casted, expected)
+
+ self.frame['foo'] = '5'
+ casted = self.frame.astype(int)
+ expected = DataFrame(self.frame.values.astype(int),
+ index=self.frame.index,
+ columns=self.frame.columns)
+ assert_frame_equal(casted, expected)
+
+ # mixed casting
+ def _check_cast(df, v):
+ assert (list({s.dtype.name for
+ _, s in compat.iteritems(df)})[0] == v)
+
+ mn = self.all_mixed._get_numeric_data().copy()
+ mn['little_float'] = np.array(12345., dtype='float16')
+ mn['big_float'] = np.array(123456789101112., dtype='float64')
+
+ casted = mn.astype('float64')
+ _check_cast(casted, 'float64')
+
+ casted = mn.astype('int64')
+ _check_cast(casted, 'int64')
+
+ casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
+ _check_cast(casted, 'float32')
+
+ casted = mn.reindex(columns=['little_float']).astype('float16')
+ _check_cast(casted, 'float16')
+
+ casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
+ _check_cast(casted, 'float16')
+
+ casted = mn.astype('float32')
+ _check_cast(casted, 'float32')
+
+ casted = mn.astype('int32')
+ _check_cast(casted, 'int32')
+
+ # to object
+ casted = mn.astype('O')
+ _check_cast(casted, 'object')
+
+ def test_astype_with_exclude_string(self):
+ df = self.frame.copy()
+ expected = self.frame.astype(int)
+ df['string'] = 'foo'
+ casted = df.astype(int, errors='ignore')
+
+ expected['string'] = 'foo'
+ assert_frame_equal(casted, expected)
+
+ df = self.frame.copy()
+ expected = self.frame.astype(np.int32)
+ df['string'] = 'foo'
+ casted = df.astype(np.int32, errors='ignore')
+
+ expected['string'] = 'foo'
+ assert_frame_equal(casted, expected)
+
+ def test_astype_with_view(self):
+
+ tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])
+
+ casted = tf.astype(np.int64)
+
+ casted = tf.astype(np.float32)
+
+ # this is the only real reason to do it this way
+ tf = np.round(self.frame).astype(np.int32)
+ casted = tf.astype(np.float32, copy=False)
+
+ # TODO(wesm): verification?
+ tf = self.frame.astype(np.float64)
+ casted = tf.astype(np.int64, copy=False) # noqa
+
+ @pytest.mark.parametrize("dtype", [np.int32, np.int64])
+ @pytest.mark.parametrize("val", [np.nan, np.inf])
+ def test_astype_cast_nan_inf_int(self, val, dtype):
+ # see gh-14265
+ #
+ # Check NaN and inf --> raise error when converting to int.
+ msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
+ df = DataFrame([val])
+
+ with pytest.raises(ValueError, match=msg):
+ df.astype(dtype)
+
+ def test_astype_str(self, text_dtype):
+ # see gh-9757
+ a = Series(date_range("2010-01-04", periods=5))
+ b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
+ c = Series([Timedelta(x, unit="d") for x in range(5)])
+ d = Series(range(5))
+ e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
+
+ df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
+
+ # Datetime-like
+ # Test str and unicode on Python 2.x and just str on Python 3.x
+ result = df.astype(text_dtype)
+
+ expected = DataFrame({
+ "a": list(map(text_dtype,
+ map(lambda x: Timestamp(x)._date_repr, a._values))),
+ "b": list(map(text_dtype, map(Timestamp, b._values))),
+ "c": list(map(text_dtype,
+ map(lambda x: Timedelta(x)._repr_base(format="all"),
+ c._values))),
+ "d": list(map(text_dtype, d._values)),
+ "e": list(map(text_dtype, e._values)),
+ })
+
+ assert_frame_equal(result, expected)
+
+ def test_astype_str_float(self, text_dtype):
+ # see gh-11302
+ result = DataFrame([np.NaN]).astype(text_dtype)
+ expected = DataFrame(["nan"])
+
+ assert_frame_equal(result, expected)
+ result = DataFrame([1.12345678901234567890]).astype(text_dtype)
+
+ # < 1.14 truncates
+ # >= 1.14 preserves the full repr
+ val = ("1.12345678901" if _np_version_under1p14
+ else "1.1234567890123457")
+ expected = DataFrame([val])
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype_class", [dict, Series])
+ def test_astype_dict_like(self, dtype_class):
+ # GH7271 & GH16717
+ a = Series(date_range('2010-01-04', periods=5))
+ b = Series(range(5))
+ c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
+ d = Series(['1.0', '2', '3.14', '4', '5.4'])
+ df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
+ original = df.copy(deep=True)
+
+ # change type of a subset of columns
+ dt1 = dtype_class({'b': 'str', 'd': 'float32'})
+ result = df.astype(dt1)
+ expected = DataFrame({
+ 'a': a,
+ 'b': Series(['0', '1', '2', '3', '4']),
+ 'c': c,
+ 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
+ assert_frame_equal(result, expected)
+ assert_frame_equal(df, original)
+
+ dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
+ result = df.astype(dt2)
+ expected = DataFrame({
+ 'a': a,
+ 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
+ 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
+ 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
+ assert_frame_equal(result, expected)
+ assert_frame_equal(df, original)
+
+ # change all columns
+ dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
+ assert_frame_equal(df.astype(dt3),
+ df.astype(str))
+ assert_frame_equal(df, original)
+
+ # error should be raised when using something other than column labels
+ # in the keys of the dtype dict
+ dt4 = dtype_class({'b': str, 2: str})
+ dt5 = dtype_class({'e': str})
+ pytest.raises(KeyError, df.astype, dt4)
+ pytest.raises(KeyError, df.astype, dt5)
+ assert_frame_equal(df, original)
+
+ # if the dtypes provided are the same as the original dtypes, the
+ # resulting DataFrame should be the same as the original DataFrame
+ dt6 = dtype_class({col: df[col].dtype for col in df.columns})
+ equiv = df.astype(dt6)
+ assert_frame_equal(df, equiv)
+ assert_frame_equal(df, original)
+
+ # GH 16717
+ # if dtypes provided is empty, the resulting DataFrame
+ # should be the same as the original DataFrame
+ dt7 = dtype_class({})
+ result = df.astype(dt7)
+ assert_frame_equal(df, equiv)
+ assert_frame_equal(df, original)
+
+ def test_astype_duplicate_col(self):
+ a1 = Series([1, 2, 3, 4, 5], name='a')
+ b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
+ a2 = Series([0, 1, 2, 3, 4], name='a')
+ df = concat([a1, b, a2], axis=1)
+
+ result = df.astype(str)
+ a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
+ b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
+ name='b')
+ a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
+ expected = concat([a1_str, b_str, a2_str], axis=1)
+ assert_frame_equal(result, expected)
+
+ result = df.astype({'a': 'str'})
+ expected = concat([a1_str, b, a2_str], axis=1)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [
+ 'category',
+ CategoricalDtype(),
+ CategoricalDtype(ordered=True),
+ CategoricalDtype(ordered=False),
+ CategoricalDtype(categories=list('abcdef')),
+ CategoricalDtype(categories=list('edba'), ordered=False),
+ CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
+ def test_astype_categorical(self, dtype):
+ # GH 18099
+ d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
+ df = DataFrame(d)
+ result = df.astype(dtype)
+ expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("cls", [
+ pd.api.types.CategoricalDtype,
+ pd.api.types.DatetimeTZDtype,
+ pd.api.types.IntervalDtype
+ ])
+ def test_astype_categoricaldtype_class_raises(self, cls):
+ df = DataFrame({"A": ['a', 'a', 'b', 'c']})
+ xpr = "Expected an instance of {}".format(cls.__name__)
+ with pytest.raises(TypeError, match=xpr):
+ df.astype({"A": cls})
+
+ with pytest.raises(TypeError, match=xpr):
+ df['A'].astype(cls)
+
+ @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
+ def test_astype_extension_dtypes(self, dtype):
+ # GH 22578
+ df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
+
+ expected1 = pd.DataFrame({'a': integer_array([1, 3, 5],
+ dtype=dtype),
+ 'b': integer_array([2, 4, 6],
+ dtype=dtype)})
+ tm.assert_frame_equal(df.astype(dtype), expected1)
+ tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
+ tm.assert_frame_equal(df.astype(dtype).astype('float64'), df)
+
+ df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
+ df['b'] = df['b'].astype(dtype)
+ expected2 = pd.DataFrame({'a': [1., 3., 5.],
+ 'b': integer_array([2, 4, 6],
+ dtype=dtype)})
+ tm.assert_frame_equal(df, expected2)
+
+ tm.assert_frame_equal(df.astype(dtype), expected1)
+ tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
+
+ @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
+ def test_astype_extension_dtypes_1d(self, dtype):
+ # GH 22578
+ df = pd.DataFrame({'a': [1., 2., 3.]})
+
+ expected1 = pd.DataFrame({'a': integer_array([1, 2, 3],
+ dtype=dtype)})
+ tm.assert_frame_equal(df.astype(dtype), expected1)
+ tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
+
+ df = pd.DataFrame({'a': [1., 2., 3.]})
+ df['a'] = df['a'].astype(dtype)
+ expected2 = pd.DataFrame({'a': integer_array([1, 2, 3],
+ dtype=dtype)})
+ tm.assert_frame_equal(df, expected2)
+
+ tm.assert_frame_equal(df.astype(dtype), expected1)
+ tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
+
+ @pytest.mark.parametrize("dtype", ['category', 'Int64'])
+ def test_astype_extension_dtypes_duplicate_col(self, dtype):
+ # GH 24704
+ a1 = Series([0, np.nan, 4], name='a')
+ a2 = Series([np.nan, 3, 5], name='a')
+ df = concat([a1, a2], axis=1)
+
+ result = df.astype(dtype)
+ expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [
+ {100: 'float64', 200: 'uint64'}, 'category', 'float64'])
+ def test_astype_column_metadata(self, dtype):
+ # GH 19920
+ columns = pd.UInt64Index([100, 200, 300], name='foo')
+ df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
+ df = df.astype(dtype)
+ tm.assert_index_equal(df.columns, columns)
+
+ @pytest.mark.parametrize("dtype", ["M8", "m8"])
+ @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
+ def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
+ # tests astype to object dtype
+ # gh-19223 / gh-12425
+ dtype = "{}[{}]".format(dtype, unit)
+ arr = np.array([[1, 2, 3]], dtype=dtype)
+ df = DataFrame(arr)
+ result = df.astype(object)
+ assert (result.dtypes == object).all()
+
+ if dtype.startswith('M8'):
+ assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
+ else:
+ assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)
+
+ @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
+ @pytest.mark.parametrize("dtype", ["M8", "m8"])
+ @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
+ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
+ # tests all units from numeric origination
+ # gh-19223 / gh-12425
+ dtype = "{}[{}]".format(dtype, unit)
+ arr = np.array([[1, 2, 3]], dtype=arr_dtype)
+ df = DataFrame(arr)
+ result = df.astype(dtype)
+ expected = DataFrame(arr.astype(dtype))
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
+ def test_astype_to_datetime_unit(self, unit):
+ # tests all units from datetime origination
+ # gh-19223
+ dtype = "M8[{}]".format(unit)
+ arr = np.array([[1, 2, 3]], dtype=dtype)
+ df = DataFrame(arr)
+ result = df.astype(dtype)
+ expected = DataFrame(arr.astype(dtype))
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("unit", ['ns'])
+ def test_astype_to_timedelta_unit_ns(self, unit):
+ # preserver the timedelta conversion
+ # gh-19223
+ dtype = "m8[{}]".format(unit)
+ arr = np.array([[1, 2, 3]], dtype=dtype)
+ df = DataFrame(arr)
+ result = df.astype(dtype)
+ expected = DataFrame(arr.astype(dtype))
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
+ def test_astype_to_timedelta_unit(self, unit):
+ # coerce to float
+ # gh-19223
+ dtype = "m8[{}]".format(unit)
+ arr = np.array([[1, 2, 3]], dtype=dtype)
+ df = DataFrame(arr)
+ result = df.astype(dtype)
+ expected = DataFrame(df.values.astype(dtype).astype(float))
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
+ def test_astype_to_incorrect_datetimelike(self, unit):
+ # trying to astype a m to a M, or vice-versa
+ # gh-19224
+ dtype = "M8[{}]".format(unit)
+ other = "m8[{}]".format(unit)
+
+ df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
+ with pytest.raises(TypeError):
+ df.astype(other)
+
+ df = DataFrame(np.array([[1, 2, 3]], dtype=other))
+ with pytest.raises(TypeError):
+ df.astype(dtype)
+
+ def test_timedeltas(self):
+ df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
+ freq='D')),
+ B=Series([timedelta(days=i) for i in range(3)])))
+ result = df.get_dtype_counts().sort_index()
+ expected = Series(
+ {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
+ assert_series_equal(result, expected)
+
+ df['C'] = df['A'] + df['B']
+ expected = Series(
+ {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
+ result = df.get_dtype_counts().sort_values()
+ assert_series_equal(result, expected)
+
+ # mixed int types
+ df['D'] = 1
+ expected = Series({'datetime64[ns]': 2,
+ 'timedelta64[ns]': 1,
+ 'int64': 1}).sort_values()
+ result = df.get_dtype_counts().sort_values()
+ assert_series_equal(result, expected)
+
+ def test_arg_for_errors_in_astype(self):
+ # issue #14878
+
+ df = DataFrame([1, 2, 3])
+
+ with pytest.raises(ValueError):
+ df.astype(np.float64, errors=True)
+
+ df.astype(np.int8, errors='ignore')
+
+ @pytest.mark.parametrize('input_vals', [
+ ([1, 2]),
+ (['1', '2']),
+ (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
+ (list(pd.date_range('1/1/2011', periods=2, freq='H',
+ tz='US/Eastern'))),
+ ([pd.Interval(left=0, right=5)]),
+ ])
+ def test_constructor_list_str(self, input_vals, string_dtype):
+ # GH 16605
+ # Ensure that data elements are converted to strings when
+ # dtype is str, 'str', or 'U'
+
+ result = DataFrame({'A': input_vals}, dtype=string_dtype)
+ expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
+ assert_frame_equal(result, expected)
+
+ def test_constructor_list_str_na(self, string_dtype):
+
+ result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
+ expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("data, expected", [
+ # empty
+ (DataFrame(), True),
+ # multi-same
+ (DataFrame({"A": [1, 2], "B": [1, 2]}), True),
+ # multi-object
+ (DataFrame({"A": np.array([1, 2], dtype=object),
+ "B": np.array(["a", "b"], dtype=object)}), True),
+ # multi-extension
+ (DataFrame({"A": pd.Categorical(['a', 'b']),
+ "B": pd.Categorical(['a', 'b'])}), True),
+ # differ types
+ (DataFrame({"A": [1, 2], "B": [1., 2.]}), False),
+ # differ sizes
+ (DataFrame({"A": np.array([1, 2], dtype=np.int32),
+ "B": np.array([1, 2], dtype=np.int64)}), False),
+ # multi-extension differ
+ (DataFrame({"A": pd.Categorical(['a', 'b']),
+ "B": pd.Categorical(['b', 'c'])}), False),
+
+ ])
+ def test_is_homogeneous_type(self, data, expected):
+ assert data._is_homogeneous_type is expected
+
+ def test_asarray_homogenous(self):
+ df = pd.DataFrame({"A": pd.Categorical([1, 2]),
+ "B": pd.Categorical([1, 2])})
+ result = np.asarray(df)
+ # may change from object in the future
+ expected = np.array([[1, 1], [2, 2]], dtype='object')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestDataFrameDatetimeWithTZ(TestData):
+
+ def test_interleave(self):
+
+ # interleave with object
+ result = self.tzframe.assign(D='foo').values
+ expected = np.array([[Timestamp('2013-01-01 00:00:00'),
+ Timestamp('2013-01-02 00:00:00'),
+ Timestamp('2013-01-03 00:00:00')],
+ [Timestamp('2013-01-01 00:00:00-0500',
+ tz='US/Eastern'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00-0500',
+ tz='US/Eastern')],
+ [Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00+0100', tz='CET')],
+ ['foo', 'foo', 'foo']], dtype=object).T
+ tm.assert_numpy_array_equal(result, expected)
+
+ # interleave with only datetime64[ns]
+ result = self.tzframe.values
+ expected = np.array([[Timestamp('2013-01-01 00:00:00'),
+ Timestamp('2013-01-02 00:00:00'),
+ Timestamp('2013-01-03 00:00:00')],
+ [Timestamp('2013-01-01 00:00:00-0500',
+ tz='US/Eastern'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00-0500',
+ tz='US/Eastern')],
+ [Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00+0100',
+ tz='CET')]], dtype=object).T
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_astype(self):
+ # astype
+ expected = np.array([[Timestamp('2013-01-01 00:00:00'),
+ Timestamp('2013-01-02 00:00:00'),
+ Timestamp('2013-01-03 00:00:00')],
+ [Timestamp('2013-01-01 00:00:00-0500',
+ tz='US/Eastern'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00-0500',
+ tz='US/Eastern')],
+ [Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
+ pd.NaT,
+ Timestamp('2013-01-03 00:00:00+0100',
+ tz='CET')]],
+ dtype=object).T
+ result = self.tzframe.astype(object)
+ assert_frame_equal(result, DataFrame(
+ expected, index=self.tzframe.index, columns=self.tzframe.columns))
+
+ result = self.tzframe.astype('datetime64[ns]')
+ expected = DataFrame({'A': date_range('20130101', periods=3),
+ 'B': (date_range('20130101', periods=3,
+ tz='US/Eastern')
+ .tz_convert('UTC')
+ .tz_localize(None)),
+ 'C': (date_range('20130101', periods=3,
+ tz='CET')
+ .tz_convert('UTC')
+ .tz_localize(None))})
+ expected.iloc[1, 1] = pd.NaT
+ expected.iloc[1, 2] = pd.NaT
+ assert_frame_equal(result, expected)
+
+ def test_astype_str(self):
+ # str formatting
+ result = self.tzframe.astype(str)
+ expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00',
+ '2013-01-01 00:00:00+01:00'],
+ ['2013-01-02', 'NaT', 'NaT'],
+ ['2013-01-03', '2013-01-03 00:00:00-05:00',
+ '2013-01-03 00:00:00+01:00']],
+ columns=self.tzframe.columns)
+ tm.assert_frame_equal(result, expected)
+
+ with option_context('display.max_columns', 20):
+ result = str(self.tzframe)
+ assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 '
+ '2013-01-01 00:00:00+01:00') in result
+ assert ('1 2013-01-02 '
+ 'NaT NaT') in result
+ assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 '
+ '2013-01-03 00:00:00+01:00') in result
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_duplicates.py b/contrib/python/pandas/py2/pandas/tests/frame/test_duplicates.py
new file mode 100644
index 00000000000..3396670fb58
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_duplicates.py
@@ -0,0 +1,466 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, string_types
+
+from pandas import DataFrame, Series
+import pandas.util.testing as tm
+
+
[email protected]('subset', ['a', ['a'], ['a', 'B']])
+def test_duplicated_with_misspelled_column_name(subset):
+ # GH 19730
+ df = DataFrame({'A': [0, 0, 1],
+ 'B': [0, 0, 1],
+ 'C': [0, 0, 1]})
+
+ with pytest.raises(KeyError):
+ df.duplicated(subset)
+
+ with pytest.raises(KeyError):
+ df.drop_duplicates(subset)
+
+
+def test_duplicated_do_not_fail_on_wide_dataframes():
+ # gh-21524
+ # Given the wide dataframe with a lot of columns
+ # with different (important!) values
+ data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
+ for i in range(100)}
+ df = DataFrame(data).T
+ result = df.duplicated()
+
+ # Then duplicates produce the bool Series as a result and don't fail during
+ # calculation. Actual values doesn't matter here, though usually it's all
+ # False in this case
+ assert isinstance(result, Series)
+ assert result.dtype == np.bool
+
+
[email protected]('keep, expected', [
+ ('first', Series([False, False, True, False, True])),
+ ('last', Series([True, True, False, False, False])),
+ (False, Series([True, True, True, False, True]))
+])
+def test_duplicated_keep(keep, expected):
+ df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
+
+ result = df.duplicated(keep=keep)
+ tm.assert_series_equal(result, expected)
+
+
[email protected](reason="GH#21720; nan/None falsely considered equal")
[email protected]('keep, expected', [
+ ('first', Series([False, False, True, False, True])),
+ ('last', Series([True, True, False, False, False])),
+ (False, Series([True, True, True, False, True]))
+])
+def test_duplicated_nan_none(keep, expected):
+ df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
+
+ result = df.duplicated(keep=keep)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('keep', ['first', 'last', False])
[email protected]('subset', [None, ['A', 'B'], 'A'])
+def test_duplicated_subset(subset, keep):
+ df = DataFrame({'A': [0, 1, 1, 2, 0],
+ 'B': ['a', 'b', 'b', 'c', 'a'],
+ 'C': [np.nan, 3, 3, None, np.nan]})
+
+ if subset is None:
+ subset = list(df.columns)
+ elif isinstance(subset, string_types):
+ # need to have a DataFrame, not a Series
+ # -> select columns with singleton list, not string
+ subset = [subset]
+
+ expected = df[subset].duplicated(keep=keep)
+ result = df.duplicated(keep=keep, subset=subset)
+ tm.assert_series_equal(result, expected)
+
+
+def test_drop_duplicates():
+ df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'bar', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1, 1, 2, 2, 2, 2, 1, 2],
+ 'D': lrange(8)})
+
+ # single column
+ result = df.drop_duplicates('AAA')
+ expected = df[:2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('AAA', keep='last')
+ expected = df.loc[[6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('AAA', keep=False)
+ expected = df.loc[[]]
+ tm.assert_frame_equal(result, expected)
+ assert len(result) == 0
+
+ # multi column
+ expected = df.loc[[0, 1, 2, 3]]
+ result = df.drop_duplicates(np.array(['AAA', 'B']))
+ tm.assert_frame_equal(result, expected)
+ result = df.drop_duplicates(['AAA', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(('AAA', 'B'), keep='last')
+ expected = df.loc[[0, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(('AAA', 'B'), keep=False)
+ expected = df.loc[[0]]
+ tm.assert_frame_equal(result, expected)
+
+ # consider everything
+ df2 = df.loc[:, ['AAA', 'B', 'C']]
+
+ result = df2.drop_duplicates()
+ # in this case only
+ expected = df2.drop_duplicates(['AAA', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df2.drop_duplicates(keep='last')
+ expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
+ tm.assert_frame_equal(result, expected)
+
+ result = df2.drop_duplicates(keep=False)
+ expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
+ tm.assert_frame_equal(result, expected)
+
+ # integers
+ result = df.drop_duplicates('C')
+ expected = df.iloc[[0, 2]]
+ tm.assert_frame_equal(result, expected)
+ result = df.drop_duplicates('C', keep='last')
+ expected = df.iloc[[-2, -1]]
+ tm.assert_frame_equal(result, expected)
+
+ df['E'] = df['C'].astype('int8')
+ result = df.drop_duplicates('E')
+ expected = df.iloc[[0, 2]]
+ tm.assert_frame_equal(result, expected)
+ result = df.drop_duplicates('E', keep='last')
+ expected = df.iloc[[-2, -1]]
+ tm.assert_frame_equal(result, expected)
+
+ # GH 11376
+ df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
+ 'y': [0, 6, 5, 5, 9, 1, 2]})
+ expected = df.loc[df.index != 3]
+ tm.assert_frame_equal(df.drop_duplicates(), expected)
+
+ df = DataFrame([[1, 0], [0, 2]])
+ tm.assert_frame_equal(df.drop_duplicates(), df)
+
+ df = DataFrame([[-2, 0], [0, -4]])
+ tm.assert_frame_equal(df.drop_duplicates(), df)
+
+ x = np.iinfo(np.int64).max / 3 * 2
+ df = DataFrame([[-x, x], [0, x + 4]])
+ tm.assert_frame_equal(df.drop_duplicates(), df)
+
+ df = DataFrame([[-x, x], [x, x + 4]])
+ tm.assert_frame_equal(df.drop_duplicates(), df)
+
+ # GH 11864
+ df = DataFrame([i] * 9 for i in range(16))
+ df = df.append([[1] + [0] * 8], ignore_index=True)
+
+ for keep in ['first', 'last', False]:
+ assert df.duplicated(keep=keep).sum() == 0
+
+
+def test_duplicated_on_empty_frame():
+ # GH 25184
+
+ df = DataFrame(columns=['a', 'b'])
+ dupes = df.duplicated('a')
+
+ result = df[dupes]
+ expected = df.copy()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_drop_duplicates_with_duplicate_column_names():
+ # GH17836
+ df = DataFrame([
+ [1, 2, 5],
+ [3, 4, 6],
+ [3, 4, 7]
+ ], columns=['a', 'a', 'b'])
+
+ result0 = df.drop_duplicates()
+ tm.assert_frame_equal(result0, df)
+
+ result1 = df.drop_duplicates('a')
+ expected1 = df[:2]
+ tm.assert_frame_equal(result1, expected1)
+
+
+def test_drop_duplicates_for_take_all():
+ df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
+ 'foo', 'bar', 'qux', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1, 1, 2, 2, 2, 2, 1, 2],
+ 'D': lrange(8)})
+
+ # single column
+ result = df.drop_duplicates('AAA')
+ expected = df.iloc[[0, 1, 2, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('AAA', keep='last')
+ expected = df.iloc[[2, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('AAA', keep=False)
+ expected = df.iloc[[2, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ # multiple columns
+ result = df.drop_duplicates(['AAA', 'B'])
+ expected = df.iloc[[0, 1, 2, 3, 4, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['AAA', 'B'], keep='last')
+ expected = df.iloc[[0, 1, 2, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['AAA', 'B'], keep=False)
+ expected = df.iloc[[0, 1, 2, 6]]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_drop_duplicates_tuple():
+ df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'bar', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1, 1, 2, 2, 2, 2, 1, 2],
+ 'D': lrange(8)})
+
+ # single column
+ result = df.drop_duplicates(('AA', 'AB'))
+ expected = df[:2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(('AA', 'AB'), keep='last')
+ expected = df.loc[[6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(('AA', 'AB'), keep=False)
+ expected = df.loc[[]] # empty df
+ assert len(result) == 0
+ tm.assert_frame_equal(result, expected)
+
+ # multi column
+ expected = df.loc[[0, 1, 2, 3]]
+ result = df.drop_duplicates((('AA', 'AB'), 'B'))
+ tm.assert_frame_equal(result, expected)
+
+
+ DataFrame(),
+ DataFrame(columns=[]),
+ DataFrame(columns=['A', 'B', 'C']),
+ DataFrame(index=[]),
+ DataFrame(index=['A', 'B', 'C'])
+])
+def test_drop_duplicates_empty(df):
+ # GH 20516
+ result = df.drop_duplicates()
+ tm.assert_frame_equal(result, df)
+
+ result = df.copy()
+ result.drop_duplicates(inplace=True)
+ tm.assert_frame_equal(result, df)
+
+
+def test_drop_duplicates_NA():
+ # none
+ df = DataFrame({'A': [None, None, 'foo', 'bar',
+ 'foo', 'bar', 'bar', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
+ 'D': lrange(8)})
+
+ # single column
+ result = df.drop_duplicates('A')
+ expected = df.loc[[0, 2, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('A', keep='last')
+ expected = df.loc[[1, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('A', keep=False)
+ expected = df.loc[[]] # empty df
+ tm.assert_frame_equal(result, expected)
+ assert len(result) == 0
+
+ # multi column
+ result = df.drop_duplicates(['A', 'B'])
+ expected = df.loc[[0, 2, 3, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['A', 'B'], keep='last')
+ expected = df.loc[[1, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['A', 'B'], keep=False)
+ expected = df.loc[[6]]
+ tm.assert_frame_equal(result, expected)
+
+ # nan
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'bar', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
+ 'D': lrange(8)})
+
+ # single column
+ result = df.drop_duplicates('C')
+ expected = df[:2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('C', keep='last')
+ expected = df.loc[[3, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('C', keep=False)
+ expected = df.loc[[]] # empty df
+ tm.assert_frame_equal(result, expected)
+ assert len(result) == 0
+
+ # multi column
+ result = df.drop_duplicates(['C', 'B'])
+ expected = df.loc[[0, 1, 2, 4]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['C', 'B'], keep='last')
+ expected = df.loc[[1, 3, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates(['C', 'B'], keep=False)
+ expected = df.loc[[1]]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_drop_duplicates_NA_for_take_all():
+ # none
+ df = DataFrame({'A': [None, None, 'foo', 'bar',
+ 'foo', 'baz', 'bar', 'qux'],
+ 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
+
+ # single column
+ result = df.drop_duplicates('A')
+ expected = df.iloc[[0, 2, 3, 5, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('A', keep='last')
+ expected = df.iloc[[1, 4, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('A', keep=False)
+ expected = df.iloc[[5, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ # nan
+
+ # single column
+ result = df.drop_duplicates('C')
+ expected = df.iloc[[0, 1, 5, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('C', keep='last')
+ expected = df.iloc[[3, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates('C', keep=False)
+ expected = df.iloc[[5, 6]]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_drop_duplicates_inplace():
+ orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'bar', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': [1, 1, 2, 2, 2, 2, 1, 2],
+ 'D': lrange(8)})
+
+ # single column
+ df = orig.copy()
+ df.drop_duplicates('A', inplace=True)
+ expected = orig[:2]
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ df = orig.copy()
+ df.drop_duplicates('A', keep='last', inplace=True)
+ expected = orig.loc[[6, 7]]
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ df = orig.copy()
+ df.drop_duplicates('A', keep=False, inplace=True)
+ expected = orig.loc[[]]
+ result = df
+ tm.assert_frame_equal(result, expected)
+ assert len(df) == 0
+
+ # multi column
+ df = orig.copy()
+ df.drop_duplicates(['A', 'B'], inplace=True)
+ expected = orig.loc[[0, 1, 2, 3]]
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ df = orig.copy()
+ df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
+ expected = orig.loc[[0, 5, 6, 7]]
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ df = orig.copy()
+ df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
+ expected = orig.loc[[0]]
+ result = df
+ tm.assert_frame_equal(result, expected)
+
+ # consider everything
+ orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
+
+ df2 = orig2.copy()
+ df2.drop_duplicates(inplace=True)
+ # in this case only
+ expected = orig2.drop_duplicates(['A', 'B'])
+ result = df2
+ tm.assert_frame_equal(result, expected)
+
+ df2 = orig2.copy()
+ df2.drop_duplicates(keep='last', inplace=True)
+ expected = orig2.drop_duplicates(['A', 'B'], keep='last')
+ result = df2
+ tm.assert_frame_equal(result, expected)
+
+ df2 = orig2.copy()
+ df2.drop_duplicates(keep=False, inplace=True)
+ expected = orig2.drop_duplicates(['A', 'B'], keep=False)
+ result = df2
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/frame/test_indexing.py
new file mode 100644
index 00000000000..19b8ae4eb6e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_indexing.py
@@ -0,0 +1,3684 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import date, datetime, time, timedelta
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+from pandas.compat import long, lrange, lzip, map, range, zip
+
+from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series,
+ Timestamp, compat, date_range, isna, notna)
+import pandas.core.common as com
+from pandas.core.indexing import IndexingError
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+from pandas.tseries.offsets import BDay
+
+
+class TestDataFrameIndexing(TestData):
+
+ def test_getitem(self):
+ # Slicing
+ sl = self.frame[:20]
+ assert len(sl.index) == 20
+
+ # Column access
+ for _, series in compat.iteritems(sl):
+ assert len(series.index) == 20
+ assert tm.equalContents(series.index, sl.index)
+
+ for key, _ in compat.iteritems(self.frame._series):
+ assert self.frame[key] is not None
+
+ assert 'random' not in self.frame
+ with pytest.raises(KeyError, match='random'):
+ self.frame['random']
+
+ df = self.frame.copy()
+ df['$10'] = np.random.randn(len(df))
+
+ ad = np.random.randn(len(df))
+ df['@awesome_domain'] = ad
+
+ with pytest.raises(KeyError):
+ df.__getitem__('df["$10"]')
+
+ res = df['@awesome_domain']
+ tm.assert_numpy_array_equal(ad, res.values)
+
+ def test_getitem_dupe_cols(self):
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
+ with pytest.raises(KeyError):
+ df[['baf']]
+
+ def test_get(self):
+ b = self.frame.get('B')
+ assert_series_equal(b, self.frame['B'])
+
+ assert self.frame.get('foo') is None
+ assert_series_equal(self.frame.get('foo', self.frame['B']),
+ self.frame['B'])
+
+ @pytest.mark.parametrize("df", [
+ DataFrame(),
+ DataFrame(columns=list("AB")),
+ DataFrame(columns=list("AB"), index=range(3))
+ ])
+ def test_get_none(self, df):
+ # see gh-5652
+ assert df.get(None) is None
+
+ def test_loc_iterable(self):
+ idx = iter(['A', 'B', 'C'])
+ result = self.frame.loc[:, idx]
+ expected = self.frame.loc[:, ['A', 'B', 'C']]
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "idx_type",
+ [list, iter, Index, set,
+ lambda l: dict(zip(l, range(len(l)))),
+ lambda l: dict(zip(l, range(len(l)))).keys()],
+ ids=["list", "iter", "Index", "set", "dict", "dict_keys"])
+ @pytest.mark.parametrize("levels", [1, 2])
+ def test_getitem_listlike(self, idx_type, levels):
+ # GH 21294
+
+ if levels == 1:
+ frame, missing = self.frame, 'food'
+ else:
+ # MultiIndex columns
+ frame = DataFrame(np.random.randn(8, 3),
+ columns=Index([('foo', 'bar'), ('baz', 'qux'),
+ ('peek', 'aboo')],
+ name=('sth', 'sth2')))
+ missing = ('good', 'food')
+
+ keys = [frame.columns[1], frame.columns[0]]
+ idx = idx_type(keys)
+ idx_check = list(idx_type(keys))
+
+ result = frame[idx]
+
+ expected = frame.loc[:, idx_check]
+ expected.columns.names = frame.columns.names
+
+ assert_frame_equal(result, expected)
+
+ idx = idx_type(keys + [missing])
+ with pytest.raises(KeyError, match='not in index'):
+ frame[idx]
+
+ @pytest.mark.parametrize("val,expected", [
+ (2**63 - 1, Series([1])),
+ (2**63, Series([2])),
+ ])
+ def test_loc_uint64(self, val, expected):
+ # see gh-19399
+ df = DataFrame([1, 2], index=[2**63 - 1, 2**63])
+ result = df.loc[val]
+
+ expected.name = val
+ tm.assert_series_equal(result, expected)
+
+ def test_getitem_callable(self):
+ # GH 12533
+ result = self.frame[lambda x: 'A']
+ tm.assert_series_equal(result, self.frame.loc[:, 'A'])
+
+ result = self.frame[lambda x: ['A', 'B']]
+ tm.assert_frame_equal(result, self.frame.loc[:, ['A', 'B']])
+
+ df = self.frame[:3]
+ result = df[lambda x: [True, False, True]]
+ tm.assert_frame_equal(result, self.frame.iloc[[0, 2], :])
+
+ def test_setitem_list(self):
+
+ self.frame['E'] = 'foo'
+ data = self.frame[['A', 'B']]
+ self.frame[['B', 'A']] = data
+
+ assert_series_equal(self.frame['B'], data['A'], check_names=False)
+ assert_series_equal(self.frame['A'], data['B'], check_names=False)
+
+ msg = 'Columns must be same length as key'
+ with pytest.raises(ValueError, match=msg):
+ data[['A']] = self.frame[['A', 'B']]
+
+ msg = 'Length of values does not match length of index'
+ with pytest.raises(ValueError, match=msg):
+ data['A'] = range(len(data.index) - 1)
+
+ df = DataFrame(0, lrange(3), ['tt1', 'tt2'], dtype=np.int_)
+ df.loc[1, ['tt1', 'tt2']] = [1, 2]
+
+ result = df.loc[df.index[1], ['tt1', 'tt2']]
+ expected = Series([1, 2], df.columns, dtype=np.int_, name=1)
+ assert_series_equal(result, expected)
+
+ df['tt1'] = df['tt2'] = '0'
+ df.loc[df.index[1], ['tt1', 'tt2']] = ['1', '2']
+ result = df.loc[df.index[1], ['tt1', 'tt2']]
+ expected = Series(['1', '2'], df.columns, name=1)
+ assert_series_equal(result, expected)
+
+ def test_setitem_list_not_dataframe(self):
+ data = np.random.randn(len(self.frame), 2)
+ self.frame[['A', 'B']] = data
+ assert_almost_equal(self.frame[['A', 'B']].values, data)
+
+ def test_setitem_list_of_tuples(self):
+ tuples = lzip(self.frame['A'], self.frame['B'])
+ self.frame['tuples'] = tuples
+
+ result = self.frame['tuples']
+ expected = Series(tuples, index=self.frame.index, name='tuples')
+ assert_series_equal(result, expected)
+
+ def test_setitem_mulit_index(self):
+ # GH7655, test that assigning to a sub-frame of a frame
+ # with multi-index columns aligns both rows and columns
+ it = ['jim', 'joe', 'jolie'], ['first', 'last'], \
+ ['left', 'center', 'right']
+
+ cols = MultiIndex.from_product(it)
+ index = pd.date_range('20141006', periods=20)
+ vals = np.random.randint(1, 1000, (len(index), len(cols)))
+ df = pd.DataFrame(vals, columns=cols, index=index)
+
+ i, j = df.index.values.copy(), it[-1][:]
+
+ np.random.shuffle(i)
+ df['jim'] = df['jolie'].loc[i, ::-1]
+ assert_frame_equal(df['jim'], df['jolie'])
+
+ np.random.shuffle(j)
+ df[('joe', 'first')] = df[('jolie', 'last')].loc[i, j]
+ assert_frame_equal(df[('joe', 'first')], df[('jolie', 'last')])
+
+ np.random.shuffle(j)
+ df[('joe', 'last')] = df[('jolie', 'first')].loc[i, j]
+ assert_frame_equal(df[('joe', 'last')], df[('jolie', 'first')])
+
+ def test_setitem_callable(self):
+ # GH 12533
+ df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]})
+ df[lambda x: 'A'] = [11, 12, 13, 14]
+
+ exp = pd.DataFrame({'A': [11, 12, 13, 14], 'B': [5, 6, 7, 8]})
+ tm.assert_frame_equal(df, exp)
+
+ def test_setitem_other_callable(self):
+ # GH 13299
+ def inc(x):
+ return x + 1
+
+ df = pd.DataFrame([[-1, 1], [1, -1]])
+ df[df > 0] = inc
+
+ expected = pd.DataFrame([[-1, inc], [inc, -1]])
+ tm.assert_frame_equal(df, expected)
+
+ def test_getitem_boolean(self):
+ # boolean indexing
+ d = self.tsframe.index[10]
+ indexer = self.tsframe.index > d
+ indexer_obj = indexer.astype(object)
+
+ subindex = self.tsframe.index[indexer]
+ subframe = self.tsframe[indexer]
+
+ tm.assert_index_equal(subindex, subframe.index)
+ with pytest.raises(ValueError, match='Item wrong length'):
+ self.tsframe[indexer[:-1]]
+
+ subframe_obj = self.tsframe[indexer_obj]
+ assert_frame_equal(subframe_obj, subframe)
+
+ with pytest.raises(ValueError, match='boolean values only'):
+ self.tsframe[self.tsframe]
+
+ # test that Series work
+ indexer_obj = Series(indexer_obj, self.tsframe.index)
+
+ subframe_obj = self.tsframe[indexer_obj]
+ assert_frame_equal(subframe_obj, subframe)
+
+ # test that Series indexers reindex
+ # we are producing a warning that since the passed boolean
+ # key is not the same as the given index, we will reindex
+ # not sure this is really necessary
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1])
+ subframe_obj = self.tsframe[indexer_obj]
+ assert_frame_equal(subframe_obj, subframe)
+
+ # test df[df > 0]
+ for df in [self.tsframe, self.mixed_frame,
+ self.mixed_float, self.mixed_int]:
+ if compat.PY3 and df is self.mixed_frame:
+ continue
+
+ data = df._get_numeric_data()
+ bif = df[df > 0]
+ bifw = DataFrame({c: np.where(data[c] > 0, data[c], np.nan)
+ for c in data.columns},
+ index=data.index, columns=data.columns)
+
+ # add back other columns to compare
+ for c in df.columns:
+ if c not in bifw:
+ bifw[c] = df[c]
+ bifw = bifw.reindex(columns=df.columns)
+
+ assert_frame_equal(bif, bifw, check_dtype=False)
+ for c in df.columns:
+ if bif[c].dtype != bifw[c].dtype:
+ assert bif[c].dtype == df[c].dtype
+
+ def test_getitem_boolean_casting(self):
+
+ # don't upcast if we don't need to
+ df = self.tsframe.copy()
+ df['E'] = 1
+ df['E'] = df['E'].astype('int32')
+ df['E1'] = df['E'].copy()
+ df['F'] = 1
+ df['F'] = df['F'].astype('int64')
+ df['F1'] = df['F'].copy()
+
+ casted = df[df > 0]
+ result = casted.get_dtype_counts()
+ expected = Series({'float64': 4, 'int32': 2, 'int64': 2})
+ assert_series_equal(result, expected)
+
+ # int block splitting
+ df.loc[df.index[1:3], ['E1', 'F1']] = 0
+ casted = df[df > 0]
+ result = casted.get_dtype_counts()
+ expected = Series({'float64': 6, 'int32': 1, 'int64': 1})
+ assert_series_equal(result, expected)
+
+ # where dtype conversions
+ # GH 3733
+ df = DataFrame(data=np.random.randn(100, 50))
+ df = df.where(df > 0) # create nans
+ bools = df > 0
+ mask = isna(df)
+ expected = bools.astype(float).mask(mask)
+ result = bools.mask(mask)
+ assert_frame_equal(result, expected)
+
+ def test_getitem_boolean_list(self):
+ df = DataFrame(np.arange(12).reshape(3, 4))
+
+ def _checkit(lst):
+ result = df[lst]
+ expected = df.loc[df.index[lst]]
+ assert_frame_equal(result, expected)
+
+ _checkit([True, False, True])
+ _checkit([True, True, True])
+ _checkit([False, False, False])
+
+ def test_getitem_boolean_iadd(self):
+ arr = np.random.randn(5, 5)
+
+ df = DataFrame(arr.copy(), columns=['A', 'B', 'C', 'D', 'E'])
+
+ df[df < 0] += 1
+ arr[arr < 0] += 1
+
+ assert_almost_equal(df.values, arr)
+
+ def test_boolean_index_empty_corner(self):
+ # #2096
+ blah = DataFrame(np.empty([0, 1]), columns=['A'],
+ index=DatetimeIndex([]))
+
+ # both of these should succeed trivially
+ k = np.array([], bool)
+
+ blah[k]
+ blah[k] = 0
+
+ def test_getitem_ix_mixed_integer(self):
+ df = DataFrame(np.random.randn(4, 3),
+ index=[1, 10, 'C', 'E'], columns=[1, 2, 3])
+
+ result = df.iloc[:-1]
+ expected = df.loc[df.index[:-1]]
+ assert_frame_equal(result, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[[1, 10]]
+ expected = df.ix[Index([1, 10], dtype=object)]
+ assert_frame_equal(result, expected)
+
+ # 11320
+ df = pd.DataFrame({"rna": (1.5, 2.2, 3.2, 4.5),
+ -1000: [11, 21, 36, 40],
+ 0: [10, 22, 43, 34],
+ 1000: [0, 10, 20, 30]},
+ columns=['rna', -1000, 0, 1000])
+ result = df[[1000]]
+ expected = df.iloc[:, [3]]
+ assert_frame_equal(result, expected)
+ result = df[[-1000]]
+ expected = df.iloc[:, [1]]
+ assert_frame_equal(result, expected)
+
+ def test_getitem_setitem_ix_negative_integers(self):
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = self.frame.ix[:, -1]
+ assert_series_equal(result, self.frame['D'])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = self.frame.ix[:, [-1]]
+ assert_frame_equal(result, self.frame[['D']])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = self.frame.ix[:, [-1, -2]]
+ assert_frame_equal(result, self.frame[['D', 'C']])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ self.frame.ix[:, [-1]] = 0
+ assert (self.frame['D'] == 0).all()
+
+ df = DataFrame(np.random.randn(8, 4))
+ # ix does label-based indexing when having an integer index
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ with pytest.raises(KeyError):
+ df.ix[[-1]]
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ with pytest.raises(KeyError):
+ df.ix[:, [-1]]
+
+ # #1942
+ a = DataFrame(np.random.randn(20, 2),
+ index=[chr(x + 65) for x in range(20)])
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ a.ix[-1] = a.ix[-2]
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_series_equal(a.ix[-1], a.ix[-2], check_names=False)
+ assert a.ix[-1].name == 'T'
+ assert a.ix[-2].name == 'S'
+
+ def test_getattr(self):
+ assert_series_equal(self.frame.A, self.frame['A'])
+ pytest.raises(AttributeError, getattr, self.frame,
+ 'NONEXISTENT_NAME')
+
+ def test_setattr_column(self):
+ df = DataFrame({'foobar': 1}, index=lrange(10))
+
+ df.foobar = 5
+ assert (df.foobar == 5).all()
+
+ def test_setitem(self):
+ # not sure what else to do here
+ series = self.frame['A'][::2]
+ self.frame['col5'] = series
+ assert 'col5' in self.frame
+
+ assert len(series) == 15
+ assert len(self.frame) == 30
+
+ exp = np.ravel(np.column_stack((series.values, [np.nan] * 15)))
+ exp = Series(exp, index=self.frame.index, name='col5')
+ tm.assert_series_equal(self.frame['col5'], exp)
+
+ series = self.frame['A']
+ self.frame['col6'] = series
+ tm.assert_series_equal(series, self.frame['col6'], check_names=False)
+
+ with pytest.raises(KeyError):
+ self.frame[np.random.randn(len(self.frame) + 1)] = 1
+
+ # set ndarray
+ arr = np.random.randn(len(self.frame))
+ self.frame['col9'] = arr
+ assert (self.frame['col9'] == arr).all()
+
+ self.frame['col7'] = 5
+ assert((self.frame['col7'] == 5).all())
+
+ self.frame['col0'] = 3.14
+ assert((self.frame['col0'] == 3.14).all())
+
+ self.frame['col8'] = 'foo'
+ assert((self.frame['col8'] == 'foo').all())
+
+ # this is partially a view (e.g. some blocks are view)
+ # so raise/warn
+ smaller = self.frame[:2]
+
+ with pytest.raises(com.SettingWithCopyError):
+ smaller['col10'] = ['1', '2']
+
+ assert smaller['col10'].dtype == np.object_
+ assert (smaller['col10'] == ['1', '2']).all()
+
+ # dtype changing GH4204
+ df = DataFrame([[0, 0]])
+ df.iloc[0] = np.nan
+ expected = DataFrame([[np.nan, np.nan]])
+ assert_frame_equal(df, expected)
+
+ df = DataFrame([[0, 0]])
+ df.loc[0] = np.nan
+ assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
+ def test_setitem_dtype(self, dtype):
+ arr = np.random.randn(len(self.frame))
+
+ self.frame[dtype] = np.array(arr, dtype=dtype)
+ assert self.frame[dtype].dtype.name == dtype
+
+ def test_setitem_tuple(self):
+ self.frame['A', 'B'] = self.frame['A']
+ assert_series_equal(self.frame['A', 'B'], self.frame[
+ 'A'], check_names=False)
+
+ def test_setitem_always_copy(self):
+ s = self.frame['A'].copy()
+ self.frame['E'] = s
+
+ self.frame['E'][5:10] = np.nan
+ assert notna(s[5:10]).all()
+
+ def test_setitem_boolean(self):
+ df = self.frame.copy()
+ values = self.frame.values
+
+ df[df['A'] > 0] = 4
+ values[values[:, 0] > 0] = 4
+ assert_almost_equal(df.values, values)
+
+ # test that column reindexing works
+ series = df['A'] == 4
+ series = series.reindex(df.index[::-1])
+ df[series] = 1
+ values[values[:, 0] == 4] = 1
+ assert_almost_equal(df.values, values)
+
+ df[df > 0] = 5
+ values[values > 0] = 5
+ assert_almost_equal(df.values, values)
+
+ df[df == 5] = 0
+ values[values == 5] = 0
+ assert_almost_equal(df.values, values)
+
+ # a df that needs alignment first
+ df[df[:-1] < 0] = 2
+ np.putmask(values[:-1], values[:-1] < 0, 2)
+ assert_almost_equal(df.values, values)
+
+ # indexed with same shape but rows-reversed df
+ df[df[::-1] == 2] = 3
+ values[values == 2] = 3
+ assert_almost_equal(df.values, values)
+
+ msg = "Must pass DataFrame or 2-d ndarray with boolean values only"
+ with pytest.raises(TypeError, match=msg):
+ df[df * 0] = 2
+
+ # index with DataFrame
+ mask = df > np.abs(df)
+ expected = df.copy()
+ df[df > np.abs(df)] = np.nan
+ expected.values[mask.values] = np.nan
+ assert_frame_equal(df, expected)
+
+ # set from DataFrame
+ expected = df.copy()
+ df[df > np.abs(df)] = df * 2
+ np.putmask(expected.values, mask.values, df.values * 2)
+ assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ "mask_type",
+ [lambda df: df > np.abs(df) / 2,
+ lambda df: (df > np.abs(df) / 2).values],
+ ids=['dataframe', 'array'])
+ def test_setitem_boolean_mask(self, mask_type):
+
+ # Test for issue #18582
+ df = self.frame.copy()
+ mask = mask_type(df)
+
+ # index with boolean mask
+ result = df.copy()
+ result[mask] = np.nan
+
+ expected = df.copy()
+ expected.values[np.array(mask)] = np.nan
+ assert_frame_equal(result, expected)
+
+ def test_setitem_cast(self):
+ self.frame['D'] = self.frame['D'].astype('i8')
+ assert self.frame['D'].dtype == np.int64
+
+ # #669, should not cast?
+ # this is now set to int64, which means a replacement of the column to
+ # the value dtype (and nothing to do with the existing dtype)
+ self.frame['B'] = 0
+ assert self.frame['B'].dtype == np.int64
+
+ # cast if pass array of course
+ self.frame['B'] = np.arange(len(self.frame))
+ assert issubclass(self.frame['B'].dtype.type, np.integer)
+
+ self.frame['foo'] = 'bar'
+ self.frame['foo'] = 0
+ assert self.frame['foo'].dtype == np.int64
+
+ self.frame['foo'] = 'bar'
+ self.frame['foo'] = 2.5
+ assert self.frame['foo'].dtype == np.float64
+
+ self.frame['something'] = 0
+ assert self.frame['something'].dtype == np.int64
+ self.frame['something'] = 2
+ assert self.frame['something'].dtype == np.int64
+ self.frame['something'] = 2.5
+ assert self.frame['something'].dtype == np.float64
+
+ # GH 7704
+ # dtype conversion on setting
+ df = DataFrame(np.random.rand(30, 3), columns=tuple('ABC'))
+ df['event'] = np.nan
+ df.loc[10, 'event'] = 'foo'
+ result = df.get_dtype_counts().sort_values()
+ expected = Series({'float64': 3, 'object': 1}).sort_values()
+ assert_series_equal(result, expected)
+
+ # Test that data type is preserved . #5782
+ df = DataFrame({'one': np.arange(6, dtype=np.int8)})
+ df.loc[1, 'one'] = 6
+ assert df.dtypes.one == np.dtype(np.int8)
+ df.one = np.int8(7)
+ assert df.dtypes.one == np.dtype(np.int8)
+
+ def test_setitem_boolean_column(self):
+ expected = self.frame.copy()
+ mask = self.frame['A'] > 0
+
+ self.frame.loc[mask, 'B'] = 0
+ expected.values[mask.values, 1] = 0
+
+ assert_frame_equal(self.frame, expected)
+
+ def test_frame_setitem_timestamp(self):
+ # GH#2155
+ columns = date_range(start='1/1/2012', end='2/1/2012', freq=BDay())
+ index = lrange(10)
+ data = DataFrame(columns=columns, index=index)
+ t = datetime(2012, 11, 1)
+ ts = Timestamp(t)
+ data[ts] = np.nan # works, mostly a smoke-test
+ assert np.isnan(data[ts]).all()
+
+ def test_setitem_corner(self):
+ # corner case
+ df = DataFrame({'B': [1., 2., 3.],
+ 'C': ['a', 'b', 'c']},
+ index=np.arange(3))
+ del df['B']
+ df['B'] = [1., 2., 3.]
+ assert 'B' in df
+ assert len(df.columns) == 2
+
+ df['A'] = 'beginning'
+ df['E'] = 'foo'
+ df['D'] = 'bar'
+ df[datetime.now()] = 'date'
+ df[datetime.now()] = 5.
+
+ # what to do when empty frame with index
+ dm = DataFrame(index=self.frame.index)
+ dm['A'] = 'foo'
+ dm['B'] = 'bar'
+ assert len(dm.columns) == 2
+ assert dm.values.dtype == np.object_
+
+ # upcast
+ dm['C'] = 1
+ assert dm['C'].dtype == np.int64
+
+ dm['E'] = 1.
+ assert dm['E'].dtype == np.float64
+
+ # set existing column
+ dm['A'] = 'bar'
+ assert 'bar' == dm['A'][0]
+
+ dm = DataFrame(index=np.arange(3))
+ dm['A'] = 1
+ dm['foo'] = 'bar'
+ del dm['foo']
+ dm['foo'] = 'bar'
+ assert dm['foo'].dtype == np.object_
+
+ dm['coercable'] = ['1', '2', '3']
+ assert dm['coercable'].dtype == np.object_
+
+ def test_setitem_corner2(self):
+ data = {"title": ['foobar', 'bar', 'foobar'] + ['foobar'] * 17,
+ "cruft": np.random.random(20)}
+
+ df = DataFrame(data)
+ ix = df[df['title'] == 'bar'].index
+
+ df.loc[ix, ['title']] = 'foobar'
+ df.loc[ix, ['cruft']] = 0
+
+ assert df.loc[1, 'title'] == 'foobar'
+ assert df.loc[1, 'cruft'] == 0
+
+ def test_setitem_ambig(self):
+ # Difficulties with mixed-type data
+ from decimal import Decimal
+
+ # Created as float type
+ dm = DataFrame(index=lrange(3), columns=lrange(3))
+
+ coercable_series = Series([Decimal(1) for _ in range(3)],
+ index=lrange(3))
+ uncoercable_series = Series(['foo', 'bzr', 'baz'], index=lrange(3))
+
+ dm[0] = np.ones(3)
+ assert len(dm.columns) == 3
+
+ dm[1] = coercable_series
+ assert len(dm.columns) == 3
+
+ dm[2] = uncoercable_series
+ assert len(dm.columns) == 3
+ assert dm[2].dtype == np.object_
+
+ def test_setitem_clear_caches(self):
+ # see gh-304
+ df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]},
+ index=[0, 1, 2, 3])
+ df.insert(2, 'z', np.nan)
+
+ # cache it
+ foo = df['z']
+ df.loc[df.index[2:], 'z'] = 42
+
+ expected = Series([np.nan, np.nan, 42, 42], index=df.index, name='z')
+
+ assert df['z'] is not foo
+ tm.assert_series_equal(df['z'], expected)
+
+ def test_setitem_None(self):
+ # GH #766
+ self.frame[None] = self.frame['A']
+ assert_series_equal(
+ self.frame.iloc[:, -1], self.frame['A'], check_names=False)
+ assert_series_equal(self.frame.loc[:, None], self.frame[
+ 'A'], check_names=False)
+ assert_series_equal(self.frame[None], self.frame[
+ 'A'], check_names=False)
+ repr(self.frame)
+
+ def test_setitem_empty(self):
+ # GH 9596
+ df = pd.DataFrame({'a': ['1', '2', '3'],
+ 'b': ['11', '22', '33'],
+ 'c': ['111', '222', '333']})
+
+ result = df.copy()
+ result.loc[result.b.isna(), 'a'] = result.a
+ assert_frame_equal(result, df)
+
+ @pytest.mark.parametrize("dtype", ["float", "int64"])
+ @pytest.mark.parametrize("kwargs", [
+ dict(),
+ dict(index=[1]),
+ dict(columns=["A"])
+ ])
+ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs):
+ # see gh-10126
+ kwargs["dtype"] = dtype
+ df = DataFrame(**kwargs)
+
+ df2 = df.copy()
+ df[df > df2] = 47
+ assert_frame_equal(df, df2)
+
+ def test_setitem_scalars_no_index(self):
+ # GH16823 / 17894
+ df = DataFrame()
+ df['foo'] = 1
+ expected = DataFrame(columns=['foo']).astype(np.int64)
+ assert_frame_equal(df, expected)
+
+ def test_getitem_empty_frame_with_boolean(self):
+ # Test for issue #11859
+
+ df = pd.DataFrame()
+ df2 = df[df > 0]
+ assert_frame_equal(df, df2)
+
+ def test_delitem_corner(self):
+ f = self.frame.copy()
+ del f['D']
+ assert len(f.columns) == 3
+ pytest.raises(KeyError, f.__delitem__, 'D')
+ del f['B']
+ assert len(f.columns) == 2
+
+ def test_getitem_fancy_2d(self):
+ f = self.frame
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(f.ix[:, ['B', 'A']],
+ f.reindex(columns=['B', 'A']))
+
+ subidx = self.frame.index[[5, 4, 1]]
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(f.ix[subidx, ['B', 'A']],
+ f.reindex(index=subidx, columns=['B', 'A']))
+
+ # slicing rows, etc.
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(f.ix[5:10], f[5:10])
+ assert_frame_equal(f.ix[5:10, :], f[5:10])
+ assert_frame_equal(f.ix[:5, ['A', 'B']],
+ f.reindex(index=f.index[:5],
+ columns=['A', 'B']))
+
+ # slice rows with labels, inclusive!
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ expected = f.ix[5:11]
+ result = f.ix[f.index[5]:f.index[10]]
+ assert_frame_equal(expected, result)
+
+ # slice columns
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(f.ix[:, :2], f.reindex(columns=['A', 'B']))
+
+ # get view
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ exp = f.copy()
+ f.ix[5:10].values[:] = 5
+ exp.values[5:10] = 5
+ assert_frame_equal(f, exp)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ pytest.raises(ValueError, f.ix.__getitem__, f > 0.5)
+
+ def test_slice_floats(self):
+ index = [52195.504153, 52196.303147, 52198.369883]
+ df = DataFrame(np.random.rand(3, 2), index=index)
+
+ s1 = df.loc[52195.1:52196.5]
+ assert len(s1) == 2
+
+ s1 = df.loc[52195.1:52196.6]
+ assert len(s1) == 2
+
+ s1 = df.loc[52195.1:52198.9]
+ assert len(s1) == 3
+
+ def test_getitem_fancy_slice_integers_step(self):
+ df = DataFrame(np.random.randn(10, 5))
+
+ # this is OK
+ result = df.iloc[:8:2] # noqa
+ df.iloc[:8:2] = np.nan
+ assert isna(df.iloc[:8:2]).values.all()
+
+ def test_getitem_setitem_integer_slice_keyerrors(self):
+ df = DataFrame(np.random.randn(10, 5), index=lrange(0, 20, 2))
+
+ # this is OK
+ cp = df.copy()
+ cp.iloc[4:10] = 0
+ assert (cp.iloc[4:10] == 0).values.all()
+
+ # so is this
+ cp = df.copy()
+ cp.iloc[3:11] = 0
+ assert (cp.iloc[3:11] == 0).values.all()
+
+ result = df.iloc[2:6]
+ result2 = df.loc[3:11]
+ expected = df.reindex([4, 6, 8, 10])
+
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+
+ # non-monotonic, raise KeyError
+ df2 = df.iloc[lrange(5) + lrange(5, 10)[::-1]]
+ pytest.raises(KeyError, df2.loc.__getitem__, slice(3, 11))
+ pytest.raises(KeyError, df2.loc.__setitem__, slice(3, 11), 0)
+
+ def test_setitem_fancy_2d(self):
+
+ # case 1
+ frame = self.frame.copy()
+ expected = frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[:, ['B', 'A']] = 1
+ expected['B'] = 1.
+ expected['A'] = 1.
+ assert_frame_equal(frame, expected)
+
+ # case 2
+ frame = self.frame.copy()
+ frame2 = self.frame.copy()
+
+ expected = frame.copy()
+
+ subidx = self.frame.index[[5, 4, 1]]
+ values = np.random.randn(3, 2)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[subidx, ['B', 'A']] = values
+ frame2.ix[[5, 4, 1], ['B', 'A']] = values
+
+ expected['B'].ix[subidx] = values[:, 0]
+ expected['A'].ix[subidx] = values[:, 1]
+
+ assert_frame_equal(frame, expected)
+ assert_frame_equal(frame2, expected)
+
+ # case 3: slicing rows, etc.
+ frame = self.frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ expected1 = self.frame.copy()
+ frame.ix[5:10] = 1.
+ expected1.values[5:10] = 1.
+ assert_frame_equal(frame, expected1)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ expected2 = self.frame.copy()
+ arr = np.random.randn(5, len(frame.columns))
+ frame.ix[5:10] = arr
+ expected2.values[5:10] = arr
+ assert_frame_equal(frame, expected2)
+
+ # case 4
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame = self.frame.copy()
+ frame.ix[5:10, :] = 1.
+ assert_frame_equal(frame, expected1)
+ frame.ix[5:10, :] = arr
+ assert_frame_equal(frame, expected2)
+
+ # case 5
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame = self.frame.copy()
+ frame2 = self.frame.copy()
+
+ expected = self.frame.copy()
+ values = np.random.randn(5, 2)
+
+ frame.ix[:5, ['A', 'B']] = values
+ expected['A'][:5] = values[:, 0]
+ expected['B'][:5] = values[:, 1]
+ assert_frame_equal(frame, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame2.ix[:5, [0, 1]] = values
+ assert_frame_equal(frame2, expected)
+
+ # case 6: slice rows with labels, inclusive!
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ frame.ix[frame.index[5]:frame.index[10]] = 5.
+ expected.values[5:11] = 5
+ assert_frame_equal(frame, expected)
+
+ # case 7: slice columns
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame = self.frame.copy()
+ frame2 = self.frame.copy()
+ expected = self.frame.copy()
+
+ # slice indices
+ frame.ix[:, 1:3] = 4.
+ expected.values[:, 1:3] = 4.
+ assert_frame_equal(frame, expected)
+
+ # slice with labels
+ frame.ix[:, 'B':'C'] = 4.
+ assert_frame_equal(frame, expected)
+
+ # new corner case of boolean slicing / setting
+ frame = DataFrame(lzip([2, 3, 9, 6, 7], [np.nan] * 5),
+ columns=['a', 'b'])
+ lst = [100]
+ lst.extend([np.nan] * 4)
+ expected = DataFrame(lzip([100, 3, 9, 6, 7], lst),
+ columns=['a', 'b'])
+ frame[frame['a'] == 2] = 100
+ assert_frame_equal(frame, expected)
+
+ def test_fancy_getitem_slice_mixed(self):
+ sliced = self.mixed_frame.iloc[:, -3:]
+ assert sliced['D'].dtype == np.float64
+
+ # get view with single block
+ # setting it triggers setting with copy
+ sliced = self.frame.iloc[:, -3:]
+
+ with pytest.raises(com.SettingWithCopyError):
+ sliced['C'] = 4.
+
+ assert (self.frame['C'] == 4).all()
+
+ def test_fancy_setitem_int_labels(self):
+ # integer index defers to label-based indexing
+
+ df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ tmp = df.copy()
+ exp = df.copy()
+ tmp.ix[[0, 2, 4]] = 5
+ exp.values[:3] = 5
+ assert_frame_equal(tmp, exp)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ tmp = df.copy()
+ exp = df.copy()
+ tmp.ix[6] = 5
+ exp.values[3] = 5
+ assert_frame_equal(tmp, exp)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ tmp = df.copy()
+ exp = df.copy()
+ tmp.ix[:, 2] = 5
+
+ # tmp correctly sets the dtype
+ # so match the exp way
+ exp[2] = 5
+ assert_frame_equal(tmp, exp)
+
+ def test_fancy_getitem_int_labels(self):
+ df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[[4, 2, 0], [2, 0]]
+ expected = df.reindex(index=[4, 2, 0], columns=[2, 0])
+ assert_frame_equal(result, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[[4, 2, 0]]
+ expected = df.reindex(index=[4, 2, 0])
+ assert_frame_equal(result, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[4]
+ expected = df.xs(4)
+ assert_series_equal(result, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[:, 3]
+ expected = df[3]
+ assert_series_equal(result, expected)
+
+ def test_fancy_index_int_labels_exceptions(self):
+ df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+
+ # labels that aren't contained
+ pytest.raises(KeyError, df.ix.__setitem__,
+ ([0, 1, 2], [2, 3, 4]), 5)
+
+ # try to set indices not contained in frame
+ pytest.raises(KeyError, self.frame.ix.__setitem__,
+ ['foo', 'bar', 'baz'], 1)
+ pytest.raises(KeyError, self.frame.ix.__setitem__,
+ (slice(None, None), ['E']), 1)
+
+ # partial setting now allows this GH2578
+ # pytest.raises(KeyError, self.frame.ix.__setitem__,
+ # (slice(None, None), 'E'), 1)
+
+ def test_setitem_fancy_mixed_2d(self):
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5
+ result = self.mixed_frame.ix[:5, ['C', 'B', 'A']]
+ assert (result.values == 5).all()
+
+ self.mixed_frame.ix[5] = np.nan
+ assert isna(self.mixed_frame.ix[5]).all()
+
+ self.mixed_frame.ix[5] = self.mixed_frame.ix[6]
+ assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6],
+ check_names=False)
+
+ # #1432
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = DataFrame({1: [1., 2., 3.],
+ 2: [3, 4, 5]})
+ assert df._is_mixed_type
+
+ df.ix[1] = [5, 10]
+
+ expected = DataFrame({1: [1., 5., 3.],
+ 2: [3, 10, 5]})
+
+ assert_frame_equal(df, expected)
+
+ def test_ix_align(self):
+ b = Series(np.random.randn(10), name=0).sort_values()
+ df_orig = DataFrame(np.random.randn(10, 4))
+ df = df_orig.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df.ix[:, 0] = b
+ assert_series_equal(df.ix[:, 0].reindex(b.index), b)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ dft = df_orig.T
+ dft.ix[0, :] = b
+ assert_series_equal(dft.ix[0, :].reindex(b.index), b)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = df_orig.copy()
+ df.ix[:5, 0] = b
+ s = df.ix[:5, 0]
+ assert_series_equal(s, b.reindex(s.index))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ dft = df_orig.T
+ dft.ix[0, :5] = b
+ s = dft.ix[0, :5]
+ assert_series_equal(s, b.reindex(s.index))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = df_orig.copy()
+ idx = [0, 1, 3, 5]
+ df.ix[idx, 0] = b
+ s = df.ix[idx, 0]
+ assert_series_equal(s, b.reindex(s.index))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ dft = df_orig.T
+ dft.ix[0, idx] = b
+ s = dft.ix[0, idx]
+ assert_series_equal(s, b.reindex(s.index))
+
+ def test_ix_frame_align(self):
+ b = DataFrame(np.random.randn(3, 4))
+ df_orig = DataFrame(np.random.randn(10, 4))
+ df = df_orig.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df.ix[:3] = b
+ out = b.ix[:3]
+ assert_frame_equal(out, b)
+
+ b.sort_index(inplace=True)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = df_orig.copy()
+ df.ix[[0, 1, 2]] = b
+ out = df.ix[[0, 1, 2]].reindex(b.index)
+ assert_frame_equal(out, b)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = df_orig.copy()
+ df.ix[:3] = b
+ out = df.ix[:3]
+ assert_frame_equal(out, b.reindex(out.index))
+
+ def test_getitem_setitem_non_ix_labels(self):
+ df = tm.makeTimeDataFrame()
+
+ start, end = df.index[[5, 10]]
+
+ result = df.loc[start:end]
+ result2 = df[start:end]
+ expected = df[5:11]
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+
+ result = df.copy()
+ result.loc[start:end] = 0
+ result2 = df.copy()
+ result2[start:end] = 0
+ expected = df.copy()
+ expected[5:11] = 0
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+
+ def test_ix_multi_take(self):
+ df = DataFrame(np.random.randn(3, 2))
+ rs = df.loc[df.index == 0, :]
+ xp = df.reindex([0])
+ assert_frame_equal(rs, xp)
+
+ """ #1321
+ df = DataFrame(np.random.randn(3, 2))
+ rs = df.loc[df.index==0, df.columns==1]
+ xp = df.reindex([0], [1])
+ assert_frame_equal(rs, xp)
+ """
+
+ def test_ix_multi_take_nonint_index(self):
+ df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'],
+ columns=['a', 'b'])
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ rs = df.ix[[0], [0]]
+ xp = df.reindex(['x'], columns=['a'])
+ assert_frame_equal(rs, xp)
+
+ def test_ix_multi_take_multiindex(self):
+ df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'],
+ columns=[['a', 'b'], ['1', '2']])
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ rs = df.ix[[0], [0]]
+ xp = df.reindex(['x'], columns=[('a', '1')])
+ assert_frame_equal(rs, xp)
+
+ def test_ix_dup(self):
+ idx = Index(['a', 'a', 'b', 'c', 'd', 'd'])
+ df = DataFrame(np.random.randn(len(idx), 3), idx)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ sub = df.ix[:'d']
+ assert_frame_equal(sub, df)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ sub = df.ix['a':'c']
+ assert_frame_equal(sub, df.ix[0:4])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ sub = df.ix['b':'d']
+ assert_frame_equal(sub, df.ix[2:])
+
+ def test_getitem_fancy_1d(self):
+ f = self.frame
+
+ # return self if no slicing...for now
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert f.ix[:, :] is f
+
+ # low dimensional slice
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ xs1 = f.ix[2, ['C', 'B', 'A']]
+ xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A'])
+ tm.assert_series_equal(xs1, xs2)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ ts1 = f.ix[5:10, 2]
+ ts2 = f[f.columns[2]][5:10]
+ tm.assert_series_equal(ts1, ts2)
+
+ # positional xs
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ xs1 = f.ix[0]
+ xs2 = f.xs(f.index[0])
+ tm.assert_series_equal(xs1, xs2)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ xs1 = f.ix[f.index[5]]
+ xs2 = f.xs(f.index[5])
+ tm.assert_series_equal(xs1, xs2)
+
+ # single column
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_series_equal(f.ix[:, 'A'], f['A'])
+
+ # return view
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ exp = f.copy()
+ exp.values[5] = 4
+ f.ix[5][:] = 4
+ tm.assert_frame_equal(exp, f)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ exp.values[:, 1] = 6
+ f.ix[:, 1][:] = 6
+ tm.assert_frame_equal(exp, f)
+
+ # slice of mixed-frame
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ xs = self.mixed_frame.ix[5]
+ exp = self.mixed_frame.xs(self.mixed_frame.index[5])
+ tm.assert_series_equal(xs, exp)
+
+ def test_setitem_fancy_1d(self):
+
+ # case 1: set cross-section for indices
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.]
+ expected['C'][2] = 1.
+ expected['B'][2] = 2.
+ expected['A'][2] = 3.
+ assert_frame_equal(frame, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame2 = self.frame.copy()
+ frame2.ix[2, [3, 2, 1]] = [1., 2., 3.]
+ assert_frame_equal(frame, expected)
+
+ # case 2, set a section of a column
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ vals = np.random.randn(5)
+ expected.values[5:10, 2] = vals
+ frame.ix[5:10, 2] = vals
+ assert_frame_equal(frame, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame2 = self.frame.copy()
+ frame2.ix[5:10, 'B'] = vals
+ assert_frame_equal(frame, expected)
+
+ # case 3: full xs
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[4] = 5.
+ expected.values[4] = 5.
+ assert_frame_equal(frame, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[frame.index[4]] = 6.
+ expected.values[4] = 6.
+ assert_frame_equal(frame, expected)
+
+ # single column
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ frame.ix[:, 'A'] = 7.
+ expected['A'] = 7.
+ assert_frame_equal(frame, expected)
+
+ def test_getitem_fancy_scalar(self):
+ f = self.frame
+ ix = f.loc
+
+ # individual value
+ for col in f.columns:
+ ts = f[col]
+ for idx in f.index[::5]:
+ assert ix[idx, col] == ts[idx]
+
+ def test_setitem_fancy_scalar(self):
+ f = self.frame
+ expected = self.frame.copy()
+ ix = f.loc
+
+ # individual value
+ for j, col in enumerate(f.columns):
+ ts = f[col] # noqa
+ for idx in f.index[::5]:
+ i = f.index.get_loc(idx)
+ val = np.random.randn()
+ expected.values[i, j] = val
+
+ ix[idx, col] = val
+ assert_frame_equal(f, expected)
+
+ def test_getitem_fancy_boolean(self):
+ f = self.frame
+ ix = f.loc
+
+ expected = f.reindex(columns=['B', 'D'])
+ result = ix[:, [False, True, False, True]]
+ assert_frame_equal(result, expected)
+
+ expected = f.reindex(index=f.index[5:10], columns=['B', 'D'])
+ result = ix[f.index[5:10], [False, True, False, True]]
+ assert_frame_equal(result, expected)
+
+ boolvec = f.index > f.index[7]
+ expected = f.reindex(index=f.index[boolvec])
+ result = ix[boolvec]
+ assert_frame_equal(result, expected)
+ result = ix[boolvec, :]
+ assert_frame_equal(result, expected)
+
+ result = ix[boolvec, f.columns[2:]]
+ expected = f.reindex(index=f.index[boolvec],
+ columns=['C', 'D'])
+ assert_frame_equal(result, expected)
+
+ def test_setitem_fancy_boolean(self):
+ # from 2d, set with booleans
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+
+ mask = frame['A'] > 0
+ frame.loc[mask] = 0.
+ expected.values[mask.values] = 0.
+ assert_frame_equal(frame, expected)
+
+ frame = self.frame.copy()
+ expected = self.frame.copy()
+ frame.loc[mask, ['A', 'B']] = 0.
+ expected.values[mask.values, :2] = 0.
+ assert_frame_equal(frame, expected)
+
+ def test_getitem_fancy_ints(self):
+ result = self.frame.iloc[[1, 4, 7]]
+ expected = self.frame.loc[self.frame.index[[1, 4, 7]]]
+ assert_frame_equal(result, expected)
+
+ result = self.frame.iloc[:, [2, 0, 1]]
+ expected = self.frame.loc[:, self.frame.columns[[2, 0, 1]]]
+ assert_frame_equal(result, expected)
+
+ def test_getitem_setitem_fancy_exceptions(self):
+ ix = self.frame.iloc
+ with pytest.raises(IndexingError, match='Too many indexers'):
+ ix[:, :, :]
+
+ with pytest.raises(IndexingError):
+ ix[:, :, :] = 1
+
+ def test_getitem_setitem_boolean_misaligned(self):
+ # boolean index misaligned labels
+ mask = self.frame['A'][::-1] > 1
+
+ result = self.frame.loc[mask]
+ expected = self.frame.loc[mask[::-1]]
+ assert_frame_equal(result, expected)
+
+ cp = self.frame.copy()
+ expected = self.frame.copy()
+ cp.loc[mask] = 0
+ expected.loc[mask] = 0
+ assert_frame_equal(cp, expected)
+
+ def test_getitem_setitem_boolean_multi(self):
+ df = DataFrame(np.random.randn(3, 2))
+
+ # get
+ k1 = np.array([True, False, True])
+ k2 = np.array([False, True])
+ result = df.loc[k1, k2]
+ expected = df.loc[[0, 2], [1]]
+ assert_frame_equal(result, expected)
+
+ expected = df.copy()
+ df.loc[np.array([True, False, True]),
+ np.array([False, True])] = 5
+ expected.loc[[0, 2], [1]] = 5
+ assert_frame_equal(df, expected)
+
+ def test_getitem_setitem_float_labels(self):
+ index = Index([1.5, 2, 3, 4, 5])
+ df = DataFrame(np.random.randn(5, 5), index=index)
+
+ result = df.loc[1.5:4]
+ expected = df.reindex([1.5, 2, 3, 4])
+ assert_frame_equal(result, expected)
+ assert len(result) == 4
+
+ result = df.loc[4:5]
+ expected = df.reindex([4, 5]) # reindex with int
+ assert_frame_equal(result, expected, check_index_type=False)
+ assert len(result) == 2
+
+ result = df.loc[4:5]
+ expected = df.reindex([4.0, 5.0]) # reindex with float
+ assert_frame_equal(result, expected)
+ assert len(result) == 2
+
+ # loc_float changes this to work properly
+ result = df.loc[1:2]
+ expected = df.iloc[0:2]
+ assert_frame_equal(result, expected)
+
+ df.loc[1:2] = 0
+ result = df[1:2]
+ assert (result == 0).all().all()
+
+ # #2727
+ index = Index([1.0, 2.5, 3.5, 4.5, 5.0])
+ df = DataFrame(np.random.randn(5, 5), index=index)
+
+ # positional slicing only via iloc!
+ pytest.raises(TypeError, lambda: df.iloc[1.0:5])
+
+ result = df.iloc[4:5]
+ expected = df.reindex([5.0])
+ assert_frame_equal(result, expected)
+ assert len(result) == 1
+
+ cp = df.copy()
+
+ with pytest.raises(TypeError):
+ cp.iloc[1.0:5] = 0
+
+ with pytest.raises(TypeError):
+ result = cp.iloc[1.0:5] == 0 # noqa
+
+ assert result.values.all()
+ assert (cp.iloc[0:1] == df.iloc[0:1]).values.all()
+
+ cp = df.copy()
+ cp.iloc[4:5] = 0
+ assert (cp.iloc[4:5] == 0).values.all()
+ assert (cp.iloc[0:4] == df.iloc[0:4]).values.all()
+
+ # float slicing
+ result = df.loc[1.0:5]
+ expected = df
+ assert_frame_equal(result, expected)
+ assert len(result) == 5
+
+ result = df.loc[1.1:5]
+ expected = df.reindex([2.5, 3.5, 4.5, 5.0])
+ assert_frame_equal(result, expected)
+ assert len(result) == 4
+
+ result = df.loc[4.51:5]
+ expected = df.reindex([5.0])
+ assert_frame_equal(result, expected)
+ assert len(result) == 1
+
+ result = df.loc[1.0:5.0]
+ expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0])
+ assert_frame_equal(result, expected)
+ assert len(result) == 5
+
+ cp = df.copy()
+ cp.loc[1.0:5.0] = 0
+ result = cp.loc[1.0:5.0]
+ assert (result == 0).values.all()
+
+ def test_setitem_single_column_mixed(self):
+ df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
+ columns=['foo', 'bar', 'baz'])
+ df['str'] = 'qux'
+ df.loc[df.index[::2], 'str'] = np.nan
+ expected = np.array([np.nan, 'qux', np.nan, 'qux', np.nan],
+ dtype=object)
+ assert_almost_equal(df['str'].values, expected)
+
+ def test_setitem_single_column_mixed_datetime(self):
+ df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
+ columns=['foo', 'bar', 'baz'])
+
+ df['timestamp'] = Timestamp('20010102')
+
+ # check our dtypes
+ result = df.get_dtype_counts()
+ expected = Series({'float64': 3, 'datetime64[ns]': 1})
+ assert_series_equal(result, expected)
+
+ # set an allowable datetime64 type
+ df.loc['b', 'timestamp'] = iNaT
+ assert isna(df.loc['b', 'timestamp'])
+
+ # allow this syntax
+ df.loc['c', 'timestamp'] = np.nan
+ assert isna(df.loc['c', 'timestamp'])
+
+ # allow this syntax
+ df.loc['d', :] = np.nan
+ assert not isna(df.loc['c', :]).all()
+
+ # as of GH 3216 this will now work!
+ # try to set with a list like item
+ # pytest.raises(
+ # Exception, df.loc.__setitem__, ('d', 'timestamp'), [np.nan])
+
+ def test_setitem_mixed_datetime(self):
+ # GH 9336
+ expected = DataFrame({'a': [0, 0, 0, 0, 13, 14],
+ 'b': [pd.datetime(2012, 1, 1),
+ 1,
+ 'x',
+ 'y',
+ pd.datetime(2013, 1, 1),
+ pd.datetime(2014, 1, 1)]})
+ df = pd.DataFrame(0, columns=list('ab'), index=range(6))
+ df['b'] = pd.NaT
+ df.loc[0, 'b'] = pd.datetime(2012, 1, 1)
+ df.loc[1, 'b'] = 1
+ df.loc[[2, 3], 'b'] = 'x', 'y'
+ A = np.array([[13, np.datetime64('2013-01-01T00:00:00')],
+ [14, np.datetime64('2014-01-01T00:00:00')]])
+ df.loc[[4, 5], ['a', 'b']] = A
+ assert_frame_equal(df, expected)
+
+ def test_setitem_frame(self):
+ piece = self.frame.loc[self.frame.index[:2], ['A', 'B']]
+ self.frame.loc[self.frame.index[-2]:, ['A', 'B']] = piece.values
+ result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values
+ expected = piece.values
+ assert_almost_equal(result, expected)
+
+ # GH 3216
+
+ # already aligned
+ f = self.mixed_frame.copy()
+ piece = DataFrame([[1., 2.], [3., 4.]],
+ index=f.index[0:2], columns=['A', 'B'])
+ key = (slice(None, 2), ['A', 'B'])
+ f.loc[key] = piece
+ assert_almost_equal(f.loc[f.index[0:2], ['A', 'B']].values,
+ piece.values)
+
+ # rows unaligned
+ f = self.mixed_frame.copy()
+ piece = DataFrame([[1., 2.], [3., 4.], [5., 6.], [7., 8.]],
+ index=list(f.index[0:2]) + ['foo', 'bar'],
+ columns=['A', 'B'])
+ key = (slice(None, 2), ['A', 'B'])
+ f.loc[key] = piece
+ assert_almost_equal(f.loc[f.index[0:2:], ['A', 'B']].values,
+ piece.values[0:2])
+
+ # key is unaligned with values
+ f = self.mixed_frame.copy()
+ piece = f.loc[f.index[:2], ['A']]
+ piece.index = f.index[-2:]
+ key = (slice(-2, None), ['A', 'B'])
+ f.loc[key] = piece
+ piece['B'] = np.nan
+ assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values,
+ piece.values)
+
+ # ndarray
+ f = self.mixed_frame.copy()
+ piece = self.mixed_frame.loc[f.index[:2], ['A', 'B']]
+ key = (slice(-2, None), ['A', 'B'])
+ f.loc[key] = piece.values
+ assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values,
+ piece.values)
+
+ # needs upcasting
+ df = DataFrame([[1, 2, 'foo'], [3, 4, 'bar']], columns=['A', 'B', 'C'])
+ df2 = df.copy()
+ df2.loc[:, ['A', 'B']] = df.loc[:, ['A', 'B']] + 0.5
+ expected = df.reindex(columns=['A', 'B'])
+ expected += 0.5
+ expected['C'] = df['C']
+ assert_frame_equal(df2, expected)
+
+ def test_setitem_frame_align(self):
+ piece = self.frame.loc[self.frame.index[:2], ['A', 'B']]
+ piece.index = self.frame.index[-2:]
+ piece.columns = ['A', 'B']
+ self.frame.loc[self.frame.index[-2:], ['A', 'B']] = piece
+ result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values
+ expected = piece.values
+ assert_almost_equal(result, expected)
+
+ def test_getitem_setitem_ix_duplicates(self):
+ # #1201
+ df = DataFrame(np.random.randn(5, 3),
+ index=['foo', 'foo', 'bar', 'baz', 'bar'])
+
+ result = df.loc['foo']
+ expected = df[:2]
+ assert_frame_equal(result, expected)
+
+ result = df.loc['bar']
+ expected = df.iloc[[2, 4]]
+ assert_frame_equal(result, expected)
+
+ result = df.loc['baz']
+ expected = df.iloc[3]
+ assert_series_equal(result, expected)
+
+ def test_getitem_ix_boolean_duplicates_multiple(self):
+ # #1201
+ df = DataFrame(np.random.randn(5, 3),
+ index=['foo', 'foo', 'bar', 'baz', 'bar'])
+
+ result = df.loc[['bar']]
+ exp = df.iloc[[2, 4]]
+ assert_frame_equal(result, exp)
+
+ result = df.loc[df[1] > 0]
+ exp = df[df[1] > 0]
+ assert_frame_equal(result, exp)
+
+ result = df.loc[df[0] > 0]
+ exp = df[df[0] > 0]
+ assert_frame_equal(result, exp)
+
+ def test_getitem_setitem_ix_bool_keyerror(self):
+ # #2199
+ df = DataFrame({'a': [1, 2, 3]})
+
+ pytest.raises(KeyError, df.loc.__getitem__, False)
+ pytest.raises(KeyError, df.loc.__getitem__, True)
+
+ pytest.raises(KeyError, df.loc.__setitem__, False, 0)
+ pytest.raises(KeyError, df.loc.__setitem__, True, 0)
+
+ def test_getitem_list_duplicates(self):
+ # #1943
+ df = DataFrame(np.random.randn(4, 4), columns=list('AABC'))
+ df.columns.name = 'foo'
+
+ result = df[['B', 'C']]
+ assert result.columns.name == 'foo'
+
+ expected = df.iloc[:, 2:]
+ assert_frame_equal(result, expected)
+
+ def test_get_value(self):
+ for idx in self.frame.index:
+ for col in self.frame.columns:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = self.frame.get_value(idx, col)
+ expected = self.frame[col][idx]
+ assert result == expected
+
+ def test_lookup(self):
+ def alt(df, rows, cols, dtype):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = [df.get_value(r, c) for r, c in zip(rows, cols)]
+ return np.array(result, dtype=dtype)
+
+ def testit(df):
+ rows = list(df.index) * len(df.columns)
+ cols = list(df.columns) * len(df.index)
+ result = df.lookup(rows, cols)
+ expected = alt(df, rows, cols, dtype=np.object_)
+ tm.assert_almost_equal(result, expected, check_dtype=False)
+
+ testit(self.mixed_frame)
+ testit(self.frame)
+
+ df = DataFrame({'label': ['a', 'b', 'a', 'c'],
+ 'mask_a': [True, True, False, True],
+ 'mask_b': [True, False, False, False],
+ 'mask_c': [False, True, False, True]})
+ df['mask'] = df.lookup(df.index, 'mask_' + df['label'])
+ exp_mask = alt(df, df.index, 'mask_' + df['label'], dtype=np.bool_)
+ tm.assert_series_equal(df['mask'], pd.Series(exp_mask, name='mask'))
+ assert df['mask'].dtype == np.bool_
+
+ with pytest.raises(KeyError):
+ self.frame.lookup(['xyz'], ['A'])
+
+ with pytest.raises(KeyError):
+ self.frame.lookup([self.frame.index[0]], ['xyz'])
+
+ with pytest.raises(ValueError, match='same size'):
+ self.frame.lookup(['a', 'b', 'c'], ['a'])
+
+ def test_set_value(self):
+ for idx in self.frame.index:
+ for col in self.frame.columns:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ self.frame.set_value(idx, col, 1)
+ assert self.frame[col][idx] == 1
+
+ def test_set_value_resize(self):
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res = self.frame.set_value('foobar', 'B', 0)
+ assert res is self.frame
+ assert res.index[-1] == 'foobar'
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert res.get_value('foobar', 'B') == 0
+
+ self.frame.loc['foobar', 'qux'] = 0
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert self.frame.get_value('foobar', 'qux') == 0
+
+ res = self.frame.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res3 = res.set_value('foobar', 'baz', 'sam')
+ assert res3['baz'].dtype == np.object_
+
+ res = self.frame.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res3 = res.set_value('foobar', 'baz', True)
+ assert res3['baz'].dtype == np.object_
+
+ res = self.frame.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res3 = res.set_value('foobar', 'baz', 5)
+ assert is_float_dtype(res3['baz'])
+ assert isna(res3['baz'].drop(['foobar'])).all()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ pytest.raises(ValueError, res3.set_value, 'foobar', 'baz', 'sam')
+
+ def test_set_value_with_index_dtype_change(self):
+ df_orig = DataFrame(np.random.randn(3, 3),
+ index=lrange(3), columns=list('ABC'))
+
+ # this is actually ambiguous as the 2 is interpreted as a positional
+ # so column is not created
+ df = df_orig.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ df.set_value('C', 2, 1.0)
+ assert list(df.index) == list(df_orig.index) + ['C']
+ # assert list(df.columns) == list(df_orig.columns) + [2]
+
+ df = df_orig.copy()
+ df.loc['C', 2] = 1.0
+ assert list(df.index) == list(df_orig.index) + ['C']
+ # assert list(df.columns) == list(df_orig.columns) + [2]
+
+ # create both new
+ df = df_orig.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ df.set_value('C', 'D', 1.0)
+ assert list(df.index) == list(df_orig.index) + ['C']
+ assert list(df.columns) == list(df_orig.columns) + ['D']
+
+ df = df_orig.copy()
+ df.loc['C', 'D'] = 1.0
+ assert list(df.index) == list(df_orig.index) + ['C']
+ assert list(df.columns) == list(df_orig.columns) + ['D']
+
+ def test_get_set_value_no_partial_indexing(self):
+ # partial w/ MultiIndex raise exception
+ index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)])
+ df = DataFrame(index=index, columns=lrange(4))
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ pytest.raises(KeyError, df.get_value, 0, 1)
+
+ def test_single_element_ix_dont_upcast(self):
+ self.frame['E'] = 1
+ assert issubclass(self.frame['E'].dtype.type, (int, np.integer))
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = self.frame.ix[self.frame.index[5], 'E']
+ assert is_integer(result)
+
+ result = self.frame.loc[self.frame.index[5], 'E']
+ assert is_integer(result)
+
+ # GH 11617
+ df = pd.DataFrame(dict(a=[1.23]))
+ df["b"] = 666
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[0, "b"]
+ assert is_integer(result)
+ result = df.loc[0, "b"]
+ assert is_integer(result)
+
+ expected = Series([666], [0], name='b')
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[[0], "b"]
+ assert_series_equal(result, expected)
+ result = df.loc[[0], "b"]
+ assert_series_equal(result, expected)
+
+ def test_iloc_row(self):
+ df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2))
+
+ result = df.iloc[1]
+ exp = df.loc[2]
+ assert_series_equal(result, exp)
+
+ result = df.iloc[2]
+ exp = df.loc[4]
+ assert_series_equal(result, exp)
+
+ # slice
+ result = df.iloc[slice(4, 8)]
+ expected = df.loc[8:14]
+ assert_frame_equal(result, expected)
+
+ # verify slice is view
+ # setting it makes it raise/warn
+ with pytest.raises(com.SettingWithCopyError):
+ result[2] = 0.
+
+ exp_col = df[2].copy()
+ exp_col[4:8] = 0.
+ assert_series_equal(df[2], exp_col)
+
+ # list of integers
+ result = df.iloc[[1, 2, 4, 6]]
+ expected = df.reindex(df.index[[1, 2, 4, 6]])
+ assert_frame_equal(result, expected)
+
+ def test_iloc_col(self):
+
+ df = DataFrame(np.random.randn(4, 10), columns=lrange(0, 20, 2))
+
+ result = df.iloc[:, 1]
+ exp = df.loc[:, 2]
+ assert_series_equal(result, exp)
+
+ result = df.iloc[:, 2]
+ exp = df.loc[:, 4]
+ assert_series_equal(result, exp)
+
+ # slice
+ result = df.iloc[:, slice(4, 8)]
+ expected = df.loc[:, 8:14]
+ assert_frame_equal(result, expected)
+
+ # verify slice is view
+ # and that we are setting a copy
+ with pytest.raises(com.SettingWithCopyError):
+ result[8] = 0.
+
+ assert (df[8] == 0).all()
+
+ # list of integers
+ result = df.iloc[:, [1, 2, 4, 6]]
+ expected = df.reindex(columns=df.columns[[1, 2, 4, 6]])
+ assert_frame_equal(result, expected)
+
+ def test_iloc_duplicates(self):
+
+ df = DataFrame(np.random.rand(3, 3), columns=list('ABC'),
+ index=list('aab'))
+
+ result = df.iloc[0]
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result2 = df.ix[0]
+ assert isinstance(result, Series)
+ assert_almost_equal(result.values, df.values[0])
+ assert_series_equal(result, result2)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.T.iloc[:, 0]
+ result2 = df.T.ix[:, 0]
+ assert isinstance(result, Series)
+ assert_almost_equal(result.values, df.values[0])
+ assert_series_equal(result, result2)
+
+ # multiindex
+ df = DataFrame(np.random.randn(3, 3),
+ columns=[['i', 'i', 'j'], ['A', 'A', 'B']],
+ index=[['i', 'i', 'j'], ['X', 'X', 'Y']])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ rs = df.iloc[0]
+ xp = df.ix[0]
+ assert_series_equal(rs, xp)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ rs = df.iloc[:, 0]
+ xp = df.T.ix[0]
+ assert_series_equal(rs, xp)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ rs = df.iloc[:, [0]]
+ xp = df.ix[:, [0]]
+ assert_frame_equal(rs, xp)
+
+ # #2259
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1, 1, 2])
+ result = df.iloc[:, [0]]
+ expected = df.take([0], axis=1)
+ assert_frame_equal(result, expected)
+
+ def test_loc_duplicates(self):
+ # gh-17105
+
+ # insert a duplicate element to the index
+ trange = pd.date_range(start=pd.Timestamp(year=2017, month=1, day=1),
+ end=pd.Timestamp(year=2017, month=1, day=5))
+
+ trange = trange.insert(loc=5,
+ item=pd.Timestamp(year=2017, month=1, day=5))
+
+ df = pd.DataFrame(0, index=trange, columns=["A", "B"])
+ bool_idx = np.array([False, False, False, False, False, True])
+
+ # assignment
+ df.loc[trange[bool_idx], "A"] = 6
+
+ expected = pd.DataFrame({'A': [0, 0, 0, 0, 6, 6],
+ 'B': [0, 0, 0, 0, 0, 0]},
+ index=trange)
+ tm.assert_frame_equal(df, expected)
+
+ # in-place
+ df = pd.DataFrame(0, index=trange, columns=["A", "B"])
+ df.loc[trange[bool_idx], "A"] += 6
+ tm.assert_frame_equal(df, expected)
+
+ def test_iloc_sparse_propegate_fill_value(self):
+ from pandas.core.sparse.api import SparseDataFrame
+ df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999)
+ assert len(df['A'].sp_values) == len(df.iloc[:, 0].sp_values)
+
+ def test_iat(self):
+
+ for i, row in enumerate(self.frame.index):
+ for j, col in enumerate(self.frame.columns):
+ result = self.frame.iat[i, j]
+ expected = self.frame.at[row, col]
+ assert result == expected
+
+ def test_nested_exception(self):
+ # Ignore the strange way of triggering the problem
+ # (which may get fixed), it's just a way to trigger
+ # the issue or reraising an outer exception without
+ # a named argument
+ df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6],
+ "c": [7, 8, 9]}).set_index(["a", "b"])
+ index = list(df.index)
+ index[0] = ["a", "b"]
+ df.index = index
+
+ try:
+ repr(df)
+ except Exception as e:
+ assert type(e) != UnboundLocalError
+
+ @pytest.mark.parametrize("method,expected_values", [
+ ("nearest", [0, 1, 1, 2]),
+ ("pad", [np.nan, 0, 1, 1]),
+ ("backfill", [0, 1, 2, 2])
+ ])
+ def test_reindex_methods(self, method, expected_values):
+ df = pd.DataFrame({"x": list(range(5))})
+ target = np.array([-0.1, 0.9, 1.1, 1.5])
+
+ expected = pd.DataFrame({'x': expected_values}, index=target)
+ actual = df.reindex(target, method=method)
+ assert_frame_equal(expected, actual)
+
+ actual = df.reindex_like(df, method=method, tolerance=0)
+ assert_frame_equal(df, actual)
+ actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
+ assert_frame_equal(df, actual)
+
+ actual = df.reindex(target, method=method, tolerance=1)
+ assert_frame_equal(expected, actual)
+ actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
+ assert_frame_equal(expected, actual)
+
+ e2 = expected[::-1]
+ actual = df.reindex(target[::-1], method=method)
+ assert_frame_equal(e2, actual)
+
+ new_order = [3, 0, 2, 1]
+ e2 = expected.iloc[new_order]
+ actual = df.reindex(target[new_order], method=method)
+ assert_frame_equal(e2, actual)
+
+ switched_method = ('pad' if method == 'backfill'
+ else 'backfill' if method == 'pad'
+ else method)
+ actual = df[::-1].reindex(target, method=switched_method)
+ assert_frame_equal(expected, actual)
+
+ def test_reindex_methods_nearest_special(self):
+ df = pd.DataFrame({"x": list(range(5))})
+ target = np.array([-0.1, 0.9, 1.1, 1.5])
+
+ expected = pd.DataFrame({"x": [0, 1, 1, np.nan]}, index=target)
+ actual = df.reindex(target, method="nearest", tolerance=0.2)
+ assert_frame_equal(expected, actual)
+
+ expected = pd.DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target)
+ actual = df.reindex(target, method="nearest",
+ tolerance=[0.5, 0.01, 0.4, 0.1])
+ assert_frame_equal(expected, actual)
+
+ def test_reindex_frame_add_nat(self):
+ rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
+ df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng})
+
+ result = df.reindex(lrange(15))
+ assert np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))
+
+ mask = com.isna(result)['B']
+ assert mask[-5:].all()
+ assert not mask[:-5].any()
+
+ def test_set_dataframe_column_ns_dtype(self):
+ x = DataFrame([datetime.now(), datetime.now()])
+ assert x[0].dtype == np.dtype('M8[ns]')
+
+ def test_non_monotonic_reindex_methods(self):
+ dr = pd.date_range('2013-08-01', periods=6, freq='B')
+ data = np.random.randn(6, 1)
+ df = pd.DataFrame(data, index=dr, columns=list('A'))
+ df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]],
+ columns=list('A'))
+ # index is not monotonic increasing or decreasing
+ pytest.raises(ValueError, df_rev.reindex, df.index, method='pad')
+ pytest.raises(ValueError, df_rev.reindex, df.index, method='ffill')
+ pytest.raises(ValueError, df_rev.reindex, df.index, method='bfill')
+ pytest.raises(ValueError, df_rev.reindex, df.index, method='nearest')
+
+ def test_reindex_level(self):
+ from itertools import permutations
+ icol = ['jim', 'joe', 'jolie']
+
+ def verify_first_level(df, level, idx, check_index_type=True):
+ def f(val):
+ return np.nonzero((df[level] == val).to_numpy())[0]
+ i = np.concatenate(list(map(f, idx)))
+ left = df.set_index(icol).reindex(idx, level=level)
+ right = df.iloc[i].set_index(icol)
+ assert_frame_equal(left, right, check_index_type=check_index_type)
+
+ def verify(df, level, idx, indexer, check_index_type=True):
+ left = df.set_index(icol).reindex(idx, level=level)
+ right = df.iloc[indexer].set_index(icol)
+ assert_frame_equal(left, right, check_index_type=check_index_type)
+
+ df = pd.DataFrame({'jim': list('B' * 4 + 'A' * 2 + 'C' * 3),
+ 'joe': list('abcdeabcd')[::-1],
+ 'jolie': [10, 20, 30] * 3,
+ 'joline': np.random.randint(0, 1000, 9)})
+
+ target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'],
+ ['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'],
+ ['A', 'B'], ['B', 'A', 'C']]
+
+ for idx in target:
+ verify_first_level(df, 'jim', idx)
+
+ # reindex by these causes different MultiIndex levels
+ for idx in [['D', 'F'], ['A', 'C', 'B']]:
+ verify_first_level(df, 'jim', idx, check_index_type=False)
+
+ verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6])
+ verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6])
+ verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6])
+ verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8])
+ verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6])
+ verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6])
+ verify(df, 'joe', list('edwq'), [0, 4, 5])
+ verify(df, 'joe', list('wq'), [], check_index_type=False)
+
+ df = DataFrame({'jim': ['mid'] * 5 + ['btm'] * 8 + ['top'] * 7,
+ 'joe': ['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 +
+ ['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 +
+ ['3rd'] * 3 + ['2nd'] * 2,
+ # this needs to be jointly unique with jim and joe or
+ # reindexing will fail ~1.5% of the time, this works
+ # out to needing unique groups of same size as joe
+ 'jolie': np.concatenate([
+ np.random.choice(1000, x, replace=False)
+ for x in [2, 3, 3, 2, 3, 2, 3, 2]]),
+ 'joline': np.random.randn(20).round(3) * 10})
+
+ for idx in permutations(df['jim'].unique()):
+ for i in range(3):
+ verify_first_level(df, 'jim', idx[:i + 1])
+
+ i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10,
+ 11, 12, 13, 14, 18, 19, 15, 16, 17]
+ verify(df, 'joe', ['1st', '2nd', '3rd'], i)
+
+ i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6,
+ 7, 8, 9, 15, 16, 17, 18, 19, 13, 14]
+ verify(df, 'joe', ['3rd', '2nd', '1st'], i)
+
+ i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]
+ verify(df, 'joe', ['2nd', '3rd'], i)
+
+ i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]
+ verify(df, 'joe', ['3rd', '1st'], i)
+
+ def test_getitem_ix_float_duplicates(self):
+ df = pd.DataFrame(np.random.randn(3, 3),
+ index=[0.1, 0.2, 0.2], columns=list('abc'))
+ expect = df.iloc[1:]
+ assert_frame_equal(df.loc[0.2], expect)
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(df.ix[0.2], expect)
+
+ expect = df.iloc[1:, 0]
+ assert_series_equal(df.loc[0.2, 'a'], expect)
+
+ df.index = [1, 0.2, 0.2]
+ expect = df.iloc[1:]
+ assert_frame_equal(df.loc[0.2], expect)
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(df.ix[0.2], expect)
+
+ expect = df.iloc[1:, 0]
+ assert_series_equal(df.loc[0.2, 'a'], expect)
+
+ df = pd.DataFrame(np.random.randn(4, 3),
+ index=[1, 0.2, 0.2, 1], columns=list('abc'))
+ expect = df.iloc[1:-1]
+ assert_frame_equal(df.loc[0.2], expect)
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(df.ix[0.2], expect)
+
+ expect = df.iloc[1:-1, 0]
+ assert_series_equal(df.loc[0.2, 'a'], expect)
+
+ df.index = [0.1, 0.2, 2, 0.2]
+ expect = df.iloc[[1, -1]]
+ assert_frame_equal(df.loc[0.2], expect)
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ assert_frame_equal(df.ix[0.2], expect)
+
+ expect = df.iloc[[1, -1], 0]
+ assert_series_equal(df.loc[0.2, 'a'], expect)
+
+ def test_getitem_sparse_column(self):
+ # https://github.com/pandas-dev/pandas/issues/23559
+ data = pd.SparseArray([0, 1])
+ df = pd.DataFrame({"A": data})
+ expected = pd.Series(data, name="A")
+ result = df['A']
+ tm.assert_series_equal(result, expected)
+
+ result = df.iloc[:, 0]
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc[:, 'A']
+ tm.assert_series_equal(result, expected)
+
+ def test_setitem_with_sparse_value(self):
+ # GH8131
+ df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]})
+ sp_array = pd.SparseArray([0, 0, 1])
+ df['new_column'] = sp_array
+ assert_series_equal(df['new_column'],
+ pd.Series(sp_array, name='new_column'),
+ check_names=False)
+
+ def test_setitem_with_unaligned_sparse_value(self):
+ df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]})
+ sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0])
+ df['new_column'] = sp_series
+ exp = pd.Series(pd.SparseArray([1, 0, 0]), name='new_column')
+ assert_series_equal(df['new_column'], exp)
+
+ def test_setitem_with_unaligned_tz_aware_datetime_column(self):
+ # GH 12981
+ # Assignment of unaligned offset-aware datetime series.
+ # Make sure timezone isn't lost
+ column = pd.Series(pd.date_range('2015-01-01', periods=3, tz='utc'),
+ name='dates')
+ df = pd.DataFrame({'dates': column})
+ df['dates'] = column[[1, 0, 2]]
+ assert_series_equal(df['dates'], column)
+
+ df = pd.DataFrame({'dates': column})
+ df.loc[[0, 1, 2], 'dates'] = column[[1, 0, 2]]
+ assert_series_equal(df['dates'], column)
+
+ def test_setitem_datetime_coercion(self):
+ # gh-1048
+ df = pd.DataFrame({'c': [pd.Timestamp('2010-10-01')] * 3})
+ df.loc[0:1, 'c'] = np.datetime64('2008-08-08')
+ assert pd.Timestamp('2008-08-08') == df.loc[0, 'c']
+ assert pd.Timestamp('2008-08-08') == df.loc[1, 'c']
+ df.loc[2, 'c'] = date(2005, 5, 5)
+ assert pd.Timestamp('2005-05-05') == df.loc[2, 'c']
+
+ def test_setitem_datetimelike_with_inference(self):
+ # GH 7592
+ # assignment of timedeltas with NaT
+
+ one_hour = timedelta(hours=1)
+ df = DataFrame(index=date_range('20130101', periods=4))
+ df['A'] = np.array([1 * one_hour] * 4, dtype='m8[ns]')
+ df.loc[:, 'B'] = np.array([2 * one_hour] * 4, dtype='m8[ns]')
+ df.loc[:3, 'C'] = np.array([3 * one_hour] * 3, dtype='m8[ns]')
+ df.loc[:, 'D'] = np.array([4 * one_hour] * 4, dtype='m8[ns]')
+ df.loc[df.index[:3], 'E'] = np.array([5 * one_hour] * 3,
+ dtype='m8[ns]')
+ df['F'] = np.timedelta64('NaT')
+ df.loc[df.index[:-1], 'F'] = np.array([6 * one_hour] * 3,
+ dtype='m8[ns]')
+ df.loc[df.index[-3]:, 'G'] = date_range('20130101', periods=3)
+ df['H'] = np.datetime64('NaT')
+ result = df.dtypes
+ expected = Series([np.dtype('timedelta64[ns]')] * 6 +
+ [np.dtype('datetime64[ns]')] * 2,
+ index=list('ABCDEFGH'))
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('idxer', ['var', ['var']])
+ def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture):
+ # GH 11365
+ tz = tz_naive_fixture
+ idx = date_range(start='2015-07-12', periods=3, freq='H', tz=tz)
+ expected = DataFrame(1.2, index=idx, columns=['var'])
+ result = DataFrame(index=idx, columns=['var'])
+ result.loc[:, idxer] = expected
+ tm.assert_frame_equal(result, expected)
+
+ def test_at_time_between_time_datetimeindex(self):
+ index = date_range("2012-01-01", "2012-01-05", freq='30min')
+ df = DataFrame(np.random.randn(len(index), 5), index=index)
+ akey = time(12, 0, 0)
+ bkey = slice(time(13, 0, 0), time(14, 0, 0))
+ ainds = [24, 72, 120, 168]
+ binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
+
+ result = df.at_time(akey)
+ expected = df.loc[akey]
+ expected2 = df.iloc[ainds]
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result, expected2)
+ assert len(result) == 4
+
+ result = df.between_time(bkey.start, bkey.stop)
+ expected = df.loc[bkey]
+ expected2 = df.iloc[binds]
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result, expected2)
+ assert len(result) == 12
+
+ result = df.copy()
+ result.loc[akey] = 0
+ result = result.loc[akey]
+ expected = df.loc[akey].copy()
+ expected.loc[:] = 0
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.loc[akey] = 0
+ result.loc[akey] = df.iloc[ainds]
+ assert_frame_equal(result, df)
+
+ result = df.copy()
+ result.loc[bkey] = 0
+ result = result.loc[bkey]
+ expected = df.loc[bkey].copy()
+ expected.loc[:] = 0
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.loc[bkey] = 0
+ result.loc[bkey] = df.iloc[binds]
+ assert_frame_equal(result, df)
+
+ def test_xs(self):
+ idx = self.frame.index[5]
+ xs = self.frame.xs(idx)
+ for item, value in compat.iteritems(xs):
+ if np.isnan(value):
+ assert np.isnan(self.frame[item][idx])
+ else:
+ assert value == self.frame[item][idx]
+
+ # mixed-type xs
+ test_data = {
+ 'A': {'1': 1, '2': 2},
+ 'B': {'1': '1', '2': '2', '3': '3'},
+ }
+ frame = DataFrame(test_data)
+ xs = frame.xs('1')
+ assert xs.dtype == np.object_
+ assert xs['A'] == 1
+ assert xs['B'] == '1'
+
+ with pytest.raises(KeyError):
+ self.tsframe.xs(self.tsframe.index[0] - BDay())
+
+ # xs get column
+ series = self.frame.xs('A', axis=1)
+ expected = self.frame['A']
+ assert_series_equal(series, expected)
+
+ # view is returned if possible
+ series = self.frame.xs('A', axis=1)
+ series[:] = 5
+ assert (expected == 5).all()
+
+ def test_xs_corner(self):
+ # pathological mixed-type reordering case
+ df = DataFrame(index=[0])
+ df['A'] = 1.
+ df['B'] = 'foo'
+ df['C'] = 2.
+ df['D'] = 'bar'
+ df['E'] = 3.
+
+ xs = df.xs(0)
+ exp = pd.Series([1., 'foo', 2., 'bar', 3.],
+ index=list('ABCDE'), name=0)
+ tm.assert_series_equal(xs, exp)
+
+ # no columns but Index(dtype=object)
+ df = DataFrame(index=['a', 'b', 'c'])
+ result = df.xs('a')
+ expected = Series([], name='a', index=pd.Index([], dtype=object))
+ assert_series_equal(result, expected)
+
+ def test_xs_duplicates(self):
+ df = DataFrame(np.random.randn(5, 2), index=['b', 'b', 'c', 'b', 'a'])
+
+ cross = df.xs('c')
+ exp = df.iloc[2]
+ assert_series_equal(cross, exp)
+
+ def test_xs_keep_level(self):
+ df = (DataFrame({'day': {0: 'sat', 1: 'sun'},
+ 'flavour': {0: 'strawberry', 1: 'strawberry'},
+ 'sales': {0: 10, 1: 12},
+ 'year': {0: 2008, 1: 2008}})
+ .set_index(['year', 'flavour', 'day']))
+ result = df.xs('sat', level='day', drop_level=False)
+ expected = df[:1]
+ assert_frame_equal(result, expected)
+
+ result = df.xs([2008, 'sat'], level=['year', 'day'], drop_level=False)
+ assert_frame_equal(result, expected)
+
+ def test_xs_view(self):
+ # in 0.14 this will return a view if possible a copy otherwise, but
+ # this is numpy dependent
+
+ dm = DataFrame(np.arange(20.).reshape(4, 5),
+ index=lrange(4), columns=lrange(5))
+
+ dm.xs(2)[:] = 10
+ assert (dm.xs(2) == 10).all()
+
+ def test_index_namedtuple(self):
+ from collections import namedtuple
+ IndexType = namedtuple("IndexType", ["a", "b"])
+ idx1 = IndexType("foo", "bar")
+ idx2 = IndexType("baz", "bof")
+ index = Index([idx1, idx2],
+ name="composite_index", tupleize_cols=False)
+ df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[IndexType("foo", "bar")]["A"]
+ assert result == 1
+
+ result = df.loc[IndexType("foo", "bar")]["A"]
+ assert result == 1
+
+ def test_boolean_indexing(self):
+ idx = lrange(3)
+ cols = ['A', 'B', 'C']
+ df1 = DataFrame(index=idx, columns=cols,
+ data=np.array([[0.0, 0.5, 1.0],
+ [1.5, 2.0, 2.5],
+ [3.0, 3.5, 4.0]],
+ dtype=float))
+ df2 = DataFrame(index=idx, columns=cols,
+ data=np.ones((len(idx), len(cols))))
+
+ expected = DataFrame(index=idx, columns=cols,
+ data=np.array([[0.0, 0.5, 1.0],
+ [1.5, 2.0, -1],
+ [-1, -1, -1]], dtype=float))
+
+ df1[df1 > 2.0 * df2] = -1
+ assert_frame_equal(df1, expected)
+ with pytest.raises(ValueError, match='Item wrong length'):
+ df1[df1.index[:-1] > 2] = -1
+
+ def test_boolean_indexing_mixed(self):
+ df = DataFrame({
+ long(0): {35: np.nan, 40: np.nan, 43: np.nan,
+ 49: np.nan, 50: np.nan},
+ long(1): {35: np.nan,
+ 40: 0.32632316859446198,
+ 43: np.nan,
+ 49: 0.32632316859446198,
+ 50: 0.39114724480578139},
+ long(2): {35: np.nan, 40: np.nan, 43: 0.29012581014105987,
+ 49: np.nan, 50: np.nan},
+ long(3): {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan,
+ 50: np.nan},
+ long(4): {35: 0.34215328467153283, 40: np.nan, 43: np.nan,
+ 49: np.nan, 50: np.nan},
+ 'y': {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}})
+
+ # mixed int/float ok
+ df2 = df.copy()
+ df2[df2 > 0.3] = 1
+ expected = df.copy()
+ expected.loc[40, 1] = 1
+ expected.loc[49, 1] = 1
+ expected.loc[50, 1] = 1
+ expected.loc[35, 4] = 1
+ assert_frame_equal(df2, expected)
+
+ df['foo'] = 'test'
+ msg = ("boolean setting on mixed-type|"
+ "not supported between|"
+ "unorderable types")
+ with pytest.raises(TypeError, match=msg):
+ # TODO: This message should be the same in PY2/PY3
+ df[df > 0.3] = 1
+
+ def test_where(self):
+ default_frame = DataFrame(np.random.randn(5, 3),
+ columns=['A', 'B', 'C'])
+
+ def _safe_add(df):
+ # only add to the numeric items
+ def is_ok(s):
+ return (issubclass(s.dtype.type, (np.integer, np.floating)) and
+ s.dtype != 'uint8')
+
+ return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s)
+ for c, s in compat.iteritems(df)))
+
+ def _check_get(df, cond, check_dtypes=True):
+ other1 = _safe_add(df)
+ rs = df.where(cond, other1)
+ rs2 = df.where(cond.values, other1)
+ for k, v in rs.iteritems():
+ exp = Series(
+ np.where(cond[k], df[k], other1[k]), index=v.index)
+ assert_series_equal(v, exp, check_names=False)
+ assert_frame_equal(rs, rs2)
+
+ # dtypes
+ if check_dtypes:
+ assert (rs.dtypes == df.dtypes).all()
+
+ # check getting
+ for df in [default_frame, self.mixed_frame,
+ self.mixed_float, self.mixed_int]:
+ if compat.PY3 and df is self.mixed_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ continue
+ cond = df > 0
+ _check_get(df, cond)
+
+ # upcasting case (GH # 2794)
+ df = DataFrame({c: Series([1] * 3, dtype=c)
+ for c in ['float32', 'float64',
+ 'int32', 'int64']})
+ df.iloc[1, :] = 0
+ result = df.where(df >= 0).get_dtype_counts()
+
+ # when we don't preserve boolean casts
+ #
+ # expected = Series({ 'float32' : 1, 'float64' : 3 })
+
+ expected = Series({'float32': 1, 'float64': 1, 'int32': 1, 'int64': 1})
+ assert_series_equal(result, expected)
+
+ # aligning
+ def _check_align(df, cond, other, check_dtypes=True):
+ rs = df.where(cond, other)
+ for i, k in enumerate(rs.columns):
+ result = rs[k]
+ d = df[k].values
+ c = cond[k].reindex(df[k].index).fillna(False).values
+
+ if is_scalar(other):
+ o = other
+ else:
+ if isinstance(other, np.ndarray):
+ o = Series(other[:, i], index=result.index).values
+ else:
+ o = other[k].values
+
+ new_values = d if c.all() else np.where(c, d, o)
+ expected = Series(new_values, index=result.index, name=k)
+
+ # since we can't always have the correct numpy dtype
+ # as numpy doesn't know how to downcast, don't check
+ assert_series_equal(result, expected, check_dtype=False)
+
+ # dtypes
+ # can't check dtype when other is an ndarray
+
+ if check_dtypes and not isinstance(other, np.ndarray):
+ assert (rs.dtypes == df.dtypes).all()
+
+ for df in [self.mixed_frame, self.mixed_float, self.mixed_int]:
+ if compat.PY3 and df is self.mixed_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ continue
+
+ # other is a frame
+ cond = (df > 0)[1:]
+ _check_align(df, cond, _safe_add(df))
+
+ # check other is ndarray
+ cond = df > 0
+ _check_align(df, cond, (_safe_add(df).values))
+
+ # integers are upcast, so don't check the dtypes
+ cond = df > 0
+ check_dtypes = all(not issubclass(s.type, np.integer)
+ for s in df.dtypes)
+ _check_align(df, cond, np.nan, check_dtypes=check_dtypes)
+
+ # invalid conditions
+ df = default_frame
+ err1 = (df + 1).values[0:2, :]
+ pytest.raises(ValueError, df.where, cond, err1)
+
+ err2 = cond.iloc[:2, :].values
+ other1 = _safe_add(df)
+ pytest.raises(ValueError, df.where, err2, other1)
+
+ pytest.raises(ValueError, df.mask, True)
+ pytest.raises(ValueError, df.mask, 0)
+
+ # where inplace
+ def _check_set(df, cond, check_dtypes=True):
+ dfi = df.copy()
+ econd = cond.reindex_like(df).fillna(True)
+ expected = dfi.mask(~econd)
+
+ dfi.where(cond, np.nan, inplace=True)
+ assert_frame_equal(dfi, expected)
+
+ # dtypes (and confirm upcasts)x
+ if check_dtypes:
+ for k, v in compat.iteritems(df.dtypes):
+ if issubclass(v.type, np.integer) and not cond[k].all():
+ v = np.dtype('float64')
+ assert dfi[k].dtype == v
+
+ for df in [default_frame, self.mixed_frame, self.mixed_float,
+ self.mixed_int]:
+ if compat.PY3 and df is self.mixed_frame:
+ with pytest.raises(TypeError):
+ df > 0
+ continue
+
+ cond = df > 0
+ _check_set(df, cond)
+
+ cond = df >= 0
+ _check_set(df, cond)
+
+ # aligining
+ cond = (df >= 0)[1:]
+ _check_set(df, cond)
+
+ # GH 10218
+ # test DataFrame.where with Series slicing
+ df = DataFrame({'a': range(3), 'b': range(4, 7)})
+ result = df.where(df['a'] == 1)
+ expected = df[df['a'] == 1].reindex(df.index)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [list, tuple, np.array])
+ def test_where_array_like(self, klass):
+ # see gh-15414
+ df = DataFrame({"a": [1, 2, 3]})
+ cond = [[False], [True], [True]]
+ expected = DataFrame({"a": [np.nan, 2, 3]})
+
+ result = df.where(klass(cond))
+ assert_frame_equal(result, expected)
+
+ df["b"] = 2
+ expected["b"] = [2, np.nan, 2]
+ cond = [[False, True], [True, False], [True, True]]
+
+ result = df.where(klass(cond))
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("cond", [
+ [[1], [0], [1]],
+ Series([[2], [5], [7]]),
+ DataFrame({"a": [2, 5, 7]}),
+ [["True"], ["False"], ["True"]],
+ [[Timestamp("2017-01-01")],
+ [pd.NaT], [Timestamp("2017-01-02")]]
+ ])
+ def test_where_invalid_input_single(self, cond):
+ # see gh-15414: only boolean arrays accepted
+ df = DataFrame({"a": [1, 2, 3]})
+ msg = "Boolean array expected for the condition"
+
+ with pytest.raises(ValueError, match=msg):
+ df.where(cond)
+
+ @pytest.mark.parametrize("cond", [
+ [[0, 1], [1, 0], [1, 1]],
+ Series([[0, 2], [5, 0], [4, 7]]),
+ [["False", "True"], ["True", "False"],
+ ["True", "True"]],
+ DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}),
+ [[pd.NaT, Timestamp("2017-01-01")],
+ [Timestamp("2017-01-02"), pd.NaT],
+ [Timestamp("2017-01-03"), Timestamp("2017-01-03")]]
+ ])
+ def test_where_invalid_input_multiple(self, cond):
+ # see gh-15414: only boolean arrays accepted
+ df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]})
+ msg = "Boolean array expected for the condition"
+
+ with pytest.raises(ValueError, match=msg):
+ df.where(cond)
+
+ def test_where_dataframe_col_match(self):
+ df = DataFrame([[1, 2, 3], [4, 5, 6]])
+ cond = DataFrame([[True, False, True], [False, False, True]])
+
+ result = df.where(cond)
+ expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]])
+ tm.assert_frame_equal(result, expected)
+
+ # this *does* align, though has no matching columns
+ cond.columns = ["a", "b", "c"]
+ result = df.where(cond)
+ expected = DataFrame(np.nan, index=df.index, columns=df.columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_where_ndframe_align(self):
+ msg = "Array conditional must be same shape as self"
+ df = DataFrame([[1, 2, 3], [4, 5, 6]])
+
+ cond = [True]
+ with pytest.raises(ValueError, match=msg):
+ df.where(cond)
+
+ expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]])
+
+ out = df.where(Series(cond))
+ tm.assert_frame_equal(out, expected)
+
+ cond = np.array([False, True, False, True])
+ with pytest.raises(ValueError, match=msg):
+ df.where(cond)
+
+ expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]])
+
+ out = df.where(Series(cond))
+ tm.assert_frame_equal(out, expected)
+
+ def test_where_bug(self):
+ # see gh-2793
+ df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [
+ 4.0, 3.0, 2.0, 1.0]}, dtype='float64')
+ expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [
+ 4.0, 3.0, np.nan, np.nan]}, dtype='float64')
+ result = df.where(df > 2, np.nan)
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(result > 2, np.nan, inplace=True)
+ assert_frame_equal(result, expected)
+
+ def test_where_bug_mixed(self, sint_dtype):
+ # see gh-2793
+ df = DataFrame({"a": np.array([1, 2, 3, 4], dtype=sint_dtype),
+ "b": np.array([4.0, 3.0, 2.0, 1.0],
+ dtype="float64")})
+
+ expected = DataFrame({"a": [np.nan, np.nan, 3.0, 4.0],
+ "b": [4.0, 3.0, np.nan, np.nan]},
+ dtype="float64")
+
+ result = df.where(df > 2, np.nan)
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(result > 2, np.nan, inplace=True)
+ assert_frame_equal(result, expected)
+
+ def test_where_bug_transposition(self):
+ # see gh-7506
+ a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]})
+ b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]})
+ do_not_replace = b.isna() | (a > b)
+
+ expected = a.copy()
+ expected[~do_not_replace] = b
+
+ result = a.where(do_not_replace, b)
+ assert_frame_equal(result, expected)
+
+ a = DataFrame({0: [4, 6], 1: [1, 0]})
+ b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]})
+ do_not_replace = b.isna() | (a > b)
+
+ expected = a.copy()
+ expected[~do_not_replace] = b
+
+ result = a.where(do_not_replace, b)
+ assert_frame_equal(result, expected)
+
+ def test_where_datetime(self):
+
+ # GH 3311
+ df = DataFrame(dict(A=date_range('20130102', periods=5),
+ B=date_range('20130104', periods=5),
+ C=np.random.randn(5)))
+
+ stamp = datetime(2013, 1, 3)
+ with pytest.raises(TypeError):
+ df > stamp
+
+ result = df[df.iloc[:, :-1] > stamp]
+
+ expected = df.copy()
+ expected.loc[[0, 1], 'A'] = np.nan
+ expected.loc[:, 'C'] = np.nan
+ assert_frame_equal(result, expected)
+
+ def test_where_none(self):
+ # GH 4667
+ # setting with None changes dtype
+ df = DataFrame({'series': Series(range(10))}).astype(float)
+ df[df > 7] = None
+ expected = DataFrame(
+ {'series': Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])})
+ assert_frame_equal(df, expected)
+
+ # GH 7656
+ df = DataFrame([{'A': 1, 'B': np.nan, 'C': 'Test'}, {
+ 'A': np.nan, 'B': 'Test', 'C': np.nan}])
+ msg = 'boolean setting on mixed-type'
+
+ with pytest.raises(TypeError, match=msg):
+ df.where(~isna(df), None, inplace=True)
+
+ def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self):
+ # see gh-21947
+ df = pd.DataFrame(columns=["a"])
+ cond = df.applymap(lambda x: x > 0)
+
+ result = df.where(cond)
+ tm.assert_frame_equal(result, df)
+
+ def test_where_align(self):
+
+ def create():
+ df = DataFrame(np.random.randn(10, 3))
+ df.iloc[3:5, 0] = np.nan
+ df.iloc[4:6, 1] = np.nan
+ df.iloc[5:8, 2] = np.nan
+ return df
+
+ # series
+ df = create()
+ expected = df.fillna(df.mean())
+ result = df.where(pd.notna(df), df.mean(), axis='columns')
+ assert_frame_equal(result, expected)
+
+ df.where(pd.notna(df), df.mean(), inplace=True, axis='columns')
+ assert_frame_equal(df, expected)
+
+ df = create().fillna(0)
+ expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0])
+ result = df.where(df > 0, df[0], axis='index')
+ assert_frame_equal(result, expected)
+ result = df.where(df > 0, df[0], axis='rows')
+ assert_frame_equal(result, expected)
+
+ # frame
+ df = create()
+ expected = df.fillna(1)
+ result = df.where(pd.notna(df), DataFrame(
+ 1, index=df.index, columns=df.columns))
+ assert_frame_equal(result, expected)
+
+ def test_where_complex(self):
+ # GH 6345
+ expected = DataFrame(
+ [[1 + 1j, 2], [np.nan, 4 + 1j]], columns=['a', 'b'])
+ df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=['a', 'b'])
+ df[df.abs() >= 5] = np.nan
+ assert_frame_equal(df, expected)
+
+ def test_where_axis(self):
+ # GH 9736
+ df = DataFrame(np.random.randn(2, 2))
+ mask = DataFrame([[False, False], [False, False]])
+ s = Series([0, 1])
+
+ expected = DataFrame([[0, 0], [1, 1]], dtype='float64')
+ result = df.where(mask, s, axis='index')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0, 1], [0, 1]], dtype='float64')
+ result = df.where(mask, s, axis='columns')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # Upcast needed
+ df = DataFrame([[1, 2], [3, 4]], dtype='int64')
+ mask = DataFrame([[False, False], [False, False]])
+ s = Series([0, np.nan])
+
+ expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64')
+ result = df.where(mask, s, axis='index')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0, np.nan], [0, np.nan]])
+ result = df.where(mask, s, axis='columns')
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame({0: np.array([0, 0], dtype='int64'),
+ 1: np.array([np.nan, np.nan], dtype='float64')})
+ result = df.copy()
+ result.where(mask, s, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # Multiple dtypes (=> multiple Blocks)
+ df = pd.concat([
+ DataFrame(np.random.randn(10, 2)),
+ DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype='int64')],
+ ignore_index=True, axis=1)
+ mask = DataFrame(False, columns=df.columns, index=df.index)
+ s1 = Series(1, index=df.columns)
+ s2 = Series(2, index=df.index)
+
+ result = df.where(mask, s1, axis='columns')
+ expected = DataFrame(1.0, columns=df.columns, index=df.index)
+ expected[2] = expected[2].astype('int64')
+ expected[3] = expected[3].astype('int64')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s1, axis='columns', inplace=True)
+ assert_frame_equal(result, expected)
+
+ result = df.where(mask, s2, axis='index')
+ expected = DataFrame(2.0, columns=df.columns, index=df.index)
+ expected[2] = expected[2].astype('int64')
+ expected[3] = expected[3].astype('int64')
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.where(mask, s2, axis='index', inplace=True)
+ assert_frame_equal(result, expected)
+
+ # DataFrame vs DataFrame
+ d1 = df.copy().drop(1, axis=0)
+ expected = df.copy()
+ expected.loc[1, :] = np.nan
+
+ result = df.where(mask, d1)
+ assert_frame_equal(result, expected)
+ result = df.where(mask, d1, axis='index')
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d1, inplace=True)
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d1, inplace=True, axis='index')
+ assert_frame_equal(result, expected)
+
+ d2 = df.copy().drop(1, axis=1)
+ expected = df.copy()
+ expected.loc[:, 1] = np.nan
+
+ result = df.where(mask, d2)
+ assert_frame_equal(result, expected)
+ result = df.where(mask, d2, axis='columns')
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d2, inplace=True)
+ assert_frame_equal(result, expected)
+ result = df.copy()
+ result.where(mask, d2, inplace=True, axis='columns')
+ assert_frame_equal(result, expected)
+
+ def test_where_callable(self):
+ # GH 12533
+ df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ result = df.where(lambda x: x > 4, lambda x: x + 1)
+ exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result, df.where(df > 4, df + 1))
+
+ # return ndarray and scalar
+ result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99)
+ exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result, df.where(df % 2 == 0, 99))
+
+ # chain
+ result = (df + 2).where(lambda x: x > 8, lambda x: x + 10)
+ exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result,
+ (df + 2).where((df + 2) > 8, (df + 2) + 10))
+
+ def test_where_tz_values(self, tz_naive_fixture):
+ df1 = DataFrame(DatetimeIndex(['20150101', '20150102', '20150103'],
+ tz=tz_naive_fixture),
+ columns=['date'])
+ df2 = DataFrame(DatetimeIndex(['20150103', '20150104', '20150105'],
+ tz=tz_naive_fixture),
+ columns=['date'])
+ mask = DataFrame([True, True, False], columns=['date'])
+ exp = DataFrame(DatetimeIndex(['20150101', '20150102', '20150105'],
+ tz=tz_naive_fixture),
+ columns=['date'])
+ result = df1.where(mask, df2)
+ assert_frame_equal(exp, result)
+
+ def test_mask(self):
+ df = DataFrame(np.random.randn(5, 3))
+ cond = df > 0
+
+ rs = df.where(cond, np.nan)
+ assert_frame_equal(rs, df.mask(df <= 0))
+ assert_frame_equal(rs, df.mask(~cond))
+
+ other = DataFrame(np.random.randn(5, 3))
+ rs = df.where(cond, other)
+ assert_frame_equal(rs, df.mask(df <= 0, other))
+ assert_frame_equal(rs, df.mask(~cond, other))
+
+ # see gh-21891
+ df = DataFrame([1, 2])
+ res = df.mask([[True], [False]])
+
+ exp = DataFrame([np.nan, 2])
+ tm.assert_frame_equal(res, exp)
+
+ def test_mask_inplace(self):
+ # GH8801
+ df = DataFrame(np.random.randn(5, 3))
+ cond = df > 0
+
+ rdf = df.copy()
+
+ rdf.where(cond, inplace=True)
+ assert_frame_equal(rdf, df.where(cond))
+ assert_frame_equal(rdf, df.mask(~cond))
+
+ rdf = df.copy()
+ rdf.where(cond, -df, inplace=True)
+ assert_frame_equal(rdf, df.where(cond, -df))
+ assert_frame_equal(rdf, df.mask(~cond, -df))
+
+ def test_mask_edge_case_1xN_frame(self):
+ # GH4071
+ df = DataFrame([[1, 2]])
+ res = df.mask(DataFrame([[True, False]]))
+ expec = DataFrame([[np.nan, 2]])
+ assert_frame_equal(res, expec)
+
+ def test_mask_callable(self):
+ # GH 12533
+ df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ result = df.mask(lambda x: x > 4, lambda x: x + 1)
+ exp = DataFrame([[1, 2, 3], [4, 6, 7], [8, 9, 10]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result, df.mask(df > 4, df + 1))
+
+ # return ndarray and scalar
+ result = df.mask(lambda x: (x % 2 == 0).values, lambda x: 99)
+ exp = DataFrame([[1, 99, 3], [99, 5, 99], [7, 99, 9]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result, df.mask(df % 2 == 0, 99))
+
+ # chain
+ result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10)
+ exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]])
+ tm.assert_frame_equal(result, exp)
+ tm.assert_frame_equal(result,
+ (df + 2).mask((df + 2) > 8, (df + 2) + 10))
+
+ def test_head_tail(self):
+ assert_frame_equal(self.frame.head(), self.frame[:5])
+ assert_frame_equal(self.frame.tail(), self.frame[-5:])
+
+ assert_frame_equal(self.frame.head(0), self.frame[0:0])
+ assert_frame_equal(self.frame.tail(0), self.frame[0:0])
+
+ assert_frame_equal(self.frame.head(-1), self.frame[:-1])
+ assert_frame_equal(self.frame.tail(-1), self.frame[1:])
+ assert_frame_equal(self.frame.head(1), self.frame[:1])
+ assert_frame_equal(self.frame.tail(1), self.frame[-1:])
+ # with a float index
+ df = self.frame.copy()
+ df.index = np.arange(len(self.frame)) + 0.1
+ assert_frame_equal(df.head(), df.iloc[:5])
+ assert_frame_equal(df.tail(), df.iloc[-5:])
+ assert_frame_equal(df.head(0), df[0:0])
+ assert_frame_equal(df.tail(0), df[0:0])
+ assert_frame_equal(df.head(-1), df.iloc[:-1])
+ assert_frame_equal(df.tail(-1), df.iloc[1:])
+ # test empty dataframe
+ empty_df = DataFrame()
+ assert_frame_equal(empty_df.tail(), empty_df)
+ assert_frame_equal(empty_df.head(), empty_df)
+
+ def test_type_error_multiindex(self):
+ # See gh-12218
+ df = DataFrame(columns=['i', 'c', 'x', 'y'],
+ data=[[0, 0, 1, 2], [1, 0, 3, 4],
+ [0, 1, 1, 2], [1, 1, 3, 4]])
+ dg = df.pivot_table(index='i', columns='c',
+ values=['x', 'y'])
+
+ with pytest.raises(TypeError, match="is an invalid key"):
+ str(dg[:, 0])
+
+ index = Index(range(2), name='i')
+ columns = MultiIndex(levels=[['x', 'y'], [0, 1]],
+ codes=[[0, 1], [0, 0]],
+ names=[None, 'c'])
+ expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index)
+
+ result = dg.loc[:, (slice(None), 0)]
+ assert_frame_equal(result, expected)
+
+ name = ('x', 0)
+ index = Index(range(2), name='i')
+ expected = Series([1, 3], index=index, name=name)
+
+ result = dg['x', 0]
+ assert_series_equal(result, expected)
+
+ def test_interval_index(self):
+ # GH 19977
+ index = pd.interval_range(start=0, periods=3)
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=index,
+ columns=['A', 'B', 'C'])
+
+ expected = 1
+ result = df.loc[0.5, 'A']
+ assert_almost_equal(result, expected)
+
+ index = pd.interval_range(start=0, periods=3, closed='both')
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=index,
+ columns=['A', 'B', 'C'])
+
+ index_exp = pd.interval_range(start=0, periods=2,
+ freq=1, closed='both')
+ expected = pd.Series([1, 4], index=index_exp, name='A')
+ result = df.loc[1, 'A']
+ assert_series_equal(result, expected)
+
+
+class TestDataFrameIndexingDatetimeWithTZ(TestData):
+
+ def setup_method(self, method):
+ self.idx = Index(date_range('20130101', periods=3, tz='US/Eastern'),
+ name='foo')
+ self.dr = date_range('20130110', periods=3)
+ self.df = DataFrame({'A': self.idx, 'B': self.dr})
+
+ def test_setitem(self):
+
+ df = self.df
+ idx = self.idx
+
+ # setitem
+ df['C'] = idx
+ assert_series_equal(df['C'], Series(idx, name='C'))
+
+ df['D'] = 'foo'
+ df['D'] = idx
+ assert_series_equal(df['D'], Series(idx, name='D'))
+ del df['D']
+
+ # assert that A & C are not sharing the same base (e.g. they
+ # are copies)
+ b1 = df._data.blocks[1]
+ b2 = df._data.blocks[2]
+ tm.assert_extension_array_equal(b1.values, b2.values)
+ assert id(b1.values._data.base) != id(b2.values._data.base)
+
+ # with nan
+ df2 = df.copy()
+ df2.iloc[1, 1] = pd.NaT
+ df2.iloc[1, 2] = pd.NaT
+ result = df2['B']
+ assert_series_equal(notna(result), Series(
+ [True, False, True], name='B'))
+ assert_series_equal(df2.dtypes, df.dtypes)
+
+ def test_set_reset(self):
+
+ idx = self.idx
+
+ # set/reset
+ df = DataFrame({'A': [0, 1, 2]}, index=idx)
+ result = df.reset_index()
+ assert result['foo'].dtype, 'M8[ns, US/Eastern'
+
+ df = result.set_index('foo')
+ tm.assert_index_equal(df.index, idx)
+
+ def test_transpose(self):
+
+ result = self.df.T
+ expected = DataFrame(self.df.values.T)
+ expected.index = ['A', 'B']
+ assert_frame_equal(result, expected)
+
+ def test_scalar_assignment(self):
+ # issue #19843
+ df = pd.DataFrame(index=(0, 1, 2))
+ df['now'] = pd.Timestamp('20130101', tz='UTC')
+ expected = pd.DataFrame(
+ {'now': pd.Timestamp('20130101', tz='UTC')}, index=[0, 1, 2])
+ tm.assert_frame_equal(df, expected)
+
+
+class TestDataFrameIndexingUInt64(TestData):
+
+ def setup_method(self, method):
+ self.ir = Index(np.arange(3), dtype=np.uint64)
+ self.idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo')
+
+ self.df = DataFrame({'A': self.idx, 'B': self.ir})
+
+ def test_setitem(self):
+
+ df = self.df
+ idx = self.idx
+
+ # setitem
+ df['C'] = idx
+ assert_series_equal(df['C'], Series(idx, name='C'))
+
+ df['D'] = 'foo'
+ df['D'] = idx
+ assert_series_equal(df['D'], Series(idx, name='D'))
+ del df['D']
+
+ # With NaN: because uint64 has no NaN element,
+ # the column should be cast to object.
+ df2 = df.copy()
+ df2.iloc[1, 1] = pd.NaT
+ df2.iloc[1, 2] = pd.NaT
+ result = df2['B']
+ assert_series_equal(notna(result), Series(
+ [True, False, True], name='B'))
+ assert_series_equal(df2.dtypes, Series([np.dtype('uint64'),
+ np.dtype('O'), np.dtype('O')],
+ index=['A', 'B', 'C']))
+
+ def test_set_reset(self):
+
+ idx = self.idx
+
+ # set/reset
+ df = DataFrame({'A': [0, 1, 2]}, index=idx)
+ result = df.reset_index()
+ assert result['foo'].dtype == np.dtype('uint64')
+
+ df = result.set_index('foo')
+ tm.assert_index_equal(df.index, idx)
+
+ def test_transpose(self):
+
+ result = self.df.T
+ expected = DataFrame(self.df.values.T)
+ expected.index = ['A', 'B']
+ assert_frame_equal(result, expected)
+
+
+class TestDataFrameIndexingCategorical(object):
+
+ def test_assignment(self):
+ # assignment
+ df = DataFrame({'value': np.array(
+ np.random.randint(0, 10000, 100), dtype='int32')})
+ labels = Categorical(["{0} - {1}".format(i, i + 499)
+ for i in range(0, 10000, 500)])
+
+ df = df.sort_values(by=['value'], ascending=True)
+ s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
+ d = s.values
+ df['D'] = d
+ str(df)
+
+ result = df.dtypes
+ expected = Series(
+ [np.dtype('int32'), CategoricalDtype(categories=labels,
+ ordered=False)],
+ index=['value', 'D'])
+ tm.assert_series_equal(result, expected)
+
+ df['E'] = s
+ str(df)
+
+ result = df.dtypes
+ expected = Series([np.dtype('int32'),
+ CategoricalDtype(categories=labels, ordered=False),
+ CategoricalDtype(categories=labels, ordered=False)],
+ index=['value', 'D', 'E'])
+ tm.assert_series_equal(result, expected)
+
+ result1 = df['D']
+ result2 = df['E']
+ tm.assert_categorical_equal(result1._data._block.values, d)
+
+ # sorting
+ s.name = 'E'
+ tm.assert_series_equal(result2.sort_index(), s.sort_index())
+
+ cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
+ df = DataFrame(Series(cat))
+
+ def test_assigning_ops(self):
+ # systematically test the assigning operations:
+ # for all slicing ops:
+ # for value in categories and value not in categories:
+
+ # - assign a single value -> exp_single_cats_value
+
+ # - assign a complete row (mixed values) -> exp_single_row
+
+ # assign multiple rows (mixed values) (-> array) -> exp_multi_row
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+
+ cats = Categorical(["a", "a", "a", "a", "a", "a", "a"],
+ categories=["a", "b"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values = [1, 1, 1, 1, 1, 1, 1]
+ orig = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ # the expected values
+ # changed single row
+ cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"],
+ categories=["a", "b"])
+ idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values1 = [1, 1, 2, 1, 1, 1, 1]
+ exp_single_row = DataFrame({"cats": cats1,
+ "values": values1}, index=idx1)
+
+ # changed multiple rows
+ cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"],
+ categories=["a", "b"])
+ idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values2 = [1, 1, 2, 2, 1, 1, 1]
+ exp_multi_row = DataFrame({"cats": cats2,
+ "values": values2}, index=idx2)
+
+ # changed part of the cats column
+ cats3 = Categorical(
+ ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
+ idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values3 = [1, 1, 1, 1, 1, 1, 1]
+ exp_parts_cats_col = DataFrame({"cats": cats3,
+ "values": values3}, index=idx3)
+
+ # changed single value in cats col
+ cats4 = Categorical(
+ ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
+ idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values4 = [1, 1, 1, 1, 1, 1, 1]
+ exp_single_cats_value = DataFrame({"cats": cats4,
+ "values": values4}, index=idx4)
+
+ # iloc
+ # ###############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.iloc[2, 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.iloc[df.index == "j", 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.iloc[2, 0] = "c"
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.iloc[2, :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.iloc[2, :] = ["c", 2]
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.iloc[2:4, :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.iloc[2:4, :] = [["c", 2], ["c", 2]]
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(list('bb'), categories=list('abc'))
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.iloc[2:4, 0] = Categorical(list('cc'), categories=list('abc'))
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.iloc[2:4, 0] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.iloc[2:4, 0] = ["c", "c"]
+
+ # loc
+ # ##############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.loc["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.loc[df.index == "j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j", "cats"] = "c"
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.loc["j", :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j", :] = ["c", 2]
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.loc["j":"k", :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j":"k", :] = [["c", 2], ["c", 2]]
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["b", "b"], categories=["a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = Categorical(
+ ["c", "c"], categories=["a", "b", "c"])
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", "cats"] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.loc["j":"k", "cats"] = ["c", "c"]
+
+ # loc
+ # ##############
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.loc["j", df.columns[0]] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ df = orig.copy()
+ df.loc[df.index == "j", df.columns[0]] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j", df.columns[0]] = "c"
+
+ # - assign a complete row (mixed values) -> exp_single_row
+ df = orig.copy()
+ df.loc["j", :] = ["b", 2]
+ tm.assert_frame_equal(df, exp_single_row)
+
+ # - assign a complete row (mixed values) not in categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j", :] = ["c", 2]
+
+ # - assign multiple rows (mixed values) -> exp_multi_row
+ df = orig.copy()
+ df.loc["j":"k", :] = [["b", 2], ["b", 2]]
+ tm.assert_frame_equal(df, exp_multi_row)
+
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.loc["j":"k", :] = [["c", 2], ["c", 2]]
+
+ # assign a part of a column with dtype == categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ # different categories -> not sure if this should fail or pass
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["b", "b"], categories=["a", "b", "c"])
+
+ with pytest.raises(ValueError):
+ # different values
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = Categorical(
+ ["c", "c"], categories=["a", "b", "c"])
+
+ # assign a part of a column with dtype != categorical ->
+ # exp_parts_cats_col
+ df = orig.copy()
+ df.loc["j":"k", df.columns[0]] = ["b", "b"]
+ tm.assert_frame_equal(df, exp_parts_cats_col)
+
+ with pytest.raises(ValueError):
+ df.loc["j":"k", df.columns[0]] = ["c", "c"]
+
+ # iat
+ df = orig.copy()
+ df.iat[2, 0] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.iat[2, 0] = "c"
+
+ # at
+ # - assign a single value -> exp_single_cats_value
+ df = orig.copy()
+ df.at["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ # - assign a single value not in the current categories set
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.at["j", "cats"] = "c"
+
+ # fancy indexing
+ catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"],
+ categories=["a", "b", "c"])
+ idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
+ valuesf = [1, 1, 3, 3, 1, 1, 1]
+ df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
+
+ exp_fancy = exp_multi_row.copy()
+ exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True)
+
+ df[df["cats"] == "c"] = ["b", 2]
+ # category c is kept in .categories
+ tm.assert_frame_equal(df, exp_fancy)
+
+ # set_value
+ df = orig.copy()
+ df.at["j", "cats"] = "b"
+ tm.assert_frame_equal(df, exp_single_cats_value)
+
+ with pytest.raises(ValueError):
+ df = orig.copy()
+ df.at["j", "cats"] = "c"
+
+ # Assigning a Category to parts of a int/... column uses the values of
+ # the Catgorical
+ df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
+ exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
+ df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
+ df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
+ tm.assert_frame_equal(df, exp)
+
+ def test_functions_no_warnings(self):
+ df = DataFrame({'value': np.random.randint(0, 100, 20)})
+ labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
+ with tm.assert_produces_warning(False):
+ df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
+ labels=labels)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_join.py b/contrib/python/pandas/py2/pandas/tests/frame/test_join.py
new file mode 100644
index 00000000000..0508658766c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_join.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Index, period_range
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+
+def frame_with_period_index():
+ return DataFrame(
+ data=np.arange(20).reshape(4, 5),
+ columns=list('abcde'),
+ index=period_range(start='2000', freq='A', periods=4))
+
+
+def frame():
+ return TestData().frame
+
+
+def left():
+ return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
+
+
+def right():
+ return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
+
+
+ "how, sort, expected",
+ [('inner', False, DataFrame({'a': [20, 10],
+ 'b': [200, 100]},
+ index=[2, 1])),
+ ('inner', True, DataFrame({'a': [10, 20],
+ 'b': [100, 200]},
+ index=[1, 2])),
+ ('left', False, DataFrame({'a': [20, 10, 0],
+ 'b': [200, 100, np.nan]},
+ index=[2, 1, 0])),
+ ('left', True, DataFrame({'a': [0, 10, 20],
+ 'b': [np.nan, 100, 200]},
+ index=[0, 1, 2])),
+ ('right', False, DataFrame({'a': [np.nan, 10, 20],
+ 'b': [300, 100, 200]},
+ index=[3, 1, 2])),
+ ('right', True, DataFrame({'a': [10, 20, np.nan],
+ 'b': [100, 200, 300]},
+ index=[1, 2, 3])),
+ ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
+ 'b': [np.nan, 100, 200, 300]},
+ index=[0, 1, 2, 3])),
+ ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
+ 'b': [np.nan, 100, 200, 300]},
+ index=[0, 1, 2, 3]))])
+def test_join(left, right, how, sort, expected):
+
+ result = left.join(right, how=how, sort=sort)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_join_index(frame):
+ # left / right
+
+ f = frame.loc[frame.index[:10], ['A', 'B']]
+ f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
+
+ joined = f.join(f2)
+ tm.assert_index_equal(f.index, joined.index)
+ expected_columns = Index(['A', 'B', 'C', 'D'])
+ tm.assert_index_equal(joined.columns, expected_columns)
+
+ joined = f.join(f2, how='left')
+ tm.assert_index_equal(joined.index, f.index)
+ tm.assert_index_equal(joined.columns, expected_columns)
+
+ joined = f.join(f2, how='right')
+ tm.assert_index_equal(joined.index, f2.index)
+ tm.assert_index_equal(joined.columns, expected_columns)
+
+ # inner
+
+ joined = f.join(f2, how='inner')
+ tm.assert_index_equal(joined.index, f.index[5:10])
+ tm.assert_index_equal(joined.columns, expected_columns)
+
+ # outer
+
+ joined = f.join(f2, how='outer')
+ tm.assert_index_equal(joined.index, frame.index.sort_values())
+ tm.assert_index_equal(joined.columns, expected_columns)
+
+ with pytest.raises(ValueError, match='join method'):
+ f.join(f2, how='foo')
+
+ # corner case - overlapping columns
+ msg = 'columns overlap but no suffix'
+ for how in ('outer', 'left', 'inner'):
+ with pytest.raises(ValueError, match=msg):
+ frame.join(frame, how=how)
+
+
+def test_join_index_more(frame):
+ af = frame.loc[:, ['A', 'B']]
+ bf = frame.loc[::2, ['C', 'D']]
+
+ expected = af.copy()
+ expected['C'] = frame['C'][::2]
+ expected['D'] = frame['D'][::2]
+
+ result = af.join(bf)
+ tm.assert_frame_equal(result, expected)
+
+ result = af.join(bf, how='right')
+ tm.assert_frame_equal(result, expected[::2])
+
+ result = bf.join(af, how='right')
+ tm.assert_frame_equal(result, expected.loc[:, result.columns])
+
+
+def test_join_index_series(frame):
+ df = frame.copy()
+ s = df.pop(frame.columns[-1])
+ joined = df.join(s)
+
+ # TODO should this check_names ?
+ tm.assert_frame_equal(joined, frame, check_names=False)
+
+ s.name = None
+ with pytest.raises(ValueError, match='must have a name'):
+ df.join(s)
+
+
+def test_join_overlap(frame):
+ df1 = frame.loc[:, ['A', 'B', 'C']]
+ df2 = frame.loc[:, ['B', 'C', 'D']]
+
+ joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
+ df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
+ df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
+
+ no_overlap = frame.loc[:, ['A', 'D']]
+ expected = df1_suf.join(df2_suf).join(no_overlap)
+
+ # column order not necessarily sorted
+ tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
+
+
+def test_join_period_index(frame_with_period_index):
+ other = frame_with_period_index.rename(
+ columns=lambda x: '{key}{key}'.format(key=x))
+
+ joined_values = np.concatenate(
+ [frame_with_period_index.values] * 2, axis=1)
+
+ joined_cols = frame_with_period_index.columns.append(other.columns)
+
+ joined = frame_with_period_index.join(other)
+ expected = DataFrame(
+ data=joined_values,
+ columns=joined_cols,
+ index=frame_with_period_index.index)
+
+ tm.assert_frame_equal(joined, expected)
+
+
+def test_join_left_sequence_non_unique_index():
+ # https://github.com/pandas-dev/pandas/issues/19607
+ df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
+ df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
+ df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
+
+ joined = df1.join([df2, df3], how='left')
+
+ expected = DataFrame({
+ 'a': [0, 10, 10, 20],
+ 'b': [np.nan, 300, 300, 200],
+ 'c': [np.nan, 400, 500, np.nan]
+ }, index=[1, 2, 2, 3])
+
+ tm.assert_frame_equal(joined, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_missing.py b/contrib/python/pandas/py2/pandas/tests/frame/test_missing.py
new file mode 100644
index 00000000000..77a3d4785d2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_missing.py
@@ -0,0 +1,863 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import datetime
+from distutils.version import LooseVersion
+
+import dateutil
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import Categorical, DataFrame, Series, Timestamp, date_range
+from pandas.tests.frame.common import TestData, _check_mixed_float
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+try:
+ import scipy
+ _is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
+ LooseVersion('0.19.0'))
+except ImportError:
+ _is_scipy_ge_0190 = False
+
+
+def _skip_if_no_pchip():
+ try:
+ from scipy.interpolate import pchip_interpolate # noqa
+ except ImportError:
+ import pytest
+ pytest.skip('scipy.interpolate.pchip missing')
+
+
+class TestDataFrameMissingData(TestData):
+
+ def test_dropEmptyRows(self):
+ N = len(self.frame.index)
+ mat = np.random.randn(N)
+ mat[:5] = np.nan
+
+ frame = DataFrame({'foo': mat}, index=self.frame.index)
+ original = Series(mat, index=self.frame.index, name='foo')
+ expected = original.dropna()
+ inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
+
+ smaller_frame = frame.dropna(how='all')
+ # check that original was preserved
+ assert_series_equal(frame['foo'], original)
+ inplace_frame1.dropna(how='all', inplace=True)
+ assert_series_equal(smaller_frame['foo'], expected)
+ assert_series_equal(inplace_frame1['foo'], expected)
+
+ smaller_frame = frame.dropna(how='all', subset=['foo'])
+ inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
+ assert_series_equal(smaller_frame['foo'], expected)
+ assert_series_equal(inplace_frame2['foo'], expected)
+
+ def test_dropIncompleteRows(self):
+ N = len(self.frame.index)
+ mat = np.random.randn(N)
+ mat[:5] = np.nan
+
+ frame = DataFrame({'foo': mat}, index=self.frame.index)
+ frame['bar'] = 5
+ original = Series(mat, index=self.frame.index, name='foo')
+ inp_frame1, inp_frame2 = frame.copy(), frame.copy()
+
+ smaller_frame = frame.dropna()
+ assert_series_equal(frame['foo'], original)
+ inp_frame1.dropna(inplace=True)
+
+ exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
+ tm.assert_series_equal(smaller_frame['foo'], exp)
+ tm.assert_series_equal(inp_frame1['foo'], exp)
+
+ samesize_frame = frame.dropna(subset=['bar'])
+ assert_series_equal(frame['foo'], original)
+ assert (frame['bar'] == 5).all()
+ inp_frame2.dropna(subset=['bar'], inplace=True)
+ tm.assert_index_equal(samesize_frame.index, self.frame.index)
+ tm.assert_index_equal(inp_frame2.index, self.frame.index)
+
+ def test_dropna(self):
+ df = DataFrame(np.random.randn(6, 4))
+ df[2][:2] = np.nan
+
+ dropped = df.dropna(axis=1)
+ expected = df.loc[:, [0, 1, 3]]
+ inp = df.copy()
+ inp.dropna(axis=1, inplace=True)
+ assert_frame_equal(dropped, expected)
+ assert_frame_equal(inp, expected)
+
+ dropped = df.dropna(axis=0)
+ expected = df.loc[lrange(2, 6)]
+ inp = df.copy()
+ inp.dropna(axis=0, inplace=True)
+ assert_frame_equal(dropped, expected)
+ assert_frame_equal(inp, expected)
+
+ # threshold
+ dropped = df.dropna(axis=1, thresh=5)
+ expected = df.loc[:, [0, 1, 3]]
+ inp = df.copy()
+ inp.dropna(axis=1, thresh=5, inplace=True)
+ assert_frame_equal(dropped, expected)
+ assert_frame_equal(inp, expected)
+
+ dropped = df.dropna(axis=0, thresh=4)
+ expected = df.loc[lrange(2, 6)]
+ inp = df.copy()
+ inp.dropna(axis=0, thresh=4, inplace=True)
+ assert_frame_equal(dropped, expected)
+ assert_frame_equal(inp, expected)
+
+ dropped = df.dropna(axis=1, thresh=4)
+ assert_frame_equal(dropped, df)
+
+ dropped = df.dropna(axis=1, thresh=3)
+ assert_frame_equal(dropped, df)
+
+ # subset
+ dropped = df.dropna(axis=0, subset=[0, 1, 3])
+ inp = df.copy()
+ inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
+ assert_frame_equal(dropped, df)
+ assert_frame_equal(inp, df)
+
+ # all
+ dropped = df.dropna(axis=1, how='all')
+ assert_frame_equal(dropped, df)
+
+ df[2] = np.nan
+ dropped = df.dropna(axis=1, how='all')
+ expected = df.loc[:, [0, 1, 3]]
+ assert_frame_equal(dropped, expected)
+
+ # bad input
+ pytest.raises(ValueError, df.dropna, axis=3)
+
+ def test_drop_and_dropna_caching(self):
+ # tst that cacher updates
+ original = Series([1, 2, np.nan], name='A')
+ expected = Series([1, 2], dtype=original.dtype, name='A')
+ df = pd.DataFrame({'A': original.values.copy()})
+ df2 = df.copy()
+ df['A'].dropna()
+ assert_series_equal(df['A'], original)
+ df['A'].dropna(inplace=True)
+ assert_series_equal(df['A'], expected)
+ df2['A'].drop([1])
+ assert_series_equal(df2['A'], original)
+ df2['A'].drop([1], inplace=True)
+ assert_series_equal(df2['A'], original.drop([1]))
+
+ def test_dropna_corner(self):
+ # bad input
+ pytest.raises(ValueError, self.frame.dropna, how='foo')
+ pytest.raises(TypeError, self.frame.dropna, how=None)
+ # non-existent column - 8303
+ pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
+
+ def test_dropna_multiple_axes(self):
+ df = DataFrame([[1, np.nan, 2, 3],
+ [4, np.nan, 5, 6],
+ [np.nan, np.nan, np.nan, np.nan],
+ [7, np.nan, 8, 9]])
+ cp = df.copy()
+
+ # GH20987
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.dropna(how='all', axis=[0, 1])
+ with tm.assert_produces_warning(FutureWarning):
+ result2 = df.dropna(how='all', axis=(0, 1))
+ expected = df.dropna(how='all').dropna(how='all', axis=1)
+
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+ assert_frame_equal(df, cp)
+
+ inp = df.copy()
+ with tm.assert_produces_warning(FutureWarning):
+ inp.dropna(how='all', axis=(0, 1), inplace=True)
+ assert_frame_equal(inp, expected)
+
+ def test_dropna_tz_aware_datetime(self):
+ # GH13407
+ df = DataFrame()
+ dt1 = datetime.datetime(2015, 1, 1,
+ tzinfo=dateutil.tz.tzutc())
+ dt2 = datetime.datetime(2015, 2, 2,
+ tzinfo=dateutil.tz.tzutc())
+ df['Time'] = [dt1]
+ result = df.dropna(axis=0)
+ expected = DataFrame({'Time': [dt1]})
+ assert_frame_equal(result, expected)
+
+ # Ex2
+ df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
+ result = df.dropna(axis=0)
+ expected = DataFrame([dt1, dt2],
+ columns=['Time'],
+ index=[0, 3])
+ assert_frame_equal(result, expected)
+
+ def test_fillna(self):
+ tf = self.tsframe
+ tf.loc[tf.index[:5], 'A'] = np.nan
+ tf.loc[tf.index[-5:], 'A'] = np.nan
+
+ zero_filled = self.tsframe.fillna(0)
+ assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
+
+ padded = self.tsframe.fillna(method='pad')
+ assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
+ assert (padded.loc[padded.index[-5:], 'A'] ==
+ padded.loc[padded.index[-5], 'A']).all()
+
+ # mixed type
+ mf = self.mixed_frame
+ mf.loc[mf.index[5:20], 'foo'] = np.nan
+ mf.loc[mf.index[-10:], 'A'] = np.nan
+ result = self.mixed_frame.fillna(value=0)
+ result = self.mixed_frame.fillna(method='pad')
+
+ pytest.raises(ValueError, self.tsframe.fillna)
+ pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
+
+ # mixed numeric (but no float16)
+ mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
+ mf.loc[mf.index[-10:], 'A'] = np.nan
+ result = mf.fillna(value=0)
+ _check_mixed_float(result, dtype=dict(C=None))
+
+ result = mf.fillna(method='pad')
+ _check_mixed_float(result, dtype=dict(C=None))
+
+ # empty frame (GH #2778)
+ df = DataFrame(columns=['x'])
+ for m in ['pad', 'backfill']:
+ df.x.fillna(method=m, inplace=True)
+ df.x.fillna(method=m)
+
+ # with different dtype (GH3386)
+ df = DataFrame([['a', 'a', np.nan, 'a'], [
+ 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
+
+ result = df.fillna({2: 'foo'})
+ expected = DataFrame([['a', 'a', 'foo', 'a'],
+ ['b', 'b', 'foo', 'b'],
+ ['c', 'c', 'foo', 'c']])
+ assert_frame_equal(result, expected)
+
+ df.fillna({2: 'foo'}, inplace=True)
+ assert_frame_equal(df, expected)
+
+ # limit and value
+ df = DataFrame(np.random.randn(10, 3))
+ df.iloc[2:7, 0] = np.nan
+ df.iloc[3:5, 2] = np.nan
+
+ expected = df.copy()
+ expected.iloc[2, 0] = 999
+ expected.iloc[3, 2] = 999
+ result = df.fillna(999, limit=1)
+ assert_frame_equal(result, expected)
+
+ # with datelike
+ # GH 6344
+ df = DataFrame({
+ 'Date': [pd.NaT, Timestamp("2014-1-1")],
+ 'Date2': [Timestamp("2013-1-1"), pd.NaT]
+ })
+
+ expected = df.copy()
+ expected['Date'] = expected['Date'].fillna(
+ df.loc[df.index[0], 'Date2'])
+ result = df.fillna(value={'Date': df['Date2']})
+ assert_frame_equal(result, expected)
+
+ # with timezone
+ # GH 15855
+ df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.NaT]})
+ exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.Timestamp('2012-11-11 00:00:00+01:00')]})
+ assert_frame_equal(df.fillna(method='pad'), exp)
+
+ df = pd.DataFrame({'A': [pd.NaT,
+ pd.Timestamp('2012-11-11 00:00:00+01:00')]})
+ exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.Timestamp('2012-11-11 00:00:00+01:00')]})
+ assert_frame_equal(df.fillna(method='bfill'), exp)
+
+ # with timezone in another column
+ # GH 15522
+ df = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
+ tz='US/Eastern'),
+ 'B': [1, 2, np.nan, np.nan]})
+ result = df.fillna(method='pad')
+ expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
+ tz='US/Eastern'),
+ 'B': [1., 2., 2., 2.]})
+ assert_frame_equal(result, expected)
+
+ def test_na_actions_categorical(self):
+
+ cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
+ vals = ["a", "b", np.nan, "d"]
+ df = DataFrame({"cats": cat, "vals": vals})
+ cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
+ vals2 = ["a", "b", "b", "d"]
+ df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
+ cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
+ vals3 = ["a", "b", np.nan]
+ df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
+ cat4 = Categorical([1, 2], categories=[1, 2, 3])
+ vals4 = ["a", "b"]
+ df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
+
+ # fillna
+ res = df.fillna(value={"cats": 3, "vals": "b"})
+ tm.assert_frame_equal(res, df_exp_fill)
+
+ with pytest.raises(ValueError, match=("fill value must "
+ "be in categories")):
+ df.fillna(value={"cats": 4, "vals": "c"})
+
+ res = df.fillna(method='pad')
+ tm.assert_frame_equal(res, df_exp_fill)
+
+ # dropna
+ res = df.dropna(subset=["cats"])
+ tm.assert_frame_equal(res, df_exp_drop_cats)
+
+ res = df.dropna()
+ tm.assert_frame_equal(res, df_exp_drop_all)
+
+ # make sure that fillna takes missing values into account
+ c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
+ df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
+
+ cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
+ df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
+
+ res = df.fillna("a")
+ tm.assert_frame_equal(res, df_exp)
+
+ def test_fillna_categorical_nan(self):
+ # GH 14021
+ # np.nan should always be a valid filler
+ cat = Categorical([np.nan, 2, np.nan])
+ val = Categorical([np.nan, np.nan, np.nan])
+ df = DataFrame({"cats": cat, "vals": val})
+ res = df.fillna(df.median())
+ v_exp = [np.nan, np.nan, np.nan]
+ df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
+ dtype='category')
+ tm.assert_frame_equal(res, df_exp)
+
+ result = df.cats.fillna(np.nan)
+ tm.assert_series_equal(result, df.cats)
+ result = df.vals.fillna(np.nan)
+ tm.assert_series_equal(result, df.vals)
+
+ idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
+ '2011-01-01 09:00', pd.NaT, pd.NaT])
+ df = DataFrame({'a': Categorical(idx)})
+ tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
+ idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
+ pd.NaT, pd.NaT], freq='M')
+ df = DataFrame({'a': Categorical(idx)})
+ tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
+ idx = pd.TimedeltaIndex(['1 days', '2 days',
+ '1 days', pd.NaT, pd.NaT])
+ df = DataFrame({'a': Categorical(idx)})
+ tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
+ def test_fillna_downcast(self):
+ # GH 15277
+ # infer int64 from float64
+ df = pd.DataFrame({'a': [1., np.nan]})
+ result = df.fillna(0, downcast='infer')
+ expected = pd.DataFrame({'a': [1, 0]})
+ assert_frame_equal(result, expected)
+
+ # infer int64 from float64 when fillna value is a dict
+ df = pd.DataFrame({'a': [1., np.nan]})
+ result = df.fillna({'a': 0}, downcast='infer')
+ expected = pd.DataFrame({'a': [1, 0]})
+ assert_frame_equal(result, expected)
+
+ def test_fillna_dtype_conversion(self):
+ # make sure that fillna on an empty frame works
+ df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
+ result = df.get_dtype_counts().sort_values()
+ expected = Series({'object': 5})
+ assert_series_equal(result, expected)
+
+ result = df.fillna(1)
+ expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
+ result = result.get_dtype_counts().sort_values()
+ expected = Series({'int64': 5})
+ assert_series_equal(result, expected)
+
+ # empty block
+ df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
+ result = df.fillna('nan')
+ expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
+ assert_frame_equal(result, expected)
+
+ # equiv of replace
+ df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
+ for v in ['', 1, np.nan, 1.0]:
+ expected = df.replace(np.nan, v)
+ result = df.fillna(v)
+ assert_frame_equal(result, expected)
+
+ def test_fillna_datetime_columns(self):
+ # GH 7095
+ df = pd.DataFrame({'A': [-1, -2, np.nan],
+ 'B': date_range('20130101', periods=3),
+ 'C': ['foo', 'bar', None],
+ 'D': ['foo2', 'bar2', None]},
+ index=date_range('20130110', periods=3))
+ result = df.fillna('?')
+ expected = pd.DataFrame({'A': [-1, -2, '?'],
+ 'B': date_range('20130101', periods=3),
+ 'C': ['foo', 'bar', '?'],
+ 'D': ['foo2', 'bar2', '?']},
+ index=date_range('20130110', periods=3))
+ tm.assert_frame_equal(result, expected)
+
+ df = pd.DataFrame({'A': [-1, -2, np.nan],
+ 'B': [pd.Timestamp('2013-01-01'),
+ pd.Timestamp('2013-01-02'), pd.NaT],
+ 'C': ['foo', 'bar', None],
+ 'D': ['foo2', 'bar2', None]},
+ index=date_range('20130110', periods=3))
+ result = df.fillna('?')
+ expected = pd.DataFrame({'A': [-1, -2, '?'],
+ 'B': [pd.Timestamp('2013-01-01'),
+ pd.Timestamp('2013-01-02'), '?'],
+ 'C': ['foo', 'bar', '?'],
+ 'D': ['foo2', 'bar2', '?']},
+ index=pd.date_range('20130110', periods=3))
+ tm.assert_frame_equal(result, expected)
+
+ def test_ffill(self):
+ self.tsframe['A'][:5] = np.nan
+ self.tsframe['A'][-5:] = np.nan
+
+ assert_frame_equal(self.tsframe.ffill(),
+ self.tsframe.fillna(method='ffill'))
+
+ def test_bfill(self):
+ self.tsframe['A'][:5] = np.nan
+ self.tsframe['A'][-5:] = np.nan
+
+ assert_frame_equal(self.tsframe.bfill(),
+ self.tsframe.fillna(method='bfill'))
+
+ def test_frame_pad_backfill_limit(self):
+ index = np.arange(10)
+ df = DataFrame(np.random.randn(10, 4), index=index)
+
+ result = df[:2].reindex(index, method='pad', limit=5)
+
+ expected = df[:2].reindex(index).fillna(method='pad')
+ expected.values[-3:] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ result = df[-2:].reindex(index, method='backfill', limit=5)
+
+ expected = df[-2:].reindex(index).fillna(method='backfill')
+ expected.values[:3] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_fillna_limit(self):
+ index = np.arange(10)
+ df = DataFrame(np.random.randn(10, 4), index=index)
+
+ result = df[:2].reindex(index)
+ result = result.fillna(method='pad', limit=5)
+
+ expected = df[:2].reindex(index).fillna(method='pad')
+ expected.values[-3:] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ result = df[-2:].reindex(index)
+ result = result.fillna(method='backfill', limit=5)
+
+ expected = df[-2:].reindex(index).fillna(method='backfill')
+ expected.values[:3] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ def test_fillna_skip_certain_blocks(self):
+ # don't try to fill boolean, int blocks
+
+ df = DataFrame(np.random.randn(10, 4).astype(int))
+
+ # it works!
+ df.fillna(np.nan)
+
+ def test_fillna_inplace(self):
+ df = DataFrame(np.random.randn(10, 4))
+ df[1][:4] = np.nan
+ df[3][-4:] = np.nan
+
+ expected = df.fillna(value=0)
+ assert expected is not df
+
+ df.fillna(value=0, inplace=True)
+ tm.assert_frame_equal(df, expected)
+
+ expected = df.fillna(value={0: 0}, inplace=True)
+ assert expected is None
+
+ df[1][:4] = np.nan
+ df[3][-4:] = np.nan
+ expected = df.fillna(method='ffill')
+ assert expected is not df
+
+ df.fillna(method='ffill', inplace=True)
+ tm.assert_frame_equal(df, expected)
+
+ def test_fillna_dict_series(self):
+ df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
+ 'b': [1, 2, 3, np.nan, np.nan],
+ 'c': [np.nan, 1, 2, 3, 4]})
+
+ result = df.fillna({'a': 0, 'b': 5})
+
+ expected = df.copy()
+ expected['a'] = expected['a'].fillna(0)
+ expected['b'] = expected['b'].fillna(5)
+ assert_frame_equal(result, expected)
+
+ # it works
+ result = df.fillna({'a': 0, 'b': 5, 'd': 7})
+
+ # Series treated same as dict
+ result = df.fillna(df.max())
+ expected = df.fillna(df.max().to_dict())
+ assert_frame_equal(result, expected)
+
+ # disable this for now
+ with pytest.raises(NotImplementedError, match='column by column'):
+ df.fillna(df.max(1), axis=1)
+
+ def test_fillna_dataframe(self):
+ # GH 8377
+ df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
+ 'b': [1, 2, 3, np.nan, np.nan],
+ 'c': [np.nan, 1, 2, 3, 4]},
+ index=list('VWXYZ'))
+
+ # df2 may have different index and columns
+ df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40],
+ 'b': [50, 60, 70, 80, 90],
+ 'foo': ['bar'] * 5},
+ index=list('VWXuZ'))
+
+ result = df.fillna(df2)
+
+ # only those columns and indices which are shared get filled
+ expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40],
+ 'b': [1, 2, 3, np.nan, 90],
+ 'c': [np.nan, 1, 2, 3, 4]},
+ index=list('VWXYZ'))
+
+ assert_frame_equal(result, expected)
+
+ def test_fillna_columns(self):
+ df = DataFrame(np.random.randn(10, 10))
+ df.values[:, ::2] = np.nan
+
+ result = df.fillna(method='ffill', axis=1)
+ expected = df.T.fillna(method='pad').T
+ assert_frame_equal(result, expected)
+
+ df.insert(6, 'foo', 5)
+ result = df.fillna(method='ffill', axis=1)
+ expected = df.astype(float).fillna(method='ffill', axis=1)
+ assert_frame_equal(result, expected)
+
+ def test_fillna_invalid_method(self):
+ with pytest.raises(ValueError, match='ffil'):
+ self.frame.fillna(method='ffil')
+
+ def test_fillna_invalid_value(self):
+ # list
+ pytest.raises(TypeError, self.frame.fillna, [1, 2])
+ # tuple
+ pytest.raises(TypeError, self.frame.fillna, (1, 2))
+ # frame with series
+ pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
+
+ def test_fillna_col_reordering(self):
+ cols = ["COL." + str(i) for i in range(5, 0, -1)]
+ data = np.random.rand(20, 5)
+ df = DataFrame(index=lrange(20), columns=cols, data=data)
+ filled = df.fillna(method='ffill')
+ assert df.columns.tolist() == filled.columns.tolist()
+
+ def test_fill_corner(self):
+ mf = self.mixed_frame
+ mf.loc[mf.index[5:20], 'foo'] = np.nan
+ mf.loc[mf.index[-10:], 'A'] = np.nan
+
+ filled = self.mixed_frame.fillna(value=0)
+ assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
+ del self.mixed_frame['foo']
+
+ empty_float = self.frame.reindex(columns=[])
+
+ # TODO(wesm): unused?
+ result = empty_float.fillna(value=0) # noqa
+
+ def test_fill_value_when_combine_const(self):
+ # GH12723
+ dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
+ df = DataFrame({'foo': dat}, index=range(6))
+
+ exp = df.fillna(0).add(2)
+ res = df.add(2, fill_value=0)
+ assert_frame_equal(res, exp)
+
+
+class TestDataFrameInterpolate(TestData):
+
+ def test_interp_basic(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4],
+ 'B': [1, 4, 9, np.nan],
+ 'C': [1, 2, 3, 5],
+ 'D': list('abcd')})
+ expected = DataFrame({'A': [1., 2., 3., 4.],
+ 'B': [1., 4., 9., 9.],
+ 'C': [1, 2, 3, 5],
+ 'D': list('abcd')})
+ result = df.interpolate()
+ assert_frame_equal(result, expected)
+
+ result = df.set_index('C').interpolate()
+ expected = df.set_index('C')
+ expected.loc[3, 'A'] = 3
+ expected.loc[5, 'B'] = 9
+ assert_frame_equal(result, expected)
+
+ def test_interp_bad_method(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4],
+ 'B': [1, 4, 9, np.nan],
+ 'C': [1, 2, 3, 5],
+ 'D': list('abcd')})
+ with pytest.raises(ValueError):
+ df.interpolate(method='not_a_method')
+
+ def test_interp_combo(self):
+ df = DataFrame({'A': [1., 2., np.nan, 4.],
+ 'B': [1, 4, 9, np.nan],
+ 'C': [1, 2, 3, 5],
+ 'D': list('abcd')})
+
+ result = df['A'].interpolate()
+ expected = Series([1., 2., 3., 4.], name='A')
+ assert_series_equal(result, expected)
+
+ result = df['A'].interpolate(downcast='infer')
+ expected = Series([1, 2, 3, 4], name='A')
+ assert_series_equal(result, expected)
+
+ def test_interp_nan_idx(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
+ df = df.set_index('A')
+ with pytest.raises(NotImplementedError):
+ df.interpolate(method='values')
+
+ @td.skip_if_no_scipy
+ def test_interp_various(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
+ 'C': [1, 2, 3, 5, 8, 13, 21]})
+ df = df.set_index('C')
+ expected = df.copy()
+ result = df.interpolate(method='polynomial', order=1)
+
+ expected.A.loc[3] = 2.66666667
+ expected.A.loc[13] = 5.76923076
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(method='cubic')
+ # GH #15662.
+ # new cubic and quadratic interpolation algorithms from scipy 0.19.0.
+ # previously `splmake` was used. See scipy/scipy#6710
+ if _is_scipy_ge_0190:
+ expected.A.loc[3] = 2.81547781
+ expected.A.loc[13] = 5.52964175
+ else:
+ expected.A.loc[3] = 2.81621174
+ expected.A.loc[13] = 5.64146581
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(method='nearest')
+ expected.A.loc[3] = 2
+ expected.A.loc[13] = 5
+ assert_frame_equal(result, expected, check_dtype=False)
+
+ result = df.interpolate(method='quadratic')
+ if _is_scipy_ge_0190:
+ expected.A.loc[3] = 2.82150771
+ expected.A.loc[13] = 6.12648668
+ else:
+ expected.A.loc[3] = 2.82533638
+ expected.A.loc[13] = 6.02817974
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(method='slinear')
+ expected.A.loc[3] = 2.66666667
+ expected.A.loc[13] = 5.76923077
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(method='zero')
+ expected.A.loc[3] = 2.
+ expected.A.loc[13] = 5
+ assert_frame_equal(result, expected, check_dtype=False)
+
+ @td.skip_if_no_scipy
+ def test_interp_alt_scipy(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
+ 'C': [1, 2, 3, 5, 8, 13, 21]})
+ result = df.interpolate(method='barycentric')
+ expected = df.copy()
+ expected.loc[2, 'A'] = 3
+ expected.loc[5, 'A'] = 6
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(method='barycentric', downcast='infer')
+ assert_frame_equal(result, expected.astype(np.int64))
+
+ result = df.interpolate(method='krogh')
+ expectedk = df.copy()
+ expectedk['A'] = expected['A']
+ assert_frame_equal(result, expectedk)
+
+ _skip_if_no_pchip()
+ import scipy
+ result = df.interpolate(method='pchip')
+ expected.loc[2, 'A'] = 3
+
+ if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
+ expected.loc[5, 'A'] = 6.0
+ else:
+ expected.loc[5, 'A'] = 6.125
+
+ assert_frame_equal(result, expected)
+
+ def test_interp_rowwise(self):
+ df = DataFrame({0: [1, 2, np.nan, 4],
+ 1: [2, 3, 4, np.nan],
+ 2: [np.nan, 4, 5, 6],
+ 3: [4, np.nan, 6, 7],
+ 4: [1, 2, 3, 4]})
+ result = df.interpolate(axis=1)
+ expected = df.copy()
+ expected.loc[3, 1] = 5
+ expected.loc[0, 2] = 3
+ expected.loc[1, 3] = 3
+ expected[4] = expected[4].astype(np.float64)
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(axis=1, method='values')
+ assert_frame_equal(result, expected)
+
+ result = df.interpolate(axis=0)
+ expected = df.interpolate()
+ assert_frame_equal(result, expected)
+
+ def test_rowwise_alt(self):
+ df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
+ 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
+ df.interpolate(axis=0)
+
+ @pytest.mark.parametrize("check_scipy", [
+ False, pytest.param(True, marks=td.skip_if_no_scipy)
+ ])
+ def test_interp_leading_nans(self, check_scipy):
+ df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
+ "B": [np.nan, -3, -3.5, np.nan, -4]})
+ result = df.interpolate()
+ expected = df.copy()
+ expected['B'].loc[3] = -3.75
+ assert_frame_equal(result, expected)
+
+ if check_scipy:
+ result = df.interpolate(method='polynomial', order=1)
+ assert_frame_equal(result, expected)
+
+ def test_interp_raise_on_only_mixed(self):
+ df = DataFrame({'A': [1, 2, np.nan, 4],
+ 'B': ['a', 'b', 'c', 'd'],
+ 'C': [np.nan, 2, 5, 7],
+ 'D': [np.nan, np.nan, 9, 9],
+ 'E': [1, 2, 3, 4]})
+ with pytest.raises(TypeError):
+ df.interpolate(axis=1)
+
+ def test_interp_raise_on_all_object_dtype(self):
+ # GH 22985
+ df = DataFrame({
+ 'A': [1, 2, 3],
+ 'B': [4, 5, 6]},
+ dtype='object')
+ msg = ("Cannot interpolate with all object-dtype columns "
+ "in the DataFrame. Try setting at least one "
+ "column to a numeric dtype.")
+ with pytest.raises(TypeError, match=msg):
+ df.interpolate()
+
+ def test_interp_inplace(self):
+ df = DataFrame({'a': [1., 2., np.nan, 4.]})
+ expected = DataFrame({'a': [1., 2., 3., 4.]})
+ result = df.copy()
+ result['a'].interpolate(inplace=True)
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result['a'].interpolate(inplace=True, downcast='infer')
+ assert_frame_equal(result, expected.astype('int64'))
+
+ def test_interp_inplace_row(self):
+ # GH 10395
+ result = DataFrame({'a': [1., 2., 3., 4.],
+ 'b': [np.nan, 2., 3., 4.],
+ 'c': [3, 2, 2, 2]})
+ expected = result.interpolate(method='linear', axis=1, inplace=False)
+ result.interpolate(method='linear', axis=1, inplace=True)
+ assert_frame_equal(result, expected)
+
+ def test_interp_ignore_all_good(self):
+ # GH
+ df = DataFrame({'A': [1, 2, np.nan, 4],
+ 'B': [1, 2, 3, 4],
+ 'C': [1., 2., np.nan, 4.],
+ 'D': [1., 2., 3., 4.]})
+ expected = DataFrame({'A': np.array(
+ [1, 2, 3, 4], dtype='float64'),
+ 'B': np.array(
+ [1, 2, 3, 4], dtype='int64'),
+ 'C': np.array(
+ [1., 2., 3, 4.], dtype='float64'),
+ 'D': np.array(
+ [1., 2., 3., 4.], dtype='float64')})
+
+ result = df.interpolate(downcast=None)
+ assert_frame_equal(result, expected)
+
+ # all good
+ result = df[['B', 'D']].interpolate(downcast=None)
+ assert_frame_equal(result, df[['B', 'D']])
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_mutate_columns.py b/contrib/python/pandas/py2/pandas/tests/frame/test_mutate_columns.py
new file mode 100644
index 00000000000..1f4da1bbb04
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_mutate_columns.py
@@ -0,0 +1,280 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY36, lrange, range
+
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+# Column add, remove, delete.
+
+
+class TestDataFrameMutateColumns(TestData):
+
+ def test_assign(self):
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+ original = df.copy()
+ result = df.assign(C=df.B / df.A)
+ expected = df.copy()
+ expected['C'] = [4, 2.5, 2]
+ assert_frame_equal(result, expected)
+
+ # lambda syntax
+ result = df.assign(C=lambda x: x.B / x.A)
+ assert_frame_equal(result, expected)
+
+ # original is unmodified
+ assert_frame_equal(df, original)
+
+ # Non-Series array-like
+ result = df.assign(C=[4, 2.5, 2])
+ assert_frame_equal(result, expected)
+ # original is unmodified
+ assert_frame_equal(df, original)
+
+ result = df.assign(B=df.B / df.A)
+ expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
+ assert_frame_equal(result, expected)
+
+ # overwrite
+ result = df.assign(A=df.A + df.B)
+ expected = df.copy()
+ expected['A'] = [5, 7, 9]
+ assert_frame_equal(result, expected)
+
+ # lambda
+ result = df.assign(A=lambda x: x.A + x.B)
+ assert_frame_equal(result, expected)
+
+ def test_assign_multiple(self):
+ df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
+ result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
+ expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
+ [3, 6, 9, 3, 6]], columns=list('ABCDE'))
+ assert_frame_equal(result, expected)
+
+ def test_assign_order(self):
+ # GH 9818
+ df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+ result = df.assign(D=df.A + df.B, C=df.A - df.B)
+
+ if PY36:
+ expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
+ columns=list('ABDC'))
+ else:
+ expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
+ columns=list('ABCD'))
+ assert_frame_equal(result, expected)
+ result = df.assign(C=df.A - df.B, D=df.A + df.B)
+
+ expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
+ columns=list('ABCD'))
+
+ assert_frame_equal(result, expected)
+
+ def test_assign_bad(self):
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+
+ # non-keyword argument
+ with pytest.raises(TypeError):
+ df.assign(lambda x: x.A)
+ with pytest.raises(AttributeError):
+ df.assign(C=df.A, D=df.A + df.C)
+
+ @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
+ 3.6 and above""")
+ def test_assign_dependent_old_python(self):
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+
+ # Key C does not exist at definition time of df
+ with pytest.raises(KeyError):
+ df.assign(C=lambda df: df.A,
+ D=lambda df: df['A'] + df['C'])
+ with pytest.raises(KeyError):
+ df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
+
+ @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
+ python 3.5 and below""")
+ def test_assign_dependent(self):
+ df = DataFrame({'A': [1, 2], 'B': [3, 4]})
+
+ result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
+ expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
+ columns=list('ABCD'))
+ assert_frame_equal(result, expected)
+
+ result = df.assign(C=lambda df: df.A,
+ D=lambda df: df['A'] + df['C'])
+ expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
+ columns=list('ABCD'))
+ assert_frame_equal(result, expected)
+
+ def test_insert_error_msmgs(self):
+
+ # GH 7432
+ df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
+ 1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
+ s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
+ 'g', 'h', 'i', 'j']}).set_index('foo')
+ msg = 'cannot reindex from a duplicate axis'
+ with pytest.raises(ValueError, match=msg):
+ df['newcol'] = s
+
+ # GH 4107, more descriptive error message
+ df = DataFrame(np.random.randint(0, 2, (4, 4)),
+ columns=['a', 'b', 'c', 'd'])
+
+ msg = 'incompatible index of inserted column with frame index'
+ with pytest.raises(TypeError, match=msg):
+ df['gr'] = df.groupby(['b', 'c']).count()
+
+ def test_insert_benchmark(self):
+ # from the vb_suite/frame_methods/frame_insert_columns
+ N = 10
+ K = 5
+ df = DataFrame(index=lrange(N))
+ new_col = np.random.randn(N)
+ for i in range(K):
+ df[i] = new_col
+ expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
+ index=lrange(N))
+ assert_frame_equal(df, expected)
+
+ def test_insert(self):
+ df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
+ columns=['c', 'b', 'a'])
+
+ df.insert(0, 'foo', df['a'])
+ tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
+ tm.assert_series_equal(df['a'], df['foo'], check_names=False)
+
+ df.insert(2, 'bar', df['c'])
+ tm.assert_index_equal(df.columns,
+ Index(['foo', 'c', 'bar', 'b', 'a']))
+ tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
+
+ # diff dtype
+
+ # new item
+ df['x'] = df['a'].astype('float32')
+ result = Series(dict(float32=1, float64=5))
+ assert (df.get_dtype_counts().sort_index() == result).all()
+
+ # replacing current (in different block)
+ df['a'] = df['a'].astype('float32')
+ result = Series(dict(float32=2, float64=4))
+ assert (df.get_dtype_counts().sort_index() == result).all()
+
+ df['y'] = df['a'].astype('int32')
+ result = Series(dict(float32=2, float64=4, int32=1))
+ assert (df.get_dtype_counts().sort_index() == result).all()
+
+ with pytest.raises(ValueError, match='already exists'):
+ df.insert(1, 'a', df['b'])
+ pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
+
+ df.columns.name = 'some_name'
+ # preserve columns name field
+ df.insert(0, 'baz', df['c'])
+ assert df.columns.name == 'some_name'
+
+ # GH 13522
+ df = DataFrame(index=['A', 'B', 'C'])
+ df['X'] = df.index
+ df['X'] = ['x', 'y', 'z']
+ exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
+ assert_frame_equal(df, exp)
+
+ def test_delitem(self):
+ del self.frame['A']
+ assert 'A' not in self.frame
+
+ def test_delitem_multiindex(self):
+ midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
+ df = DataFrame(np.random.randn(4, 4), columns=midx)
+ assert len(df.columns) == 4
+ assert ('A', ) in df.columns
+ assert 'A' in df.columns
+
+ result = df['A']
+ assert isinstance(result, DataFrame)
+ del df['A']
+
+ assert len(df.columns) == 2
+
+ # A still in the levels, BUT get a KeyError if trying
+ # to delete
+ assert ('A', ) not in df.columns
+ with pytest.raises(KeyError):
+ del df[('A',)]
+
+ # behavior of dropped/deleted MultiIndex levels changed from
+ # GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
+ # levels which are dropped/deleted
+ assert 'A' not in df.columns
+ with pytest.raises(KeyError):
+ del df['A']
+
+ def test_pop(self):
+ self.frame.columns.name = 'baz'
+
+ self.frame.pop('A')
+ assert 'A' not in self.frame
+
+ self.frame['foo'] = 'bar'
+ self.frame.pop('foo')
+ assert 'foo' not in self.frame
+ assert self.frame.columns.name == 'baz'
+
+ # gh-10912: inplace ops cause caching issue
+ a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
+ 'A', 'B', 'C'], index=['X', 'Y'])
+ b = a.pop('B')
+ b += 1
+
+ # original frame
+ expected = DataFrame([[1, 3], [4, 6]], columns=[
+ 'A', 'C'], index=['X', 'Y'])
+ tm.assert_frame_equal(a, expected)
+
+ # result
+ expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
+ tm.assert_series_equal(b, expected)
+
+ def test_pop_non_unique_cols(self):
+ df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
+ df.columns = ["a", "b", "a"]
+
+ res = df.pop("a")
+ assert type(res) == DataFrame
+ assert len(res) == 2
+ assert len(df.columns) == 1
+ assert "b" in df.columns
+ assert "a" not in df.columns
+ assert len(df.index) == 2
+
+ def test_insert_column_bug_4032(self):
+
+ # GH4032, inserting a column and renaming causing errors
+ df = DataFrame({'b': [1.1, 2.2]})
+ df = df.rename(columns={})
+ df.insert(0, 'a', [1, 2])
+
+ result = df.rename(columns={})
+ str(result)
+ expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+ df.insert(0, 'c', [1.3, 2.3])
+
+ result = df.rename(columns={})
+ str(result)
+
+ expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
+ columns=['c', 'a', 'b'])
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_nonunique_indexes.py b/contrib/python/pandas/py2/pandas/tests/frame/test_nonunique_indexes.py
new file mode 100644
index 00000000000..a5bed14cf06
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_nonunique_indexes.py
@@ -0,0 +1,477 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, u
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Series, date_range
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameNonuniqueIndexes(TestData):
+
+ def test_column_dups_operations(self):
+
+ def check(result, expected=None):
+ if expected is not None:
+ assert_frame_equal(result, expected)
+ result.dtypes
+ str(result)
+
+ # assignment
+ # GH 3687
+ arr = np.random.randn(3, 2)
+ idx = lrange(2)
+ df = DataFrame(arr, columns=['A', 'A'])
+ df.columns = idx
+ expected = DataFrame(arr, columns=idx)
+ check(df, expected)
+
+ idx = date_range('20130101', periods=4, freq='Q-NOV')
+ df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
+ columns=['a', 'a', 'a', 'a'])
+ df.columns = idx
+ expected = DataFrame(
+ [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
+ check(df, expected)
+
+ # insert
+ df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
+ columns=['foo', 'bar', 'foo', 'hello'])
+ df['string'] = 'bah'
+ expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
+ [2, 1, 3, 5, 'bah']],
+ columns=['foo', 'bar', 'foo', 'hello', 'string'])
+ check(df, expected)
+ with pytest.raises(ValueError, match='Length of value'):
+ df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
+
+ # insert same dtype
+ df['foo2'] = 3
+ expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
+ [2, 1, 3, 5, 'bah', 3]],
+ columns=['foo', 'bar', 'foo', 'hello',
+ 'string', 'foo2'])
+ check(df, expected)
+
+ # set (non-dup)
+ df['foo2'] = 4
+ expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
+ [2, 1, 3, 5, 'bah', 4]],
+ columns=['foo', 'bar', 'foo', 'hello',
+ 'string', 'foo2'])
+ check(df, expected)
+ df['foo2'] = 3
+
+ # delete (non dup)
+ del df['bar']
+ expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
+ [2, 3, 5, 'bah', 3]],
+ columns=['foo', 'foo', 'hello', 'string', 'foo2'])
+ check(df, expected)
+
+ # try to delete again (its not consolidated)
+ del df['hello']
+ expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
+ [2, 3, 'bah', 3]],
+ columns=['foo', 'foo', 'string', 'foo2'])
+ check(df, expected)
+
+ # consolidate
+ df = df._consolidate()
+ expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
+ [2, 3, 'bah', 3]],
+ columns=['foo', 'foo', 'string', 'foo2'])
+ check(df, expected)
+
+ # insert
+ df.insert(2, 'new_col', 5.)
+ expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
+ [2, 3, 5., 'bah', 3]],
+ columns=['foo', 'foo', 'new_col', 'string',
+ 'foo2'])
+ check(df, expected)
+
+ # insert a dup
+ with pytest.raises(ValueError, match='cannot insert'):
+ df.insert(2, 'new_col', 4.)
+
+ df.insert(2, 'new_col', 4., allow_duplicates=True)
+ expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
+ [1, 2, 4., 5., 'bah', 3],
+ [2, 3, 4., 5., 'bah', 3]],
+ columns=['foo', 'foo', 'new_col',
+ 'new_col', 'string', 'foo2'])
+ check(df, expected)
+
+ # delete (dup)
+ del df['foo']
+ expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
+ [4., 5., 'bah', 3]],
+ columns=['new_col', 'new_col', 'string', 'foo2'])
+ assert_frame_equal(df, expected)
+
+ # dup across dtypes
+ df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
+ columns=['foo', 'bar', 'foo', 'hello'])
+ check(df)
+
+ df['foo2'] = 7.
+ expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
+ [2, 1, 3., 5, 7.]],
+ columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
+ check(df, expected)
+
+ result = df['foo']
+ expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
+ columns=['foo', 'foo'])
+ check(result, expected)
+
+ # multiple replacements
+ df['foo'] = 'string'
+ expected = DataFrame([['string', 1, 'string', 5, 7.],
+ ['string', 1, 'string', 5, 7.],
+ ['string', 1, 'string', 5, 7.]],
+ columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
+ check(df, expected)
+
+ del df['foo']
+ expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
+ 'bar', 'hello', 'foo2'])
+ check(df, expected)
+
+ # values
+ df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
+ result = df.values
+ expected = np.array([[1, 2.5], [3, 4.5]])
+ assert (result == expected).all().all()
+
+ # rename, GH 4403
+ df4 = DataFrame(
+ {'RT': [0.0454],
+ 'TClose': [22.02],
+ 'TExg': [0.0422]},
+ index=MultiIndex.from_tuples([(600809, 20130331)],
+ names=['STK_ID', 'RPT_Date']))
+
+ df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
+ 'STK_ID': [600809] * 3,
+ 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
+ 'TClose': [38.05, 41.66, 30.01]},
+ index=MultiIndex.from_tuples(
+ [(600809, 20120930),
+ (600809, 20121231),
+ (600809, 20130331)],
+ names=['STK_ID', 'RPT_Date']))
+
+ k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
+ result = k.rename(
+ columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
+ str(result)
+ result.dtypes
+
+ expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
+ u('饡驦'), 30.01]],
+ columns=['RT', 'TClose', 'TExg',
+ 'RPT_Date', 'STK_ID', 'STK_Name',
+ 'QT_Close'])
+ .set_index(['STK_ID', 'RPT_Date'], drop=False))
+ assert_frame_equal(result, expected)
+
+ # reindex is invalid!
+ df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
+ columns=['bar', 'a', 'a'])
+ pytest.raises(ValueError, df.reindex, columns=['bar'])
+ pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
+
+ # drop
+ df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
+ columns=['bar', 'a', 'a'])
+ result = df.drop(['a'], axis=1)
+ expected = DataFrame([[1], [1], [1]], columns=['bar'])
+ check(result, expected)
+ result = df.drop('a', axis=1)
+ check(result, expected)
+
+ # describe
+ df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+ columns=['bar', 'a', 'a'], dtype='float64')
+ result = df.describe()
+ s = df.iloc[:, 0].describe()
+ expected = pd.concat([s, s, s], keys=df.columns, axis=1)
+ check(result, expected)
+
+ # check column dups with index equal and not equal to df's index
+ df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
+ columns=['A', 'B', 'A'])
+ for index in [df.index, pd.Index(list('edcba'))]:
+ this_df = df.copy()
+ expected_ser = pd.Series(index.values, index=this_df.index)
+ expected_df = DataFrame({'A': expected_ser,
+ 'B': this_df['B'],
+ 'A': expected_ser},
+ columns=['A', 'B', 'A'])
+ this_df['A'] = index
+ check(this_df, expected_df)
+
+ # operations
+ for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
+ df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
+ expected = getattr(df, op)(df)
+ expected.columns = ['A', 'A']
+ df.columns = ['A', 'A']
+ result = getattr(df, op)(df)
+ check(result, expected)
+
+ # multiple assignments that change dtypes
+ # the location indexer is a slice
+ # GH 6120
+ df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
+ expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
+
+ df['that'] = 1.0
+ check(df, expected)
+
+ df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
+ expected = DataFrame(1, index=range(5), columns=['that', 'that'])
+
+ df['that'] = 1
+ check(df, expected)
+
+ def test_column_dups2(self):
+
+ # drop buggy GH 6240
+ df = DataFrame({'A': np.random.randn(5),
+ 'B': np.random.randn(5),
+ 'C': np.random.randn(5),
+ 'D': ['a', 'b', 'c', 'd', 'e']})
+
+ expected = df.take([0, 1, 1], axis=1)
+ df2 = df.take([2, 0, 1, 2, 1], axis=1)
+ result = df2.drop('C', axis=1)
+ assert_frame_equal(result, expected)
+
+ # dropna
+ df = DataFrame({'A': np.random.randn(5),
+ 'B': np.random.randn(5),
+ 'C': np.random.randn(5),
+ 'D': ['a', 'b', 'c', 'd', 'e']})
+ df.iloc[2, [0, 1, 2]] = np.nan
+ df.iloc[0, 0] = np.nan
+ df.iloc[1, 1] = np.nan
+ df.iloc[:, 3] = np.nan
+ expected = df.dropna(subset=['A', 'B', 'C'], how='all')
+ expected.columns = ['A', 'A', 'B', 'C']
+
+ df.columns = ['A', 'A', 'B', 'C']
+
+ result = df.dropna(subset=['A', 'C'], how='all')
+ assert_frame_equal(result, expected)
+
+ def test_column_dups_indexing(self):
+ def check(result, expected=None):
+ if expected is not None:
+ assert_frame_equal(result, expected)
+ result.dtypes
+ str(result)
+
+ # boolean indexing
+ # GH 4879
+ dups = ['A', 'A', 'C', 'D']
+ df = DataFrame(np.arange(12).reshape(3, 4), columns=[
+ 'A', 'B', 'C', 'D'], dtype='float64')
+ expected = df[df.C > 6]
+ expected.columns = dups
+ df = DataFrame(np.arange(12).reshape(3, 4),
+ columns=dups, dtype='float64')
+ result = df[df.C > 6]
+ check(result, expected)
+
+ # where
+ df = DataFrame(np.arange(12).reshape(3, 4), columns=[
+ 'A', 'B', 'C', 'D'], dtype='float64')
+ expected = df[df > 6]
+ expected.columns = dups
+ df = DataFrame(np.arange(12).reshape(3, 4),
+ columns=dups, dtype='float64')
+ result = df[df > 6]
+ check(result, expected)
+
+ # boolean with the duplicate raises
+ df = DataFrame(np.arange(12).reshape(3, 4),
+ columns=dups, dtype='float64')
+ pytest.raises(ValueError, lambda: df[df.A > 6])
+
+ # dup aligining operations should work
+ # GH 5185
+ df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
+ df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
+ expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
+ result = df1.sub(df2)
+ assert_frame_equal(result, expected)
+
+ # equality
+ df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
+ columns=['A', 'B'])
+ df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
+ columns=['A', 'A'])
+
+ # not-comparing like-labelled
+ pytest.raises(ValueError, lambda: df1 == df2)
+
+ df1r = df1.reindex_like(df2)
+ result = df1r == df2
+ expected = DataFrame([[False, True], [True, False], [False, False], [
+ True, False]], columns=['A', 'A'])
+ assert_frame_equal(result, expected)
+
+ # mixed column selection
+ # GH 5639
+ dfbool = DataFrame({'one': Series([True, True, False],
+ index=['a', 'b', 'c']),
+ 'two': Series([False, False, True, False],
+ index=['a', 'b', 'c', 'd']),
+ 'three': Series([False, True, True, True],
+ index=['a', 'b', 'c', 'd'])})
+ expected = pd.concat(
+ [dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
+ result = dfbool[['one', 'three', 'one']]
+ check(result, expected)
+
+ # multi-axis dups
+ # GH 6121
+ df = DataFrame(np.arange(25.).reshape(5, 5),
+ index=['a', 'b', 'c', 'd', 'e'],
+ columns=['A', 'B', 'C', 'D', 'E'])
+ z = df[['A', 'C', 'A']].copy()
+ expected = z.loc[['a', 'c', 'a']]
+
+ df = DataFrame(np.arange(25.).reshape(5, 5),
+ index=['a', 'b', 'c', 'd', 'e'],
+ columns=['A', 'B', 'C', 'D', 'E'])
+ z = df[['A', 'C', 'A']]
+ result = z.loc[['a', 'c', 'a']]
+ check(result, expected)
+
+ def test_column_dups_indexing2(self):
+
+ # GH 8363
+ # datetime ops with a non-unique index
+ df = DataFrame({'A': np.arange(5, dtype='int64'),
+ 'B': np.arange(1, 6, dtype='int64')},
+ index=[2, 2, 3, 3, 4])
+ result = df.B - df.A
+ expected = Series(1, index=[2, 2, 3, 3, 4])
+ assert_series_equal(result, expected)
+
+ df = DataFrame({'A': date_range('20130101', periods=5),
+ 'B': date_range('20130101 09:00:00', periods=5)},
+ index=[2, 2, 3, 3, 4])
+ result = df.B - df.A
+ expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
+ assert_series_equal(result, expected)
+
+ def test_columns_with_dups(self):
+ # GH 3468 related
+
+ # basic
+ df = DataFrame([[1, 2]], columns=['a', 'a'])
+ df.columns = ['a', 'a.1']
+ str(df)
+ expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
+ assert_frame_equal(df, expected)
+
+ df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
+ df.columns = ['b', 'a', 'a.1']
+ str(df)
+ expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
+ assert_frame_equal(df, expected)
+
+ # with a dup index
+ df = DataFrame([[1, 2]], columns=['a', 'a'])
+ df.columns = ['b', 'b']
+ str(df)
+ expected = DataFrame([[1, 2]], columns=['b', 'b'])
+ assert_frame_equal(df, expected)
+
+ # multi-dtype
+ df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
+ columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
+ df.columns = list('ABCDEFG')
+ str(df)
+ expected = DataFrame(
+ [[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
+ assert_frame_equal(df, expected)
+
+ # this is an error because we cannot disambiguate the dup columns
+ pytest.raises(Exception, lambda x: DataFrame(
+ [[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
+
+ # dups across blocks
+ df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
+ df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
+ df_bool = DataFrame(True, index=df_float.index,
+ columns=df_float.columns)
+ df_object = DataFrame('foo', index=df_float.index,
+ columns=df_float.columns)
+ df_dt = DataFrame(pd.Timestamp('20010101'),
+ index=df_float.index,
+ columns=df_float.columns)
+ df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
+
+ assert len(df._data._blknos) == len(df.columns)
+ assert len(df._data._blklocs) == len(df.columns)
+
+ # testing iloc
+ for i in range(len(df.columns)):
+ df.iloc[:, i]
+
+ # dup columns across dtype GH 2079/2194
+ vals = [[1, -1, 2.], [2, -2, 3.]]
+ rs = DataFrame(vals, columns=['A', 'A', 'B'])
+ xp = DataFrame(vals)
+ xp.columns = ['A', 'A', 'B']
+ assert_frame_equal(rs, xp)
+
+ def test_values_duplicates(self):
+ df = DataFrame([[1, 2, 'a', 'b'],
+ [1, 2, 'a', 'b']],
+ columns=['one', 'one', 'two', 'two'])
+
+ result = df.values
+ expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
+ dtype=object)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_set_value_by_index(self):
+ # See gh-12344
+ df = DataFrame(np.arange(9).reshape(3, 3).T)
+ df.columns = list('AAA')
+ expected = df.iloc[:, 2]
+
+ df.iloc[:, 0] = 3
+ assert_series_equal(df.iloc[:, 2], expected)
+
+ df = DataFrame(np.arange(9).reshape(3, 3).T)
+ df.columns = [2, float(2), str(2)]
+ expected = df.iloc[:, 1]
+
+ df.iloc[:, 0] = 3
+ assert_series_equal(df.iloc[:, 1], expected)
+
+ def test_insert_with_columns_dups(self):
+ # GH 14291
+ df = pd.DataFrame()
+ df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
+ df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
+ df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
+ exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
+ ['c', 'f', 'i']], columns=['A', 'A', 'A'])
+ assert_frame_equal(df, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_operators.py b/contrib/python/pandas/py2/pandas/tests/frame/test_operators.py
new file mode 100644
index 00000000000..e9521fa1506
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_operators.py
@@ -0,0 +1,802 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from decimal import Decimal
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Series, compat
+import pandas.core.common as com
+from pandas.tests.frame.common import TestData, _check_mixed_float
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_numpy_array_equal, assert_series_equal)
+
+
+class TestDataFrameUnaryOperators(object):
+ # __pos__, __neg__, __inv__
+
+ @pytest.mark.parametrize('df,expected', [
+ (pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})),
+ (pd.DataFrame({'a': [False, True]}),
+ pd.DataFrame({'a': [True, False]})),
+ (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
+ pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))}))
+ ])
+ def test_neg_numeric(self, df, expected):
+ assert_frame_equal(-df, expected)
+ assert_series_equal(-df['a'], expected['a'])
+
+ @pytest.mark.parametrize('df, expected', [
+ (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
+ ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]),
+ ])
+ def test_neg_object(self, df, expected):
+ # GH#21380
+ df = pd.DataFrame({'a': df})
+ expected = pd.DataFrame({'a': expected})
+ assert_frame_equal(-df, expected)
+ assert_series_equal(-df['a'], expected['a'])
+
+ @pytest.mark.parametrize('df', [
+ pd.DataFrame({'a': ['a', 'b']}),
+ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
+ ])
+ def test_neg_raises(self, df):
+ with pytest.raises(TypeError):
+ (- df)
+ with pytest.raises(TypeError):
+ (- df['a'])
+
+ def test_invert(self):
+ _seriesd = tm.getSeriesData()
+ df = pd.DataFrame(_seriesd)
+
+ assert_frame_equal(-(df < 0), ~(df < 0))
+
+ @pytest.mark.parametrize('df', [
+ pd.DataFrame({'a': [-1, 1]}),
+ pd.DataFrame({'a': [False, True]}),
+ pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
+ ])
+ def test_pos_numeric(self, df):
+ # GH#16073
+ assert_frame_equal(+df, df)
+ assert_series_equal(+df['a'], df['a'])
+
+ @pytest.mark.parametrize('df', [
+ # numpy changing behavior in the future
+ pytest.param(pd.DataFrame({'a': ['a', 'b']}),
+ marks=[pytest.mark.filterwarnings("ignore")]),
+ pd.DataFrame({'a': np.array([-1, 2], dtype=object)}),
+ pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}),
+ ])
+ def test_pos_object(self, df):
+ # GH#21380
+ assert_frame_equal(+df, df)
+ assert_series_equal(+df['a'], df['a'])
+
+ @pytest.mark.parametrize('df', [
+ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
+ ])
+ def test_pos_raises(self, df):
+ with pytest.raises(TypeError):
+ (+ df)
+ with pytest.raises(TypeError):
+ (+ df['a'])
+
+
+class TestDataFrameLogicalOperators(object):
+ # &, |, ^
+
+ def test_logical_ops_empty_frame(self):
+ # GH#5808
+ # empty frames, non-mixed dtype
+ df = DataFrame(index=[1])
+
+ result = df & df
+ assert_frame_equal(result, df)
+
+ result = df | df
+ assert_frame_equal(result, df)
+
+ df2 = DataFrame(index=[1, 2])
+ result = df & df2
+ assert_frame_equal(result, df2)
+
+ dfa = DataFrame(index=[1], columns=['A'])
+
+ result = dfa & dfa
+ assert_frame_equal(result, dfa)
+
+ def test_logical_ops_bool_frame(self):
+ # GH#5808
+ df1a_bool = DataFrame(True, index=[1], columns=['A'])
+
+ result = df1a_bool & df1a_bool
+ assert_frame_equal(result, df1a_bool)
+
+ result = df1a_bool | df1a_bool
+ assert_frame_equal(result, df1a_bool)
+
+ def test_logical_ops_int_frame(self):
+ # GH#5808
+ df1a_int = DataFrame(1, index=[1], columns=['A'])
+ df1a_bool = DataFrame(True, index=[1], columns=['A'])
+
+ result = df1a_int | df1a_bool
+ assert_frame_equal(result, df1a_int)
+
+ def test_logical_ops_invalid(self):
+ # GH#5808
+
+ df1 = DataFrame(1.0, index=[1], columns=['A'])
+ df2 = DataFrame(True, index=[1], columns=['A'])
+ with pytest.raises(TypeError):
+ df1 | df2
+
+ df1 = DataFrame('foo', index=[1], columns=['A'])
+ df2 = DataFrame(True, index=[1], columns=['A'])
+ with pytest.raises(TypeError):
+ df1 | df2
+
+ def test_logical_operators(self):
+
+ def _check_bin_op(op):
+ result = op(df1, df2)
+ expected = DataFrame(op(df1.values, df2.values), index=df1.index,
+ columns=df1.columns)
+ assert result.values.dtype == np.bool_
+ assert_frame_equal(result, expected)
+
+ def _check_unary_op(op):
+ result = op(df1)
+ expected = DataFrame(op(df1.values), index=df1.index,
+ columns=df1.columns)
+ assert result.values.dtype == np.bool_
+ assert_frame_equal(result, expected)
+
+ df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
+ 'b': {'a': False, 'b': True, 'c': False,
+ 'd': False, 'e': False},
+ 'c': {'a': False, 'b': False, 'c': True,
+ 'd': False, 'e': False},
+ 'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
+ 'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}}
+
+ df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
+ 'b': {'a': False, 'b': True, 'c': False,
+ 'd': False, 'e': False},
+ 'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
+ 'd': {'a': False, 'b': False, 'c': False,
+ 'd': True, 'e': False},
+ 'e': {'a': False, 'b': False, 'c': False,
+ 'd': False, 'e': True}}
+
+ df1 = DataFrame(df1)
+ df2 = DataFrame(df2)
+
+ _check_bin_op(operator.and_)
+ _check_bin_op(operator.or_)
+ _check_bin_op(operator.xor)
+
+ _check_unary_op(operator.inv) # TODO: belongs elsewhere
+
+ def test_logical_with_nas(self):
+ d = DataFrame({'a': [np.nan, False], 'b': [True, True]})
+
+ # GH4947
+ # bool comparisons should return bool
+ result = d['a'] | d['b']
+ expected = Series([False, True])
+ assert_series_equal(result, expected)
+
+ # GH4604, automatic casting here
+ result = d['a'].fillna(False) | d['b']
+ expected = Series([True, True])
+ assert_series_equal(result, expected)
+
+ result = d['a'].fillna(False, downcast=False) | d['b']
+ expected = Series([True, True])
+ assert_series_equal(result, expected)
+
+
+class TestDataFrameOperators(TestData):
+
+ @pytest.mark.parametrize('op', [operator.add, operator.sub,
+ operator.mul, operator.truediv])
+ def test_operators_none_as_na(self, op):
+ df = DataFrame({"col1": [2, 5.0, 123, None],
+ "col2": [1, 2, 3, 4]}, dtype=object)
+
+ # since filling converts dtypes from object, changed expected to be
+ # object
+ filled = df.fillna(np.nan)
+ result = op(df, 3)
+ expected = op(filled, 3).astype(object)
+ expected[com.isna(expected)] = None
+ assert_frame_equal(result, expected)
+
+ result = op(df, df)
+ expected = op(filled, filled).astype(object)
+ expected[com.isna(expected)] = None
+ assert_frame_equal(result, expected)
+
+ result = op(df, df.fillna(7))
+ assert_frame_equal(result, expected)
+
+ result = op(df.fillna(7), df)
+ assert_frame_equal(result, expected, check_dtype=False)
+
+ @pytest.mark.parametrize('op,res', [('__eq__', False),
+ ('__ne__', True)])
+ # TODO: not sure what's correct here.
+ @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
+ def test_logical_typeerror_with_non_valid(self, op, res):
+ # we are comparing floats vs a string
+ result = getattr(self.frame, op)('foo')
+ assert bool(result.all().all()) is res
+
+ def test_binary_ops_align(self):
+
+ # test aligning binary ops
+
+ # GH 6681
+ index = MultiIndex.from_product([list('abc'),
+ ['one', 'two', 'three'],
+ [1, 2, 3]],
+ names=['first', 'second', 'third'])
+
+ df = DataFrame(np.arange(27 * 3).reshape(27, 3),
+ index=index,
+ columns=['value1', 'value2', 'value3']).sort_index()
+
+ idx = pd.IndexSlice
+ for op in ['add', 'sub', 'mul', 'div', 'truediv']:
+ opa = getattr(operator, op, None)
+ if opa is None:
+ continue
+
+ x = Series([1.0, 10.0, 100.0], [1, 2, 3])
+ result = getattr(df, op)(x, level='third', axis=0)
+
+ expected = pd.concat([opa(df.loc[idx[:, :, i], :], v)
+ for i, v in x.iteritems()]).sort_index()
+ assert_frame_equal(result, expected)
+
+ x = Series([1.0, 10.0], ['two', 'three'])
+ result = getattr(df, op)(x, level='second', axis=0)
+
+ expected = (pd.concat([opa(df.loc[idx[:, i], :], v)
+ for i, v in x.iteritems()])
+ .reindex_like(df).sort_index())
+ assert_frame_equal(result, expected)
+
+ # GH9463 (alignment level of dataframe with series)
+
+ midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
+ df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
+ s = pd.Series({'a': 1, 'b': 2})
+
+ df2 = df.copy()
+ df2.columns.names = ['lvl0', 'lvl1']
+ s2 = s.copy()
+ s2.index.name = 'lvl1'
+
+ # different cases of integer/string level names:
+ res1 = df.mul(s, axis=1, level=1)
+ res2 = df.mul(s2, axis=1, level=1)
+ res3 = df2.mul(s, axis=1, level=1)
+ res4 = df2.mul(s2, axis=1, level=1)
+ res5 = df2.mul(s, axis=1, level='lvl1')
+ res6 = df2.mul(s2, axis=1, level='lvl1')
+
+ exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
+ columns=midx)
+
+ for res in [res1, res2]:
+ assert_frame_equal(res, exp)
+
+ exp.columns.names = ['lvl0', 'lvl1']
+ for res in [res3, res4, res5, res6]:
+ assert_frame_equal(res, exp)
+
+ def test_dti_tz_convert_to_utc(self):
+ base = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
+ '2011-01-03'], tz='UTC')
+ idx1 = base.tz_convert('Asia/Tokyo')[:2]
+ idx2 = base.tz_convert('US/Eastern')[1:]
+
+ df1 = DataFrame({'A': [1, 2]}, index=idx1)
+ df2 = DataFrame({'A': [1, 1]}, index=idx2)
+ exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base)
+ assert_frame_equal(df1 + df2, exp)
+
+ def test_combineFrame(self):
+ frame_copy = self.frame.reindex(self.frame.index[::2])
+
+ del frame_copy['D']
+ frame_copy['C'][:5] = np.nan
+
+ added = self.frame + frame_copy
+
+ indexer = added['A'].dropna().index
+ exp = (self.frame['A'] * 2).copy()
+
+ tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer])
+
+ exp.loc[~exp.index.isin(indexer)] = np.nan
+ tm.assert_series_equal(added['A'], exp.loc[added['A'].index])
+
+ assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()
+
+ # assert(False)
+
+ assert np.isnan(added['D']).all()
+
+ self_added = self.frame + self.frame
+ tm.assert_index_equal(self_added.index, self.frame.index)
+
+ added_rev = frame_copy + self.frame
+ assert np.isnan(added['D']).all()
+ assert np.isnan(added_rev['D']).all()
+
+ # corner cases
+
+ # empty
+ plus_empty = self.frame + self.empty
+ assert np.isnan(plus_empty.values).all()
+
+ empty_plus = self.empty + self.frame
+ assert np.isnan(empty_plus.values).all()
+
+ empty_empty = self.empty + self.empty
+ assert empty_empty.empty
+
+ # out of order
+ reverse = self.frame.reindex(columns=self.frame.columns[::-1])
+
+ assert_frame_equal(reverse + self.frame, self.frame * 2)
+
+ # mix vs float64, upcast
+ added = self.frame + self.mixed_float
+ _check_mixed_float(added, dtype='float64')
+ added = self.mixed_float + self.frame
+ _check_mixed_float(added, dtype='float64')
+
+ # mix vs mix
+ added = self.mixed_float + self.mixed_float2
+ _check_mixed_float(added, dtype=dict(C=None))
+ added = self.mixed_float2 + self.mixed_float
+ _check_mixed_float(added, dtype=dict(C=None))
+
+ # with int
+ added = self.frame + self.mixed_int
+ _check_mixed_float(added, dtype='float64')
+
+ def test_combineSeries(self):
+
+ # Series
+ series = self.frame.xs(self.frame.index[0])
+
+ added = self.frame + series
+
+ for key, s in compat.iteritems(added):
+ assert_series_equal(s, self.frame[key] + series[key])
+
+ larger_series = series.to_dict()
+ larger_series['E'] = 1
+ larger_series = Series(larger_series)
+ larger_added = self.frame + larger_series
+
+ for key, s in compat.iteritems(self.frame):
+ assert_series_equal(larger_added[key], s + series[key])
+ assert 'E' in larger_added
+ assert np.isnan(larger_added['E']).all()
+
+ # no upcast needed
+ added = self.mixed_float + series
+ _check_mixed_float(added)
+
+ # vs mix (upcast) as needed
+ added = self.mixed_float + series.astype('float32')
+ _check_mixed_float(added, dtype=dict(C=None))
+ added = self.mixed_float + series.astype('float16')
+ _check_mixed_float(added, dtype=dict(C=None))
+
+ # these raise with numexpr.....as we are adding an int64 to an
+ # uint64....weird vs int
+
+ # added = self.mixed_int + (100*series).astype('int64')
+ # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
+ # 'int64', D = 'int64'))
+ # added = self.mixed_int + (100*series).astype('int32')
+ # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
+ # 'int32', D = 'int64'))
+
+ # TimeSeries
+ ts = self.tsframe['A']
+
+ # 10890
+ # we no longer allow auto timeseries broadcasting
+ # and require explicit broadcasting
+ added = self.tsframe.add(ts, axis='index')
+
+ for key, col in compat.iteritems(self.tsframe):
+ result = col + ts
+ assert_series_equal(added[key], result, check_names=False)
+ assert added[key].name == key
+ if col.name == ts.name:
+ assert result.name == 'A'
+ else:
+ assert result.name is None
+
+ smaller_frame = self.tsframe[:-5]
+ smaller_added = smaller_frame.add(ts, axis='index')
+
+ tm.assert_index_equal(smaller_added.index, self.tsframe.index)
+
+ smaller_ts = ts[:-5]
+ smaller_added2 = self.tsframe.add(smaller_ts, axis='index')
+ assert_frame_equal(smaller_added, smaller_added2)
+
+ # length 0, result is all-nan
+ result = self.tsframe.add(ts[:0], axis='index')
+ expected = DataFrame(np.nan, index=self.tsframe.index,
+ columns=self.tsframe.columns)
+ assert_frame_equal(result, expected)
+
+ # Frame is all-nan
+ result = self.tsframe[:0].add(ts, axis='index')
+ expected = DataFrame(np.nan, index=self.tsframe.index,
+ columns=self.tsframe.columns)
+ assert_frame_equal(result, expected)
+
+ # empty but with non-empty index
+ frame = self.tsframe[:1].reindex(columns=[])
+ result = frame.mul(ts, axis='index')
+ assert len(result) == len(ts)
+
+ def test_combineFunc(self):
+ result = self.frame * 2
+ tm.assert_numpy_array_equal(result.values, self.frame.values * 2)
+
+ # vs mix
+ result = self.mixed_float * 2
+ for c, s in compat.iteritems(result):
+ tm.assert_numpy_array_equal(
+ s.values, self.mixed_float[c].values * 2)
+ _check_mixed_float(result, dtype=dict(C=None))
+
+ result = self.empty * 2
+ assert result.index is self.empty.index
+ assert len(result.columns) == 0
+
+ def test_comparisons(self):
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame()
+
+ row = self.simple.xs('a')
+ ndim_5 = np.ones(df1.shape + (1, 1, 1))
+
+ def test_comp(func):
+ result = func(df1, df2)
+ tm.assert_numpy_array_equal(result.values,
+ func(df1.values, df2.values))
+
+ with pytest.raises(ValueError, match='dim must be <= 2'):
+ func(df1, ndim_5)
+
+ result2 = func(self.simple, row)
+ tm.assert_numpy_array_equal(result2.values,
+ func(self.simple.values, row.values))
+
+ result3 = func(self.frame, 0)
+ tm.assert_numpy_array_equal(result3.values,
+ func(self.frame.values, 0))
+
+ msg = 'Can only compare identically-labeled DataFrame'
+ with pytest.raises(ValueError, match=msg):
+ func(self.simple, self.simple[:2])
+
+ test_comp(operator.eq)
+ test_comp(operator.ne)
+ test_comp(operator.lt)
+ test_comp(operator.gt)
+ test_comp(operator.ge)
+ test_comp(operator.le)
+
+ def test_comparison_protected_from_errstate(self):
+ missing_df = tm.makeDataFrame()
+ missing_df.iloc[0]['A'] = np.nan
+ with np.errstate(invalid='ignore'):
+ expected = missing_df.values < 0
+ with np.errstate(invalid='raise'):
+ result = (missing_df < 0).values
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_boolean_comparison(self):
+
+ # GH 4576
+ # boolean comparisons with a tuple/list give unexpected results
+ df = DataFrame(np.arange(6).reshape((3, 2)))
+ b = np.array([2, 2])
+ b_r = np.atleast_2d([2, 2])
+ b_c = b_r.T
+ lst = [2, 2, 2]
+ tup = tuple(lst)
+
+ # gt
+ expected = DataFrame([[False, False], [False, True], [True, True]])
+ result = df > b
+ assert_frame_equal(result, expected)
+
+ result = df.values > b
+ assert_numpy_array_equal(result, expected.values)
+
+ msg1d = 'Unable to coerce to Series, length must be 2: given 3'
+ msg2d = 'Unable to coerce to DataFrame, shape must be'
+ msg2db = 'operands could not be broadcast together with shapes'
+ with pytest.raises(ValueError, match=msg1d):
+ # wrong shape
+ df > lst
+
+ with pytest.raises(ValueError, match=msg1d):
+ # wrong shape
+ result = df > tup
+
+ # broadcasts like ndarray (GH#23000)
+ result = df > b_r
+ assert_frame_equal(result, expected)
+
+ result = df.values > b_r
+ assert_numpy_array_equal(result, expected.values)
+
+ with pytest.raises(ValueError, match=msg2d):
+ df > b_c
+
+ with pytest.raises(ValueError, match=msg2db):
+ df.values > b_c
+
+ # ==
+ expected = DataFrame([[False, False], [True, False], [False, False]])
+ result = df == b
+ assert_frame_equal(result, expected)
+
+ with pytest.raises(ValueError, match=msg1d):
+ result = df == lst
+
+ with pytest.raises(ValueError, match=msg1d):
+ result = df == tup
+
+ # broadcasts like ndarray (GH#23000)
+ result = df == b_r
+ assert_frame_equal(result, expected)
+
+ result = df.values == b_r
+ assert_numpy_array_equal(result, expected.values)
+
+ with pytest.raises(ValueError, match=msg2d):
+ df == b_c
+
+ assert df.values.shape != b_c.shape
+
+ # with alignment
+ df = DataFrame(np.arange(6).reshape((3, 2)),
+ columns=list('AB'), index=list('abc'))
+ expected.index = df.index
+ expected.columns = df.columns
+
+ with pytest.raises(ValueError, match=msg1d):
+ result = df == lst
+
+ with pytest.raises(ValueError, match=msg1d):
+ result = df == tup
+
+ def test_combine_generic(self):
+ df1 = self.frame
+ df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']]
+
+ combined = df1.combine(df2, np.add)
+ combined2 = df2.combine(df1, np.add)
+ assert combined['D'].isna().all()
+ assert combined2['D'].isna().all()
+
+ chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']]
+ chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']]
+
+ exp = self.frame.loc[self.frame.index[:-5],
+ ['A', 'B', 'C']].reindex_like(chunk) * 2
+ assert_frame_equal(chunk, exp)
+ assert_frame_equal(chunk2, exp)
+
+ def test_inplace_ops_alignment(self):
+
+ # inplace ops / ops alignment
+ # GH 8511
+
+ columns = list('abcdefg')
+ X_orig = DataFrame(np.arange(10 * len(columns))
+ .reshape(-1, len(columns)),
+ columns=columns, index=range(10))
+ Z = 100 * X_orig.iloc[:, 1:-1].copy()
+ block1 = list('bedcf')
+ subs = list('bcdef')
+
+ # add
+ X = X_orig.copy()
+ result1 = (X[block1] + Z).reindex(columns=subs)
+
+ X[block1] += Z
+ result2 = X.reindex(columns=subs)
+
+ X = X_orig.copy()
+ result3 = (X[block1] + Z[block1]).reindex(columns=subs)
+
+ X[block1] += Z[block1]
+ result4 = X.reindex(columns=subs)
+
+ assert_frame_equal(result1, result2)
+ assert_frame_equal(result1, result3)
+ assert_frame_equal(result1, result4)
+
+ # sub
+ X = X_orig.copy()
+ result1 = (X[block1] - Z).reindex(columns=subs)
+
+ X[block1] -= Z
+ result2 = X.reindex(columns=subs)
+
+ X = X_orig.copy()
+ result3 = (X[block1] - Z[block1]).reindex(columns=subs)
+
+ X[block1] -= Z[block1]
+ result4 = X.reindex(columns=subs)
+
+ assert_frame_equal(result1, result2)
+ assert_frame_equal(result1, result3)
+ assert_frame_equal(result1, result4)
+
+ def test_inplace_ops_identity(self):
+
+ # GH 5104
+ # make sure that we are actually changing the object
+ s_orig = Series([1, 2, 3])
+ df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))
+
+ # no dtype change
+ s = s_orig.copy()
+ s2 = s
+ s += 1
+ assert_series_equal(s, s2)
+ assert_series_equal(s_orig + 1, s)
+ assert s is s2
+ assert s._data is s2._data
+
+ df = df_orig.copy()
+ df2 = df
+ df += 1
+ assert_frame_equal(df, df2)
+ assert_frame_equal(df_orig + 1, df)
+ assert df is df2
+ assert df._data is df2._data
+
+ # dtype change
+ s = s_orig.copy()
+ s2 = s
+ s += 1.5
+ assert_series_equal(s, s2)
+ assert_series_equal(s_orig + 1.5, s)
+
+ df = df_orig.copy()
+ df2 = df
+ df += 1.5
+ assert_frame_equal(df, df2)
+ assert_frame_equal(df_orig + 1.5, df)
+ assert df is df2
+ assert df._data is df2._data
+
+ # mixed dtype
+ arr = np.random.randint(0, 10, size=5)
+ df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
+ df = df_orig.copy()
+ df2 = df
+ df['A'] += 1
+ expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
+ assert_frame_equal(df, expected)
+ assert_frame_equal(df2, expected)
+ assert df._data is df2._data
+
+ df = df_orig.copy()
+ df2 = df
+ df['A'] += 1.5
+ expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
+ assert_frame_equal(df, expected)
+ assert_frame_equal(df2, expected)
+ assert df._data is df2._data
+
+ @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod',
+ 'mul', 'or', 'pow', 'sub', 'truediv',
+ 'xor'])
+ def test_inplace_ops_identity2(self, op):
+
+ if compat.PY3 and op == 'div':
+ return
+
+ df = DataFrame({'a': [1., 2., 3.],
+ 'b': [1, 2, 3]})
+
+ operand = 2
+ if op in ('and', 'or', 'xor'):
+ # cannot use floats for boolean ops
+ df['a'] = [True, False, True]
+
+ df_copy = df.copy()
+ iop = '__i{}__'.format(op)
+ op = '__{}__'.format(op)
+
+ # no id change and value is correct
+ getattr(df, iop)(operand)
+ expected = getattr(df_copy, op)(operand)
+ assert_frame_equal(df, expected)
+ expected = id(df)
+ assert id(df) == expected
+
+ def test_alignment_non_pandas(self):
+ index = ['A', 'B', 'C']
+ columns = ['X', 'Y', 'Z']
+ df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
+
+ align = pd.core.ops._align_method_FRAME
+ for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64),
+ range(1, 4)]:
+
+ tm.assert_series_equal(align(df, val, 'index'),
+ Series([1, 2, 3], index=df.index))
+ tm.assert_series_equal(align(df, val, 'columns'),
+ Series([1, 2, 3], index=df.columns))
+
+ # length mismatch
+ msg = 'Unable to coerce to Series, length must be 3: given 2'
+ for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:
+
+ with pytest.raises(ValueError, match=msg):
+ align(df, val, 'index')
+
+ with pytest.raises(ValueError, match=msg):
+ align(df, val, 'columns')
+
+ val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ tm.assert_frame_equal(align(df, val, 'index'),
+ DataFrame(val, index=df.index,
+ columns=df.columns))
+ tm.assert_frame_equal(align(df, val, 'columns'),
+ DataFrame(val, index=df.index,
+ columns=df.columns))
+
+ # shape mismatch
+ msg = 'Unable to coerce to DataFrame, shape must be'
+ val = np.array([[1, 2, 3], [4, 5, 6]])
+ with pytest.raises(ValueError, match=msg):
+ align(df, val, 'index')
+
+ with pytest.raises(ValueError, match=msg):
+ align(df, val, 'columns')
+
+ val = np.zeros((3, 3, 3))
+ with pytest.raises(ValueError):
+ align(df, val, 'index')
+ with pytest.raises(ValueError):
+ align(df, val, 'columns')
+
+ def test_no_warning(self, all_arithmetic_operators):
+ df = pd.DataFrame({"A": [0., 0.], "B": [0., None]})
+ b = df['B']
+ with tm.assert_produces_warning(None):
+ getattr(df, all_arithmetic_operators)(b, 0)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_period.py b/contrib/python/pandas/py2/pandas/tests/frame/test_period.py
new file mode 100644
index 00000000000..8b37d4ff2cf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_period.py
@@ -0,0 +1,147 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, PeriodIndex, Timedelta, date_range,
+ period_range, to_datetime)
+import pandas.util.testing as tm
+
+
+def _permute(obj):
+ return obj.take(np.random.permutation(len(obj)))
+
+
+class TestPeriodIndex(object):
+
+ def test_as_frame_columns(self):
+ rng = period_range('1/1/2000', periods=5)
+ df = DataFrame(np.random.randn(10, 5), columns=rng)
+
+ ts = df[rng[0]]
+ tm.assert_series_equal(ts, df.iloc[:, 0])
+
+ # GH # 1211
+ repr(df)
+
+ ts = df['1/1/2000']
+ tm.assert_series_equal(ts, df.iloc[:, 0])
+
+ def test_frame_setitem(self):
+ rng = period_range('1/1/2000', periods=5, name='index')
+ df = DataFrame(np.random.randn(5, 3), index=rng)
+
+ df['Index'] = rng
+ rs = Index(df['Index'])
+ tm.assert_index_equal(rs, rng, check_names=False)
+ assert rs.name == 'Index'
+ assert rng.name == 'index'
+
+ rs = df.reset_index().set_index('index')
+ assert isinstance(rs.index, PeriodIndex)
+ tm.assert_index_equal(rs.index, rng)
+
+ def test_frame_to_time_stamp(self):
+ K = 5
+ index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ df = DataFrame(np.random.randn(len(index), K), index=index)
+ df['mix'] = 'a'
+
+ exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
+ exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ result = df.to_timestamp('D', 'end')
+ tm.assert_index_equal(result.index, exp_index)
+ tm.assert_numpy_array_equal(result.values, df.values)
+
+ exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
+ result = df.to_timestamp('D', 'start')
+ tm.assert_index_equal(result.index, exp_index)
+
+ def _get_with_delta(delta, freq='A-DEC'):
+ return date_range(to_datetime('1/1/2001') + delta,
+ to_datetime('12/31/2009') + delta, freq=freq)
+
+ delta = timedelta(hours=23)
+ result = df.to_timestamp('H', 'end')
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ delta = timedelta(hours=23, minutes=59)
+ result = df.to_timestamp('T', 'end')
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ result = df.to_timestamp('S', 'end')
+ delta = timedelta(hours=23, minutes=59, seconds=59)
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ # columns
+ df = df.T
+
+ exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
+ exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ result = df.to_timestamp('D', 'end', axis=1)
+ tm.assert_index_equal(result.columns, exp_index)
+ tm.assert_numpy_array_equal(result.values, df.values)
+
+ exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
+ result = df.to_timestamp('D', 'start', axis=1)
+ tm.assert_index_equal(result.columns, exp_index)
+
+ delta = timedelta(hours=23)
+ result = df.to_timestamp('H', 'end', axis=1)
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.columns, exp_index)
+
+ delta = timedelta(hours=23, minutes=59)
+ result = df.to_timestamp('T', 'end', axis=1)
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.columns, exp_index)
+
+ result = df.to_timestamp('S', 'end', axis=1)
+ delta = timedelta(hours=23, minutes=59, seconds=59)
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.columns, exp_index)
+
+ # invalid axis
+ with pytest.raises(ValueError, match='axis'):
+ df.to_timestamp(axis=2)
+
+ result1 = df.to_timestamp('5t', axis=1)
+ result2 = df.to_timestamp('t', axis=1)
+ expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS')
+ assert isinstance(result1.columns, DatetimeIndex)
+ assert isinstance(result2.columns, DatetimeIndex)
+ tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
+ tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
+ # PeriodIndex.to_timestamp always use 'infer'
+ assert result1.columns.freqstr == 'AS-JAN'
+ assert result2.columns.freqstr == 'AS-JAN'
+
+ def test_frame_index_to_string(self):
+ index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
+ frame = DataFrame(np.random.randn(3, 4), index=index)
+
+ # it works!
+ frame.to_string()
+
+ def test_align_frame(self):
+ rng = period_range('1/1/2000', '1/1/2010', freq='A')
+ ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
+
+ result = ts + ts[::2]
+ expected = ts + ts
+ expected.values[1::2] = np.nan
+ tm.assert_frame_equal(result, expected)
+
+ result = ts + _permute(ts[::2])
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_quantile.py b/contrib/python/pandas/py2/pandas/tests/frame/test_quantile.py
new file mode 100644
index 00000000000..d1f1299a520
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_quantile.py
@@ -0,0 +1,384 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Series, Timestamp
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameQuantile(TestData):
+
+ def test_quantile(self):
+ from numpy import percentile
+
+ q = self.tsframe.quantile(0.1, axis=0)
+ assert q['A'] == percentile(self.tsframe['A'], 10)
+ tm.assert_index_equal(q.index, self.tsframe.columns)
+
+ q = self.tsframe.quantile(0.9, axis=1)
+ assert (q['2000-01-17'] ==
+ percentile(self.tsframe.loc['2000-01-17'], 90))
+ tm.assert_index_equal(q.index, self.tsframe.index)
+
+ # test degenerate case
+ q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
+ assert(np.isnan(q['x']) and np.isnan(q['y']))
+
+ # non-numeric exclusion
+ df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
+ rs = df.quantile(0.5)
+ xp = df.median().rename(0.5)
+ assert_series_equal(rs, xp)
+
+ # axis
+ df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+ result = df.quantile(.5, axis=1)
+ expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
+ assert_series_equal(result, expected)
+
+ result = df.quantile([.5, .75], axis=1)
+ expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
+ 3: [3.5, 3.75]}, index=[0.5, 0.75])
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # We may want to break API in the future to change this
+ # so that we exclude non-numeric along the same axis
+ # See GH #7312
+ df = DataFrame([[1, 2, 3],
+ ['a', 'b', 4]])
+ result = df.quantile(.5, axis=1)
+ expected = Series([3., 4.], index=[0, 1], name=0.5)
+ assert_series_equal(result, expected)
+
+ def test_quantile_axis_mixed(self):
+
+ # mixed on axis=1
+ df = DataFrame({"A": [1, 2, 3],
+ "B": [2., 3., 4.],
+ "C": pd.date_range('20130101', periods=3),
+ "D": ['foo', 'bar', 'baz']})
+ result = df.quantile(.5, axis=1)
+ expected = Series([1.5, 2.5, 3.5], name=0.5)
+ assert_series_equal(result, expected)
+
+ # must raise
+ with pytest.raises(TypeError):
+ df.quantile(.5, axis=1, numeric_only=False)
+
+ def test_quantile_axis_parameter(self):
+ # GH 9543/9544
+
+ df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+
+ result = df.quantile(.5, axis=0)
+
+ expected = Series([2., 3.], index=["A", "B"], name=0.5)
+ assert_series_equal(result, expected)
+
+ expected = df.quantile(.5, axis="index")
+ assert_series_equal(result, expected)
+
+ result = df.quantile(.5, axis=1)
+
+ expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
+ assert_series_equal(result, expected)
+
+ result = df.quantile(.5, axis="columns")
+ assert_series_equal(result, expected)
+
+ pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
+ pytest.raises(ValueError, df.quantile, 0.1, axis="column")
+
+ def test_quantile_interpolation(self):
+ # see gh-10174
+ from numpy import percentile
+
+ # interpolation = linear (default case)
+ q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
+ assert q['A'] == percentile(self.tsframe['A'], 10)
+ q = self.intframe.quantile(0.1)
+ assert q['A'] == percentile(self.intframe['A'], 10)
+
+ # test with and without interpolation keyword
+ q1 = self.intframe.quantile(0.1)
+ assert q1['A'] == np.percentile(self.intframe['A'], 10)
+ tm.assert_series_equal(q, q1)
+
+ # interpolation method other than default linear
+ df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+ result = df.quantile(.5, axis=1, interpolation='nearest')
+ expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
+ tm.assert_series_equal(result, expected)
+
+ # cross-check interpolation=nearest results in original dtype
+ exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
+ axis=0, interpolation='nearest')
+ expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+ # float
+ df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
+ result = df.quantile(.5, axis=1, interpolation='nearest')
+ expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
+ tm.assert_series_equal(result, expected)
+ exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
+ axis=0, interpolation='nearest')
+ expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
+ assert_series_equal(result, expected)
+
+ # axis
+ result = df.quantile([.5, .75], axis=1, interpolation='lower')
+ expected = DataFrame({1: [1., 1.], 2: [2., 2.],
+ 3: [3., 3.]}, index=[0.5, 0.75])
+ assert_frame_equal(result, expected)
+
+ # test degenerate case
+ df = DataFrame({'x': [], 'y': []})
+ q = df.quantile(0.1, axis=0, interpolation='higher')
+ assert(np.isnan(q['x']) and np.isnan(q['y']))
+
+ # multi
+ df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+ columns=['a', 'b', 'c'])
+ result = df.quantile([.25, .5], interpolation='midpoint')
+
+ # https://github.com/numpy/numpy/issues/7163
+ expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
+ index=[.25, .5], columns=['a', 'b', 'c'])
+ assert_frame_equal(result, expected)
+
+ def test_quantile_multi(self):
+ df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+ columns=['a', 'b', 'c'])
+ result = df.quantile([.25, .5])
+ expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
+ index=[.25, .5], columns=['a', 'b', 'c'])
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ result = df.quantile([.25, .5], axis=1)
+ expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
+ index=[.25, .5], columns=[0, 1, 2])
+
+ # empty
+ result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
+ expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
+ index=[.1, .9])
+ assert_frame_equal(result, expected)
+
+ def test_quantile_datetime(self):
+ df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
+
+ # exclude datetime
+ result = df.quantile(.5)
+ expected = Series([2.5], index=['b'])
+
+ # datetime
+ result = df.quantile(.5, numeric_only=False)
+ expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
+ index=['a', 'b'],
+ name=0.5)
+ assert_series_equal(result, expected)
+
+ # datetime w/ multi
+ result = df.quantile([.5], numeric_only=False)
+ expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
+ index=[.5], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ # axis = 1
+ df['c'] = pd.to_datetime(['2011', '2012'])
+ result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
+ expected = Series([Timestamp('2010-07-02 12:00:00'),
+ Timestamp('2011-07-02 12:00:00')],
+ index=[0, 1],
+ name=0.5)
+ assert_series_equal(result, expected)
+
+ result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
+ expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
+ Timestamp('2011-07-02 12:00:00')]],
+ index=[0.5], columns=[0, 1])
+ assert_frame_equal(result, expected)
+
+ # empty when numeric_only=True
+ # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
+ # result = df[['a', 'c']].quantile(.5)
+ # result = df[['a', 'c']].quantile([.5])
+
+ def test_quantile_invalid(self):
+ msg = 'percentiles should all be in the interval \\[0, 1\\]'
+ for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
+ with pytest.raises(ValueError, match=msg):
+ self.tsframe.quantile(invalid)
+
+ def test_quantile_box(self):
+ df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03')],
+ 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03', tz='US/Eastern')],
+ 'C': [pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('3 days')]})
+
+ res = df.quantile(0.5, numeric_only=False)
+
+ exp = pd.Series([pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timedelta('2 days')],
+ name=0.5, index=['A', 'B', 'C'])
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5], numeric_only=False)
+ exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timedelta('2 days')]],
+ index=[0.5], columns=['A', 'B', 'C'])
+ tm.assert_frame_equal(res, exp)
+
+ # DatetimeBlock may be consolidated and contain NaT in different loc
+ df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
+ pd.NaT,
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03')],
+ 'a': [pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.NaT,
+ pd.Timestamp('2011-01-03')],
+ 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.NaT,
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03', tz='US/Eastern')],
+ 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.NaT,
+ pd.Timestamp('2011-01-03', tz='US/Eastern')],
+ 'C': [pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('3 days'),
+ pd.NaT],
+ 'c': [pd.NaT,
+ pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('3 days')]},
+ columns=list('AaBbCc'))
+
+ res = df.quantile(0.5, numeric_only=False)
+ exp = pd.Series([pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('2 days')],
+ name=0.5, index=list('AaBbCc'))
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5], numeric_only=False)
+ exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('2 days')]],
+ index=[0.5], columns=list('AaBbCc'))
+ tm.assert_frame_equal(res, exp)
+
+ def test_quantile_nan(self):
+
+ # GH 14357 - float block where some cols have missing values
+ df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
+ df.iloc[-1, 1] = np.nan
+
+ res = df.quantile(0.5)
+ exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5, 0.75])
+ exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
+ tm.assert_frame_equal(res, exp)
+
+ res = df.quantile(0.5, axis=1)
+ exp = Series(np.arange(1.0, 6.0), name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5, 0.75], axis=1)
+ exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
+ tm.assert_frame_equal(res, exp)
+
+ # full-nan column
+ df['b'] = np.nan
+
+ res = df.quantile(0.5)
+ exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5, 0.75])
+ exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
+ index=[0.5, 0.75])
+ tm.assert_frame_equal(res, exp)
+
+ def test_quantile_nat(self):
+
+ # full NaT column
+ df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
+
+ res = df.quantile(0.5, numeric_only=False)
+ exp = Series([pd.NaT], index=['a'], name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5], numeric_only=False)
+ exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
+ tm.assert_frame_equal(res, exp)
+
+ # mixed non-null / full null column
+ df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
+ pd.Timestamp('2012-01-02'),
+ pd.Timestamp('2012-01-03')],
+ 'b': [pd.NaT, pd.NaT, pd.NaT]})
+
+ res = df.quantile(0.5, numeric_only=False)
+ exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
+ name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5], numeric_only=False)
+ exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
+ columns=['a', 'b'])
+ tm.assert_frame_equal(res, exp)
+
+ def test_quantile_empty(self):
+
+ # floats
+ df = DataFrame(columns=['a', 'b'], dtype='float64')
+
+ res = df.quantile(0.5)
+ exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
+ tm.assert_series_equal(res, exp)
+
+ res = df.quantile([0.5])
+ exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
+ tm.assert_frame_equal(res, exp)
+
+ # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
+ # res = df.quantile(0.5, axis=1)
+ # res = df.quantile([0.5], axis=1)
+
+ # ints
+ df = DataFrame(columns=['a', 'b'], dtype='int64')
+
+ # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
+ # res = df.quantile(0.5)
+
+ # datetimes
+ df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
+
+ # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
+ # res = df.quantile(0.5, numeric_only=False)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_query_eval.py b/contrib/python/pandas/py2/pandas/tests/frame/test_query_eval.py
new file mode 100644
index 00000000000..9c4d306ea57
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_query_eval.py
@@ -0,0 +1,1040 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lrange, range, zip
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, date_range
+from pandas.core.computation.check import _NUMEXPR_INSTALLED
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf)
+
+PARSERS = 'python', 'pandas'
+ENGINES = 'python', pytest.param('numexpr', marks=td.skip_if_no_ne)
+
+
[email protected](params=PARSERS, ids=lambda x: x)
+def parser(request):
+ return request.param
+
+
[email protected](params=ENGINES, ids=lambda x: x)
+def engine(request):
+ return request.param
+
+
+def skip_if_no_pandas_parser(parser):
+ if parser != 'pandas':
+ pytest.skip("cannot evaluate with parser {0!r}".format(parser))
+
+
+class TestCompat(object):
+
+ def setup_method(self, method):
+ self.df = DataFrame({'A': [1, 2, 3]})
+ self.expected1 = self.df[self.df.A > 0]
+ self.expected2 = self.df.A + 1
+
+ def test_query_default(self):
+
+ # GH 12749
+ # this should always work, whether _NUMEXPR_INSTALLED or not
+ df = self.df
+ result = df.query('A>0')
+ assert_frame_equal(result, self.expected1)
+ result = df.eval('A+1')
+ assert_series_equal(result, self.expected2, check_names=False)
+
+ def test_query_None(self):
+
+ df = self.df
+ result = df.query('A>0', engine=None)
+ assert_frame_equal(result, self.expected1)
+ result = df.eval('A+1', engine=None)
+ assert_series_equal(result, self.expected2, check_names=False)
+
+ def test_query_python(self):
+
+ df = self.df
+ result = df.query('A>0', engine='python')
+ assert_frame_equal(result, self.expected1)
+ result = df.eval('A+1', engine='python')
+ assert_series_equal(result, self.expected2, check_names=False)
+
+ def test_query_numexpr(self):
+
+ df = self.df
+ if _NUMEXPR_INSTALLED:
+ result = df.query('A>0', engine='numexpr')
+ assert_frame_equal(result, self.expected1)
+ result = df.eval('A+1', engine='numexpr')
+ assert_series_equal(result, self.expected2, check_names=False)
+ else:
+ pytest.raises(ImportError,
+ lambda: df.query('A>0', engine='numexpr'))
+ pytest.raises(ImportError,
+ lambda: df.eval('A+1', engine='numexpr'))
+
+
+class TestDataFrameEval(TestData):
+
+ def test_ops(self):
+
+ # tst ops and reversed ops in evaluation
+ # GH7198
+
+ # smaller hits python, larger hits numexpr
+ for n in [4, 4000]:
+
+ df = DataFrame(1, index=range(n), columns=list('abcd'))
+ df.iloc[0] = 2
+ m = df.mean()
+
+ for op_str, op, rop in [('+', '__add__', '__radd__'),
+ ('-', '__sub__', '__rsub__'),
+ ('*', '__mul__', '__rmul__'),
+ ('/', '__truediv__', '__rtruediv__')]:
+
+ base = (DataFrame(np.tile(m.values, n) # noqa
+ .reshape(n, -1),
+ columns=list('abcd')))
+
+ expected = eval("base{op}df".format(op=op_str))
+
+ # ops as strings
+ result = eval("m{op}df".format(op=op_str))
+ assert_frame_equal(result, expected)
+
+ # these are commutative
+ if op in ['+', '*']:
+ result = getattr(df, op)(m)
+ assert_frame_equal(result, expected)
+
+ # these are not
+ elif op in ['-', '/']:
+ result = getattr(df, rop)(m)
+ assert_frame_equal(result, expected)
+
+ # GH7192
+ df = DataFrame(dict(A=np.random.randn(25000)))
+ df.iloc[0:5] = np.nan
+ expected = (1 - np.isnan(df.iloc[0:25]))
+ result = (1 - np.isnan(df)).iloc[0:25]
+ assert_frame_equal(result, expected)
+
+ def test_query_non_str(self):
+ # GH 11485
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'b']})
+
+ msg = "expr must be a string to be evaluated"
+ with pytest.raises(ValueError, match=msg):
+ df.query(lambda x: x.B == "b")
+
+ with pytest.raises(ValueError, match=msg):
+ df.query(111)
+
+ def test_query_empty_string(self):
+ # GH 13139
+ df = pd.DataFrame({'A': [1, 2, 3]})
+
+ msg = "expr cannot be an empty string"
+ with pytest.raises(ValueError, match=msg):
+ df.query('')
+
+ def test_eval_resolvers_as_list(self):
+ # GH 14095
+ df = DataFrame(np.random.randn(10, 2), columns=list('ab'))
+ dict1 = {'a': 1}
+ dict2 = {'b': 2}
+ assert (df.eval('a + b', resolvers=[dict1, dict2]) ==
+ dict1['a'] + dict2['b'])
+ assert (pd.eval('a + b', resolvers=[dict1, dict2]) ==
+ dict1['a'] + dict2['b'])
+
+
+class TestDataFrameQueryWithMultiIndex(object):
+
+ def test_query_with_named_multiindex(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ a = np.random.choice(['red', 'green'], size=10)
+ b = np.random.choice(['eggs', 'ham'], size=10)
+ index = MultiIndex.from_arrays([a, b], names=['color', 'food'])
+ df = DataFrame(np.random.randn(10, 2), index=index)
+ ind = Series(df.index.get_level_values('color').values, index=index,
+ name='color')
+
+ # equality
+ res1 = df.query('color == "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" == color', parser=parser, engine=engine)
+ exp = df[ind == 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('color != "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" != color', parser=parser, engine=engine)
+ exp = df[ind != 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('color == ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] == color', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('color != ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] != color', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["red"] in color', parser=parser, engine=engine)
+ res2 = df.query('"red" in color', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["red"] not in color', parser=parser, engine=engine)
+ res2 = df.query('"red" not in color', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ def test_query_with_unnamed_multiindex(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ a = np.random.choice(['red', 'green'], size=10)
+ b = np.random.choice(['eggs', 'ham'], size=10)
+ index = MultiIndex.from_arrays([a, b])
+ df = DataFrame(np.random.randn(10, 2), index=index)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+
+ res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
+ exp = df[ind == 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+ res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
+ exp = df[ind != 'red']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
+ res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
+ res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
+ exp = df[ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["red"] not in ilevel_0', parser=parser,
+ engine=engine)
+ res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
+ exp = df[~ind.isin(['red'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # ## LEVEL 1
+ ind = Series(df.index.get_level_values(1).values, index=index)
+ res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
+ res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
+ exp = df[ind == 'eggs']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # inequality
+ res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
+ res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
+ exp = df[ind != 'eggs']
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # list equality (really just set membership)
+ res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
+ res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
+ exp = df[ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
+ res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
+ exp = df[~ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ # in/not in ops
+ res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
+ res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
+ exp = df[ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ res1 = df.query('["eggs"] not in ilevel_1', parser=parser,
+ engine=engine)
+ res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
+ exp = df[~ind.isin(['eggs'])]
+ assert_frame_equal(res1, exp)
+ assert_frame_equal(res2, exp)
+
+ def test_query_with_partially_named_multiindex(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ a = np.random.choice(['red', 'green'], size=10)
+ b = np.arange(10)
+ index = MultiIndex.from_arrays([a, b])
+ index.names = [None, 'rating']
+ df = DataFrame(np.random.randn(10, 2), index=index)
+ res = df.query('rating == 1', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values('rating').values, index=index,
+ name='rating')
+ exp = df[ind == 1]
+ assert_frame_equal(res, exp)
+
+ res = df.query('rating != 1', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values('rating').values, index=index,
+ name='rating')
+ exp = df[ind != 1]
+ assert_frame_equal(res, exp)
+
+ res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+ exp = df[ind == "red"]
+ assert_frame_equal(res, exp)
+
+ res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+ ind = Series(df.index.get_level_values(0).values, index=index)
+ exp = df[ind != "red"]
+ assert_frame_equal(res, exp)
+
+ def test_query_multiindex_get_index_resolvers(self):
+ df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs'])
+ resolvers = df._get_index_resolvers()
+
+ def to_series(mi, level):
+ level_values = mi.get_level_values(level)
+ s = level_values.to_series()
+ s.index = mi
+ return s
+
+ col_series = df.columns.to_series()
+ expected = {'index': df.index,
+ 'columns': col_series,
+ 'spam': to_series(df.index, 'spam'),
+ 'eggs': to_series(df.index, 'eggs'),
+ 'C0': col_series}
+ for k, v in resolvers.items():
+ if isinstance(v, Index):
+ assert v.is_(expected[k])
+ elif isinstance(v, Series):
+ assert_series_equal(v, expected[k])
+ else:
+ raise AssertionError("object must be a Series or Index")
+
+ @pytest.mark.filterwarnings("ignore::FutureWarning")
+ def test_raise_on_panel_with_multiindex(self, parser, engine):
+ p = tm.makePanel(7)
+ p.items = tm.makeCustomIndex(len(p.items), nlevels=2)
+ with pytest.raises(NotImplementedError):
+ pd.eval('p + 1', parser=parser, engine=engine)
+
+
+class TestDataFrameQueryNumExprPandas(object):
+
+ @classmethod
+ def setup_class(cls):
+ cls.engine = 'numexpr'
+ cls.parser = 'pandas'
+
+ @classmethod
+ def teardown_class(cls):
+ del cls.engine, cls.parser
+
+ def test_date_query_with_attribute_access(self):
+ engine, parser = self.engine, self.parser
+ skip_if_no_pandas_parser(parser)
+ df = DataFrame(np.random.randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ df['dates2'] = date_range('1/1/2013', periods=5)
+ df['dates3'] = date_range('1/1/2014', periods=5)
+ res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_query_no_attribute_access(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ df['dates2'] = date_range('1/1/2013', periods=5)
+ df['dates3'] = date_range('1/1/2014', periods=5)
+ res = df.query('dates1 < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_query_with_NaT(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates2'] = date_range('1/1/2013', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+ df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
+ res = df.query('dates1 < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.set_index('dates1', inplace=True, drop=True)
+ res = df.query('index < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query_with_NaT(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.iloc[0, 0] = pd.NaT
+ df.set_index('dates1', inplace=True, drop=True)
+ res = df.query('index < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query_with_NaT_duplicates(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ d = {}
+ d['dates1'] = date_range('1/1/2012', periods=n)
+ d['dates3'] = date_range('1/1/2014', periods=n)
+ df = DataFrame(d)
+ df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+ df.set_index('dates1', inplace=True, drop=True)
+ res = df.query('dates1 < 20130101 < dates3', engine=engine,
+ parser=parser)
+ expec = df[(df.index.to_series() < '20130101') &
+ ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_query_with_non_date(self):
+ engine, parser = self.engine, self.parser
+
+ n = 10
+ df = DataFrame({'dates': date_range('1/1/2012', periods=n),
+ 'nondate': np.arange(n)})
+
+ result = df.query('dates == nondate', parser=parser, engine=engine)
+ assert len(result) == 0
+
+ result = df.query('dates != nondate', parser=parser, engine=engine)
+ assert_frame_equal(result, df)
+
+ for op in ['<', '>', '<=', '>=']:
+ with pytest.raises(TypeError):
+ df.query('dates %s nondate' % op, parser=parser, engine=engine)
+
+ def test_query_syntax_error(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame({"i": lrange(10), "+": lrange(3, 13),
+ "r": lrange(4, 14)})
+ with pytest.raises(SyntaxError):
+ df.query('i - +', engine=engine, parser=parser)
+
+ def test_query_scope(self):
+ from pandas.core.computation.ops import UndefinedVariableError
+ engine, parser = self.engine, self.parser
+ skip_if_no_pandas_parser(parser)
+
+ df = DataFrame(np.random.randn(20, 2), columns=list('ab'))
+
+ a, b = 1, 2 # noqa
+ res = df.query('a > b', engine=engine, parser=parser)
+ expected = df[df.a > df.b]
+ assert_frame_equal(res, expected)
+
+ res = df.query('@a > b', engine=engine, parser=parser)
+ expected = df[a > df.b]
+ assert_frame_equal(res, expected)
+
+ # no local variable c
+ with pytest.raises(UndefinedVariableError):
+ df.query('@a > b > @c', engine=engine, parser=parser)
+
+ # no column named 'c'
+ with pytest.raises(UndefinedVariableError):
+ df.query('@a > b > c', engine=engine, parser=parser)
+
+ def test_query_doesnt_pickup_local(self):
+ from pandas.core.computation.ops import UndefinedVariableError
+
+ engine, parser = self.engine, self.parser
+ n = m = 10
+ df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
+
+ # we don't pick up the local 'sin'
+ with pytest.raises(UndefinedVariableError):
+ df.query('sin > 5', engine=engine, parser=parser)
+
+ def test_query_builtin(self):
+ from pandas.core.computation.engines import NumExprClobberingError
+ engine, parser = self.engine, self.parser
+
+ n = m = 10
+ df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
+
+ df.index.name = 'sin'
+ msg = 'Variables in expression.+'
+ with pytest.raises(NumExprClobberingError, match=msg):
+ df.query('sin > 5', engine=engine, parser=parser)
+
+ def test_query(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
+
+ assert_frame_equal(df.query('a < b', engine=engine, parser=parser),
+ df[df.a < df.b])
+ assert_frame_equal(df.query('a + b > b * c', engine=engine,
+ parser=parser),
+ df[df.a + df.b > df.b * df.c])
+
+ def test_query_index_with_name(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randint(10, size=(10, 3)),
+ index=Index(range(10), name='blob'),
+ columns=['a', 'b', 'c'])
+ res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser)
+ expec = df[(df.index < 5) & (df.a < df.b)]
+ assert_frame_equal(res, expec)
+
+ res = df.query('blob < b', engine=engine, parser=parser)
+ expec = df[df.index < df.b]
+
+ assert_frame_equal(res, expec)
+
+ def test_query_index_without_name(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randint(10, size=(10, 3)),
+ index=range(10), columns=['a', 'b', 'c'])
+
+ # "index" should refer to the index
+ res = df.query('index < b', engine=engine, parser=parser)
+ expec = df[df.index < df.b]
+ assert_frame_equal(res, expec)
+
+ # test against a scalar
+ res = df.query('index < 5', engine=engine, parser=parser)
+ expec = df[df.index < 5]
+ assert_frame_equal(res, expec)
+
+ def test_nested_scope(self):
+ engine = self.engine
+ parser = self.parser
+
+ skip_if_no_pandas_parser(parser)
+
+ df = DataFrame(np.random.randn(5, 3))
+ df2 = DataFrame(np.random.randn(5, 3))
+ expected = df[(df > 0) & (df2 > 0)]
+
+ result = df.query('(@df > 0) & (@df2 > 0)', engine=engine,
+ parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[df > 0 and df2 > 0]', engine=engine,
+ parser=parser)
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]',
+ engine=engine, parser=parser)
+ expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+ assert_frame_equal(result, expected)
+
+ result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
+ expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
+ assert_frame_equal(result, expected)
+
+ def test_nested_raises_on_local_self_reference(self):
+ from pandas.core.computation.ops import UndefinedVariableError
+
+ df = DataFrame(np.random.randn(5, 3))
+
+ # can't reference ourself b/c we're a local so @ is necessary
+ with pytest.raises(UndefinedVariableError):
+ df.query('df > 0', engine=self.engine, parser=self.parser)
+
+ def test_local_syntax(self):
+ skip_if_no_pandas_parser(self.parser)
+
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij'))
+ b = 1
+ expect = df[df.a < b]
+ result = df.query('a < @b', engine=engine, parser=parser)
+ assert_frame_equal(result, expect)
+
+ expect = df[df.a < df.b]
+ result = df.query('a < b', engine=engine, parser=parser)
+ assert_frame_equal(result, expect)
+
+ def test_chained_cmp_and_in(self):
+ skip_if_no_pandas_parser(self.parser)
+ engine, parser = self.engine, self.parser
+ cols = list('abc')
+ df = DataFrame(np.random.randn(100, len(cols)), columns=cols)
+ res = df.query('a < b < c and a not in b not in c', engine=engine,
+ parser=parser)
+ ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) # noqa
+ expec = df[ind]
+ assert_frame_equal(res, expec)
+
+ def test_local_variable_with_in(self):
+ engine, parser = self.engine, self.parser
+ skip_if_no_pandas_parser(parser)
+ a = Series(np.random.randint(3, size=15), name='a')
+ b = Series(np.random.randint(10, size=15), name='b')
+ df = DataFrame({'a': a, 'b': b})
+
+ expected = df.loc[(df.b - 1).isin(a)]
+ result = df.query('b - 1 in a', engine=engine, parser=parser)
+ assert_frame_equal(expected, result)
+
+ b = Series(np.random.randint(10, size=15), name='b')
+ expected = df.loc[(b - 1).isin(a)]
+ result = df.query('@b - 1 in a', engine=engine, parser=parser)
+ assert_frame_equal(expected, result)
+
+ def test_at_inside_string(self):
+ engine, parser = self.engine, self.parser
+ skip_if_no_pandas_parser(parser)
+ c = 1 # noqa
+ df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']})
+ result = df.query('a == "@c"', engine=engine, parser=parser)
+ expected = df[df.a == "@c"]
+ assert_frame_equal(result, expected)
+
+ def test_query_undefined_local(self):
+ from pandas.core.computation.ops import UndefinedVariableError
+ engine, parser = self.engine, self.parser
+ skip_if_no_pandas_parser(parser)
+
+ df = DataFrame(np.random.rand(10, 2), columns=list('ab'))
+ msg = "local variable 'c' is not defined"
+
+ with pytest.raises(UndefinedVariableError, match=msg):
+ df.query('a == @c', engine=engine, parser=parser)
+
+ def test_index_resolvers_come_after_columns_with_the_same_name(self):
+ n = 1 # noqa
+ a = np.r_[20:101:20]
+
+ df = DataFrame({'index': a, 'b': np.random.randn(a.size)})
+ df.index.name = 'index'
+ result = df.query('index > 5', engine=self.engine, parser=self.parser)
+ expected = df[df['index'] > 5]
+ assert_frame_equal(result, expected)
+
+ df = DataFrame({'index': a,
+ 'b': np.random.randn(a.size)})
+ result = df.query('ilevel_0 > 5', engine=self.engine,
+ parser=self.parser)
+ expected = df.loc[df.index[df.index > 5]]
+ assert_frame_equal(result, expected)
+
+ df = DataFrame({'a': a, 'b': np.random.randn(a.size)})
+ df.index.name = 'a'
+ result = df.query('a > 5', engine=self.engine, parser=self.parser)
+ expected = df[df.a > 5]
+ assert_frame_equal(result, expected)
+
+ result = df.query('index > 5', engine=self.engine, parser=self.parser)
+ expected = df.loc[df.index[df.index > 5]]
+ assert_frame_equal(result, expected)
+
+ def test_inf(self):
+ n = 10
+ df = DataFrame({'a': np.random.rand(n), 'b': np.random.rand(n)})
+ df.loc[::2, 0] = np.inf
+ ops = '==', '!='
+ d = dict(zip(ops, (operator.eq, operator.ne)))
+ for op, f in d.items():
+ q = 'a %s inf' % op
+ expected = df[f(df.a, np.inf)]
+ result = df.query(q, engine=self.engine, parser=self.parser)
+ assert_frame_equal(result, expected)
+
+
+class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestDataFrameQueryNumExprPython, cls).setup_class()
+ cls.engine = 'numexpr'
+ cls.parser = 'python'
+ cls.frame = TestData().frame
+
+ def test_date_query_no_attribute_access(self):
+ engine, parser = self.engine, self.parser
+ df = DataFrame(np.random.randn(5, 3))
+ df['dates1'] = date_range('1/1/2012', periods=5)
+ df['dates2'] = date_range('1/1/2013', periods=5)
+ df['dates3'] = date_range('1/1/2014', periods=5)
+ res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
+ engine=engine, parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_query_with_NaT(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates2'] = date_range('1/1/2013', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+ df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
+ res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
+ engine=engine, parser=parser)
+ expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.set_index('dates1', inplace=True, drop=True)
+ res = df.query('(index < 20130101) & (20130101 < dates3)',
+ engine=engine, parser=parser)
+ expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query_with_NaT(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.iloc[0, 0] = pd.NaT
+ df.set_index('dates1', inplace=True, drop=True)
+ res = df.query('(index < 20130101) & (20130101 < dates3)',
+ engine=engine, parser=parser)
+ expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+ assert_frame_equal(res, expec)
+
+ def test_date_index_query_with_NaT_duplicates(self):
+ engine, parser = self.engine, self.parser
+ n = 10
+ df = DataFrame(np.random.randn(n, 3))
+ df['dates1'] = date_range('1/1/2012', periods=n)
+ df['dates3'] = date_range('1/1/2014', periods=n)
+ df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+ df.set_index('dates1', inplace=True, drop=True)
+ with pytest.raises(NotImplementedError):
+ df.query('index < 20130101 < dates3', engine=engine, parser=parser)
+
+ def test_nested_scope(self):
+ from pandas.core.computation.ops import UndefinedVariableError
+ engine = self.engine
+ parser = self.parser
+ # smoke test
+ x = 1 # noqa
+ result = pd.eval('x + 1', engine=engine, parser=parser)
+ assert result == 2
+
+ df = DataFrame(np.random.randn(5, 3))
+ df2 = DataFrame(np.random.randn(5, 3))
+
+ # don't have the pandas parser
+ with pytest.raises(SyntaxError):
+ df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
+
+ with pytest.raises(UndefinedVariableError):
+ df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
+
+ expected = df[(df > 0) & (df2 > 0)]
+ result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
+ parser=parser)
+ assert_frame_equal(expected, result)
+
+ expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+ result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
+ engine=engine, parser=parser)
+ assert_frame_equal(expected, result)
+
+
+class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestDataFrameQueryPythonPandas, cls).setup_class()
+ cls.engine = 'python'
+ cls.parser = 'pandas'
+ cls.frame = TestData().frame
+
+ def test_query_builtin(self):
+ engine, parser = self.engine, self.parser
+
+ n = m = 10
+ df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
+
+ df.index.name = 'sin'
+ expected = df[df.index > 5]
+ result = df.query('sin > 5', engine=engine, parser=parser)
+ assert_frame_equal(expected, result)
+
+
+class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
+
+ @classmethod
+ def setup_class(cls):
+ super(TestDataFrameQueryPythonPython, cls).setup_class()
+ cls.engine = cls.parser = 'python'
+ cls.frame = TestData().frame
+
+ def test_query_builtin(self):
+ engine, parser = self.engine, self.parser
+
+ n = m = 10
+ df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
+
+ df.index.name = 'sin'
+ expected = df[df.index > 5]
+ result = df.query('sin > 5', engine=engine, parser=parser)
+ assert_frame_equal(expected, result)
+
+
+class TestDataFrameQueryStrings(object):
+
+ def test_str_query_method(self, parser, engine):
+ df = DataFrame(np.random.randn(10, 1), columns=['b'])
+ df['strings'] = Series(list('aabbccddee'))
+ expect = df[df.strings == 'a']
+
+ if parser != 'pandas':
+ col = 'strings'
+ lst = '"a"'
+
+ lhs = [col] * 2 + [lst] * 2
+ rhs = lhs[::-1]
+
+ eq, ne = '==', '!='
+ ops = 2 * ([eq] + [ne])
+
+ for lhs, op, rhs in zip(lhs, ops, rhs):
+ ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
+ pytest.raises(NotImplementedError, df.query, ex,
+ engine=engine, parser=parser,
+ local_dict={'strings': df.strings})
+ else:
+ res = df.query('"a" == strings', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('strings == "a"', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+ assert_frame_equal(res, df[df.strings.isin(['a'])])
+
+ expect = df[df.strings != 'a']
+ res = df.query('strings != "a"', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('"a" != strings', engine=engine, parser=parser)
+ assert_frame_equal(res, expect)
+ assert_frame_equal(res, df[~df.strings.isin(['a'])])
+
+ def test_str_list_query_method(self, parser, engine):
+ df = DataFrame(np.random.randn(10, 1), columns=['b'])
+ df['strings'] = Series(list('aabbccddee'))
+ expect = df[df.strings.isin(['a', 'b'])]
+
+ if parser != 'pandas':
+ col = 'strings'
+ lst = '["a", "b"]'
+
+ lhs = [col] * 2 + [lst] * 2
+ rhs = lhs[::-1]
+
+ eq, ne = '==', '!='
+ ops = 2 * ([eq] + [ne])
+
+ for lhs, op, rhs in zip(lhs, ops, rhs):
+ ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
+ with pytest.raises(NotImplementedError):
+ df.query(ex, engine=engine, parser=parser)
+ else:
+ res = df.query('strings == ["a", "b"]', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('["a", "b"] == strings', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ expect = df[~df.strings.isin(['a', 'b'])]
+
+ res = df.query('strings != ["a", "b"]', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ res = df.query('["a", "b"] != strings', engine=engine,
+ parser=parser)
+ assert_frame_equal(res, expect)
+
+ def test_query_with_string_columns(self, parser, engine):
+ df = DataFrame({'a': list('aaaabbbbcccc'),
+ 'b': list('aabbccddeeff'),
+ 'c': np.random.randint(5, size=12),
+ 'd': np.random.randint(9, size=12)})
+ if parser == 'pandas':
+ res = df.query('a in b', parser=parser, engine=engine)
+ expec = df[df.a.isin(df.b)]
+ assert_frame_equal(res, expec)
+
+ res = df.query('a in b and c < d', parser=parser, engine=engine)
+ expec = df[df.a.isin(df.b) & (df.c < df.d)]
+ assert_frame_equal(res, expec)
+ else:
+ with pytest.raises(NotImplementedError):
+ df.query('a in b', parser=parser, engine=engine)
+
+ with pytest.raises(NotImplementedError):
+ df.query('a in b and c < d', parser=parser, engine=engine)
+
+ def test_object_array_eq_ne(self, parser, engine):
+ df = DataFrame({'a': list('aaaabbbbcccc'),
+ 'b': list('aabbccddeeff'),
+ 'c': np.random.randint(5, size=12),
+ 'd': np.random.randint(9, size=12)})
+ res = df.query('a == b', parser=parser, engine=engine)
+ exp = df[df.a == df.b]
+ assert_frame_equal(res, exp)
+
+ res = df.query('a != b', parser=parser, engine=engine)
+ exp = df[df.a != df.b]
+ assert_frame_equal(res, exp)
+
+ def test_query_with_nested_strings(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ raw = """id event timestamp
+ 1 "page 1 load" 1/1/2014 0:00:01
+ 1 "page 1 exit" 1/1/2014 0:00:31
+ 2 "page 2 load" 1/1/2014 0:01:01
+ 2 "page 2 exit" 1/1/2014 0:01:31
+ 3 "page 3 load" 1/1/2014 0:02:01
+ 3 "page 3 exit" 1/1/2014 0:02:31
+ 4 "page 1 load" 2/1/2014 1:00:01
+ 4 "page 1 exit" 2/1/2014 1:00:31
+ 5 "page 2 load" 2/1/2014 1:01:01
+ 5 "page 2 exit" 2/1/2014 1:01:31
+ 6 "page 3 load" 2/1/2014 1:02:01
+ 6 "page 3 exit" 2/1/2014 1:02:31
+ """
+ df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python',
+ parse_dates=['timestamp'])
+ expected = df[df.event == '"page 1 load"']
+ res = df.query("""'"page 1 load"' in event""", parser=parser,
+ engine=engine)
+ assert_frame_equal(expected, res)
+
+ def test_query_with_nested_special_character(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ df = DataFrame({'a': ['a', 'b', 'test & test'],
+ 'b': [1, 2, 3]})
+ res = df.query('a == "test & test"', parser=parser, engine=engine)
+ expec = df[df.a == 'test & test']
+ assert_frame_equal(res, expec)
+
+ def test_query_lex_compare_strings(self, parser, engine):
+ import operator as opr
+
+ a = Series(np.random.choice(list('abcde'), 20))
+ b = Series(np.arange(a.size))
+ df = DataFrame({'X': a, 'Y': b})
+
+ ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge}
+
+ for op, func in ops.items():
+ res = df.query('X %s "d"' % op, engine=engine, parser=parser)
+ expected = df[func(df.X, 'd')]
+ assert_frame_equal(res, expected)
+
+ def test_query_single_element_booleans(self, parser, engine):
+ columns = 'bid', 'bidsize', 'ask', 'asksize'
+ data = np.random.randint(2, size=(1, len(columns))).astype(bool)
+ df = DataFrame(data, columns=columns)
+ res = df.query('bid & ask', engine=engine, parser=parser)
+ expected = df[df.bid & df.ask]
+ assert_frame_equal(res, expected)
+
+ def test_query_string_scalar_variable(self, parser, engine):
+ skip_if_no_pandas_parser(parser)
+ df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'],
+ 'Price': [109.70, 109.72, 183.30, 183.35]})
+ e = df[df.Symbol == 'BUD US']
+ symb = 'BUD US' # noqa
+ r = df.query('Symbol == @symb', parser=parser, engine=engine)
+ assert_frame_equal(e, r)
+
+
+class TestDataFrameEvalWithFrame(object):
+
+ def setup_method(self, method):
+ self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc'))
+
+ def teardown_method(self, method):
+ del self.frame
+
+ def test_simple_expr(self, parser, engine):
+ res = self.frame.eval('a + b', engine=engine, parser=parser)
+ expect = self.frame.a + self.frame.b
+ assert_series_equal(res, expect)
+
+ def test_bool_arith_expr(self, parser, engine):
+ res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
+ expect = self.frame.a[self.frame.a < 1] + self.frame.b
+ assert_series_equal(res, expect)
+
+ @pytest.mark.parametrize('op', ['+', '-', '*', '/'])
+ def test_invalid_type_for_operator_raises(self, parser, engine, op):
+ df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
+ msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"
+
+ with pytest.raises(TypeError, match=msg):
+ df.eval('a {0} b'.format(op), engine=engine, parser=parser)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_rank.py b/contrib/python/pandas/py2/pandas/tests/frame/test_rank.py
new file mode 100644
index 00000000000..10c42e0d1a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_rank.py
@@ -0,0 +1,318 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta
+from distutils.version import LooseVersion
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+
+class TestRank(TestData):
+ s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
+ df = DataFrame({'A': s, 'B': s})
+
+ results = {
+ 'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan,
+ 3.5, 1.5, 8.0, np.nan, 5.5]),
+ 'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
+ 'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
+ 'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
+ 'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
+ }
+
+ @pytest.fixture(params=['average', 'min', 'max', 'first', 'dense'])
+ def method(self, request):
+ """
+ Fixture for trying all rank methods
+ """
+ return request.param
+
+ def test_rank(self):
+ rankdata = pytest.importorskip('scipy.stats.rankdata')
+
+ self.frame['A'][::2] = np.nan
+ self.frame['B'][::3] = np.nan
+ self.frame['C'][::4] = np.nan
+ self.frame['D'][::5] = np.nan
+
+ ranks0 = self.frame.rank()
+ ranks1 = self.frame.rank(1)
+ mask = np.isnan(self.frame.values)
+
+ fvals = self.frame.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, fvals)
+ exp0[mask] = np.nan
+
+ exp1 = np.apply_along_axis(rankdata, 1, fvals)
+ exp1[mask] = np.nan
+
+ tm.assert_almost_equal(ranks0.values, exp0)
+ tm.assert_almost_equal(ranks1.values, exp1)
+
+ # integers
+ df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
+
+ result = df.rank()
+ exp = df.astype(float).rank()
+ tm.assert_frame_equal(result, exp)
+
+ result = df.rank(1)
+ exp = df.astype(float).rank(1)
+ tm.assert_frame_equal(result, exp)
+
+ def test_rank2(self):
+ df = DataFrame([[1, 3, 2], [1, 2, 3]])
+ expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
+ result = df.rank(1, pct=True)
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame([[1, 3, 2], [1, 2, 3]])
+ expected = df.rank(0) / 2.0
+ result = df.rank(0, pct=True)
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
+ expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
+ result = df.rank(1, numeric_only=False)
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
+ result = df.rank(0, numeric_only=False)
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
+ expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
+ result = df.rank(1, numeric_only=False)
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
+ result = df.rank(0, numeric_only=False)
+ tm.assert_frame_equal(result, expected)
+
+ # f7u12, this does not work without extensive workaround
+ data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
+ [datetime(2000, 1, 2), datetime(2000, 1, 3),
+ datetime(2000, 1, 1)]]
+ df = DataFrame(data)
+
+ # check the rank
+ expected = DataFrame([[2., np.nan, 1.],
+ [2., 3., 1.]])
+ result = df.rank(1, numeric_only=False, ascending=True)
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[1., np.nan, 2.],
+ [2., 1., 3.]])
+ result = df.rank(1, numeric_only=False, ascending=False)
+ tm.assert_frame_equal(result, expected)
+
+ # mixed-type frames
+ self.mixed_frame['datetime'] = datetime.now()
+ self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
+
+ result = self.mixed_frame.rank(1)
+ expected = self.mixed_frame.rank(1, numeric_only=True)
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
+ 1e60, 1e80, 1e-30]})
+ exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
+ tm.assert_frame_equal(df.rank(), exp)
+
+ def test_rank_na_option(self):
+ rankdata = pytest.importorskip('scipy.stats.rankdata')
+
+ self.frame['A'][::2] = np.nan
+ self.frame['B'][::3] = np.nan
+ self.frame['C'][::4] = np.nan
+ self.frame['D'][::5] = np.nan
+
+ # bottom
+ ranks0 = self.frame.rank(na_option='bottom')
+ ranks1 = self.frame.rank(1, na_option='bottom')
+
+ fvals = self.frame.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, fvals)
+ exp1 = np.apply_along_axis(rankdata, 1, fvals)
+
+ tm.assert_almost_equal(ranks0.values, exp0)
+ tm.assert_almost_equal(ranks1.values, exp1)
+
+ # top
+ ranks0 = self.frame.rank(na_option='top')
+ ranks1 = self.frame.rank(1, na_option='top')
+
+ fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+ fval1 = self.frame.T
+ fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+ fval1 = fval1.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, fval0)
+ exp1 = np.apply_along_axis(rankdata, 1, fval1)
+
+ tm.assert_almost_equal(ranks0.values, exp0)
+ tm.assert_almost_equal(ranks1.values, exp1)
+
+ # descending
+
+ # bottom
+ ranks0 = self.frame.rank(na_option='top', ascending=False)
+ ranks1 = self.frame.rank(1, na_option='top', ascending=False)
+
+ fvals = self.frame.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, -fvals)
+ exp1 = np.apply_along_axis(rankdata, 1, -fvals)
+
+ tm.assert_almost_equal(ranks0.values, exp0)
+ tm.assert_almost_equal(ranks1.values, exp1)
+
+ # descending
+
+ # top
+ ranks0 = self.frame.rank(na_option='bottom', ascending=False)
+ ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
+
+ fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+ fval1 = self.frame.T
+ fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+ fval1 = fval1.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, -fval0)
+ exp1 = np.apply_along_axis(rankdata, 1, -fval1)
+
+ tm.assert_numpy_array_equal(ranks0.values, exp0)
+ tm.assert_numpy_array_equal(ranks1.values, exp1)
+
+ # bad values throw error
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+
+ with pytest.raises(ValueError, match=msg):
+ self.frame.rank(na_option='bad', ascending=False)
+
+ # invalid type
+ with pytest.raises(ValueError, match=msg):
+ self.frame.rank(na_option=True, ascending=False)
+
+ def test_rank_axis(self):
+ # check if using axes' names gives the same result
+ df = DataFrame([[2, 1], [4, 3]])
+ tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
+ tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
+
+ def test_rank_methods_frame(self):
+ pytest.importorskip('scipy.stats.special')
+ rankdata = pytest.importorskip('scipy.stats.rankdata')
+ import scipy
+
+ xs = np.random.randint(0, 21, (100, 26))
+ xs = (xs - 10.0) / 10.0
+ cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
+
+ for vals in [xs, xs + 1e6, xs * 1e-6]:
+ df = DataFrame(vals, columns=cols)
+
+ for ax in [0, 1]:
+ for m in ['average', 'min', 'max', 'first', 'dense']:
+ result = df.rank(axis=ax, method=m)
+ sprank = np.apply_along_axis(
+ rankdata, ax, vals,
+ m if m != 'first' else 'ordinal')
+ sprank = sprank.astype(np.float64)
+ expected = DataFrame(sprank, columns=cols)
+
+ if (LooseVersion(scipy.__version__) >=
+ LooseVersion('0.17.0')):
+ expected = expected.astype('float64')
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
+ def test_rank_descending(self, method, dtype):
+
+ if 'i' in dtype:
+ df = self.df.dropna()
+ else:
+ df = self.df.astype(dtype)
+
+ res = df.rank(ascending=False)
+ expected = (df.max() - df).rank()
+ assert_frame_equal(res, expected)
+
+ if method == 'first' and dtype == 'O':
+ return
+
+ expected = (df.max() - df).rank(method=method)
+
+ if dtype != 'O':
+ res2 = df.rank(method=method, ascending=False,
+ numeric_only=True)
+ assert_frame_equal(res2, expected)
+
+ res3 = df.rank(method=method, ascending=False,
+ numeric_only=False)
+ assert_frame_equal(res3, expected)
+
+ @pytest.mark.parametrize('axis', [0, 1])
+ @pytest.mark.parametrize('dtype', [None, object])
+ def test_rank_2d_tie_methods(self, method, axis, dtype):
+ df = self.df
+
+ def _check2d(df, expected, method='average', axis=0):
+ exp_df = DataFrame({'A': expected, 'B': expected})
+
+ if axis == 1:
+ df = df.T
+ exp_df = exp_df.T
+
+ result = df.rank(method=method, axis=axis)
+ assert_frame_equal(result, exp_df)
+
+ disabled = {(object, 'first')}
+ if (dtype, method) in disabled:
+ return
+ frame = df if dtype is None else df.astype(dtype)
+ _check2d(frame, self.results[method], method=method, axis=axis)
+
+ @pytest.mark.parametrize(
+ "method,exp", [("dense",
+ [[1., 1., 1.],
+ [1., 0.5, 2. / 3],
+ [1., 0.5, 1. / 3]]),
+ ("min",
+ [[1. / 3, 1., 1.],
+ [1. / 3, 1. / 3, 2. / 3],
+ [1. / 3, 1. / 3, 1. / 3]]),
+ ("max",
+ [[1., 1., 1.],
+ [1., 2. / 3, 2. / 3],
+ [1., 2. / 3, 1. / 3]]),
+ ("average",
+ [[2. / 3, 1., 1.],
+ [2. / 3, 0.5, 2. / 3],
+ [2. / 3, 0.5, 1. / 3]]),
+ ("first",
+ [[1. / 3, 1., 1.],
+ [2. / 3, 1. / 3, 2. / 3],
+ [3. / 3, 2. / 3, 1. / 3]])])
+ def test_rank_pct_true(self, method, exp):
+ # see gh-15630.
+
+ df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
+ result = df.rank(method=method, pct=True)
+
+ expected = DataFrame(exp)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.single
+ def test_pct_max_many_rows(self):
+ # GH 18271
+ df = DataFrame({'A': np.arange(2**24 + 1),
+ 'B': np.arange(2**24 + 1, 0, -1)})
+ result = df.rank(pct=True).max()
+ assert (result == 1).all()
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_replace.py b/contrib/python/pandas/py2/pandas/tests/frame/test_replace.py
new file mode 100644
index 00000000000..127a64da38b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_replace.py
@@ -0,0 +1,1111 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lrange, range, zip
+
+import pandas as pd
+from pandas import DataFrame, Index, Series, Timestamp, compat, date_range
+from pandas.tests.frame.common import TestData
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameReplace(TestData):
+
+ def test_replace_inplace(self):
+ self.tsframe['A'][:5] = np.nan
+ self.tsframe['A'][-5:] = np.nan
+
+ tsframe = self.tsframe.copy()
+ tsframe.replace(np.nan, 0, inplace=True)
+ assert_frame_equal(tsframe, self.tsframe.fillna(0))
+
+ # mixed type
+ mf = self.mixed_frame
+ mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan
+ mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan
+
+ result = self.mixed_frame.replace(np.nan, 0)
+ expected = self.mixed_frame.fillna(value=0)
+ assert_frame_equal(result, expected)
+
+ tsframe = self.tsframe.copy()
+ tsframe.replace([np.nan], [0], inplace=True)
+ assert_frame_equal(tsframe, self.tsframe.fillna(0))
+
+ def test_regex_replace_scalar(self):
+ obj = {'a': list('ab..'), 'b': list('efgh')}
+ dfobj = DataFrame(obj)
+ mix = {'a': lrange(4), 'b': list('ab..')}
+ dfmix = DataFrame(mix)
+
+ # simplest cases
+ # regex -> value
+ # obj frame
+ res = dfobj.replace(r'\s*\.\s*', np.nan, regex=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.replace(r'\s*\.\s*', np.nan, regex=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True)
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True)
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ # everything with compiled regexs as well
+ res = dfobj.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1')
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1')
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1')
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1')
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_scalar_inplace(self):
+ obj = {'a': list('ab..'), 'b': list('efgh')}
+ dfobj = DataFrame(obj)
+ mix = {'a': lrange(4), 'b': list('ab..')}
+ dfmix = DataFrame(mix)
+
+ # simplest cases
+ # regex -> value
+ # obj frame
+ res = dfobj.copy()
+ res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.copy()
+ res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.copy()
+ res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True)
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.copy()
+ res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True)
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ # everything with compiled regexs as well
+ res = dfobj.copy()
+ res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.copy()
+ res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.copy()
+ res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True,
+ inplace=True)
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.copy()
+ res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True,
+ inplace=True)
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ res = dfobj.copy()
+ res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.copy()
+ res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.copy()
+ res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True)
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.copy()
+ res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True)
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ # everything with compiled regexs as well
+ res = dfobj.copy()
+ res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True)
+ assert_frame_equal(dfobj, res.fillna('.'))
+
+ # mixed
+ res = dfmix.copy()
+ res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True)
+ assert_frame_equal(dfmix, res.fillna('.'))
+
+ # regex -> regex
+ # obj frame
+ res = dfobj.copy()
+ res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1',
+ inplace=True)
+ objc = obj.copy()
+ objc['a'] = ['a', 'b', '...', '...']
+ expec = DataFrame(objc)
+ assert_frame_equal(res, expec)
+
+ # with mixed
+ res = dfmix.copy()
+ res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1',
+ inplace=True)
+ mixc = mix.copy()
+ mixc['b'] = ['a', 'b', '...', '...']
+ expec = DataFrame(mixc)
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_list_obj(self):
+ obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')}
+ dfobj = DataFrame(obj)
+
+ # lists of regexes and values
+ # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
+ to_replace_res = [r'\s*\.\s*', r'e|f|g']
+ values = [np.nan, 'crap']
+ res = dfobj.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], 'b': ['crap'] * 3 +
+ ['h'], 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
+ to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)']
+ values = [r'\1\1', r'\1_crap']
+ res = dfobj.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap',
+ 'f_crap',
+ 'g_crap', 'h'],
+ 'c': ['h', 'e_crap', 'l', 'o']})
+
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
+ # or vN)]
+ to_replace_res = [r'\s*(\.)\s*', r'e']
+ values = [r'\1\1', r'crap']
+ res = dfobj.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g',
+ 'h'],
+ 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ to_replace_res = [r'\s*(\.)\s*', r'e']
+ values = [r'\1\1', r'crap']
+ res = dfobj.replace(value=values, regex=to_replace_res)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g',
+ 'h'],
+ 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_list_obj_inplace(self):
+ # same as above with inplace=True
+ # lists of regexes and values
+ obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')}
+ dfobj = DataFrame(obj)
+
+ # lists of regexes and values
+ # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
+ to_replace_res = [r'\s*\.\s*', r'e|f|g']
+ values = [np.nan, 'crap']
+ res = dfobj.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], 'b': ['crap'] * 3 +
+ ['h'], 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
+ to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)']
+ values = [r'\1\1', r'\1_crap']
+ res = dfobj.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap',
+ 'f_crap',
+ 'g_crap', 'h'],
+ 'c': ['h', 'e_crap', 'l', 'o']})
+
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
+ # or vN)]
+ to_replace_res = [r'\s*(\.)\s*', r'e']
+ values = [r'\1\1', r'crap']
+ res = dfobj.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g',
+ 'h'],
+ 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ to_replace_res = [r'\s*(\.)\s*', r'e']
+ values = [r'\1\1', r'crap']
+ res = dfobj.copy()
+ res.replace(value=values, regex=to_replace_res, inplace=True)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g',
+ 'h'],
+ 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_list_mixed(self):
+ # mixed frame to make sure this doesn't break things
+ mix = {'a': lrange(4), 'b': list('ab..')}
+ dfmix = DataFrame(mix)
+
+ # lists of regexes and values
+ # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
+ to_replace_res = [r'\s*\.\s*', r'a']
+ values = [np.nan, 'crap']
+ mix2 = {'a': lrange(4), 'b': list('ab..'), 'c': list('halo')}
+ dfmix2 = DataFrame(mix2)
+ res = dfmix2.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': mix2['a'], 'b': ['crap', 'b', np.nan, np.nan],
+ 'c': ['h', 'crap', 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
+ to_replace_res = [r'\s*(\.)\s*', r'(a|b)']
+ values = [r'\1\1', r'\1_crap']
+ res = dfmix.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..',
+ '..']})
+
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
+ # or vN)]
+ to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
+ values = [r'\1\1', r'crap', r'\1_crap']
+ res = dfmix.replace(to_replace_res, values, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']})
+ assert_frame_equal(res, expec)
+
+ to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
+ values = [r'\1\1', r'crap', r'\1_crap']
+ res = dfmix.replace(regex=to_replace_res, value=values)
+ expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']})
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_list_mixed_inplace(self):
+ mix = {'a': lrange(4), 'b': list('ab..')}
+ dfmix = DataFrame(mix)
+ # the same inplace
+ # lists of regexes and values
+ # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
+ to_replace_res = [r'\s*\.\s*', r'a']
+ values = [np.nan, 'crap']
+ res = dfmix.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b', np.nan, np.nan]})
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
+ to_replace_res = [r'\s*(\.)\s*', r'(a|b)']
+ values = [r'\1\1', r'\1_crap']
+ res = dfmix.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..',
+ '..']})
+
+ assert_frame_equal(res, expec)
+
+ # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
+ # or vN)]
+ to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
+ values = [r'\1\1', r'crap', r'\1_crap']
+ res = dfmix.copy()
+ res.replace(to_replace_res, values, inplace=True, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']})
+ assert_frame_equal(res, expec)
+
+ to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
+ values = [r'\1\1', r'crap', r'\1_crap']
+ res = dfmix.copy()
+ res.replace(regex=to_replace_res, value=values, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']})
+ assert_frame_equal(res, expec)
+
+ def test_regex_replace_dict_mixed(self):
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ dfmix = DataFrame(mix)
+
+ # dicts
+ # single dict {re1: v1}, search the whole frame
+ # need test for this...
+
+ # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
+ # frame
+ res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True)
+ res2 = dfmix.copy()
+ res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan},
+ inplace=True, regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+
+ # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
+ # whole frame
+ res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True)
+ res2 = dfmix.copy()
+ res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True,
+ regex=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+
+ res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'})
+ res2 = dfmix.copy()
+ res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'},
+ inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+
+ # scalar -> dict
+ # to_replace regex, {value: value}
+ expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
+ mix['c']})
+ res = dfmix.replace('a', {'b': np.nan}, regex=True)
+ res2 = dfmix.copy()
+ res2.replace('a', {'b': np.nan}, regex=True, inplace=True)
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+
+ res = dfmix.replace('a', {'b': np.nan}, regex=True)
+ res2 = dfmix.copy()
+ res2.replace(regex='a', value={'b': np.nan}, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+
+ def test_regex_replace_dict_nested(self):
+ # nested dicts will not work until this is implemented for Series
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ dfmix = DataFrame(mix)
+ res = dfmix.replace({'b': {r'\s*\.\s*': np.nan}}, regex=True)
+ res2 = dfmix.copy()
+ res4 = dfmix.copy()
+ res2.replace({'b': {r'\s*\.\s*': np.nan}}, inplace=True, regex=True)
+ res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': np.nan}})
+ res4.replace(regex={'b': {r'\s*\.\s*': np.nan}}, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+ assert_frame_equal(res3, expec)
+ assert_frame_equal(res4, expec)
+
+ def test_regex_replace_dict_nested_non_first_character(self):
+ # GH 25259
+ df = pd.DataFrame({'first': ['abc', 'bca', 'cab']})
+ expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']})
+ result = df.replace({'a': '.'}, regex=True)
+ assert_frame_equal(result, expected)
+
+ def test_regex_replace_dict_nested_gh4115(self):
+ df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2})
+ expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})
+ result = df.replace({'Type': {'Q': 0, 'T': 1}})
+ assert_frame_equal(result, expected)
+
+ def test_regex_replace_list_to_scalar(self):
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ expec = DataFrame({'a': mix['a'], 'b': np.array([np.nan] * 4),
+ 'c': [np.nan, np.nan, np.nan, 'd']})
+
+ res = df.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True)
+ res2 = df.copy()
+ res3 = df.copy()
+ res2.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True, inplace=True)
+ res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=np.nan, inplace=True)
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+ assert_frame_equal(res3, expec)
+
+ def test_regex_replace_str_to_numeric(self):
+ # what happens when you try to replace a numeric value with a regex?
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ res = df.replace(r'\s*\.\s*', 0, regex=True)
+ res2 = df.copy()
+ res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True)
+ res3 = df.copy()
+ res3.replace(regex=r'\s*\.\s*', value=0, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', 0, 0], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+ assert_frame_equal(res3, expec)
+
+ def test_regex_replace_regex_list_to_numeric(self):
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True)
+ res2 = df.copy()
+ res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True)
+ res3 = df.copy()
+ res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 0, 0, 0], 'c': ['a', 0,
+ np.nan,
+ 'd']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+ assert_frame_equal(res3, expec)
+
+ def test_regex_replace_series_of_regexes(self):
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ s1 = Series({'b': r'\s*\.\s*'})
+ s2 = Series({'b': np.nan})
+ res = df.replace(s1, s2, regex=True)
+ res2 = df.copy()
+ res2.replace(s1, s2, inplace=True, regex=True)
+ res3 = df.copy()
+ res3.replace(regex=s1, value=s2, inplace=True)
+ expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c':
+ mix['c']})
+ assert_frame_equal(res, expec)
+ assert_frame_equal(res2, expec)
+ assert_frame_equal(res3, expec)
+
+ def test_regex_replace_numeric_to_object_conversion(self):
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']})
+ res = df.replace(0, 'a')
+ assert_frame_equal(res, expec)
+ assert res.a.dtype == np.object_
+
+ @pytest.mark.parametrize('metachar', ['[]', '()', r'\d', r'\w', r'\s'])
+ def test_replace_regex_metachar(self, metachar):
+ df = DataFrame({'a': [metachar, 'else']})
+ result = df.replace({'a': {metachar: 'paren'}})
+ expected = DataFrame({'a': ['paren', 'else']})
+ assert_frame_equal(result, expected)
+
+ def test_replace(self):
+ self.tsframe['A'][:5] = np.nan
+ self.tsframe['A'][-5:] = np.nan
+
+ zero_filled = self.tsframe.replace(np.nan, -1e8)
+ assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8))
+ assert_frame_equal(zero_filled.replace(-1e8, np.nan), self.tsframe)
+
+ self.tsframe['A'][:5] = np.nan
+ self.tsframe['A'][-5:] = np.nan
+ self.tsframe['B'][:5] = -1e8
+
+ # empty
+ df = DataFrame(index=['a', 'b'])
+ assert_frame_equal(df, df.replace(5, 7))
+
+ # GH 11698
+ # test for mixed data types.
+ df = pd.DataFrame([('-', pd.to_datetime('20150101')),
+ ('a', pd.to_datetime('20150102'))])
+ df1 = df.replace('-', np.nan)
+ expected_df = pd.DataFrame([(np.nan, pd.to_datetime('20150101')),
+ ('a', pd.to_datetime('20150102'))])
+ assert_frame_equal(df1, expected_df)
+
+ def test_replace_list(self):
+ obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')}
+ dfobj = DataFrame(obj)
+
+ # lists of regexes and values
+ # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN]
+ to_replace_res = [r'.', r'e']
+ values = [np.nan, 'crap']
+ res = dfobj.replace(to_replace_res, values)
+ expec = DataFrame({'a': ['a', 'b', np.nan, np.nan],
+ 'b': ['crap', 'f', 'g', 'h'], 'c': ['h', 'crap',
+ 'l', 'o']})
+ assert_frame_equal(res, expec)
+
+ # list of [v1, v2, ..., vN] -> [v1, v2, .., vN]
+ to_replace_res = [r'.', r'f']
+ values = [r'..', r'crap']
+ res = dfobj.replace(to_replace_res, values)
+ expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e', 'crap', 'g',
+ 'h'],
+ 'c': ['h', 'e', 'l', 'o']})
+
+ assert_frame_equal(res, expec)
+
+ def test_replace_with_empty_list(self):
+ # GH 21977
+ s = pd.Series([['a', 'b'], [], np.nan, [1]])
+ df = pd.DataFrame({'col': s})
+ expected = df
+ result = df.replace([], np.nan)
+ assert_frame_equal(result, expected)
+
+ # GH 19266
+ with pytest.raises(ValueError, match="cannot assign mismatch"):
+ df.replace({np.nan: []})
+ with pytest.raises(ValueError, match="cannot assign mismatch"):
+ df.replace({np.nan: ['dummy', 'alt']})
+
+ def test_replace_series_dict(self):
+ # from GH 3064
+ df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
+ result = df.replace(0, {'zero': 0.5, 'one': 1.0})
+ expected = DataFrame(
+ {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}})
+ assert_frame_equal(result, expected)
+
+ result = df.replace(0, df.mean())
+ assert_frame_equal(result, expected)
+
+ # series to series/dict
+ df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
+ s = Series({'zero': 0.0, 'one': 2.0})
+ result = df.replace(s, {'zero': 0.5, 'one': 1.0})
+ expected = DataFrame(
+ {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}})
+ assert_frame_equal(result, expected)
+
+ result = df.replace(s, df.mean())
+ assert_frame_equal(result, expected)
+
+ def test_replace_convert(self):
+ # gh 3907
+ df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']])
+ m = {'foo': 1, 'bar': 2, 'bah': 3}
+ rep = df.replace(m)
+ expec = Series([np.int64] * 3)
+ res = rep.dtypes
+ assert_series_equal(expec, res)
+
+ def test_replace_mixed(self):
+ mf = self.mixed_frame
+ mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan
+ mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan
+
+ result = self.mixed_frame.replace(np.nan, -18)
+ expected = self.mixed_frame.fillna(value=-18)
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result.replace(-18, np.nan), self.mixed_frame)
+
+ result = self.mixed_frame.replace(np.nan, -1e8)
+ expected = self.mixed_frame.fillna(value=-1e8)
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame)
+
+ # int block upcasting
+ df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
+ 'B': Series([0, 1], dtype='int64')})
+ expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
+ 'B': Series([0.5, 1], dtype='float64')})
+ result = df.replace(0, 0.5)
+ assert_frame_equal(result, expected)
+
+ df.replace(0, 0.5, inplace=True)
+ assert_frame_equal(df, expected)
+
+ # int block splitting
+ df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
+ 'B': Series([0, 1], dtype='int64'),
+ 'C': Series([1, 2], dtype='int64')})
+ expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
+ 'B': Series([0.5, 1], dtype='float64'),
+ 'C': Series([1, 2], dtype='int64')})
+ result = df.replace(0, 0.5)
+ assert_frame_equal(result, expected)
+
+ # to object block upcasting
+ df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
+ 'B': Series([0, 1], dtype='int64')})
+ expected = DataFrame({'A': Series([1, 'foo'], dtype='object'),
+ 'B': Series([0, 1], dtype='int64')})
+ result = df.replace(2, 'foo')
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'),
+ 'B': Series([0, 'foo'], dtype='object')})
+ result = df.replace([1, 2], ['foo', 'bar'])
+ assert_frame_equal(result, expected)
+
+ # test case from
+ df = DataFrame({'A': Series([3, 0], dtype='int64'),
+ 'B': Series([0, 3], dtype='int64')})
+ result = df.replace(3, df.mean().to_dict())
+ expected = df.copy().astype('float64')
+ m = df.mean()
+ expected.iloc[0, 0] = m[0]
+ expected.iloc[1, 1] = m[1]
+ assert_frame_equal(result, expected)
+
+ def test_replace_simple_nested_dict(self):
+ df = DataFrame({'col': range(1, 5)})
+ expected = DataFrame({'col': ['a', 2, 3, 'b']})
+
+ result = df.replace({'col': {1: 'a', 4: 'b'}})
+ assert_frame_equal(expected, result)
+
+ # in this case, should be the same as the not nested version
+ result = df.replace({1: 'a', 4: 'b'})
+ assert_frame_equal(expected, result)
+
+ def test_replace_simple_nested_dict_with_nonexistent_value(self):
+ df = DataFrame({'col': range(1, 5)})
+ expected = DataFrame({'col': ['a', 2, 3, 'b']})
+
+ result = df.replace({-1: '-', 1: 'a', 4: 'b'})
+ assert_frame_equal(expected, result)
+
+ result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}})
+ assert_frame_equal(expected, result)
+
+ def test_replace_value_is_none(self):
+ orig_value = self.tsframe.iloc[0, 0]
+ orig2 = self.tsframe.iloc[1, 0]
+
+ self.tsframe.iloc[0, 0] = np.nan
+ self.tsframe.iloc[1, 0] = 1
+
+ result = self.tsframe.replace(to_replace={np.nan: 0})
+ expected = self.tsframe.T.replace(to_replace={np.nan: 0}).T
+ assert_frame_equal(result, expected)
+
+ result = self.tsframe.replace(to_replace={np.nan: 0, 1: -1e8})
+ tsframe = self.tsframe.copy()
+ tsframe.iloc[0, 0] = 0
+ tsframe.iloc[1, 0] = -1e8
+ expected = tsframe
+ assert_frame_equal(expected, result)
+ self.tsframe.iloc[0, 0] = orig_value
+ self.tsframe.iloc[1, 0] = orig2
+
+ def test_replace_for_new_dtypes(self):
+
+ # dtypes
+ tsframe = self.tsframe.copy().astype(np.float32)
+ tsframe['A'][:5] = np.nan
+ tsframe['A'][-5:] = np.nan
+
+ zero_filled = tsframe.replace(np.nan, -1e8)
+ assert_frame_equal(zero_filled, tsframe.fillna(-1e8))
+ assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe)
+
+ tsframe['A'][:5] = np.nan
+ tsframe['A'][-5:] = np.nan
+ tsframe['B'][:5] = -1e8
+
+ b = tsframe['B']
+ b[b == -1e8] = np.nan
+ tsframe['B'] = b
+ result = tsframe.fillna(method='bfill')
+ assert_frame_equal(result, tsframe.fillna(method='bfill'))
+
+ @pytest.mark.parametrize('frame, to_replace, value, expected', [
+ (DataFrame({'ints': [1, 2, 3]}), 1, 0,
+ DataFrame({'ints': [0, 2, 3]})),
+ (DataFrame({'ints': [1, 2, 3]}, dtype=np.int32), 1, 0,
+ DataFrame({'ints': [0, 2, 3]}, dtype=np.int32)),
+ (DataFrame({'ints': [1, 2, 3]}, dtype=np.int16), 1, 0,
+ DataFrame({'ints': [0, 2, 3]}, dtype=np.int16)),
+ (DataFrame({'bools': [True, False, True]}), False, True,
+ DataFrame({'bools': [True, True, True]})),
+ (DataFrame({'complex': [1j, 2j, 3j]}), 1j, 0,
+ DataFrame({'complex': [0j, 2j, 3j]})),
+ (DataFrame({'datetime64': Index([datetime(2018, 5, 28),
+ datetime(2018, 7, 28),
+ datetime(2018, 5, 28)])}),
+ datetime(2018, 5, 28), datetime(2018, 7, 28),
+ DataFrame({'datetime64': Index([datetime(2018, 7, 28)] * 3)})),
+ # GH 20380
+ (DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['foo']}),
+ 'foo', 'bar',
+ DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['bar']})),
+ (DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'),
+ 'B': [0, np.nan, 2]}),
+ Timestamp('20130102', tz='US/Eastern'),
+ Timestamp('20130104', tz='US/Eastern'),
+ DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
+ Timestamp('20130104', tz='US/Eastern'),
+ Timestamp('20130103', tz='US/Eastern')],
+ 'B': [0, np.nan, 2]}))
+ ])
+ def test_replace_dtypes(self, frame, to_replace, value, expected):
+ result = getattr(frame, 'replace')(to_replace, value)
+ assert_frame_equal(result, expected)
+
+ def test_replace_input_formats_listlike(self):
+ # both dicts
+ to_rep = {'A': np.nan, 'B': 0, 'C': ''}
+ values = {'A': 0, 'B': -1, 'C': 'missing'}
+ df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
+ 'C': ['', 'asdf', 'fd']})
+ filled = df.replace(to_rep, values)
+ expected = {k: v.replace(to_rep[k], values[k])
+ for k, v in compat.iteritems(df)}
+ assert_frame_equal(filled, DataFrame(expected))
+
+ result = df.replace([0, 2, 5], [5, 2, 0])
+ expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
+ 'C': ['', 'asdf', 'fd']})
+ assert_frame_equal(result, expected)
+
+ # scalar to dict
+ values = {'A': 0, 'B': -1, 'C': 'missing'}
+ df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
+ 'C': ['', 'asdf', 'fd']})
+ filled = df.replace(np.nan, values)
+ expected = {k: v.replace(np.nan, values[k])
+ for k, v in compat.iteritems(df)}
+ assert_frame_equal(filled, DataFrame(expected))
+
+ # list to list
+ to_rep = [np.nan, 0, '']
+ values = [-2, -1, 'missing']
+ result = df.replace(to_rep, values)
+ expected = df.copy()
+ for i in range(len(to_rep)):
+ expected.replace(to_rep[i], values[i], inplace=True)
+ assert_frame_equal(result, expected)
+
+ pytest.raises(ValueError, df.replace, to_rep, values[1:])
+
+ def test_replace_input_formats_scalar(self):
+ df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
+ 'C': ['', 'asdf', 'fd']})
+
+ # dict to scalar
+ to_rep = {'A': np.nan, 'B': 0, 'C': ''}
+ filled = df.replace(to_rep, 0)
+ expected = {k: v.replace(to_rep[k], 0)
+ for k, v in compat.iteritems(df)}
+ assert_frame_equal(filled, DataFrame(expected))
+
+ pytest.raises(TypeError, df.replace, to_rep, [np.nan, 0, ''])
+
+ # list to scalar
+ to_rep = [np.nan, 0, '']
+ result = df.replace(to_rep, -1)
+ expected = df.copy()
+ for i in range(len(to_rep)):
+ expected.replace(to_rep[i], -1, inplace=True)
+ assert_frame_equal(result, expected)
+
+ def test_replace_limit(self):
+ pass
+
+ def test_replace_dict_no_regex(self):
+ answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3:
+ 'Disagree', 4: 'Strongly Disagree'})
+ weights = {'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree':
+ 5, 'Strongly Disagree': 1}
+ expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
+ result = answer.replace(weights)
+ assert_series_equal(result, expected)
+
+ def test_replace_series_no_regex(self):
+ answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3:
+ 'Disagree', 4: 'Strongly Disagree'})
+ weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3,
+ 'Strongly Agree': 5, 'Strongly Disagree': 1})
+ expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
+ result = answer.replace(weights)
+ assert_series_equal(result, expected)
+
+ def test_replace_dict_tuple_list_ordering_remains_the_same(self):
+ df = DataFrame(dict(A=[np.nan, 1]))
+ res1 = df.replace(to_replace={np.nan: 0, 1: -1e8})
+ res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0])
+ res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0])
+
+ expected = DataFrame({'A': [0, -1e8]})
+ assert_frame_equal(res1, res2)
+ assert_frame_equal(res2, res3)
+ assert_frame_equal(res3, expected)
+
+ def test_replace_doesnt_replace_without_regex(self):
+ raw = """fol T_opp T_Dir T_Enh
+ 0 1 0 0 vo
+ 1 2 vr 0 0
+ 2 2 0 0 0
+ 3 3 0 bt 0"""
+ df = pd.read_csv(StringIO(raw), sep=r'\s+')
+ res = df.replace({r'\D': 1})
+ assert_frame_equal(df, res)
+
+ def test_replace_bool_with_string(self):
+ df = DataFrame({'a': [True, False], 'b': list('ab')})
+ result = df.replace(True, 'a')
+ expected = DataFrame({'a': ['a', False], 'b': df.b})
+ assert_frame_equal(result, expected)
+
+ def test_replace_pure_bool_with_string_no_op(self):
+ df = DataFrame(np.random.rand(2, 2) > 0.5)
+ result = df.replace('asdf', 'fdsa')
+ assert_frame_equal(df, result)
+
+ def test_replace_bool_with_bool(self):
+ df = DataFrame(np.random.rand(2, 2) > 0.5)
+ result = df.replace(False, True)
+ expected = DataFrame(np.ones((2, 2), dtype=bool))
+ assert_frame_equal(result, expected)
+
+ def test_replace_with_dict_with_bool_keys(self):
+ df = DataFrame({0: [True, False], 1: [False, True]})
+ with pytest.raises(TypeError, match='Cannot compare types .+'):
+ df.replace({'asdf': 'asdb', True: 'yes'})
+
+ def test_replace_truthy(self):
+ df = DataFrame({'a': [True, True]})
+ r = df.replace([np.inf, -np.inf], np.nan)
+ e = df
+ assert_frame_equal(r, e)
+
+ def test_replace_int_to_int_chain(self):
+ df = DataFrame({'a': lrange(1, 5)})
+ with pytest.raises(ValueError, match="Replacement not allowed .+"):
+ df.replace({'a': dict(zip(range(1, 5), range(2, 6)))})
+
+ def test_replace_str_to_str_chain(self):
+ a = np.arange(1, 5)
+ astr = a.astype(str)
+ bstr = np.arange(2, 6).astype(str)
+ df = DataFrame({'a': astr})
+ with pytest.raises(ValueError, match="Replacement not allowed .+"):
+ df.replace({'a': dict(zip(astr, bstr))})
+
+ def test_replace_swapping_bug(self):
+ df = pd.DataFrame({'a': [True, False, True]})
+ res = df.replace({'a': {True: 'Y', False: 'N'}})
+ expect = pd.DataFrame({'a': ['Y', 'N', 'Y']})
+ assert_frame_equal(res, expect)
+
+ df = pd.DataFrame({'a': [0, 1, 0]})
+ res = df.replace({'a': {0: 'Y', 1: 'N'}})
+ expect = pd.DataFrame({'a': ['Y', 'N', 'Y']})
+ assert_frame_equal(res, expect)
+
+ def test_replace_period(self):
+ d = {
+ 'fname': {
+ 'out_augmented_AUG_2011.json':
+ pd.Period(year=2011, month=8, freq='M'),
+ 'out_augmented_JAN_2011.json':
+ pd.Period(year=2011, month=1, freq='M'),
+ 'out_augmented_MAY_2012.json':
+ pd.Period(year=2012, month=5, freq='M'),
+ 'out_augmented_SUBSIDY_WEEK.json':
+ pd.Period(year=2011, month=4, freq='M'),
+ 'out_augmented_AUG_2012.json':
+ pd.Period(year=2012, month=8, freq='M'),
+ 'out_augmented_MAY_2011.json':
+ pd.Period(year=2011, month=5, freq='M'),
+ 'out_augmented_SEP_2013.json':
+ pd.Period(year=2013, month=9, freq='M')}}
+
+ df = pd.DataFrame(['out_augmented_AUG_2012.json',
+ 'out_augmented_SEP_2013.json',
+ 'out_augmented_SUBSIDY_WEEK.json',
+ 'out_augmented_MAY_2012.json',
+ 'out_augmented_MAY_2011.json',
+ 'out_augmented_AUG_2011.json',
+ 'out_augmented_JAN_2011.json'], columns=['fname'])
+ assert set(df.fname.values) == set(d['fname'].keys())
+ # We don't support converting object -> specialized EA in
+ # replace yet.
+ expected = DataFrame({'fname': [d['fname'][k]
+ for k in df.fname.values]},
+ dtype=object)
+ result = df.replace(d)
+ assert_frame_equal(result, expected)
+
+ def test_replace_datetime(self):
+ d = {'fname':
+ {'out_augmented_AUG_2011.json': pd.Timestamp('2011-08'),
+ 'out_augmented_JAN_2011.json': pd.Timestamp('2011-01'),
+ 'out_augmented_MAY_2012.json': pd.Timestamp('2012-05'),
+ 'out_augmented_SUBSIDY_WEEK.json': pd.Timestamp('2011-04'),
+ 'out_augmented_AUG_2012.json': pd.Timestamp('2012-08'),
+ 'out_augmented_MAY_2011.json': pd.Timestamp('2011-05'),
+ 'out_augmented_SEP_2013.json': pd.Timestamp('2013-09')}}
+
+ df = pd.DataFrame(['out_augmented_AUG_2012.json',
+ 'out_augmented_SEP_2013.json',
+ 'out_augmented_SUBSIDY_WEEK.json',
+ 'out_augmented_MAY_2012.json',
+ 'out_augmented_MAY_2011.json',
+ 'out_augmented_AUG_2011.json',
+ 'out_augmented_JAN_2011.json'], columns=['fname'])
+ assert set(df.fname.values) == set(d['fname'].keys())
+ expected = DataFrame({'fname': [d['fname'][k]
+ for k in df.fname.values]})
+ result = df.replace(d)
+ assert_frame_equal(result, expected)
+
+ def test_replace_datetimetz(self):
+
+ # GH 11326
+ # behaving poorly when presented with a datetime64[ns, tz]
+ df = DataFrame({'A': date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'B': [0, np.nan, 2]})
+ result = df.replace(np.nan, 1)
+ expected = DataFrame({'A': date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'B': Series([0, 1, 2], dtype='float64')})
+ assert_frame_equal(result, expected)
+
+ result = df.fillna(1)
+ assert_frame_equal(result, expected)
+
+ result = df.replace(0, np.nan)
+ expected = DataFrame({'A': date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'B': [np.nan, np.nan, 2]})
+ assert_frame_equal(result, expected)
+
+ result = df.replace(Timestamp('20130102', tz='US/Eastern'),
+ Timestamp('20130104', tz='US/Eastern'))
+ expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
+ Timestamp('20130104', tz='US/Eastern'),
+ Timestamp('20130103', tz='US/Eastern')],
+ 'B': [0, np.nan, 2]})
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.iloc[1, 0] = np.nan
+ result = result.replace(
+ {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern'))
+ assert_frame_equal(result, expected)
+
+ # coerce to object
+ result = df.copy()
+ result.iloc[1, 0] = np.nan
+ result = result.replace(
+ {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific'))
+ expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
+ Timestamp('20130104', tz='US/Pacific'),
+ Timestamp('20130103', tz='US/Eastern')],
+ 'B': [0, np.nan, 2]})
+ assert_frame_equal(result, expected)
+
+ result = df.copy()
+ result.iloc[1, 0] = np.nan
+ result = result.replace({'A': np.nan}, Timestamp('20130104'))
+ expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
+ Timestamp('20130104'),
+ Timestamp('20130103', tz='US/Eastern')],
+ 'B': [0, np.nan, 2]})
+ assert_frame_equal(result, expected)
+
+ def test_replace_with_empty_dictlike(self):
+ # GH 15289
+ mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
+ df = DataFrame(mix)
+ assert_frame_equal(df, df.replace({}))
+ assert_frame_equal(df, df.replace(Series([])))
+
+ assert_frame_equal(df, df.replace({'b': {}}))
+ assert_frame_equal(df, df.replace(Series({'b': {}})))
+
+ @pytest.mark.parametrize("to_replace, method, expected", [
+ (0, 'bfill', {'A': [1, 1, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']}),
+ (np.nan, 'bfill', {'A': [0, 1, 2],
+ 'B': [5.0, 7.0, 7.0],
+ 'C': ['a', 'b', 'c']}),
+ ('d', 'ffill', {'A': [0, 1, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']}),
+ ([0, 2], 'bfill', {'A': [1, 1, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']}),
+ ([1, 2], 'pad', {'A': [0, 0, 0],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']}),
+ ((1, 2), 'bfill', {'A': [0, 2, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']}),
+ (['b', 'c'], 'ffill', {'A': [0, 1, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'a', 'a']}),
+ ])
+ def test_replace_method(self, to_replace, method, expected):
+ # GH 19632
+ df = DataFrame({'A': [0, 1, 2],
+ 'B': [5, np.nan, 7],
+ 'C': ['a', 'b', 'c']})
+
+ result = df.replace(to_replace=to_replace, value=None, method=method)
+ expected = DataFrame(expected)
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_repr_info.py b/contrib/python/pandas/py2/pandas/tests/frame/test_repr_info.py
new file mode 100644
index 00000000000..4a7cb7f5089
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_repr_info.py
@@ -0,0 +1,523 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime, timedelta
+import re
+import sys
+import textwrap
+
+import numpy as np
+import pytest
+
+from pandas.compat import PYPY, StringIO, lrange, u
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Series, compat, date_range, option_context,
+ period_range)
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+import pandas.io.formats.format as fmt
+
+# Segregated collection of methods that require the BlockManager internal data
+# structure
+
+
+class TestDataFrameReprInfoEtc(TestData):
+
+ def test_repr_empty(self):
+ # empty
+ foo = repr(self.empty) # noqa
+
+ # empty with index
+ frame = DataFrame(index=np.arange(1000))
+ foo = repr(frame) # noqa
+
+ def test_repr_mixed(self):
+ buf = StringIO()
+
+ # mixed
+ foo = repr(self.mixed_frame) # noqa
+ self.mixed_frame.info(verbose=False, buf=buf)
+
+ @pytest.mark.slow
+ def test_repr_mixed_big(self):
+ # big mixed
+ biggie = DataFrame({'A': np.random.randn(200),
+ 'B': tm.makeStringIndex(200)},
+ index=lrange(200))
+ biggie.loc[:20, 'A'] = np.nan
+ biggie.loc[:20, 'B'] = np.nan
+
+ foo = repr(biggie) # noqa
+
+ def test_repr(self):
+ buf = StringIO()
+
+ # small one
+ foo = repr(self.frame)
+ self.frame.info(verbose=False, buf=buf)
+
+ # even smaller
+ self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
+ self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
+
+ # exhausting cases in DataFrame.info
+
+ # columns but no index
+ no_index = DataFrame(columns=[0, 1, 3])
+ foo = repr(no_index) # noqa
+
+ # no columns or index
+ self.empty.info(buf=buf)
+
+ df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
+ assert "\t" not in repr(df)
+ assert "\r" not in repr(df)
+ assert "a\n" not in repr(df)
+
+ def test_repr_dimensions(self):
+ df = DataFrame([[1, 2, ], [3, 4]])
+ with option_context('display.show_dimensions', True):
+ assert "2 rows x 2 columns" in repr(df)
+
+ with option_context('display.show_dimensions', False):
+ assert "2 rows x 2 columns" not in repr(df)
+
+ with option_context('display.show_dimensions', 'truncate'):
+ assert "2 rows x 2 columns" not in repr(df)
+
+ @pytest.mark.slow
+ def test_repr_big(self):
+ # big one
+ biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
+ index=lrange(200))
+ repr(biggie)
+
+ def test_repr_unsortable(self):
+ # columns are not sortable
+ import warnings
+ warn_filters = warnings.filters
+ warnings.filterwarnings('ignore',
+ category=FutureWarning,
+ module=".*format")
+
+ unsortable = DataFrame({'foo': [1] * 50,
+ datetime.today(): [1] * 50,
+ 'bar': ['bar'] * 50,
+ datetime.today() + timedelta(1): ['bar'] * 50},
+ index=np.arange(50))
+ repr(unsortable)
+
+ fmt.set_option('display.precision', 3, 'display.column_space', 10)
+ repr(self.frame)
+
+ fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
+ repr(self.frame)
+
+ fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
+ repr(self.frame)
+
+ tm.reset_display_options()
+
+ warnings.filters = warn_filters
+
+ def test_repr_unicode(self):
+ uval = u('\u03c3\u03c3\u03c3\u03c3')
+
+ # TODO(wesm): is this supposed to be used?
+ bval = uval.encode('utf-8') # noqa
+
+ df = DataFrame({'A': [uval, uval]})
+
+ result = repr(df)
+ ex_top = ' A'
+ assert result.split('\n')[0].rstrip() == ex_top
+
+ df = DataFrame({'A': [uval, uval]})
+ result = repr(df)
+ assert result.split('\n')[0].rstrip() == ex_top
+
+ def test_unicode_string_with_unicode(self):
+ df = DataFrame({'A': [u("\u05d0")]})
+
+ if compat.PY3:
+ str(df)
+ else:
+ compat.text_type(df)
+
+ def test_bytestring_with_unicode(self):
+ df = DataFrame({'A': [u("\u05d0")]})
+ if compat.PY3:
+ bytes(df)
+ else:
+ str(df)
+
+ def test_very_wide_info_repr(self):
+ df = DataFrame(np.random.randn(10, 20),
+ columns=tm.rands_array(10, 20))
+ repr(df)
+
+ def test_repr_column_name_unicode_truncation_bug(self):
+ # #1906
+ df = DataFrame({'Id': [7117434],
+ 'StringCol': ('Is it possible to modify drop plot code'
+ ' so that the output graph is displayed '
+ 'in iphone simulator, Is it possible to '
+ 'modify drop plot code so that the '
+ 'output graph is \xe2\x80\xa8displayed '
+ 'in iphone simulator.Now we are adding '
+ 'the CSV file externally. I want to Call'
+ ' the File through the code..')})
+
+ with option_context('display.max_columns', 20):
+ assert 'StringCol' in repr(df)
+
+ def test_latex_repr(self):
+ result = r"""\begin{tabular}{llll}
+\toprule
+{} & 0 & 1 & 2 \\
+\midrule
+0 & $\alpha$ & b & c \\
+1 & 1 & 2 & 3 \\
+\bottomrule
+\end{tabular}
+"""
+ with option_context("display.latex.escape", False,
+ 'display.latex.repr', True):
+ df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
+ assert result == df._repr_latex_()
+
+ # GH 12182
+ assert df._repr_latex_() is None
+
+ def test_info(self):
+ io = StringIO()
+ self.frame.info(buf=io)
+ self.tsframe.info(buf=io)
+
+ frame = DataFrame(np.random.randn(5, 3))
+
+ frame.info()
+ frame.info(verbose=False)
+
+ def test_info_memory(self):
+ # https://github.com/pandas-dev/pandas/issues/21056
+ df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
+ buf = StringIO()
+ df.info(buf=buf)
+ result = buf.getvalue()
+ bytes = float(df.memory_usage().sum())
+
+ expected = textwrap.dedent("""\
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 2 entries, 0 to 1
+ Data columns (total 1 columns):
+ a 2 non-null int64
+ dtypes: int64(1)
+ memory usage: {} bytes
+ """.format(bytes))
+
+ assert result == expected
+
+ def test_info_wide(self):
+ from pandas import set_option, reset_option
+ io = StringIO()
+ df = DataFrame(np.random.randn(5, 101))
+ df.info(buf=io)
+
+ io = StringIO()
+ df.info(buf=io, max_cols=101)
+ rs = io.getvalue()
+ assert len(rs.splitlines()) > 100
+ xp = rs
+
+ set_option('display.max_info_columns', 101)
+ io = StringIO()
+ df.info(buf=io)
+ assert rs == xp
+ reset_option('display.max_info_columns')
+
+ def test_info_duplicate_columns(self):
+ io = StringIO()
+
+ # it works!
+ frame = DataFrame(np.random.randn(1500, 4),
+ columns=['a', 'a', 'b', 'b'])
+ frame.info(buf=io)
+
+ def test_info_duplicate_columns_shows_correct_dtypes(self):
+ # GH11761
+ io = StringIO()
+
+ frame = DataFrame([[1, 2.0]],
+ columns=['a', 'a'])
+ frame.info(buf=io)
+ io.seek(0)
+ lines = io.readlines()
+ assert 'a 1 non-null int64\n' == lines[3]
+ assert 'a 1 non-null float64\n' == lines[4]
+
+ def test_info_shows_column_dtypes(self):
+ dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
+ 'complex128', 'object', 'bool']
+ data = {}
+ n = 10
+ for i, dtype in enumerate(dtypes):
+ data[i] = np.random.randint(2, size=n).astype(dtype)
+ df = DataFrame(data)
+ buf = StringIO()
+ df.info(buf=buf)
+ res = buf.getvalue()
+ for i, dtype in enumerate(dtypes):
+ name = '%d %d non-null %s' % (i, n, dtype)
+ assert name in res
+
+ def test_info_max_cols(self):
+ df = DataFrame(np.random.randn(10, 5))
+ for len_, verbose in [(5, None), (5, False), (10, True)]:
+ # For verbose always ^ setting ^ summarize ^ full output
+ with option_context('max_info_columns', 4):
+ buf = StringIO()
+ df.info(buf=buf, verbose=verbose)
+ res = buf.getvalue()
+ assert len(res.strip().split('\n')) == len_
+
+ for len_, verbose in [(10, None), (5, False), (10, True)]:
+
+ # max_cols no exceeded
+ with option_context('max_info_columns', 5):
+ buf = StringIO()
+ df.info(buf=buf, verbose=verbose)
+ res = buf.getvalue()
+ assert len(res.strip().split('\n')) == len_
+
+ for len_, max_cols in [(10, 5), (5, 4)]:
+ # setting truncates
+ with option_context('max_info_columns', 4):
+ buf = StringIO()
+ df.info(buf=buf, max_cols=max_cols)
+ res = buf.getvalue()
+ assert len(res.strip().split('\n')) == len_
+
+ # setting wouldn't truncate
+ with option_context('max_info_columns', 5):
+ buf = StringIO()
+ df.info(buf=buf, max_cols=max_cols)
+ res = buf.getvalue()
+ assert len(res.strip().split('\n')) == len_
+
+ def test_info_memory_usage(self):
+ # Ensure memory usage is displayed, when asserted, on the last line
+ dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
+ 'complex128', 'object', 'bool']
+ data = {}
+ n = 10
+ for i, dtype in enumerate(dtypes):
+ data[i] = np.random.randint(2, size=n).astype(dtype)
+ df = DataFrame(data)
+ buf = StringIO()
+
+ # display memory usage case
+ df.info(buf=buf, memory_usage=True)
+ res = buf.getvalue().splitlines()
+ assert "memory usage: " in res[-1]
+
+ # do not display memory usage case
+ df.info(buf=buf, memory_usage=False)
+ res = buf.getvalue().splitlines()
+ assert "memory usage: " not in res[-1]
+
+ df.info(buf=buf, memory_usage=True)
+ res = buf.getvalue().splitlines()
+
+ # memory usage is a lower bound, so print it as XYZ+ MB
+ assert re.match(r"memory usage: [^+]+\+", res[-1])
+
+ df.iloc[:, :5].info(buf=buf, memory_usage=True)
+ res = buf.getvalue().splitlines()
+
+ # excluded column with object dtype, so estimate is accurate
+ assert not re.match(r"memory usage: [^+]+\+", res[-1])
+
+ # Test a DataFrame with duplicate columns
+ dtypes = ['int64', 'int64', 'int64', 'float64']
+ data = {}
+ n = 100
+ for i, dtype in enumerate(dtypes):
+ data[i] = np.random.randint(2, size=n).astype(dtype)
+ df = DataFrame(data)
+ df.columns = dtypes
+
+ df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+ df_with_object_index.info(buf=buf, memory_usage=True)
+ res = buf.getvalue().splitlines()
+ assert re.match(r"memory usage: [^+]+\+", res[-1])
+
+ df_with_object_index.info(buf=buf, memory_usage='deep')
+ res = buf.getvalue().splitlines()
+ assert re.match(r"memory usage: [^+]+$", res[-1])
+
+ # Ensure df size is as expected
+ # (cols * rows * bytes) + index size
+ df_size = df.memory_usage().sum()
+ exp_size = len(dtypes) * n * 8 + df.index.nbytes
+ assert df_size == exp_size
+
+ # Ensure number of cols in memory_usage is the same as df
+ size_df = np.size(df.columns.values) + 1 # index=True; default
+ assert size_df == np.size(df.memory_usage())
+
+ # assert deep works only on object
+ assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
+
+ # test for validity
+ DataFrame(1, index=['a'], columns=['A']
+ ).memory_usage(index=True)
+ DataFrame(1, index=['a'], columns=['A']
+ ).index.nbytes
+ df = DataFrame(
+ data=1,
+ index=pd.MultiIndex.from_product(
+ [['a'], range(1000)]),
+ columns=['A']
+ )
+ df.index.nbytes
+ df.memory_usage(index=True)
+ df.index.values.nbytes
+
+ mem = df.memory_usage(deep=True).sum()
+ assert mem > 0
+
+ @pytest.mark.skipif(PYPY,
+ reason="on PyPy deep=True doesn't change result")
+ def test_info_memory_usage_deep_not_pypy(self):
+ df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+ assert (df_with_object_index.memory_usage(
+ index=True, deep=True).sum() >
+ df_with_object_index.memory_usage(
+ index=True).sum())
+
+ df_object = pd.DataFrame({'a': ['a']})
+ assert (df_object.memory_usage(deep=True).sum() >
+ df_object.memory_usage().sum())
+
+ @pytest.mark.skipif(not PYPY,
+ reason="on PyPy deep=True does not change result")
+ def test_info_memory_usage_deep_pypy(self):
+ df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+ assert (df_with_object_index.memory_usage(
+ index=True, deep=True).sum() ==
+ df_with_object_index.memory_usage(
+ index=True).sum())
+
+ df_object = pd.DataFrame({'a': ['a']})
+ assert (df_object.memory_usage(deep=True).sum() ==
+ df_object.memory_usage().sum())
+
+ @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
+ def test_usage_via_getsizeof(self):
+ df = DataFrame(
+ data=1,
+ index=pd.MultiIndex.from_product(
+ [['a'], range(1000)]),
+ columns=['A']
+ )
+ mem = df.memory_usage(deep=True).sum()
+ # sys.getsizeof will call the .memory_usage with
+ # deep=True, and add on some GC overhead
+ diff = mem - sys.getsizeof(df)
+ assert abs(diff) < 100
+
+ def test_info_memory_usage_qualified(self):
+
+ buf = StringIO()
+ df = DataFrame(1, columns=list('ab'),
+ index=[1, 2, 3])
+ df.info(buf=buf)
+ assert '+' not in buf.getvalue()
+
+ buf = StringIO()
+ df = DataFrame(1, columns=list('ab'),
+ index=list('ABC'))
+ df.info(buf=buf)
+ assert '+' in buf.getvalue()
+
+ buf = StringIO()
+ df = DataFrame(1, columns=list('ab'),
+ index=pd.MultiIndex.from_product(
+ [range(3), range(3)]))
+ df.info(buf=buf)
+ assert '+' not in buf.getvalue()
+
+ buf = StringIO()
+ df = DataFrame(1, columns=list('ab'),
+ index=pd.MultiIndex.from_product(
+ [range(3), ['foo', 'bar']]))
+ df.info(buf=buf)
+ assert '+' in buf.getvalue()
+
+ def test_info_memory_usage_bug_on_multiindex(self):
+ # GH 14308
+ # memory usage introspection should not materialize .values
+
+ from string import ascii_uppercase as uppercase
+
+ def memory_usage(f):
+ return f.memory_usage(deep=True).sum()
+
+ N = 100
+ M = len(uppercase)
+ index = pd.MultiIndex.from_product([list(uppercase),
+ pd.date_range('20160101',
+ periods=N)],
+ names=['id', 'date'])
+ df = DataFrame({'value': np.random.randn(N * M)}, index=index)
+
+ unstacked = df.unstack('id')
+ assert df.values.nbytes == unstacked.values.nbytes
+ assert memory_usage(df) > memory_usage(unstacked)
+
+ # high upper bound
+ assert memory_usage(unstacked) - memory_usage(df) < 2000
+
+ def test_info_categorical(self):
+ # GH14298
+ idx = pd.CategoricalIndex(['a', 'b'])
+ df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
+
+ buf = StringIO()
+ df.info(buf=buf)
+
+ def test_info_categorical_column(self):
+
+ # make sure it works
+ n = 2500
+ df = DataFrame({'int64': np.random.randint(100, size=n)})
+ df['category'] = Series(np.array(list('abcdefghij')).take(
+ np.random.randint(0, 10, size=n))).astype('category')
+ df.isna()
+ buf = StringIO()
+ df.info(buf=buf)
+
+ df2 = df[df['category'] == 'd']
+ buf = compat.StringIO()
+ df2.info(buf=buf)
+
+ def test_repr_categorical_dates_periods(self):
+ # normal DataFrame
+ dt = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ p = period_range('2011-01', freq='M', periods=5)
+ df = DataFrame({'dt': dt, 'p': p})
+ exp = """ dt p
+0 2011-01-01 09:00:00-05:00 2011-01
+1 2011-01-01 10:00:00-05:00 2011-02
+2 2011-01-01 11:00:00-05:00 2011-03
+3 2011-01-01 12:00:00-05:00 2011-04
+4 2011-01-01 13:00:00-05:00 2011-05"""
+
+ df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
+ assert repr(df) == exp
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_reshape.py b/contrib/python/pandas/py2/pandas/tests/frame/test_reshape.py
new file mode 100644
index 00000000000..28222a82945
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_reshape.py
@@ -0,0 +1,968 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime
+import itertools
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas.compat import u
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range)
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameReshape(TestData):
+
+ def test_pivot(self):
+ data = {
+ 'index': ['A', 'B', 'C', 'C', 'B', 'A'],
+ 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
+ 'values': [1., 2., 3., 3., 2., 1.]
+ }
+
+ frame = DataFrame(data)
+ pivoted = frame.pivot(
+ index='index', columns='columns', values='values')
+
+ expected = DataFrame({
+ 'One': {'A': 1., 'B': 2., 'C': 3.},
+ 'Two': {'A': 1., 'B': 2., 'C': 3.}
+ })
+
+ expected.index.name, expected.columns.name = 'index', 'columns'
+ tm.assert_frame_equal(pivoted, expected)
+
+ # name tracking
+ assert pivoted.index.name == 'index'
+ assert pivoted.columns.name == 'columns'
+
+ # don't specify values
+ pivoted = frame.pivot(index='index', columns='columns')
+ assert pivoted.index.name == 'index'
+ assert pivoted.columns.names == (None, 'columns')
+
+ with catch_warnings(record=True):
+ # pivot multiple columns
+ simplefilter("ignore", FutureWarning)
+ wp = tm.makePanel()
+ lp = wp.to_frame()
+ df = lp.reset_index()
+ tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
+
+ def test_pivot_duplicates(self):
+ data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
+ 'b': ['one', 'two', 'one', 'one', 'two'],
+ 'c': [1., 2., 3., 3., 4.]})
+ with pytest.raises(ValueError, match='duplicate entries'):
+ data.pivot('a', 'b', 'c')
+
+ def test_pivot_empty(self):
+ df = DataFrame({}, columns=['a', 'b', 'c'])
+ result = df.pivot('a', 'b', 'c')
+ expected = DataFrame({})
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ def test_pivot_integer_bug(self):
+ df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
+
+ result = df.pivot(index=1, columns=0, values=2)
+ repr(result)
+ tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
+
+ def test_pivot_index_none(self):
+ # gh-3962
+ data = {
+ 'index': ['A', 'B', 'C', 'C', 'B', 'A'],
+ 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
+ 'values': [1., 2., 3., 3., 2., 1.]
+ }
+
+ frame = DataFrame(data).set_index('index')
+ result = frame.pivot(columns='columns', values='values')
+ expected = DataFrame({
+ 'One': {'A': 1., 'B': 2., 'C': 3.},
+ 'Two': {'A': 1., 'B': 2., 'C': 3.}
+ })
+
+ expected.index.name, expected.columns.name = 'index', 'columns'
+ assert_frame_equal(result, expected)
+
+ # omit values
+ result = frame.pivot(columns='columns')
+
+ expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
+ ('values', 'Two')],
+ names=[None, 'columns'])
+ expected.index.name = 'index'
+ tm.assert_frame_equal(result, expected, check_names=False)
+ assert result.index.name == 'index'
+ assert result.columns.names == (None, 'columns')
+ expected.columns = expected.columns.droplevel(0)
+ result = frame.pivot(columns='columns', values='values')
+
+ expected.columns.name = 'columns'
+ tm.assert_frame_equal(result, expected)
+
+ def test_stack_unstack(self):
+ df = self.frame.copy()
+ df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
+
+ stacked = df.stack()
+ stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
+
+ unstacked = stacked.unstack()
+ unstacked_df = stacked_df.unstack()
+
+ assert_frame_equal(unstacked, df)
+ assert_frame_equal(unstacked_df['bar'], df)
+
+ unstacked_cols = stacked.unstack(0)
+ unstacked_cols_df = stacked_df.unstack(0)
+ assert_frame_equal(unstacked_cols.T, df)
+ assert_frame_equal(unstacked_cols_df['bar'].T, df)
+
+ def test_stack_mixed_level(self):
+ # GH 18310
+ levels = [range(3), [3, 'a', 'b'], [1, 2]]
+
+ # flat columns:
+ df = DataFrame(1, index=levels[0], columns=levels[1])
+ result = df.stack()
+ expected = Series(1, index=MultiIndex.from_product(levels[:2]))
+ assert_series_equal(result, expected)
+
+ # MultiIndex columns:
+ df = DataFrame(1, index=levels[0],
+ columns=MultiIndex.from_product(levels[1:]))
+ result = df.stack(1)
+ expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
+ levels[2]]),
+ columns=levels[1])
+ assert_frame_equal(result, expected)
+
+ # as above, but used labels in level are actually of homogeneous type
+ result = df[['a', 'b']].stack(1)
+ expected = expected[['a', 'b']]
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill(self):
+
+ # GH #9746: fill_value keyword argument for Series
+ # and DataFrame unstack
+
+ # From a series
+ data = Series([1, 2, 4, 5], dtype=np.int16)
+ data.index = MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ result = data.unstack(fill_value=-1)
+ expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
+ index=['x', 'y', 'z'], dtype=np.int16)
+ assert_frame_equal(result, expected)
+
+ # From a series with incorrect data type for fill_value
+ result = data.unstack(fill_value=0.5)
+ expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
+ index=['x', 'y', 'z'], dtype=np.float)
+ assert_frame_equal(result, expected)
+
+ # GH #13971: fill_value when unstacking multiple levels:
+ df = DataFrame({'x': ['a', 'a', 'b'],
+ 'y': ['j', 'k', 'j'],
+ 'z': [0, 1, 2],
+ 'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
+ unstacked = df.unstack(['x', 'y'], fill_value=0)
+ key = ('w', 'b', 'j')
+ expected = unstacked[key]
+ result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
+ assert_series_equal(result, expected)
+
+ stacked = unstacked.stack(['x', 'y'])
+ stacked.index = stacked.index.reorder_levels(df.index.names)
+ # Workaround for GH #17886 (unnecessarily casts to float):
+ stacked = stacked.astype(np.int64)
+ result = stacked.loc[df.index]
+ assert_frame_equal(result, df)
+
+ # From a series
+ s = df['w']
+ result = s.unstack(['x', 'y'], fill_value=0)
+ expected = unstacked['w']
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill_frame(self):
+
+ # From a dataframe
+ rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
+ df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
+ df.index = MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ result = df.unstack(fill_value=-1)
+
+ rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
+ expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
+ expected.columns = MultiIndex.from_tuples(
+ [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
+ assert_frame_equal(result, expected)
+
+ # From a mixed type dataframe
+ df['A'] = df['A'].astype(np.int16)
+ df['B'] = df['B'].astype(np.float64)
+
+ result = df.unstack(fill_value=-1)
+ expected['A'] = expected['A'].astype(np.int16)
+ expected['B'] = expected['B'].astype(np.float64)
+ assert_frame_equal(result, expected)
+
+ # From a dataframe with incorrect data type for fill_value
+ result = df.unstack(fill_value=0.5)
+
+ rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
+ expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
+ expected.columns = MultiIndex.from_tuples(
+ [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill_frame_datetime(self):
+
+ # Test unstacking with date times
+ dv = pd.date_range('2012-01-01', periods=4).values
+ data = Series(dv)
+ data.index = MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ result = data.unstack()
+ expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
+ 'b': [dv[1], dv[2], pd.NaT]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ result = data.unstack(fill_value=dv[0])
+ expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
+ 'b': [dv[1], dv[2], dv[0]]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill_frame_timedelta(self):
+
+ # Test unstacking with time deltas
+ td = [Timedelta(days=i) for i in range(4)]
+ data = Series(td)
+ data.index = MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ result = data.unstack()
+ expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
+ 'b': [td[1], td[2], pd.NaT]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ result = data.unstack(fill_value=td[1])
+ expected = DataFrame({'a': [td[0], td[1], td[3]],
+ 'b': [td[1], td[2], td[1]]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill_frame_period(self):
+
+ # Test unstacking with period
+ periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
+ Period('2012-04')]
+ data = Series(periods)
+ data.index = MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ result = data.unstack()
+ expected = DataFrame({'a': [periods[0], None, periods[3]],
+ 'b': [periods[1], periods[2], None]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ result = data.unstack(fill_value=periods[1])
+ expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
+ 'b': [periods[1], periods[2], periods[1]]},
+ index=['x', 'y', 'z'])
+ assert_frame_equal(result, expected)
+
+ def test_unstack_fill_frame_categorical(self):
+
+ # Test unstacking with categorical
+ data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
+ data.index = pd.MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')],
+ )
+
+ # By default missing values will be NaN
+ result = data.unstack()
+ expected = DataFrame({'a': pd.Categorical(list('axa'),
+ categories=list('abc')),
+ 'b': pd.Categorical(list('bcx'),
+ categories=list('abc'))},
+ index=list('xyz'))
+ assert_frame_equal(result, expected)
+
+ # Fill with non-category results in a TypeError
+ msg = r"'fill_value' \('d'\) is not in"
+ with pytest.raises(TypeError, match=msg):
+ data.unstack(fill_value='d')
+
+ # Fill with category value replaces missing values as expected
+ result = data.unstack(fill_value='c')
+ expected = DataFrame({'a': pd.Categorical(list('aca'),
+ categories=list('abc')),
+ 'b': pd.Categorical(list('bcc'),
+ categories=list('abc'))},
+ index=list('xyz'))
+ assert_frame_equal(result, expected)
+
+ def test_unstack_preserve_dtypes(self):
+ # Checks fix for #11847
+ df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
+ index=['a', 'b', 'c'],
+ some_categories=pd.Series(['a', 'b', 'c']
+ ).astype('category'),
+ A=np.random.rand(3),
+ B=1,
+ C='foo',
+ D=pd.Timestamp('20010102'),
+ E=pd.Series([1.0, 50.0, 100.0]
+ ).astype('float32'),
+ F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
+ G=False,
+ H=pd.Series([1, 200, 923442], dtype='int8')))
+
+ def unstack_and_compare(df, column_name):
+ unstacked1 = df.unstack([column_name])
+ unstacked2 = df.unstack(column_name)
+ assert_frame_equal(unstacked1, unstacked2)
+
+ df1 = df.set_index(['state', 'index'])
+ unstack_and_compare(df1, 'index')
+
+ df1 = df.set_index(['state', 'some_categories'])
+ unstack_and_compare(df1, 'some_categories')
+
+ df1 = df.set_index(['F', 'C'])
+ unstack_and_compare(df1, 'F')
+
+ df1 = df.set_index(['G', 'B', 'state'])
+ unstack_and_compare(df1, 'B')
+
+ df1 = df.set_index(['E', 'A'])
+ unstack_and_compare(df1, 'E')
+
+ df1 = df.set_index(['state', 'index'])
+ s = df1['A']
+ unstack_and_compare(s, 'index')
+
+ def test_stack_ints(self):
+ columns = MultiIndex.from_tuples(list(itertools.product(range(3),
+ repeat=3)))
+ df = DataFrame(np.random.randn(30, 27), columns=columns)
+
+ assert_frame_equal(df.stack(level=[1, 2]),
+ df.stack(level=1).stack(level=1))
+ assert_frame_equal(df.stack(level=[-2, -1]),
+ df.stack(level=1).stack(level=1))
+
+ df_named = df.copy()
+ df_named.columns.set_names(range(3), inplace=True)
+
+ assert_frame_equal(df_named.stack(level=[1, 2]),
+ df_named.stack(level=1).stack(level=1))
+
+ def test_stack_mixed_levels(self):
+ columns = MultiIndex.from_tuples(
+ [('A', 'cat', 'long'), ('B', 'cat', 'long'),
+ ('A', 'dog', 'short'), ('B', 'dog', 'short')],
+ names=['exp', 'animal', 'hair_length']
+ )
+ df = DataFrame(np.random.randn(4, 4), columns=columns)
+
+ animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
+ exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
+
+ # GH #8584: Need to check that stacking works when a number
+ # is passed that is both a level name and in the range of
+ # the level numbers
+ df2 = df.copy()
+ df2.columns.names = ['exp', 'animal', 1]
+ assert_frame_equal(df2.stack(level=['animal', 1]),
+ animal_hair_stacked, check_names=False)
+ assert_frame_equal(df2.stack(level=['exp', 1]),
+ exp_hair_stacked, check_names=False)
+
+ # When mixed types are passed and the ints are not level
+ # names, raise
+ pytest.raises(ValueError, df2.stack, level=['animal', 0])
+
+ # GH #8584: Having 0 in the level names could raise a
+ # strange error about lexsort depth
+ df3 = df.copy()
+ df3.columns.names = ['exp', 'animal', 0]
+ assert_frame_equal(df3.stack(level=['animal', 0]),
+ animal_hair_stacked, check_names=False)
+
+ def test_stack_int_level_names(self):
+ columns = MultiIndex.from_tuples(
+ [('A', 'cat', 'long'), ('B', 'cat', 'long'),
+ ('A', 'dog', 'short'), ('B', 'dog', 'short')],
+ names=['exp', 'animal', 'hair_length']
+ )
+ df = DataFrame(np.random.randn(4, 4), columns=columns)
+
+ exp_animal_stacked = df.stack(level=['exp', 'animal'])
+ animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
+ exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
+
+ df2 = df.copy()
+ df2.columns.names = [0, 1, 2]
+ assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
+ check_names=False)
+ assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
+ check_names=False)
+ assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
+ check_names=False)
+
+ # Out-of-order int column names
+ df3 = df.copy()
+ df3.columns.names = [2, 0, 1]
+ assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
+ check_names=False)
+ assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
+ check_names=False)
+ assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
+ check_names=False)
+
+ def test_unstack_bool(self):
+ df = DataFrame([False, False],
+ index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
+ columns=['col'])
+ rs = df.unstack()
+ xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
+ dtype=object),
+ index=['a', 'b'],
+ columns=MultiIndex.from_arrays([['col', 'col'],
+ ['c', 'l']]))
+ assert_frame_equal(rs, xp)
+
+ def test_unstack_level_binding(self):
+ # GH9856
+ mi = pd.MultiIndex(
+ levels=[[u('foo'), u('bar')], [u('one'), u('two')],
+ [u('a'), u('b')]],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
+ names=[u('first'), u('second'), u('third')])
+ s = pd.Series(0, index=mi)
+ result = s.unstack([1, 2]).stack(0)
+
+ expected_mi = pd.MultiIndex(
+ levels=[['foo', 'bar'], ['one', 'two']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=['first', 'second'])
+
+ expected = pd.DataFrame(np.array([[np.nan, 0],
+ [0, np.nan],
+ [np.nan, 0],
+ [0, np.nan]],
+ dtype=np.float64),
+ index=expected_mi,
+ columns=pd.Index(['a', 'b'], name='third'))
+
+ assert_frame_equal(result, expected)
+
+ def test_unstack_to_series(self):
+ # check reversibility
+ data = self.frame.unstack()
+
+ assert isinstance(data, Series)
+ undo = data.unstack().T
+ assert_frame_equal(undo, self.frame)
+
+ # check NA handling
+ data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
+ data.index = Index(['a', 'b', 'c'])
+ result = data.unstack()
+
+ midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
+ codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
+ expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
+
+ assert_series_equal(result, expected)
+
+ # check composability of unstack
+ old_data = data.copy()
+ for _ in range(4):
+ data = data.unstack()
+ assert_frame_equal(old_data, data)
+
+ def test_unstack_dtypes(self):
+
+ # GH 2929
+ rows = [[1, 1, 3, 4],
+ [1, 2, 3, 4],
+ [2, 1, 3, 4],
+ [2, 2, 3, 4]]
+
+ df = DataFrame(rows, columns=list('ABCD'))
+ result = df.get_dtype_counts()
+ expected = Series({'int64': 4})
+ assert_series_equal(result, expected)
+
+ # single dtype
+ df2 = df.set_index(['A', 'B'])
+ df3 = df2.unstack('B')
+ result = df3.get_dtype_counts()
+ expected = Series({'int64': 4})
+ assert_series_equal(result, expected)
+
+ # mixed
+ df2 = df.set_index(['A', 'B'])
+ df2['C'] = 3.
+ df3 = df2.unstack('B')
+ result = df3.get_dtype_counts()
+ expected = Series({'int64': 2, 'float64': 2})
+ assert_series_equal(result, expected)
+
+ df2['D'] = 'foo'
+ df3 = df2.unstack('B')
+ result = df3.get_dtype_counts()
+ expected = Series({'float64': 2, 'object': 2})
+ assert_series_equal(result, expected)
+
+ # GH7405
+ for c, d in (np.zeros(5), np.zeros(5)), \
+ (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
+
+ df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
+ 'B': pd.date_range('2012-01-01', periods=5)})
+
+ right = df.iloc[:3].copy(deep=True)
+
+ df = df.set_index(['A', 'B'])
+ df['D'] = df['D'].astype('int64')
+
+ left = df.iloc[:3].unstack(0)
+ right = right.set_index(['A', 'B']).unstack(0)
+ right[('D', 'a')] = right[('D', 'a')].astype('int64')
+
+ assert left.shape == (3, 2)
+ tm.assert_frame_equal(left, right)
+
+ def test_unstack_non_unique_index_names(self):
+ idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
+ names=['c1', 'c1'])
+ df = DataFrame([1, 2], index=idx)
+ with pytest.raises(ValueError):
+ df.unstack('c1')
+
+ with pytest.raises(ValueError):
+ df.T.stack('c1')
+
+ def test_unstack_unused_levels(self):
+ # GH 17845: unused codes in index make unstack() cast int to float
+ idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
+ df = pd.DataFrame([[1, 0]] * 3, index=idx)
+
+ result = df.unstack()
+ exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
+ expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
+ columns=exp_col)
+ tm.assert_frame_equal(result, expected)
+ assert((result.columns.levels[1] == idx.levels[1]).all())
+
+ # Unused items on both levels
+ levels = [[0, 1, 7], [0, 1, 2, 3]]
+ codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
+ idx = pd.MultiIndex(levels, codes)
+ block = np.arange(4).reshape(2, 2)
+ df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
+ result = df.unstack()
+ expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
+ axis=1),
+ columns=idx)
+ tm.assert_frame_equal(result, expected)
+ assert((result.columns.levels[1] == idx.levels[1]).all())
+
+ # With mixed dtype and NaN
+ levels = [['a', 2, 'c'], [1, 3, 5, 7]]
+ codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
+ idx = pd.MultiIndex(levels, codes)
+ data = np.arange(8)
+ df = pd.DataFrame(data.reshape(4, 2), index=idx)
+
+ cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
+ [np.nan, 'a', 2], [np.nan, 5, 1]),
+ (1, [8, 11, 1, 4, 12, 15, 13, 16],
+ [np.nan, 5, 1], [np.nan, 'a', 2]))
+ for level, idces, col_level, idx_level in cases:
+ result = df.unstack(level=level)
+ exp_data = np.zeros(18) * np.nan
+ exp_data[idces] = data
+ cols = pd.MultiIndex.from_product([[0, 1], col_level])
+ expected = pd.DataFrame(exp_data.reshape(3, 6),
+ index=idx_level, columns=cols)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
+ def test_unstack_unused_level(self, cols):
+ # GH 18562 : unused codes on the unstacked level
+ df = pd.DataFrame([[2010, 'a', 'I'],
+ [2011, 'b', 'II']],
+ columns=['A', 'B', 'C'])
+
+ ind = df.set_index(['A', 'B', 'C'], drop=False)
+ selection = ind.loc[(slice(None), slice(None), 'I'), cols]
+ result = selection.unstack()
+
+ expected = ind.iloc[[0]][cols]
+ expected.columns = MultiIndex.from_product([expected.columns, ['I']],
+ names=[None, 'C'])
+ expected.index = expected.index.droplevel('C')
+ tm.assert_frame_equal(result, expected)
+
+ def test_unstack_nan_index(self): # GH7466
+ cast = lambda val: '{0:1}'.format('' if val != val else val)
+
+ def verify(df):
+ mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
+ rows, cols = df.notna().values.nonzero()
+ for i, j in zip(rows, cols):
+ left = sorted(df.iloc[i, j].split('.'))
+ right = mk_list(df.index[i]) + mk_list(df.columns[j])
+ right = sorted(list(map(cast, right)))
+ assert left == right
+
+ df = DataFrame({'jim': ['a', 'b', np.nan, 'd'],
+ 'joe': ['w', 'x', 'y', 'z'],
+ 'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
+
+ left = df.set_index(['jim', 'joe']).unstack()['jolie']
+ right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
+ assert_frame_equal(left, right)
+
+ for idx in itertools.permutations(df.columns[:2]):
+ mi = df.set_index(list(idx))
+ for lev in range(2):
+ udf = mi.unstack(level=lev)
+ assert udf.notna().values.sum() == len(df)
+ verify(udf['jolie'])
+
+ df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 +
+ ['c'] * 3 + ['e'] * 2 + ['b'] * 5,
+ '2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 +
+ ['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2,
+ '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
+ 50, 62, 59, 76, 52, 14, 53, 60, 51]})
+
+ df['4th'], df['5th'] = \
+ df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
+ df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
+
+ for idx in itertools.permutations(['1st', '2nd', '3rd']):
+ mi = df.set_index(list(idx))
+ for lev in range(3):
+ udf = mi.unstack(level=lev)
+ assert udf.notna().values.sum() == 2 * len(df)
+ for col in ['4th', '5th']:
+ verify(udf[col])
+
+ # GH7403
+ df = pd.DataFrame(
+ {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
+ df.iloc[3, 1] = np.NaN
+ left = df.set_index(['A', 'B']).unstack(0)
+
+ vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]]
+ vals = list(map(list, zip(*vals)))
+ idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B')
+ cols = MultiIndex(levels=[['C'], ['a', 'b']],
+ codes=[[0, 0], [0, 1]],
+ names=[None, 'A'])
+
+ right = DataFrame(vals, columns=cols, index=idx)
+ assert_frame_equal(left, right)
+
+ df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
+ 'C': range(8)})
+ df.iloc[2, 1] = np.NaN
+ left = df.set_index(['A', 'B']).unstack(0)
+
+ vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
+ cols = MultiIndex(levels=[['C'], ['a', 'b']],
+ codes=[[0, 0], [0, 1]],
+ names=[None, 'A'])
+ idx = Index([np.nan, 0, 1, 2, 3], name='B')
+ right = DataFrame(vals, columns=cols, index=idx)
+ assert_frame_equal(left, right)
+
+ df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
+ 'C': range(8)})
+ df.iloc[3, 1] = np.NaN
+ left = df.set_index(['A', 'B']).unstack(0)
+
+ vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
+ cols = MultiIndex(levels=[['C'], ['a', 'b']],
+ codes=[[0, 0], [0, 1]],
+ names=[None, 'A'])
+ idx = Index([np.nan, 0, 1, 2, 3], name='B')
+ right = DataFrame(vals, columns=cols, index=idx)
+ assert_frame_equal(left, right)
+
+ # GH7401
+ df = pd.DataFrame({'A': list('aaaaabbbbb'),
+ 'B': (date_range('2012-01-01', periods=5)
+ .tolist() * 2),
+ 'C': np.arange(10)})
+
+ df.iloc[3, 1] = np.NaN
+ left = df.set_index(['A', 'B']).unstack()
+
+ vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
+ idx = Index(['a', 'b'], name='A')
+ cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
+ codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
+ names=[None, 'B'])
+
+ right = DataFrame(vals, columns=cols, index=idx)
+ assert_frame_equal(left, right)
+
+ # GH4862
+ vals = [['Hg', np.nan, np.nan, 680585148],
+ ['U', 0.0, np.nan, 680585148],
+ ['Pb', 7.07e-06, np.nan, 680585148],
+ ['Sn', 2.3614e-05, 0.0133, 680607017],
+ ['Ag', 0.0, 0.0133, 680607017],
+ ['Hg', -0.00015, 0.0133, 680607017]]
+ df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
+ index=[17263, 17264, 17265, 17266, 17267, 17268])
+
+ left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
+
+ vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0],
+ [0.0, -0.00015, np.nan, 2.3614e-05, np.nan]]
+
+ idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
+ codes=[[0, 1], [-1, 0]],
+ names=['s_id', 'dosage'])
+
+ cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
+ codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
+ names=[None, 'agent'])
+
+ right = DataFrame(vals, columns=cols, index=idx)
+ assert_frame_equal(left, right)
+
+ left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
+ assert_frame_equal(left.unstack(), right)
+
+ # GH9497 - multiple unstack with nulls
+ df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
+ '2nd': pd.date_range('2014-02-01', periods=6,
+ freq='D'),
+ 'jim': 100 + np.arange(6),
+ 'joe': (np.random.randn(6) * 10).round(2)})
+
+ df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
+ df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan
+ df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan
+
+ left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
+ assert left.notna().values.sum() == 2 * len(df)
+
+ for col in ['jim', 'joe']:
+ for _, r in df.iterrows():
+ key = r['1st'], (col, r['2nd'], r['3rd'])
+ assert r[col] == left.loc[key]
+
+ def test_stack_datetime_column_multiIndex(self):
+ # GH 8039
+ t = datetime(2014, 1, 1)
+ df = DataFrame(
+ [1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
+ result = df.stack()
+
+ eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
+ ecols = MultiIndex.from_tuples([(t, 'A')])
+ expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
+ assert_frame_equal(result, expected)
+
+ def test_stack_partial_multiIndex(self):
+ # GH 8844
+ def _test_stack_with_multiindex(multiindex):
+ df = DataFrame(np.arange(3 * len(multiindex))
+ .reshape(3, len(multiindex)),
+ columns=multiindex)
+ for level in (-1, 0, 1, [0, 1], [1, 0]):
+ result = df.stack(level=level, dropna=False)
+
+ if isinstance(level, int):
+ # Stacking a single level should not make any all-NaN rows,
+ # so df.stack(level=level, dropna=False) should be the same
+ # as df.stack(level=level, dropna=True).
+ expected = df.stack(level=level, dropna=True)
+ if isinstance(expected, Series):
+ assert_series_equal(result, expected)
+ else:
+ assert_frame_equal(result, expected)
+
+ df.columns = MultiIndex.from_tuples(df.columns.get_values(),
+ names=df.columns.names)
+ expected = df.stack(level=level, dropna=False)
+ if isinstance(expected, Series):
+ assert_series_equal(result, expected)
+ else:
+ assert_frame_equal(result, expected)
+
+ full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
+ ('A', 'y'),
+ ('C', 'x'), ('C', 'u')],
+ names=['Upper', 'Lower'])
+ for multiindex_columns in ([0, 1, 2, 3, 4],
+ [0, 1, 2, 3], [0, 1, 2, 4],
+ [0, 1, 2], [1, 2, 3], [2, 3, 4],
+ [0, 1], [0, 2], [0, 3],
+ [0], [2], [4]):
+ _test_stack_with_multiindex(full_multiindex[multiindex_columns])
+ if len(multiindex_columns) > 1:
+ multiindex_columns.reverse()
+ _test_stack_with_multiindex(
+ full_multiindex[multiindex_columns])
+
+ df = DataFrame(np.arange(6).reshape(2, 3),
+ columns=full_multiindex[[0, 1, 3]])
+ result = df.stack(dropna=False)
+ expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
+ index=MultiIndex(
+ levels=[[0, 1], ['u', 'x', 'y', 'z']],
+ codes=[[0, 0, 1, 1],
+ [1, 3, 1, 3]],
+ names=[None, 'Lower']),
+ columns=Index(['B', 'C'], name='Upper'),
+ dtype=df.dtypes[0])
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('ordered', [False, True])
+ @pytest.mark.parametrize('labels', [list("yxz"), list("yxy")])
+ def test_stack_preserve_categorical_dtype(self, ordered, labels):
+ # GH13854
+ cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
+ ordered=ordered)
+ df = DataFrame([[10, 11, 12]], columns=cidx)
+ result = df.stack()
+
+ # `MutliIndex.from_product` preserves categorical dtype -
+ # it's tested elsewhere.
+ midx = pd.MultiIndex.from_product([df.index, cidx])
+ expected = Series([10, 11, 12], index=midx)
+
+ tm.assert_series_equal(result, expected)
+
+ def test_stack_preserve_categorical_dtype_values(self):
+ # GH-23077
+ cat = pd.Categorical(['a', 'a', 'b', 'c'])
+ df = pd.DataFrame({"A": cat, "B": cat})
+ result = df.stack()
+ index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']])
+ expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a',
+ 'b', 'b', 'c', 'c']),
+ index=index)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('level', [0, 1])
+ def test_unstack_mixed_extension_types(self, level):
+ index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)],
+ names=['a', 'b'])
+ df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]),
+ "B": pd.Categorical(['a', 'a', 'b'])}, index=index)
+
+ result = df.unstack(level=level)
+ expected = df.astype(object).unstack(level=level)
+
+ expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2,
+ index=result.columns)
+ tm.assert_series_equal(result.dtypes, expected_dtypes)
+ tm.assert_frame_equal(result.astype(object), expected)
+
+ @pytest.mark.parametrize("level", [0, 'baz'])
+ def test_unstack_swaplevel_sortlevel(self, level):
+ # GH 20994
+ mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
+ names=['bar', 'baz'])
+ df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
+ df.columns.name = 'foo'
+
+ expected = pd.DataFrame([
+ [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
+ ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
+ 'baz', 'foo']))
+ expected.index.name = 'bar'
+
+ result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_unstack_fill_frame_object():
+ # GH12815 Test unstacking with object.
+ data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
+ data.index = pd.MultiIndex.from_tuples(
+ [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+ # By default missing values will be NaN
+ result = data.unstack()
+ expected = pd.DataFrame(
+ {'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
+ index=list('xyz')
+ )
+ assert_frame_equal(result, expected)
+
+ # Fill with any value replaces missing values as expected
+ result = data.unstack(fill_value='d')
+ expected = pd.DataFrame(
+ {'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
+ index=list('xyz')
+ )
+ assert_frame_equal(result, expected)
+
+
+def test_unstack_timezone_aware_values():
+ # GH 18338
+ df = pd.DataFrame({
+ 'timestamp': [
+ pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')],
+ 'a': ['a'],
+ 'b': ['b'],
+ 'c': ['c'],
+ }, columns=['timestamp', 'a', 'b', 'c'])
+ result = df.set_index(['a', 'b']).unstack()
+ expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000',
+ tz='UTC'),
+ 'c']],
+ index=pd.Index(['a'], name='a'),
+ columns=pd.MultiIndex(
+ levels=[['timestamp', 'c'], ['b']],
+ codes=[[0, 1], [0, 0]],
+ names=[None, 'b']))
+ assert_frame_equal(result, expected)
+
+
+def test_stack_timezone_aware_values():
+ # GH 19420
+ ts = pd.date_range(freq="D", start="20180101", end="20180103",
+ tz="America/New_York")
+ df = pd.DataFrame({"A": ts}, index=["a", "b", "c"])
+ result = df.stack()
+ expected = pd.Series(ts,
+ index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']],
+ codes=[[0, 1, 2], [0, 0, 0]]))
+ assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_sort_values_level_as_str.py b/contrib/python/pandas/py2/pandas/tests/frame/test_sort_values_level_as_str.py
new file mode 100644
index 00000000000..3dca82a229b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_sort_values_level_as_str.py
@@ -0,0 +1,96 @@
+import numpy as np
+import pytest
+
+from pandas.errors import PerformanceWarning
+
+from pandas import DataFrame
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal
+
+
+def df_none():
+ return DataFrame({
+ 'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'inner': [1, 2, 2, 2, 1, 1],
+ 'A': np.arange(6, 0, -1),
+ ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']})
+
+
+ ['outer'],
+ ['outer', 'inner']
+])
+def df_idx(request, df_none):
+ levels = request.param
+ return df_none.set_index(levels)
+
+
+ 'inner', # index level
+ ['outer'], # list of index level
+ 'A', # column
+ [('B', 5)], # list of column
+ ['inner', 'outer'], # two index levels
+ [('B', 5), 'outer'], # index level and column
+ ['A', ('B', 5)], # Two columns
+ ['inner', 'outer'] # two index levels and column
+])
+def sort_names(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def ascending(request):
+ return request.param
+
+
+def test_sort_index_level_and_column_label(
+ df_none, df_idx, sort_names, ascending):
+
+ # GH 14353
+
+ # Get index levels from df_idx
+ levels = df_idx.index.names
+
+ # Compute expected by sorting on columns and the setting index
+ expected = df_none.sort_values(by=sort_names,
+ ascending=ascending,
+ axis=0).set_index(levels)
+
+ # Compute result sorting on mix on columns and index levels
+ result = df_idx.sort_values(by=sort_names,
+ ascending=ascending,
+ axis=0)
+
+ assert_frame_equal(result, expected)
+
+
+def test_sort_column_level_and_index_label(
+ df_none, df_idx, sort_names, ascending):
+
+ # GH 14353
+
+ # Get levels from df_idx
+ levels = df_idx.index.names
+
+ # Compute expected by sorting on axis=0, setting index levels, and then
+ # transposing. For some cases this will result in a frame with
+ # multiple column levels
+ expected = df_none.sort_values(by=sort_names,
+ ascending=ascending,
+ axis=0).set_index(levels).T
+
+ # Compute result by transposing and sorting on axis=1.
+ result = df_idx.T.sort_values(by=sort_names,
+ ascending=ascending,
+ axis=1)
+
+ if len(levels) > 1:
+ # Accessing multi-level columns that are not lexsorted raises a
+ # performance warning
+ with tm.assert_produces_warning(PerformanceWarning,
+ check_stacklevel=False):
+ assert_frame_equal(result, expected)
+ else:
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_sorting.py b/contrib/python/pandas/py2/pandas/tests/frame/test_sorting.py
new file mode 100644
index 00000000000..85e6373b384
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_sorting.py
@@ -0,0 +1,670 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import random
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, IntervalIndex, MultiIndex, NaT, Series, Timestamp,
+ date_range)
+from pandas.api.types import CategoricalDtype
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestDataFrameSorting(TestData):
+
+ def test_sort_values(self):
+ frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
+ index=[1, 2, 3], columns=list('ABC'))
+
+ # by column (axis=0)
+ sorted_df = frame.sort_values(by='A')
+ indexer = frame['A'].argsort().values
+ expected = frame.loc[frame.index[indexer]]
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by='A', ascending=False)
+ indexer = indexer[::-1]
+ expected = frame.loc[frame.index[indexer]]
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by='A', ascending=False)
+ assert_frame_equal(sorted_df, expected)
+
+ # GH4839
+ sorted_df = frame.sort_values(by=['A'], ascending=[False])
+ assert_frame_equal(sorted_df, expected)
+
+ # multiple bys
+ sorted_df = frame.sort_values(by=['B', 'C'])
+ expected = frame.loc[[2, 1, 3]]
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
+ assert_frame_equal(sorted_df, expected[::-1])
+
+ sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
+ assert_frame_equal(sorted_df, expected)
+
+ pytest.raises(ValueError, lambda: frame.sort_values(
+ by=['A', 'B'], axis=2, inplace=True))
+
+ # by row (axis=1): GH 10806
+ sorted_df = frame.sort_values(by=3, axis=1)
+ expected = frame
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
+ expected = frame.reindex(columns=['C', 'B', 'A'])
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by=[1, 2], axis='columns')
+ expected = frame.reindex(columns=['B', 'A', 'C'])
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by=[1, 3], axis=1,
+ ascending=[True, False])
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
+ expected = frame.reindex(columns=['C', 'B', 'A'])
+ assert_frame_equal(sorted_df, expected)
+
+ msg = r'Length of ascending \(5\) != length of by \(2\)'
+ with pytest.raises(ValueError, match=msg):
+ frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
+
+ def test_sort_values_inplace(self):
+ frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
+ columns=['A', 'B', 'C', 'D'])
+
+ sorted_df = frame.copy()
+ sorted_df.sort_values(by='A', inplace=True)
+ expected = frame.sort_values(by='A')
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.copy()
+ sorted_df.sort_values(by=1, axis=1, inplace=True)
+ expected = frame.sort_values(by=1, axis=1)
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.copy()
+ sorted_df.sort_values(by='A', ascending=False, inplace=True)
+ expected = frame.sort_values(by='A', ascending=False)
+ assert_frame_equal(sorted_df, expected)
+
+ sorted_df = frame.copy()
+ sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
+ expected = frame.sort_values(by=['A', 'B'], ascending=False)
+ assert_frame_equal(sorted_df, expected)
+
+ def test_sort_nan(self):
+ # GH3917
+ nan = np.nan
+ df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
+ 'B': [9, nan, 5, 2, 5, 4, 5]})
+
+ # sort one column only
+ expected = DataFrame(
+ {'A': [nan, 1, 1, 2, 4, 6, 8],
+ 'B': [5, 9, 2, nan, 5, 5, 4]},
+ index=[2, 0, 3, 1, 6, 4, 5])
+ sorted_df = df.sort_values(['A'], na_position='first')
+ assert_frame_equal(sorted_df, expected)
+
+ expected = DataFrame(
+ {'A': [nan, 8, 6, 4, 2, 1, 1],
+ 'B': [5, 4, 5, 5, nan, 9, 2]},
+ index=[2, 5, 4, 6, 1, 0, 3])
+ sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
+ assert_frame_equal(sorted_df, expected)
+
+ expected = df.reindex(columns=['B', 'A'])
+ sorted_df = df.sort_values(by=1, axis=1, na_position='first')
+ assert_frame_equal(sorted_df, expected)
+
+ # na_position='last', order
+ expected = DataFrame(
+ {'A': [1, 1, 2, 4, 6, 8, nan],
+ 'B': [2, 9, nan, 5, 5, 4, 5]},
+ index=[3, 0, 1, 6, 4, 5, 2])
+ sorted_df = df.sort_values(['A', 'B'])
+ assert_frame_equal(sorted_df, expected)
+
+ # na_position='first', order
+ expected = DataFrame(
+ {'A': [nan, 1, 1, 2, 4, 6, 8],
+ 'B': [5, 2, 9, nan, 5, 5, 4]},
+ index=[2, 3, 0, 1, 6, 4, 5])
+ sorted_df = df.sort_values(['A', 'B'], na_position='first')
+ assert_frame_equal(sorted_df, expected)
+
+ # na_position='first', not order
+ expected = DataFrame(
+ {'A': [nan, 1, 1, 2, 4, 6, 8],
+ 'B': [5, 9, 2, nan, 5, 5, 4]},
+ index=[2, 0, 3, 1, 6, 4, 5])
+ sorted_df = df.sort_values(['A', 'B'], ascending=[
+ 1, 0], na_position='first')
+ assert_frame_equal(sorted_df, expected)
+
+ # na_position='last', not order
+ expected = DataFrame(
+ {'A': [8, 6, 4, 2, 1, 1, nan],
+ 'B': [4, 5, 5, nan, 2, 9, 5]},
+ index=[5, 4, 6, 1, 3, 0, 2])
+ sorted_df = df.sort_values(['A', 'B'], ascending=[
+ 0, 1], na_position='last')
+ assert_frame_equal(sorted_df, expected)
+
+ # Test DataFrame with nan label
+ df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
+ 'B': [9, nan, 5, 2, 5, 4, 5]},
+ index=[1, 2, 3, 4, 5, 6, nan])
+
+ # NaN label, ascending=True, na_position='last'
+ sorted_df = df.sort_index(
+ kind='quicksort', ascending=True, na_position='last')
+ expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
+ 'B': [9, nan, 5, 2, 5, 4, 5]},
+ index=[1, 2, 3, 4, 5, 6, nan])
+ assert_frame_equal(sorted_df, expected)
+
+ # NaN label, ascending=True, na_position='first'
+ sorted_df = df.sort_index(na_position='first')
+ expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
+ 'B': [5, 9, nan, 5, 2, 5, 4]},
+ index=[nan, 1, 2, 3, 4, 5, 6])
+ assert_frame_equal(sorted_df, expected)
+
+ # NaN label, ascending=False, na_position='last'
+ sorted_df = df.sort_index(kind='quicksort', ascending=False)
+ expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
+ 'B': [4, 5, 2, 5, nan, 9, 5]},
+ index=[6, 5, 4, 3, 2, 1, nan])
+ assert_frame_equal(sorted_df, expected)
+
+ # NaN label, ascending=False, na_position='first'
+ sorted_df = df.sort_index(
+ kind='quicksort', ascending=False, na_position='first')
+ expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
+ 'B': [5, 4, 5, 2, 5, nan, 9]},
+ index=[nan, 6, 5, 4, 3, 2, 1])
+ assert_frame_equal(sorted_df, expected)
+
+ def test_stable_descending_sort(self):
+ # GH #6399
+ df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
+ columns=['sort_col', 'order'])
+ sorted_df = df.sort_values(by='sort_col', kind='mergesort',
+ ascending=False)
+ assert_frame_equal(df, sorted_df)
+
+ def test_stable_descending_multicolumn_sort(self):
+ nan = np.nan
+ df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
+ 'B': [9, nan, 5, 2, 5, 4, 5]})
+ # test stable mergesort
+ expected = DataFrame(
+ {'A': [nan, 8, 6, 4, 2, 1, 1],
+ 'B': [5, 4, 5, 5, nan, 2, 9]},
+ index=[2, 5, 4, 6, 1, 3, 0])
+ sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
+ na_position='first',
+ kind='mergesort')
+ assert_frame_equal(sorted_df, expected)
+
+ expected = DataFrame(
+ {'A': [nan, 8, 6, 4, 2, 1, 1],
+ 'B': [5, 4, 5, 5, nan, 9, 2]},
+ index=[2, 5, 4, 6, 1, 0, 3])
+ sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
+ na_position='first',
+ kind='mergesort')
+ assert_frame_equal(sorted_df, expected)
+
+ def test_stable_categorial(self):
+ # GH 16793
+ df = DataFrame({
+ 'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
+ })
+ expected = df.copy()
+ sorted_df = df.sort_values('x', kind='mergesort')
+ assert_frame_equal(sorted_df, expected)
+
+ def test_sort_datetimes(self):
+
+ # GH 3461, argsort / lexsort differences for a datetime column
+ df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
+ columns=['A'],
+ index=date_range('20130101', periods=9))
+ dts = [Timestamp(x)
+ for x in ['2004-02-11', '2004-01-21', '2004-01-26',
+ '2005-09-20', '2010-10-04', '2009-05-12',
+ '2008-11-12', '2010-09-28', '2010-09-28']]
+ df['B'] = dts[::2] + dts[1::2]
+ df['C'] = 2.
+ df['A1'] = 3.
+
+ df1 = df.sort_values(by='A')
+ df2 = df.sort_values(by=['A'])
+ assert_frame_equal(df1, df2)
+
+ df1 = df.sort_values(by='B')
+ df2 = df.sort_values(by=['B'])
+ assert_frame_equal(df1, df2)
+
+ df1 = df.sort_values(by='B')
+
+ df2 = df.sort_values(by=['C', 'B'])
+ assert_frame_equal(df1, df2)
+
+ def test_frame_column_inplace_sort_exception(self):
+ s = self.frame['A']
+ with pytest.raises(ValueError, match="This Series is a view"):
+ s.sort_values(inplace=True)
+
+ cp = s.copy()
+ cp.sort_values() # it works!
+
+ def test_sort_nat_values_in_int_column(self):
+
+ # GH 14922: "sorting with large float and multiple columns incorrect"
+
+ # cause was that the int64 value NaT was considered as "na". Which is
+ # only correct for datetime64 columns.
+
+ int_values = (2, int(NaT))
+ float_values = (2.0, -1.797693e308)
+
+ df = DataFrame(dict(int=int_values, float=float_values),
+ columns=["int", "float"])
+
+ df_reversed = DataFrame(dict(int=int_values[::-1],
+ float=float_values[::-1]),
+ columns=["int", "float"],
+ index=[1, 0])
+
+ # NaT is not a "na" for int64 columns, so na_position must not
+ # influence the result:
+ df_sorted = df.sort_values(["int", "float"], na_position="last")
+ assert_frame_equal(df_sorted, df_reversed)
+
+ df_sorted = df.sort_values(["int", "float"], na_position="first")
+ assert_frame_equal(df_sorted, df_reversed)
+
+ # reverse sorting order
+ df_sorted = df.sort_values(["int", "float"], ascending=False)
+ assert_frame_equal(df_sorted, df)
+
+ # and now check if NaT is still considered as "na" for datetime64
+ # columns:
+ df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
+ float=float_values), columns=["datetime", "float"])
+
+ df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
+ float=float_values[::-1]),
+ columns=["datetime", "float"],
+ index=[1, 0])
+
+ df_sorted = df.sort_values(["datetime", "float"], na_position="first")
+ assert_frame_equal(df_sorted, df_reversed)
+
+ df_sorted = df.sort_values(["datetime", "float"], na_position="last")
+ assert_frame_equal(df_sorted, df)
+
+ # Ascending should not affect the results.
+ df_sorted = df.sort_values(["datetime", "float"], ascending=False)
+ assert_frame_equal(df_sorted, df)
+
+ def test_sort_nat(self):
+
+ # GH 16836
+
+ d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
+ np.nan, '2016-01-01']]
+ d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
+ '2016-01-01', '2015-01-01']]
+ df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
+
+ d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
+ '2016-01-01', np.nan]]
+ d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
+ '2017-01-01', '2016-01-01']]
+ expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
+ sorted_df = df.sort_values(by=['a', 'b'], )
+ tm.assert_frame_equal(sorted_df, expected)
+
+
+class TestDataFrameSortIndexKinds(TestData):
+
+ def test_sort_index_multicolumn(self):
+ A = np.arange(5).repeat(20)
+ B = np.tile(np.arange(5), 20)
+ random.shuffle(A)
+ random.shuffle(B)
+ frame = DataFrame({'A': A, 'B': B,
+ 'C': np.random.randn(100)})
+
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ frame.sort_index(by=['A', 'B'])
+ result = frame.sort_values(by=['A', 'B'])
+ indexer = np.lexsort((frame['B'], frame['A']))
+ expected = frame.take(indexer)
+ assert_frame_equal(result, expected)
+
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ frame.sort_index(by=['A', 'B'], ascending=False)
+ result = frame.sort_values(by=['A', 'B'], ascending=False)
+ indexer = np.lexsort((frame['B'].rank(ascending=False),
+ frame['A'].rank(ascending=False)))
+ expected = frame.take(indexer)
+ assert_frame_equal(result, expected)
+
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ frame.sort_index(by=['B', 'A'])
+ result = frame.sort_values(by=['B', 'A'])
+ indexer = np.lexsort((frame['A'], frame['B']))
+ expected = frame.take(indexer)
+ assert_frame_equal(result, expected)
+
+ def test_sort_index_inplace(self):
+ frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
+ columns=['A', 'B', 'C', 'D'])
+
+ # axis=0
+ unordered = frame.loc[[3, 2, 4, 1]]
+ a_id = id(unordered['A'])
+ df = unordered.copy()
+ df.sort_index(inplace=True)
+ expected = frame
+ assert_frame_equal(df, expected)
+ assert a_id != id(df['A'])
+
+ df = unordered.copy()
+ df.sort_index(ascending=False, inplace=True)
+ expected = frame[::-1]
+ assert_frame_equal(df, expected)
+
+ # axis=1
+ unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
+ df = unordered.copy()
+ df.sort_index(axis=1, inplace=True)
+ expected = frame
+ assert_frame_equal(df, expected)
+
+ df = unordered.copy()
+ df.sort_index(axis=1, ascending=False, inplace=True)
+ expected = frame.iloc[:, ::-1]
+ assert_frame_equal(df, expected)
+
+ def test_sort_index_different_sortorder(self):
+ A = np.arange(20).repeat(5)
+ B = np.tile(np.arange(5), 20)
+
+ indexer = np.random.permutation(100)
+ A = A.take(indexer)
+ B = B.take(indexer)
+
+ df = DataFrame({'A': A, 'B': B,
+ 'C': np.random.randn(100)})
+
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by=['A', 'B'], ascending=[1, 0])
+ result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
+
+ ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
+ expected = df.take(ex_indexer)
+ assert_frame_equal(result, expected)
+
+ # test with multiindex, too
+ idf = df.set_index(['A', 'B'])
+
+ result = idf.sort_index(ascending=[1, 0])
+ expected = idf.take(ex_indexer)
+ assert_frame_equal(result, expected)
+
+ # also, Series!
+ result = idf['C'].sort_index(ascending=[1, 0])
+ assert_series_equal(result, expected['C'])
+
+ def test_sort_index_duplicates(self):
+
+ # with 9816, these are all translated to .sort_values
+
+ df = DataFrame([lrange(5, 9), lrange(4)],
+ columns=['a', 'a', 'b', 'b'])
+
+ with pytest.raises(ValueError, match='not unique'):
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by='a')
+ with pytest.raises(ValueError, match='not unique'):
+ df.sort_values(by='a')
+
+ with pytest.raises(ValueError, match='not unique'):
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by=['a'])
+ with pytest.raises(ValueError, match='not unique'):
+ df.sort_values(by=['a'])
+
+ with pytest.raises(ValueError, match='not unique'):
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ # multi-column 'by' is separate codepath
+ df.sort_index(by=['a', 'b'])
+ with pytest.raises(ValueError, match='not unique'):
+ # multi-column 'by' is separate codepath
+ df.sort_values(by=['a', 'b'])
+
+ # with multi-index
+ # GH4370
+ df = DataFrame(np.random.randn(4, 2),
+ columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
+ with pytest.raises(ValueError, match='level'):
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by='a')
+ with pytest.raises(ValueError, match='level'):
+ df.sort_values(by='a')
+
+ # convert tuples to a list of tuples
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by=[('a', 1)])
+ expected = df.sort_values(by=[('a', 1)])
+
+ # use .sort_values #9816
+ with tm.assert_produces_warning(FutureWarning):
+ df.sort_index(by=('a', 1))
+ result = df.sort_values(by=('a', 1))
+ assert_frame_equal(result, expected)
+
+ def test_sort_index_level(self):
+ mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
+ df = DataFrame([[1, 2], [3, 4]], mi)
+ res = df.sort_index(level='A', sort_remaining=False)
+ assert_frame_equal(df, res)
+
+ res = df.sort_index(level=['A', 'B'], sort_remaining=False)
+ assert_frame_equal(df, res)
+
+ def test_sort_index_categorical_index(self):
+
+ df = (DataFrame({'A': np.arange(6, dtype='int64'),
+ 'B': Series(list('aabbca'))
+ .astype(CategoricalDtype(list('cab')))})
+ .set_index('B'))
+
+ result = df.sort_index()
+ expected = df.iloc[[4, 0, 1, 5, 2, 3]]
+ assert_frame_equal(result, expected)
+
+ result = df.sort_index(ascending=False)
+ expected = df.iloc[[3, 2, 5, 1, 0, 4]]
+ assert_frame_equal(result, expected)
+
+ def test_sort_index(self):
+ # GH13496
+
+ frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
+ columns=['A', 'B', 'C', 'D'])
+
+ # axis=0 : sort rows by index labels
+ unordered = frame.loc[[3, 2, 4, 1]]
+ result = unordered.sort_index(axis=0)
+ expected = frame
+ assert_frame_equal(result, expected)
+
+ result = unordered.sort_index(ascending=False)
+ expected = frame[::-1]
+ assert_frame_equal(result, expected)
+
+ # axis=1 : sort columns by column names
+ unordered = frame.iloc[:, [2, 1, 3, 0]]
+ result = unordered.sort_index(axis=1)
+ assert_frame_equal(result, frame)
+
+ result = unordered.sort_index(axis=1, ascending=False)
+ expected = frame.iloc[:, ::-1]
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("level", ['A', 0]) # GH 21052
+ def test_sort_index_multiindex(self, level):
+ # GH13496
+
+ # sort rows by specified level of multi-index
+ mi = MultiIndex.from_tuples([
+ [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
+ df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
+
+ expected_mi = MultiIndex.from_tuples([
+ [1, 1, 1],
+ [2, 1, 2],
+ [2, 1, 3]], names=list('ABC'))
+ expected = pd.DataFrame([
+ [5, 6],
+ [3, 4],
+ [1, 2]], index=expected_mi)
+ result = df.sort_index(level=level)
+ assert_frame_equal(result, expected)
+
+ # sort_remaining=False
+ expected_mi = MultiIndex.from_tuples([
+ [1, 1, 1],
+ [2, 1, 3],
+ [2, 1, 2]], names=list('ABC'))
+ expected = pd.DataFrame([
+ [5, 6],
+ [1, 2],
+ [3, 4]], index=expected_mi)
+ result = df.sort_index(level=level, sort_remaining=False)
+ assert_frame_equal(result, expected)
+
+ def test_sort_index_intervalindex(self):
+ # this is a de-facto sort via unstack
+ # confirming that we sort in the order of the bins
+ y = Series(np.random.randn(100))
+ x1 = Series(np.sign(np.random.randn(100)))
+ x2 = pd.cut(Series(np.random.randn(100)),
+ bins=[-3, -0.5, 0, 0.5, 3])
+ model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
+
+ result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
+ expected = IntervalIndex.from_tuples(
+ [(-3.0, -0.5), (-0.5, 0.0),
+ (0.0, 0.5), (0.5, 3.0)],
+ closed='right')
+ result = result.columns.levels[1].categories
+ tm.assert_index_equal(result, expected)
+
+ def test_sort_index_na_position_with_categories(self):
+ # GH 22556
+ # Positioning missing value properly when column is Categorical.
+ categories = ['A', 'B', 'C']
+ category_indices = [0, 2, 4]
+ list_of_nans = [np.nan, np.nan]
+ na_indices = [1, 3]
+ na_position_first = 'first'
+ na_position_last = 'last'
+ column_name = 'c'
+
+ reversed_categories = sorted(categories, reverse=True)
+ reversed_category_indices = sorted(category_indices, reverse=True)
+ reversed_na_indices = sorted(na_indices, reverse=True)
+
+ df = pd.DataFrame({
+ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
+ categories=categories,
+ ordered=True)})
+ # sort ascending with na first
+ result = df.sort_values(by=column_name,
+ ascending=True,
+ na_position=na_position_first)
+ expected = DataFrame({
+ column_name: Categorical(list_of_nans + categories,
+ categories=categories,
+ ordered=True)
+ }, index=na_indices + category_indices)
+
+ assert_frame_equal(result, expected)
+
+ # sort ascending with na last
+ result = df.sort_values(by=column_name,
+ ascending=True,
+ na_position=na_position_last)
+ expected = DataFrame({
+ column_name: Categorical(categories + list_of_nans,
+ categories=categories,
+ ordered=True)
+ }, index=category_indices + na_indices)
+
+ assert_frame_equal(result, expected)
+
+ # sort descending with na first
+ result = df.sort_values(by=column_name,
+ ascending=False,
+ na_position=na_position_first)
+ expected = DataFrame({
+ column_name: Categorical(list_of_nans + reversed_categories,
+ categories=categories,
+ ordered=True)
+ }, index=reversed_na_indices + reversed_category_indices)
+
+ assert_frame_equal(result, expected)
+
+ # sort descending with na last
+ result = df.sort_values(by=column_name,
+ ascending=False,
+ na_position=na_position_last)
+ expected = DataFrame({
+ column_name: Categorical(reversed_categories + list_of_nans,
+ categories=categories,
+ ordered=True)
+ }, index=reversed_category_indices + reversed_na_indices)
+
+ assert_frame_equal(result, expected)
+
+ def test_sort_index_na_position_with_categories_raises(self):
+ df = pd.DataFrame({
+ 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
+ categories=['A', 'B', 'C'],
+ ordered=True)})
+
+ with pytest.raises(ValueError):
+ df.sort_values(by='c',
+ ascending=False,
+ na_position='bad_position')
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_subclass.py b/contrib/python/pandas/py2/pandas/tests/frame/test_subclass.py
new file mode 100644
index 00000000000..4f0747c0d69
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_subclass.py
@@ -0,0 +1,573 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Panel, Series
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+
+class TestDataFrameSubclassing(TestData):
+
+ def test_frame_subclassing_and_slicing(self):
+ # Subclass frame and ensure it returns the right class on slicing it
+ # In reference to PR 9632
+
+ class CustomSeries(Series):
+
+ @property
+ def _constructor(self):
+ return CustomSeries
+
+ def custom_series_function(self):
+ return 'OK'
+
+ class CustomDataFrame(DataFrame):
+ """
+ Subclasses pandas DF, fills DF with simulation results, adds some
+ custom plotting functions.
+ """
+
+ def __init__(self, *args, **kw):
+ super(CustomDataFrame, self).__init__(*args, **kw)
+
+ @property
+ def _constructor(self):
+ return CustomDataFrame
+
+ _constructor_sliced = CustomSeries
+
+ def custom_frame_function(self):
+ return 'OK'
+
+ data = {'col1': range(10),
+ 'col2': range(10)}
+ cdf = CustomDataFrame(data)
+
+ # Did we get back our own DF class?
+ assert isinstance(cdf, CustomDataFrame)
+
+ # Do we get back our own Series class after selecting a column?
+ cdf_series = cdf.col1
+ assert isinstance(cdf_series, CustomSeries)
+ assert cdf_series.custom_series_function() == 'OK'
+
+ # Do we get back our own DF class after slicing row-wise?
+ cdf_rows = cdf[1:5]
+ assert isinstance(cdf_rows, CustomDataFrame)
+ assert cdf_rows.custom_frame_function() == 'OK'
+
+ # Make sure sliced part of multi-index frame is custom class
+ mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')])
+ cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
+ assert isinstance(cdf_multi['A'], CustomDataFrame)
+
+ mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')])
+ cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
+ assert isinstance(cdf_multi2['A'], CustomSeries)
+
+ def test_dataframe_metadata(self):
+ df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
+ index=['a', 'b', 'c'])
+ df.testattr = 'XXX'
+
+ assert df.testattr == 'XXX'
+ assert df[['X']].testattr == 'XXX'
+ assert df.loc[['a', 'b'], :].testattr == 'XXX'
+ assert df.iloc[[0, 1], :].testattr == 'XXX'
+
+ # see gh-9776
+ assert df.iloc[0:1, :].testattr == 'XXX'
+
+ # see gh-10553
+ unpickled = tm.round_trip_pickle(df)
+ tm.assert_frame_equal(df, unpickled)
+ assert df._metadata == unpickled._metadata
+ assert df.testattr == unpickled.testattr
+
+ def test_indexing_sliced(self):
+ # GH 11559
+ df = tm.SubclassedDataFrame({'X': [1, 2, 3],
+ 'Y': [4, 5, 6],
+ 'Z': [7, 8, 9]},
+ index=['a', 'b', 'c'])
+ res = df.loc[:, 'X']
+ exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ res = df.iloc[:, 1]
+ exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ res = df.loc[:, 'Z']
+ exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ res = df.loc['a', :]
+ exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ res = df.iloc[1, :]
+ exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ res = df.loc['c', :]
+ exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c')
+ tm.assert_series_equal(res, exp)
+ assert isinstance(res, tm.SubclassedSeries)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_to_panel_expanddim(self):
+ # GH 9762
+
+ class SubclassedFrame(DataFrame):
+
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedPanel
+
+ class SubclassedPanel(Panel):
+ pass
+
+ index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
+ df = SubclassedFrame({'X': [1, 2, 3], 'Y': [4, 5, 6]}, index=index)
+ result = df.to_panel()
+ assert isinstance(result, SubclassedPanel)
+ expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
+ items=['X', 'Y'], major_axis=[0],
+ minor_axis=[0, 1, 2],
+ dtype='int64')
+ tm.assert_panel_equal(result, expected)
+
+ def test_subclass_attr_err_propagation(self):
+ # GH 11808
+ class A(DataFrame):
+
+ @property
+ def bar(self):
+ return self.i_dont_exist
+ with pytest.raises(AttributeError, match='.*i_dont_exist.*'):
+ A().bar
+
+ def test_subclass_align(self):
+ # GH 12983
+ df1 = tm.SubclassedDataFrame({'a': [1, 3, 5],
+ 'b': [1, 3, 5]}, index=list('ACE'))
+ df2 = tm.SubclassedDataFrame({'c': [1, 2, 4],
+ 'd': [1, 2, 4]}, index=list('ABD'))
+
+ res1, res2 = df1.align(df2, axis=0)
+ exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5],
+ 'b': [1, np.nan, 3, np.nan, 5]},
+ index=list('ABCDE'))
+ exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan],
+ 'd': [1, 2, np.nan, 4, np.nan]},
+ index=list('ABCDE'))
+ assert isinstance(res1, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(res1, exp1)
+ assert isinstance(res2, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(res2, exp2)
+
+ res1, res2 = df1.a.align(df2.c)
+ assert isinstance(res1, tm.SubclassedSeries)
+ tm.assert_series_equal(res1, exp1.a)
+ assert isinstance(res2, tm.SubclassedSeries)
+ tm.assert_series_equal(res2, exp2.c)
+
+ def test_subclass_align_combinations(self):
+ # GH 12983
+ df = tm.SubclassedDataFrame({'a': [1, 3, 5],
+ 'b': [1, 3, 5]}, index=list('ACE'))
+ s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x')
+
+ # frame + series
+ res1, res2 = df.align(s, axis=0)
+ exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
+ 'b': [1, np.nan, 3, np.nan, 5]},
+ index=list('ABCDE'))
+ # name is lost when
+ exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
+ index=list('ABCDE'), name='x')
+
+ assert isinstance(res1, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(res1, exp1)
+ assert isinstance(res2, tm.SubclassedSeries)
+ tm.assert_series_equal(res2, exp2)
+
+ # series + frame
+ res1, res2 = s.align(df)
+ assert isinstance(res1, tm.SubclassedSeries)
+ tm.assert_series_equal(res1, exp2)
+ assert isinstance(res2, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(res2, exp1)
+
+ def test_subclass_iterrows(self):
+ # GH 13977
+ df = tm.SubclassedDataFrame({'a': [1]})
+ for i, row in df.iterrows():
+ assert isinstance(row, tm.SubclassedSeries)
+ tm.assert_series_equal(row, df.loc[i])
+
+ def test_subclass_sparse_slice(self):
+ rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
+ ssdf = tm.SubclassedSparseDataFrame(rows)
+ ssdf.testattr = "testattr"
+
+ tm.assert_sp_frame_equal(ssdf.loc[:2],
+ tm.SubclassedSparseDataFrame(rows[:3]))
+ tm.assert_sp_frame_equal(ssdf.iloc[:2],
+ tm.SubclassedSparseDataFrame(rows[:2]))
+ tm.assert_sp_frame_equal(ssdf[:2],
+ tm.SubclassedSparseDataFrame(rows[:2]))
+ assert ssdf.loc[:2].testattr == "testattr"
+ assert ssdf.iloc[:2].testattr == "testattr"
+ assert ssdf[:2].testattr == "testattr"
+
+ tm.assert_sp_series_equal(ssdf.loc[1],
+ tm.SubclassedSparseSeries(rows[1]),
+ check_names=False,
+ check_kind=False)
+ tm.assert_sp_series_equal(ssdf.iloc[1],
+ tm.SubclassedSparseSeries(rows[1]),
+ check_names=False,
+ check_kind=False)
+
+ def test_subclass_sparse_transpose(self):
+ ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3],
+ [4, 5, 6]])
+ essdf = tm.SubclassedSparseDataFrame([[1, 4],
+ [2, 5],
+ [3, 6]])
+ tm.assert_sp_frame_equal(ossdf.T, essdf)
+
+ def test_subclass_stack(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=['a', 'b', 'c'],
+ columns=['X', 'Y', 'Z'])
+
+ res = df.stack()
+ exp = tm.SubclassedSeries(
+ [1, 2, 3, 4, 5, 6, 7, 8, 9],
+ index=[list('aaabbbccc'), list('XYZXYZXYZ')])
+
+ tm.assert_series_equal(res, exp)
+
+ def test_subclass_stack_multi(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([
+ [10, 11, 12, 13],
+ [20, 21, 22, 23],
+ [30, 31, 32, 33],
+ [40, 41, 42, 43]],
+ index=MultiIndex.from_tuples(
+ list(zip(list('AABB'), list('cdcd'))),
+ names=['aaa', 'ccc']),
+ columns=MultiIndex.from_tuples(
+ list(zip(list('WWXX'), list('yzyz'))),
+ names=['www', 'yyy']))
+
+ exp = tm.SubclassedDataFrame([
+ [10, 12],
+ [11, 13],
+ [20, 22],
+ [21, 23],
+ [30, 32],
+ [31, 33],
+ [40, 42],
+ [41, 43]],
+ index=MultiIndex.from_tuples(list(zip(
+ list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
+ names=['aaa', 'ccc', 'yyy']),
+ columns=Index(['W', 'X'], name='www'))
+
+ res = df.stack()
+ tm.assert_frame_equal(res, exp)
+
+ res = df.stack('yyy')
+ tm.assert_frame_equal(res, exp)
+
+ exp = tm.SubclassedDataFrame([
+ [10, 11],
+ [12, 13],
+ [20, 21],
+ [22, 23],
+ [30, 31],
+ [32, 33],
+ [40, 41],
+ [42, 43]],
+ index=MultiIndex.from_tuples(list(zip(
+ list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
+ names=['aaa', 'ccc', 'www']),
+ columns=Index(['y', 'z'], name='yyy'))
+
+ res = df.stack('www')
+ tm.assert_frame_equal(res, exp)
+
+ def test_subclass_stack_multi_mixed(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([
+ [10, 11, 12.0, 13.0],
+ [20, 21, 22.0, 23.0],
+ [30, 31, 32.0, 33.0],
+ [40, 41, 42.0, 43.0]],
+ index=MultiIndex.from_tuples(
+ list(zip(list('AABB'), list('cdcd'))),
+ names=['aaa', 'ccc']),
+ columns=MultiIndex.from_tuples(
+ list(zip(list('WWXX'), list('yzyz'))),
+ names=['www', 'yyy']))
+
+ exp = tm.SubclassedDataFrame([
+ [10, 12.0],
+ [11, 13.0],
+ [20, 22.0],
+ [21, 23.0],
+ [30, 32.0],
+ [31, 33.0],
+ [40, 42.0],
+ [41, 43.0]],
+ index=MultiIndex.from_tuples(list(zip(
+ list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
+ names=['aaa', 'ccc', 'yyy']),
+ columns=Index(['W', 'X'], name='www'))
+
+ res = df.stack()
+ tm.assert_frame_equal(res, exp)
+
+ res = df.stack('yyy')
+ tm.assert_frame_equal(res, exp)
+
+ exp = tm.SubclassedDataFrame([
+ [10.0, 11.0],
+ [12.0, 13.0],
+ [20.0, 21.0],
+ [22.0, 23.0],
+ [30.0, 31.0],
+ [32.0, 33.0],
+ [40.0, 41.0],
+ [42.0, 43.0]],
+ index=MultiIndex.from_tuples(list(zip(
+ list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
+ names=['aaa', 'ccc', 'www']),
+ columns=Index(['y', 'z'], name='yyy'))
+
+ res = df.stack('www')
+ tm.assert_frame_equal(res, exp)
+
+ def test_subclass_unstack(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=['a', 'b', 'c'],
+ columns=['X', 'Y', 'Z'])
+
+ res = df.unstack()
+ exp = tm.SubclassedSeries(
+ [1, 4, 7, 2, 5, 8, 3, 6, 9],
+ index=[list('XXXYYYZZZ'), list('abcabcabc')])
+
+ tm.assert_series_equal(res, exp)
+
+ def test_subclass_unstack_multi(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([
+ [10, 11, 12, 13],
+ [20, 21, 22, 23],
+ [30, 31, 32, 33],
+ [40, 41, 42, 43]],
+ index=MultiIndex.from_tuples(
+ list(zip(list('AABB'), list('cdcd'))),
+ names=['aaa', 'ccc']),
+ columns=MultiIndex.from_tuples(
+ list(zip(list('WWXX'), list('yzyz'))),
+ names=['www', 'yyy']))
+
+ exp = tm.SubclassedDataFrame([
+ [10, 20, 11, 21, 12, 22, 13, 23],
+ [30, 40, 31, 41, 32, 42, 33, 43]],
+ index=Index(['A', 'B'], name='aaa'),
+ columns=MultiIndex.from_tuples(list(zip(
+ list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
+ names=['www', 'yyy', 'ccc']))
+
+ res = df.unstack()
+ tm.assert_frame_equal(res, exp)
+
+ res = df.unstack('ccc')
+ tm.assert_frame_equal(res, exp)
+
+ exp = tm.SubclassedDataFrame([
+ [10, 30, 11, 31, 12, 32, 13, 33],
+ [20, 40, 21, 41, 22, 42, 23, 43]],
+ index=Index(['c', 'd'], name='ccc'),
+ columns=MultiIndex.from_tuples(list(zip(
+ list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
+ names=['www', 'yyy', 'aaa']))
+
+ res = df.unstack('aaa')
+ tm.assert_frame_equal(res, exp)
+
+ def test_subclass_unstack_multi_mixed(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame([
+ [10, 11, 12.0, 13.0],
+ [20, 21, 22.0, 23.0],
+ [30, 31, 32.0, 33.0],
+ [40, 41, 42.0, 43.0]],
+ index=MultiIndex.from_tuples(
+ list(zip(list('AABB'), list('cdcd'))),
+ names=['aaa', 'ccc']),
+ columns=MultiIndex.from_tuples(
+ list(zip(list('WWXX'), list('yzyz'))),
+ names=['www', 'yyy']))
+
+ exp = tm.SubclassedDataFrame([
+ [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
+ [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
+ index=Index(['A', 'B'], name='aaa'),
+ columns=MultiIndex.from_tuples(list(zip(
+ list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
+ names=['www', 'yyy', 'ccc']))
+
+ res = df.unstack()
+ tm.assert_frame_equal(res, exp)
+
+ res = df.unstack('ccc')
+ tm.assert_frame_equal(res, exp)
+
+ exp = tm.SubclassedDataFrame([
+ [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
+ [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
+ index=Index(['c', 'd'], name='ccc'),
+ columns=MultiIndex.from_tuples(list(zip(
+ list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
+ names=['www', 'yyy', 'aaa']))
+
+ res = df.unstack('aaa')
+ tm.assert_frame_equal(res, exp)
+
+ def test_subclass_pivot(self):
+ # GH 15564
+ df = tm.SubclassedDataFrame({
+ 'index': ['A', 'B', 'C', 'C', 'B', 'A'],
+ 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
+ 'values': [1., 2., 3., 3., 2., 1.]})
+
+ pivoted = df.pivot(
+ index='index', columns='columns', values='values')
+
+ expected = tm.SubclassedDataFrame({
+ 'One': {'A': 1., 'B': 2., 'C': 3.},
+ 'Two': {'A': 1., 'B': 2., 'C': 3.}})
+
+ expected.index.name, expected.columns.name = 'index', 'columns'
+
+ tm.assert_frame_equal(pivoted, expected)
+
+ def test_subclassed_melt(self):
+ # GH 15564
+ cheese = tm.SubclassedDataFrame({
+ 'first': ['John', 'Mary'],
+ 'last': ['Doe', 'Bo'],
+ 'height': [5.5, 6.0],
+ 'weight': [130, 150]})
+
+ melted = pd.melt(cheese, id_vars=['first', 'last'])
+
+ expected = tm.SubclassedDataFrame([
+ ['John', 'Doe', 'height', 5.5],
+ ['Mary', 'Bo', 'height', 6.0],
+ ['John', 'Doe', 'weight', 130],
+ ['Mary', 'Bo', 'weight', 150]],
+ columns=['first', 'last', 'variable', 'value'])
+
+ tm.assert_frame_equal(melted, expected)
+
+ def test_subclassed_wide_to_long(self):
+ # GH 9762
+
+ np.random.seed(123)
+ x = np.random.randn(3)
+ df = tm.SubclassedDataFrame({
+ "A1970": {0: "a", 1: "b", 2: "c"},
+ "A1980": {0: "d", 1: "e", 2: "f"},
+ "B1970": {0: 2.5, 1: 1.2, 2: .7},
+ "B1980": {0: 3.2, 1: 1.3, 2: .1},
+ "X": dict(zip(range(3), x))})
+
+ df["id"] = df.index
+ exp_data = {"X": x.tolist() + x.tolist(),
+ "A": ['a', 'b', 'c', 'd', 'e', 'f'],
+ "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+ "year": [1970, 1970, 1970, 1980, 1980, 1980],
+ "id": [0, 1, 2, 0, 1, 2]}
+ expected = tm.SubclassedDataFrame(exp_data)
+ expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
+ long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
+
+ tm.assert_frame_equal(long_frame, expected)
+
+ def test_subclassed_apply(self):
+ # GH 19822
+
+ def check_row_subclass(row):
+ assert isinstance(row, tm.SubclassedSeries)
+
+ def strech(row):
+ if row["variable"] == "height":
+ row["value"] += 0.5
+ return row
+
+ df = tm.SubclassedDataFrame([
+ ['John', 'Doe', 'height', 5.5],
+ ['Mary', 'Bo', 'height', 6.0],
+ ['John', 'Doe', 'weight', 130],
+ ['Mary', 'Bo', 'weight', 150]],
+ columns=['first', 'last', 'variable', 'value'])
+
+ df.apply(lambda x: check_row_subclass(x))
+ df.apply(lambda x: check_row_subclass(x), axis=1)
+
+ expected = tm.SubclassedDataFrame([
+ ['John', 'Doe', 'height', 6.0],
+ ['Mary', 'Bo', 'height', 6.5],
+ ['John', 'Doe', 'weight', 130],
+ ['Mary', 'Bo', 'weight', 150]],
+ columns=['first', 'last', 'variable', 'value'])
+
+ result = df.apply(lambda x: strech(x), axis=1)
+ assert isinstance(result, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(result, expected)
+
+ expected = tm.SubclassedDataFrame([
+ [1, 2, 3],
+ [1, 2, 3],
+ [1, 2, 3],
+ [1, 2, 3]])
+
+ result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
+ assert isinstance(result, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
+ assert isinstance(result, tm.SubclassedDataFrame)
+ tm.assert_frame_equal(result, expected)
+
+ expected = tm.SubclassedSeries([
+ [1, 2, 3],
+ [1, 2, 3],
+ [1, 2, 3],
+ [1, 2, 3]])
+
+ result = df.apply(lambda x: [1, 2, 3], axis=1)
+ assert not isinstance(result, tm.SubclassedDataFrame)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_timeseries.py b/contrib/python/pandas/py2/pandas/tests/frame/test_timeseries.py
new file mode 100644
index 00000000000..bc37317f728
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_timeseries.py
@@ -0,0 +1,899 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+from datetime import datetime, time
+
+import numpy as np
+import pytest
+
+from pandas.compat import product
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, date_range,
+ period_range, to_datetime)
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_index_equal, assert_series_equal)
+
+import pandas.tseries.offsets as offsets
+
+
[email protected](params=product([True, False], [True, False]))
+def close_open_fixture(request):
+ return request.param
+
+
+class TestDataFrameTimeSeriesMethods(TestData):
+
+ def test_diff(self):
+ the_diff = self.tsframe.diff(1)
+
+ assert_series_equal(the_diff['A'],
+ self.tsframe['A'] - self.tsframe['A'].shift(1))
+
+ # int dtype
+ a = 10000000000000000
+ b = a + 1
+ s = Series([a, b])
+
+ rs = DataFrame({'s': s}).diff()
+ assert rs.s[1] == 1
+
+ # mixed numeric
+ tf = self.tsframe.astype('float32')
+ the_diff = tf.diff(1)
+ assert_series_equal(the_diff['A'],
+ tf['A'] - tf['A'].shift(1))
+
+ # issue 10907
+ df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])})
+ df.insert(0, 'x', 1)
+ result = df.diff(axis=1)
+ expected = pd.DataFrame({'x': np.nan, 'y': pd.Series(
+ 1), 'z': pd.Series(1)}).astype('float64')
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ def test_diff_datetime_axis0(self, tz):
+ # GH 18578
+ df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
+ 1: date_range('2010', freq='D', periods=2, tz=tz)})
+
+ result = df.diff(axis=0)
+ expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
+ 1: pd.TimedeltaIndex(['NaT', '1 days'])})
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ def test_diff_datetime_axis1(self, tz):
+ # GH 18578
+ df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
+ 1: date_range('2010', freq='D', periods=2, tz=tz)})
+ if tz is None:
+ result = df.diff(axis=1)
+ expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
+ 1: pd.TimedeltaIndex(['0 days',
+ '0 days'])})
+ assert_frame_equal(result, expected)
+ else:
+ with pytest.raises(NotImplementedError):
+ result = df.diff(axis=1)
+
+ def test_diff_timedelta(self):
+ # GH 4533
+ df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
+ Timestamp('20130101 9:02')],
+ value=[1.0, 2.0]))
+
+ res = df.diff()
+ exp = DataFrame([[pd.NaT, np.nan],
+ [pd.Timedelta('00:01:00'), 1]],
+ columns=['time', 'value'])
+ assert_frame_equal(res, exp)
+
+ def test_diff_mixed_dtype(self):
+ df = DataFrame(np.random.randn(5, 3))
+ df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
+
+ result = df.diff()
+ assert result[0].dtype == np.float64
+
+ def test_diff_neg_n(self):
+ rs = self.tsframe.diff(-1)
+ xp = self.tsframe - self.tsframe.shift(-1)
+ assert_frame_equal(rs, xp)
+
+ def test_diff_float_n(self):
+ rs = self.tsframe.diff(1.)
+ xp = self.tsframe.diff(1)
+ assert_frame_equal(rs, xp)
+
+ def test_diff_axis(self):
+ # GH 9727
+ df = DataFrame([[1., 2.], [3., 4.]])
+ assert_frame_equal(df.diff(axis=1), DataFrame(
+ [[np.nan, 1.], [np.nan, 1.]]))
+ assert_frame_equal(df.diff(axis=0), DataFrame(
+ [[np.nan, np.nan], [2., 2.]]))
+
+ def test_pct_change(self):
+ rs = self.tsframe.pct_change(fill_method=None)
+ assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
+
+ rs = self.tsframe.pct_change(2)
+ filled = self.tsframe.fillna(method='pad')
+ assert_frame_equal(rs, filled / filled.shift(2) - 1)
+
+ rs = self.tsframe.pct_change(fill_method='bfill', limit=1)
+ filled = self.tsframe.fillna(method='bfill', limit=1)
+ assert_frame_equal(rs, filled / filled.shift(1) - 1)
+
+ rs = self.tsframe.pct_change(freq='5D')
+ filled = self.tsframe.fillna(method='pad')
+ assert_frame_equal(rs,
+ (filled / filled.shift(freq='5D') - 1)
+ .reindex_like(filled))
+
+ def test_pct_change_shift_over_nas(self):
+ s = Series([1., 1.5, np.nan, 2.5, 3.])
+
+ df = DataFrame({'a': s, 'b': s})
+
+ chg = df.pct_change()
+ expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
+ edf = DataFrame({'a': expected, 'b': expected})
+ assert_frame_equal(chg, edf)
+
+ @pytest.mark.parametrize("freq, periods, fill_method, limit",
+ [('5B', 5, None, None),
+ ('3B', 3, None, None),
+ ('3B', 3, 'bfill', None),
+ ('7B', 7, 'pad', 1),
+ ('7B', 7, 'bfill', 3),
+ ('14B', 14, None, None)])
+ def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
+ # GH 7292
+ rs_freq = self.tsframe.pct_change(freq=freq,
+ fill_method=fill_method,
+ limit=limit)
+ rs_periods = self.tsframe.pct_change(periods,
+ fill_method=fill_method,
+ limit=limit)
+ assert_frame_equal(rs_freq, rs_periods)
+
+ empty_ts = DataFrame(index=self.tsframe.index,
+ columns=self.tsframe.columns)
+ rs_freq = empty_ts.pct_change(freq=freq,
+ fill_method=fill_method,
+ limit=limit)
+ rs_periods = empty_ts.pct_change(periods,
+ fill_method=fill_method,
+ limit=limit)
+ assert_frame_equal(rs_freq, rs_periods)
+
+ def test_frame_ctor_datetime64_column(self):
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
+ dates = np.asarray(rng)
+
+ df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
+ assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
+
+ def test_frame_append_datetime64_column(self):
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
+ df = DataFrame(index=np.arange(len(rng)))
+
+ df['A'] = rng
+ assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
+
+ def test_frame_datetime64_pre1900_repr(self):
+ df = DataFrame({'year': date_range('1/1/1700', periods=50,
+ freq='A-DEC')})
+ # it works!
+ repr(df)
+
+ def test_frame_append_datetime64_col_other_units(self):
+ n = 100
+
+ units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']
+
+ ns_dtype = np.dtype('M8[ns]')
+
+ for unit in units:
+ dtype = np.dtype('M8[%s]' % unit)
+ vals = np.arange(n, dtype=np.int64).view(dtype)
+
+ df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
+ df[unit] = vals
+
+ ex_vals = to_datetime(vals.astype('O')).values
+
+ assert df[unit].dtype == ns_dtype
+ assert (df[unit].values == ex_vals).all()
+
+ # Test insertion into existing datetime64 column
+ df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
+ df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)
+
+ for unit in units:
+ dtype = np.dtype('M8[%s]' % unit)
+ vals = np.arange(n, dtype=np.int64).view(dtype)
+
+ tmp = df.copy()
+
+ tmp['dates'] = vals
+ ex_vals = to_datetime(vals.astype('O')).values
+
+ assert (tmp['dates'].values == ex_vals).all()
+
+ def test_shift(self):
+ # naive shift
+ shiftedFrame = self.tsframe.shift(5)
+ tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
+
+ shiftedSeries = self.tsframe['A'].shift(5)
+ assert_series_equal(shiftedFrame['A'], shiftedSeries)
+
+ shiftedFrame = self.tsframe.shift(-5)
+ tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
+
+ shiftedSeries = self.tsframe['A'].shift(-5)
+ assert_series_equal(shiftedFrame['A'], shiftedSeries)
+
+ # shift by 0
+ unshifted = self.tsframe.shift(0)
+ assert_frame_equal(unshifted, self.tsframe)
+
+ # shift by DateOffset
+ shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
+ assert len(shiftedFrame) == len(self.tsframe)
+
+ shiftedFrame2 = self.tsframe.shift(5, freq='B')
+ assert_frame_equal(shiftedFrame, shiftedFrame2)
+
+ d = self.tsframe.index[0]
+ shifted_d = d + offsets.BDay(5)
+ assert_series_equal(self.tsframe.xs(d),
+ shiftedFrame.xs(shifted_d), check_names=False)
+
+ # shift int frame
+ int_shifted = self.intframe.shift(1) # noqa
+
+ # Shifting with PeriodIndex
+ ps = tm.makePeriodFrame()
+ shifted = ps.shift(1)
+ unshifted = shifted.shift(-1)
+ tm.assert_index_equal(shifted.index, ps.index)
+ tm.assert_index_equal(unshifted.index, ps.index)
+ tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values,
+ ps.iloc[:-1, 0].values)
+
+ shifted2 = ps.shift(1, 'B')
+ shifted3 = ps.shift(1, offsets.BDay())
+ assert_frame_equal(shifted2, shifted3)
+ assert_frame_equal(ps, shifted2.shift(-1, 'B'))
+
+ msg = 'does not match PeriodIndex freq'
+ with pytest.raises(ValueError, match=msg):
+ ps.shift(freq='D')
+
+ # shift other axis
+ # GH 6371
+ df = DataFrame(np.random.rand(10, 5))
+ expected = pd.concat([DataFrame(np.nan, index=df.index,
+ columns=[0]),
+ df.iloc[:, 0:-1]],
+ ignore_index=True, axis=1)
+ result = df.shift(1, axis=1)
+ assert_frame_equal(result, expected)
+
+ # shift named axis
+ df = DataFrame(np.random.rand(10, 5))
+ expected = pd.concat([DataFrame(np.nan, index=df.index,
+ columns=[0]),
+ df.iloc[:, 0:-1]],
+ ignore_index=True, axis=1)
+ result = df.shift(1, axis='columns')
+ assert_frame_equal(result, expected)
+
+ def test_shift_bool(self):
+ df = DataFrame({'high': [True, False],
+ 'low': [False, False]})
+ rs = df.shift(1)
+ xp = DataFrame(np.array([[np.nan, np.nan],
+ [True, False]], dtype=object),
+ columns=['high', 'low'])
+ assert_frame_equal(rs, xp)
+
+ def test_shift_categorical(self):
+ # GH 9416
+ s1 = pd.Series(['a', 'b', 'c'], dtype='category')
+ s2 = pd.Series(['A', 'B', 'C'], dtype='category')
+ df = DataFrame({'one': s1, 'two': s2})
+ rs = df.shift(1)
+ xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
+ assert_frame_equal(rs, xp)
+
+ def test_shift_fill_value(self):
+ # GH #24128
+ df = DataFrame([1, 2, 3, 4, 5],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+ exp = DataFrame([0, 1, 2, 3, 4],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+ result = df.shift(1, fill_value=0)
+ assert_frame_equal(result, exp)
+
+ exp = DataFrame([0, 0, 1, 2, 3],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+ result = df.shift(2, fill_value=0)
+ assert_frame_equal(result, exp)
+
+ def test_shift_empty(self):
+ # Regression test for #8019
+ df = DataFrame({'foo': []})
+ rs = df.shift(-1)
+
+ assert_frame_equal(df, rs)
+
+ def test_shift_duplicate_columns(self):
+ # GH 9092; verify that position-based shifting works
+ # in the presence of duplicate columns
+ column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
+ data = np.random.randn(20, 5)
+
+ shifted = []
+ for columns in column_lists:
+ df = pd.DataFrame(data.copy(), columns=columns)
+ for s in range(5):
+ df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
+ df.columns = range(5)
+ shifted.append(df)
+
+ # sanity check the base case
+ nulls = shifted[0].isna().sum()
+ assert_series_equal(nulls, Series(range(1, 6), dtype='int64'))
+
+ # check all answers are the same
+ assert_frame_equal(shifted[0], shifted[1])
+ assert_frame_equal(shifted[0], shifted[2])
+
+ def test_tshift(self):
+ # PeriodIndex
+ ps = tm.makePeriodFrame()
+ shifted = ps.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_frame_equal(unshifted, ps)
+
+ shifted2 = ps.tshift(freq='B')
+ assert_frame_equal(shifted, shifted2)
+
+ shifted3 = ps.tshift(freq=offsets.BDay())
+ assert_frame_equal(shifted, shifted3)
+
+ with pytest.raises(ValueError, match='does not match'):
+ ps.tshift(freq='M')
+
+ # DatetimeIndex
+ shifted = self.tsframe.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_frame_equal(self.tsframe, unshifted)
+
+ shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
+ assert_frame_equal(shifted, shifted2)
+
+ inferred_ts = DataFrame(self.tsframe.values,
+ Index(np.asarray(self.tsframe.index)),
+ columns=self.tsframe.columns)
+ shifted = inferred_ts.tshift(1)
+ unshifted = shifted.tshift(-1)
+ assert_frame_equal(shifted, self.tsframe.tshift(1))
+ assert_frame_equal(unshifted, inferred_ts)
+
+ no_freq = self.tsframe.iloc[[0, 5, 7], :]
+ pytest.raises(ValueError, no_freq.tshift)
+
+ def test_truncate(self):
+ ts = self.tsframe[::3]
+
+ start, end = self.tsframe.index[3], self.tsframe.index[6]
+
+ start_missing = self.tsframe.index[2]
+ end_missing = self.tsframe.index[7]
+
+ # neither specified
+ truncated = ts.truncate()
+ assert_frame_equal(truncated, ts)
+
+ # both specified
+ expected = ts[1:3]
+
+ truncated = ts.truncate(start, end)
+ assert_frame_equal(truncated, expected)
+
+ truncated = ts.truncate(start_missing, end_missing)
+ assert_frame_equal(truncated, expected)
+
+ # start specified
+ expected = ts[1:]
+
+ truncated = ts.truncate(before=start)
+ assert_frame_equal(truncated, expected)
+
+ truncated = ts.truncate(before=start_missing)
+ assert_frame_equal(truncated, expected)
+
+ # end specified
+ expected = ts[:3]
+
+ truncated = ts.truncate(after=end)
+ assert_frame_equal(truncated, expected)
+
+ truncated = ts.truncate(after=end_missing)
+ assert_frame_equal(truncated, expected)
+
+ pytest.raises(ValueError, ts.truncate,
+ before=ts.index[-1] - ts.index.freq,
+ after=ts.index[0] + ts.index.freq)
+
+ def test_truncate_copy(self):
+ index = self.tsframe.index
+ truncated = self.tsframe.truncate(index[5], index[10])
+ truncated.values[:] = 5.
+ assert not (self.tsframe.values[5:11] == 5).any()
+
+ def test_truncate_nonsortedindex(self):
+ # GH 17935
+
+ df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']},
+ index=[5, 3, 2, 9, 0])
+ msg = 'truncate requires a sorted index'
+ with pytest.raises(ValueError, match=msg):
+ df.truncate(before=3, after=9)
+
+ rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
+ ts = pd.DataFrame({'A': np.random.randn(len(rng)),
+ 'B': np.random.randn(len(rng))},
+ index=rng)
+ msg = 'truncate requires a sorted index'
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values('A', ascending=False).truncate(before='2011-11',
+ after='2011-12')
+
+ df = pd.DataFrame({3: np.random.randn(5),
+ 20: np.random.randn(5),
+ 2: np.random.randn(5),
+ 0: np.random.randn(5)},
+ columns=[3, 20, 2, 0])
+ msg = 'truncate requires a sorted index'
+ with pytest.raises(ValueError, match=msg):
+ df.truncate(before=2, after=20, axis=1)
+
+ def test_asfreq(self):
+ offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
+ rule_monthly = self.tsframe.asfreq('BM')
+
+ tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A'])
+
+ filled = rule_monthly.asfreq('B', method='pad') # noqa
+ # TODO: actually check that this worked.
+
+ # don't forget!
+ filled_dep = rule_monthly.asfreq('B', method='pad') # noqa
+
+ # test does not blow up on length-0 DataFrame
+ zero_length = self.tsframe.reindex([])
+ result = zero_length.asfreq('BM')
+ assert result is not zero_length
+
+ def test_asfreq_datetimeindex(self):
+ df = DataFrame({'A': [1, 2, 3]},
+ index=[datetime(2011, 11, 1), datetime(2011, 11, 2),
+ datetime(2011, 11, 3)])
+ df = df.asfreq('B')
+ assert isinstance(df.index, DatetimeIndex)
+
+ ts = df['A'].asfreq('B')
+ assert isinstance(ts.index, DatetimeIndex)
+
+ def test_asfreq_fillvalue(self):
+ # test for fill value during upsampling, related to issue 3715
+
+ # setup
+ rng = pd.date_range('1/1/2016', periods=10, freq='2S')
+ ts = pd.Series(np.arange(len(rng)), index=rng)
+ df = pd.DataFrame({'one': ts})
+
+ # insert pre-existing missing value
+ df.loc['2016-01-01 00:00:08', 'one'] = None
+
+ actual_df = df.asfreq(freq='1S', fill_value=9.0)
+ expected_df = df.asfreq(freq='1S').fillna(9.0)
+ expected_df.loc['2016-01-01 00:00:08', 'one'] = None
+ assert_frame_equal(expected_df, actual_df)
+
+ expected_series = ts.asfreq(freq='1S').fillna(9.0)
+ actual_series = ts.asfreq(freq='1S', fill_value=9.0)
+ assert_series_equal(expected_series, actual_series)
+
+ @pytest.mark.parametrize("data,idx,expected_first,expected_last", [
+ ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
+ ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
+ ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
+ ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
+ ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
+ ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
+ def test_first_last_valid(self, data, idx,
+ expected_first, expected_last):
+ N = len(self.frame.index)
+ mat = np.random.randn(N)
+ mat[:5] = np.nan
+ mat[-5:] = np.nan
+
+ frame = DataFrame({'foo': mat}, index=self.frame.index)
+ index = frame.first_valid_index()
+
+ assert index == frame.index[5]
+
+ index = frame.last_valid_index()
+ assert index == frame.index[-6]
+
+ # GH12800
+ empty = DataFrame()
+ assert empty.last_valid_index() is None
+ assert empty.first_valid_index() is None
+
+ # GH17400: no valid entries
+ frame[:] = np.nan
+ assert frame.last_valid_index() is None
+ assert frame.first_valid_index() is None
+
+ # GH20499: its preserves freq with holes
+ frame.index = date_range("20110101", periods=N, freq="B")
+ frame.iloc[1] = 1
+ frame.iloc[-2] = 1
+ assert frame.first_valid_index() == frame.index[1]
+ assert frame.last_valid_index() == frame.index[-2]
+ assert frame.first_valid_index().freq == frame.index.freq
+ assert frame.last_valid_index().freq == frame.index.freq
+
+ # GH 21441
+ df = DataFrame(data, index=idx)
+ assert expected_first == df.first_valid_index()
+ assert expected_last == df.last_valid_index()
+
+ def test_first_subset(self):
+ ts = tm.makeTimeDataFrame(freq='12h')
+ result = ts.first('10d')
+ assert len(result) == 20
+
+ ts = tm.makeTimeDataFrame(freq='D')
+ result = ts.first('10d')
+ assert len(result) == 10
+
+ result = ts.first('3M')
+ expected = ts[:'3/31/2000']
+ assert_frame_equal(result, expected)
+
+ result = ts.first('21D')
+ expected = ts[:21]
+ assert_frame_equal(result, expected)
+
+ result = ts[:0].first('3M')
+ assert_frame_equal(result, ts[:0])
+
+ def test_first_raises(self):
+ # GH20725
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+ with pytest.raises(TypeError): # index is not a DatetimeIndex
+ df.first('1D')
+
+ def test_last_subset(self):
+ ts = tm.makeTimeDataFrame(freq='12h')
+ result = ts.last('10d')
+ assert len(result) == 20
+
+ ts = tm.makeTimeDataFrame(nper=30, freq='D')
+ result = ts.last('10d')
+ assert len(result) == 10
+
+ result = ts.last('21D')
+ expected = ts['2000-01-10':]
+ assert_frame_equal(result, expected)
+
+ result = ts.last('21D')
+ expected = ts[-21:]
+ assert_frame_equal(result, expected)
+
+ result = ts[:0].last('3M')
+ assert_frame_equal(result, ts[:0])
+
+ def test_last_raises(self):
+ # GH20725
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+ with pytest.raises(TypeError): # index is not a DatetimeIndex
+ df.last('1D')
+
+ def test_at_time(self):
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
+ rs = ts.at_time(rng[1])
+ assert (rs.index.hour == rng[1].hour).all()
+ assert (rs.index.minute == rng[1].minute).all()
+ assert (rs.index.second == rng[1].second).all()
+
+ result = ts.at_time('9:30')
+ expected = ts.at_time(time(9, 30))
+ assert_frame_equal(result, expected)
+
+ result = ts.loc[time(9, 30)]
+ expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
+
+ assert_frame_equal(result, expected)
+
+ # midnight, everything
+ rng = date_range('1/1/2000', '1/31/2000')
+ ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
+
+ result = ts.at_time(time(0, 0))
+ assert_frame_equal(result, ts)
+
+ # time doesn't exist
+ rng = date_range('1/1/2012', freq='23Min', periods=384)
+ ts = DataFrame(np.random.randn(len(rng), 2), rng)
+ rs = ts.at_time('16:00')
+ assert len(rs) == 0
+
+ def test_at_time_raises(self):
+ # GH20725
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+ with pytest.raises(TypeError): # index is not a DatetimeIndex
+ df.at_time('00:00')
+
+ @pytest.mark.parametrize('axis', ['index', 'columns', 0, 1])
+ def test_at_time_axis(self, axis):
+ # issue 8839
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = DataFrame(np.random.randn(len(rng), len(rng)))
+ ts.index, ts.columns = rng, rng
+
+ indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
+
+ if axis in ['index', 0]:
+ expected = ts.loc[indices, :]
+ elif axis in ['columns', 1]:
+ expected = ts.loc[:, indices]
+
+ result = ts.at_time('9:30', axis=axis)
+ assert_frame_equal(result, expected)
+
+ def test_between_time(self, close_open_fixture):
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
+ stime = time(0, 0)
+ etime = time(1, 0)
+ inc_start, inc_end = close_open_fixture
+
+ filtered = ts.between_time(stime, etime, inc_start, inc_end)
+ exp_len = 13 * 4 + 1
+ if not inc_start:
+ exp_len -= 5
+ if not inc_end:
+ exp_len -= 4
+
+ assert len(filtered) == exp_len
+ for rs in filtered.index:
+ t = rs.time()
+ if inc_start:
+ assert t >= stime
+ else:
+ assert t > stime
+
+ if inc_end:
+ assert t <= etime
+ else:
+ assert t < etime
+
+ result = ts.between_time('00:00', '01:00')
+ expected = ts.between_time(stime, etime)
+ assert_frame_equal(result, expected)
+
+ # across midnight
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
+ stime = time(22, 0)
+ etime = time(9, 0)
+
+ filtered = ts.between_time(stime, etime, inc_start, inc_end)
+ exp_len = (12 * 11 + 1) * 4 + 1
+ if not inc_start:
+ exp_len -= 4
+ if not inc_end:
+ exp_len -= 4
+
+ assert len(filtered) == exp_len
+ for rs in filtered.index:
+ t = rs.time()
+ if inc_start:
+ assert (t >= stime) or (t <= etime)
+ else:
+ assert (t > stime) or (t <= etime)
+
+ if inc_end:
+ assert (t <= etime) or (t >= stime)
+ else:
+ assert (t < etime) or (t >= stime)
+
+ def test_between_time_raises(self):
+ # GH20725
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+ with pytest.raises(TypeError): # index is not a DatetimeIndex
+ df.between_time(start_time='00:00', end_time='12:00')
+
+ def test_between_time_axis(self, axis):
+ # issue 8839
+ rng = date_range('1/1/2000', periods=100, freq='10min')
+ ts = DataFrame(np.random.randn(len(rng), len(rng)))
+ stime, etime = ('08:00:00', '09:00:00')
+ exp_len = 7
+
+ if axis in ['index', 0]:
+ ts.index = rng
+ assert len(ts.between_time(stime, etime)) == exp_len
+ assert len(ts.between_time(stime, etime, axis=0)) == exp_len
+
+ if axis in ['columns', 1]:
+ ts.columns = rng
+ selected = ts.between_time(stime, etime, axis=1).columns
+ assert len(selected) == exp_len
+
+ def test_between_time_axis_raises(self, axis):
+ # issue 8839
+ rng = date_range('1/1/2000', periods=100, freq='10min')
+ mask = np.arange(0, len(rng))
+ rand_data = np.random.randn(len(rng), len(rng))
+ ts = DataFrame(rand_data, index=rng, columns=rng)
+ stime, etime = ('08:00:00', '09:00:00')
+
+ if axis in ['columns', 1]:
+ ts.index = mask
+ pytest.raises(TypeError, ts.between_time, stime, etime)
+ pytest.raises(TypeError, ts.between_time, stime, etime, axis=0)
+
+ if axis in ['index', 0]:
+ ts.columns = mask
+ pytest.raises(TypeError, ts.between_time, stime, etime, axis=1)
+
+ def test_operation_on_NaT(self):
+ # Both NaT and Timestamp are in DataFrame.
+ df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
+ pd.Timestamp('2012-05-01')]})
+
+ res = df.min()
+ exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
+ tm.assert_series_equal(res, exp)
+
+ res = df.max()
+ exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
+ tm.assert_series_equal(res, exp)
+
+ # GH12941, only NaTs are in DataFrame.
+ df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})
+
+ res = df.min()
+ exp = pd.Series([pd.NaT], index=["foo"])
+ tm.assert_series_equal(res, exp)
+
+ res = df.max()
+ exp = pd.Series([pd.NaT], index=["foo"])
+ tm.assert_series_equal(res, exp)
+
+ def test_datetime_assignment_with_NaT_and_diff_time_units(self):
+ # GH 7492
+ data_ns = np.array([1, 'nat'], dtype='datetime64[ns]')
+ result = pd.Series(data_ns).to_frame()
+ result['new'] = data_ns
+ expected = pd.DataFrame({0: [1, None],
+ 'new': [1, None]}, dtype='datetime64[ns]')
+ tm.assert_frame_equal(result, expected)
+ # OutOfBoundsDatetime error shouldn't occur
+ data_s = np.array([1, 'nat'], dtype='datetime64[s]')
+ result['new'] = data_s
+ expected = pd.DataFrame({0: [1, None],
+ 'new': [1e9, None]}, dtype='datetime64[ns]')
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_to_period(self):
+ K = 5
+
+ dr = date_range('1/1/2000', '1/1/2001')
+ pr = period_range('1/1/2000', '1/1/2001')
+ df = DataFrame(np.random.randn(len(dr), K), index=dr)
+ df['mix'] = 'a'
+
+ pts = df.to_period()
+ exp = df.copy()
+ exp.index = pr
+ assert_frame_equal(pts, exp)
+
+ pts = df.to_period('M')
+ tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
+
+ df = df.T
+ pts = df.to_period(axis=1)
+ exp = df.copy()
+ exp.columns = pr
+ assert_frame_equal(pts, exp)
+
+ pts = df.to_period('M', axis=1)
+ tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))
+
+ pytest.raises(ValueError, df.to_period, axis=2)
+
+ @pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert'])
+ def test_tz_convert_and_localize(self, fn):
+ l0 = date_range('20140701', periods=5, freq='D')
+ l1 = date_range('20140701', periods=5, freq='D')
+
+ int_idx = Index(range(5))
+
+ if fn == 'tz_convert':
+ l0 = l0.tz_localize('UTC')
+ l1 = l1.tz_localize('UTC')
+
+ for idx in [l0, l1]:
+
+ l0_expected = getattr(idx, fn)('US/Pacific')
+ l1_expected = getattr(idx, fn)('US/Pacific')
+
+ df1 = DataFrame(np.ones(5), index=l0)
+ df1 = getattr(df1, fn)('US/Pacific')
+ assert_index_equal(df1.index, l0_expected)
+
+ # MultiIndex
+ # GH7846
+ df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
+
+ df3 = getattr(df2, fn)('US/Pacific', level=0)
+ assert not df3.index.levels[0].equals(l0)
+ assert_index_equal(df3.index.levels[0], l0_expected)
+ assert_index_equal(df3.index.levels[1], l1)
+ assert not df3.index.levels[1].equals(l1_expected)
+
+ df3 = getattr(df2, fn)('US/Pacific', level=1)
+ assert_index_equal(df3.index.levels[0], l0)
+ assert not df3.index.levels[0].equals(l0_expected)
+ assert_index_equal(df3.index.levels[1], l1_expected)
+ assert not df3.index.levels[1].equals(l1)
+
+ df4 = DataFrame(np.ones(5),
+ MultiIndex.from_arrays([int_idx, l0]))
+
+ # TODO: untested
+ df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa
+
+ assert_index_equal(df3.index.levels[0], l0)
+ assert not df3.index.levels[0].equals(l0_expected)
+ assert_index_equal(df3.index.levels[1], l1_expected)
+ assert not df3.index.levels[1].equals(l1)
+
+ # Bad Inputs
+
+ # Not DatetimeIndex / PeriodIndex
+ with pytest.raises(TypeError, match='DatetimeIndex'):
+ df = DataFrame(index=int_idx)
+ df = getattr(df, fn)('US/Pacific')
+
+ # Not DatetimeIndex / PeriodIndex
+ with pytest.raises(TypeError, match='DatetimeIndex'):
+ df = DataFrame(np.ones(5),
+ MultiIndex.from_arrays([int_idx, l0]))
+ df = getattr(df, fn)('US/Pacific', level=0)
+
+ # Invalid level
+ with pytest.raises(ValueError, match='not valid'):
+ df = DataFrame(index=l0)
+ df = getattr(df, fn)('US/Pacific', level=1)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_timezones.py b/contrib/python/pandas/py2/pandas/tests/frame/test_timezones.py
new file mode 100644
index 00000000000..fd6587c73b8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_timezones.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for DataFrame timezone-related methods
+"""
+from datetime import datetime
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas.compat import lrange
+
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+from pandas import DataFrame, Series
+from pandas.core.indexes.datetimes import date_range
+import pandas.util.testing as tm
+
+
+class TestDataFrameTimezones(object):
+
+ def test_frame_values_with_tz(self):
+ tz = "US/Central"
+ df = DataFrame({"A": date_range('2000', periods=4, tz=tz)})
+ result = df.values
+ expected = np.array([
+ [pd.Timestamp('2000-01-01', tz=tz)],
+ [pd.Timestamp('2000-01-02', tz=tz)],
+ [pd.Timestamp('2000-01-03', tz=tz)],
+ [pd.Timestamp('2000-01-04', tz=tz)],
+ ])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # two columns, homogenous
+
+ df = df.assign(B=df.A)
+ result = df.values
+ expected = np.concatenate([expected, expected], axis=1)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # three columns, heterogenous
+ est = "US/Eastern"
+ df = df.assign(C=df.A.dt.tz_convert(est))
+
+ new = np.array([
+ [pd.Timestamp('2000-01-01T01:00:00', tz=est)],
+ [pd.Timestamp('2000-01-02T01:00:00', tz=est)],
+ [pd.Timestamp('2000-01-03T01:00:00', tz=est)],
+ [pd.Timestamp('2000-01-04T01:00:00', tz=est)],
+ ])
+ expected = np.concatenate([expected, new], axis=1)
+ result = df.values
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_frame_from_records_utc(self):
+ rec = {'datum': 1.5,
+ 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)}
+
+ # it works
+ DataFrame.from_records([rec], index='begin_time')
+
+ def test_frame_tz_localize(self):
+ rng = date_range('1/1/2011', periods=100, freq='H')
+
+ df = DataFrame({'a': 1}, index=rng)
+ result = df.tz_localize('utc')
+ expected = DataFrame({'a': 1}, rng.tz_localize('UTC'))
+ assert result.index.tz.zone == 'UTC'
+ tm.assert_frame_equal(result, expected)
+
+ df = df.T
+ result = df.tz_localize('utc', axis=1)
+ assert result.columns.tz.zone == 'UTC'
+ tm.assert_frame_equal(result, expected.T)
+
+ def test_frame_tz_convert(self):
+ rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
+
+ df = DataFrame({'a': 1}, index=rng)
+ result = df.tz_convert('Europe/Berlin')
+ expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin'))
+ assert result.index.tz.zone == 'Europe/Berlin'
+ tm.assert_frame_equal(result, expected)
+
+ df = df.T
+ result = df.tz_convert('Europe/Berlin', axis=1)
+ assert result.columns.tz.zone == 'Europe/Berlin'
+ tm.assert_frame_equal(result, expected.T)
+
+ def test_frame_join_tzaware(self):
+ test1 = DataFrame(np.zeros((6, 3)),
+ index=date_range("2012-11-15 00:00:00", periods=6,
+ freq="100L", tz="US/Central"))
+ test2 = DataFrame(np.zeros((3, 3)),
+ index=date_range("2012-11-15 00:00:00", periods=3,
+ freq="250L", tz="US/Central"),
+ columns=lrange(3, 6))
+
+ result = test1.join(test2, how='outer')
+ ex_index = test1.index.union(test2.index)
+
+ tm.assert_index_equal(result.index, ex_index)
+ assert result.index.tz.zone == 'US/Central'
+
+ def test_frame_add_tz_mismatch_converts_to_utc(self):
+ rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
+ df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a'])
+
+ df_moscow = df.tz_convert('Europe/Moscow')
+ result = df + df_moscow
+ assert result.index.tz is pytz.utc
+
+ result = df_moscow + df
+ assert result.index.tz is pytz.utc
+
+ def test_frame_align_aware(self):
+ idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
+ idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
+ df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
+ df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
+ new1, new2 = df1.align(df2)
+ assert df1.index.tz == new1.index.tz
+ assert df2.index.tz == new2.index.tz
+
+ # different timezones convert to UTC
+
+ # frame with frame
+ df1_central = df1.tz_convert('US/Central')
+ new1, new2 = df1.align(df1_central)
+ assert new1.index.tz == pytz.UTC
+ assert new2.index.tz == pytz.UTC
+
+ # frame with Series
+ new1, new2 = df1.align(df1_central[0], axis=0)
+ assert new1.index.tz == pytz.UTC
+ assert new2.index.tz == pytz.UTC
+
+ df1[0].align(df1_central, axis=0)
+ assert new1.index.tz == pytz.UTC
+ assert new2.index.tz == pytz.UTC
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_frame_no_datetime64_dtype(self, tz):
+ # after GH#7822
+ # these retain the timezones on dict construction
+ dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
+ dr_tz = dr.tz_localize(tz)
+ df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
+ tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
+ assert df['B'].dtype == tz_expected
+
+ # GH#2810 (with timezones)
+ datetimes_naive = [ts.to_pydatetime() for ts in dr]
+ datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
+ df = DataFrame({'dr': dr,
+ 'dr_tz': dr_tz,
+ 'datetimes_naive': datetimes_naive,
+ 'datetimes_with_tz': datetimes_with_tz})
+ result = df.get_dtype_counts().sort_index()
+ expected = Series({'datetime64[ns]': 2,
+ str(tz_expected): 2}).sort_index()
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_frame_reset_index(self, tz):
+ dr = date_range('2012-06-02', periods=10, tz=tz)
+ df = DataFrame(np.random.randn(len(dr)), dr)
+ roundtripped = df.reset_index().set_index('index')
+ xp = df.index.tz
+ rs = roundtripped.index.tz
+ assert xp == rs
+
+ @pytest.mark.parametrize('tz', [None, 'America/New_York'])
+ def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
+ # GH 19970
+ idx = date_range('20161101', '20161130', freq='4H', tz=tz)
+ df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))},
+ index=idx)
+ result = df.T == df.T
+ expected = DataFrame(True, index=list('ab'), columns=idx)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('copy', [True, False])
+ @pytest.mark.parametrize('method, tz', [
+ ['tz_localize', None],
+ ['tz_convert', 'Europe/Berlin']
+ ])
+ def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
+ # GH 6326
+ result = DataFrame(np.arange(0, 5),
+ index=date_range('20131027', periods=5,
+ freq='1H', tz=tz))
+ getattr(result, method)('UTC', copy=copy)
+ expected = DataFrame(np.arange(0, 5),
+ index=date_range('20131027', periods=5,
+ freq='1H', tz=tz))
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_to_csv.py b/contrib/python/pandas/py2/pandas/tests/frame/test_to_csv.py
new file mode 100644
index 00000000000..42bfa75a010
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_to_csv.py
@@ -0,0 +1,1234 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+import csv
+import os
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lmap, lrange, range, u
+from pandas.errors import ParserError
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range,
+ read_csv, to_datetime)
+import pandas.core.common as com
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean,
+ makeCustomDataframe as mkdf)
+
+from pandas.io.common import _get_handle
+
+MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64']
+MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16',
+ 'int32', 'int64']
+
+
+class TestDataFrameToCSV(TestData):
+
+ def read_csv(self, path, **kwargs):
+ params = dict(index_col=0, parse_dates=True)
+ params.update(**kwargs)
+
+ return pd.read_csv(path, **params)
+
+ def test_from_csv_deprecation(self):
+ # see gh-17812
+ with ensure_clean('__tmp_from_csv_deprecation__') as path:
+ self.tsframe.to_csv(path)
+
+ with tm.assert_produces_warning(FutureWarning):
+ depr_recons = DataFrame.from_csv(path)
+ assert_frame_equal(self.tsframe, depr_recons)
+
+ def test_to_csv_from_csv1(self):
+
+ with ensure_clean('__tmp_to_csv_from_csv1__') as path:
+ self.frame['A'][:5] = np.nan
+
+ self.frame.to_csv(path)
+ self.frame.to_csv(path, columns=['A', 'B'])
+ self.frame.to_csv(path, header=False)
+ self.frame.to_csv(path, index=False)
+
+ # test roundtrip
+ self.tsframe.to_csv(path)
+ recons = self.read_csv(path)
+ assert_frame_equal(self.tsframe, recons)
+
+ self.tsframe.to_csv(path, index_label='index')
+ recons = self.read_csv(path, index_col=None)
+
+ assert(len(recons.columns) == len(self.tsframe.columns) + 1)
+
+ # no index
+ self.tsframe.to_csv(path, index=False)
+ recons = self.read_csv(path, index_col=None)
+ assert_almost_equal(self.tsframe.values, recons.values)
+
+ # corner case
+ dm = DataFrame({'s1': Series(lrange(3), lrange(3)),
+ 's2': Series(lrange(2), lrange(2))})
+ dm.to_csv(path)
+
+ recons = self.read_csv(path)
+ assert_frame_equal(dm, recons)
+
+ def test_to_csv_from_csv2(self):
+
+ with ensure_clean('__tmp_to_csv_from_csv2__') as path:
+
+ # duplicate index
+ df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
+ columns=['x', 'y', 'z'])
+ df.to_csv(path)
+ result = self.read_csv(path)
+ assert_frame_equal(result, df)
+
+ midx = MultiIndex.from_tuples(
+ [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
+ df = DataFrame(np.random.randn(3, 3), index=midx,
+ columns=['x', 'y', 'z'])
+
+ df.to_csv(path)
+ result = self.read_csv(path, index_col=[0, 1, 2],
+ parse_dates=False)
+ assert_frame_equal(result, df, check_names=False)
+
+ # column aliases
+ col_aliases = Index(['AA', 'X', 'Y', 'Z'])
+ self.frame2.to_csv(path, header=col_aliases)
+
+ rs = self.read_csv(path)
+ xp = self.frame2.copy()
+ xp.columns = col_aliases
+ assert_frame_equal(xp, rs)
+
+ pytest.raises(ValueError, self.frame2.to_csv, path,
+ header=['AA', 'X'])
+
+ def test_to_csv_from_csv3(self):
+
+ with ensure_clean('__tmp_to_csv_from_csv3__') as path:
+ df1 = DataFrame(np.random.randn(3, 1))
+ df2 = DataFrame(np.random.randn(3, 1))
+
+ df1.to_csv(path)
+ df2.to_csv(path, mode='a', header=False)
+ xp = pd.concat([df1, df2])
+ rs = pd.read_csv(path, index_col=0)
+ rs.columns = lmap(int, rs.columns)
+ xp.columns = lmap(int, xp.columns)
+ assert_frame_equal(xp, rs)
+
+ def test_to_csv_from_csv4(self):
+
+ with ensure_clean('__tmp_to_csv_from_csv4__') as path:
+ # GH 10833 (TimedeltaIndex formatting)
+ dt = pd.Timedelta(seconds=1)
+ df = pd.DataFrame({'dt_data': [i * dt for i in range(3)]},
+ index=pd.Index([i * dt for i in range(3)],
+ name='dt_index'))
+ df.to_csv(path)
+
+ result = pd.read_csv(path, index_col='dt_index')
+ result.index = pd.to_timedelta(result.index)
+ # TODO: remove renaming when GH 10875 is solved
+ result.index = result.index.rename('dt_index')
+ result['dt_data'] = pd.to_timedelta(result['dt_data'])
+
+ assert_frame_equal(df, result, check_index_type=True)
+
+ def test_to_csv_from_csv5(self):
+
+ # tz, 8260
+ with ensure_clean('__tmp_to_csv_from_csv5__') as path:
+
+ self.tzframe.to_csv(path)
+ result = pd.read_csv(path, index_col=0, parse_dates=['A'])
+
+ converter = lambda c: to_datetime(result[c]).dt.tz_convert(
+ 'UTC').dt.tz_convert(self.tzframe[c].dt.tz)
+ result['B'] = converter('B')
+ result['C'] = converter('C')
+ assert_frame_equal(result, self.tzframe)
+
+ def test_to_csv_cols_reordering(self):
+ # GH3454
+ import pandas as pd
+
+ chunksize = 5
+ N = int(chunksize * 2.5)
+
+ df = mkdf(N, 3)
+ cs = df.columns
+ cols = [cs[2], cs[0]]
+
+ with ensure_clean() as path:
+ df.to_csv(path, columns=cols, chunksize=chunksize)
+ rs_c = pd.read_csv(path, index_col=0)
+
+ assert_frame_equal(df[cols], rs_c, check_names=False)
+
+ def test_to_csv_new_dupe_cols(self):
+ import pandas as pd
+
+ def _check_df(df, cols=None):
+ with ensure_clean() as path:
+ df.to_csv(path, columns=cols, chunksize=chunksize)
+ rs_c = pd.read_csv(path, index_col=0)
+
+ # we wrote them in a different order
+ # so compare them in that order
+ if cols is not None:
+
+ if df.columns.is_unique:
+ rs_c.columns = cols
+ else:
+ indexer, missing = df.columns.get_indexer_non_unique(
+ cols)
+ rs_c.columns = df.columns.take(indexer)
+
+ for c in cols:
+ obj_df = df[c]
+ obj_rs = rs_c[c]
+ if isinstance(obj_df, Series):
+ assert_series_equal(obj_df, obj_rs)
+ else:
+ assert_frame_equal(
+ obj_df, obj_rs, check_names=False)
+
+ # wrote in the same order
+ else:
+ rs_c.columns = df.columns
+ assert_frame_equal(df, rs_c, check_names=False)
+
+ chunksize = 5
+ N = int(chunksize * 2.5)
+
+ # dupe cols
+ df = mkdf(N, 3)
+ df.columns = ['a', 'a', 'b']
+ _check_df(df, None)
+
+ # dupe cols with selection
+ cols = ['b', 'a']
+ _check_df(df, cols)
+
+ @pytest.mark.slow
+ def test_to_csv_dtnat(self):
+ # GH3437
+ from pandas import NaT
+
+ def make_dtnat_arr(n, nnat=None):
+ if nnat is None:
+ nnat = int(n * 0.1) # 10%
+ s = list(date_range('2000', freq='5min', periods=n))
+ if nnat:
+ for i in np.random.randint(0, len(s), nnat):
+ s[i] = NaT
+ i = np.random.randint(100)
+ s[-i] = NaT
+ s[i] = NaT
+ return s
+
+ chunksize = 1000
+ # N=35000
+ s1 = make_dtnat_arr(chunksize + 5)
+ s2 = make_dtnat_arr(chunksize + 5, 0)
+
+ # s3=make_dtnjat_arr(chunksize+5,0)
+ with ensure_clean('1.csv') as pth:
+ df = DataFrame(dict(a=s1, b=s2))
+ df.to_csv(pth, chunksize=chunksize)
+
+ recons = self.read_csv(pth)._convert(datetime=True,
+ coerce=True)
+ assert_frame_equal(df, recons, check_names=False,
+ check_less_precise=True)
+
+ @pytest.mark.slow
+ def test_to_csv_moar(self):
+
+ def _do_test(df, r_dtype=None, c_dtype=None,
+ rnlvl=None, cnlvl=None, dupe_col=False):
+
+ kwargs = dict(parse_dates=False)
+ if cnlvl:
+ if rnlvl is not None:
+ kwargs['index_col'] = lrange(rnlvl)
+ kwargs['header'] = lrange(cnlvl)
+
+ with ensure_clean('__tmp_to_csv_moar__') as path:
+ df.to_csv(path, encoding='utf8',
+ chunksize=chunksize)
+ recons = self.read_csv(path, **kwargs)
+ else:
+ kwargs['header'] = 0
+
+ with ensure_clean('__tmp_to_csv_moar__') as path:
+ df.to_csv(path, encoding='utf8', chunksize=chunksize)
+ recons = self.read_csv(path, **kwargs)
+
+ def _to_uni(x):
+ if not isinstance(x, compat.text_type):
+ return x.decode('utf8')
+ return x
+ if dupe_col:
+ # read_Csv disambiguates the columns by
+ # labeling them dupe.1,dupe.2, etc'. monkey patch columns
+ recons.columns = df.columns
+ if rnlvl and not cnlvl:
+ delta_lvl = [recons.iloc[
+ :, i].values for i in range(rnlvl - 1)]
+ ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
+ recons.index = ix
+ recons = recons.iloc[:, rnlvl - 1:]
+
+ type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
+ if r_dtype:
+ if r_dtype == 'u': # unicode
+ r_dtype = 'O'
+ recons.index = np.array(lmap(_to_uni, recons.index),
+ dtype=r_dtype)
+ df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
+ elif r_dtype == 'dt': # unicode
+ r_dtype = 'O'
+ recons.index = np.array(lmap(Timestamp, recons.index),
+ dtype=r_dtype)
+ df.index = np.array(
+ lmap(Timestamp, df.index), dtype=r_dtype)
+ elif r_dtype == 'p':
+ r_dtype = 'O'
+ recons.index = np.array(
+ list(map(Timestamp, to_datetime(recons.index))),
+ dtype=r_dtype)
+ df.index = np.array(
+ list(map(Timestamp, df.index.to_timestamp())),
+ dtype=r_dtype)
+ else:
+ r_dtype = type_map.get(r_dtype)
+ recons.index = np.array(recons.index, dtype=r_dtype)
+ df.index = np.array(df.index, dtype=r_dtype)
+ if c_dtype:
+ if c_dtype == 'u':
+ c_dtype = 'O'
+ recons.columns = np.array(lmap(_to_uni, recons.columns),
+ dtype=c_dtype)
+ df.columns = np.array(
+ lmap(_to_uni, df.columns), dtype=c_dtype)
+ elif c_dtype == 'dt':
+ c_dtype = 'O'
+ recons.columns = np.array(lmap(Timestamp, recons.columns),
+ dtype=c_dtype)
+ df.columns = np.array(
+ lmap(Timestamp, df.columns), dtype=c_dtype)
+ elif c_dtype == 'p':
+ c_dtype = 'O'
+ recons.columns = np.array(
+ lmap(Timestamp, to_datetime(recons.columns)),
+ dtype=c_dtype)
+ df.columns = np.array(
+ lmap(Timestamp, df.columns.to_timestamp()),
+ dtype=c_dtype)
+ else:
+ c_dtype = type_map.get(c_dtype)
+ recons.columns = np.array(recons.columns, dtype=c_dtype)
+ df.columns = np.array(df.columns, dtype=c_dtype)
+
+ assert_frame_equal(df, recons, check_names=False,
+ check_less_precise=True)
+
+ N = 100
+ chunksize = 1000
+
+ for ncols in [4]:
+ base = int((chunksize // ncols or 1) or 1)
+ for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
+ 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
+ base - 1, base, base + 1]:
+ _do_test(mkdf(nrows, ncols, r_idx_type='dt',
+ c_idx_type='s'), 'dt', 's')
+
+ for ncols in [4]:
+ base = int((chunksize // ncols or 1) or 1)
+ for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
+ 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
+ base - 1, base, base + 1]:
+ _do_test(mkdf(nrows, ncols, r_idx_type='dt',
+ c_idx_type='s'), 'dt', 's')
+ pass
+
+ for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'),
+ ('p', 'p')]:
+ for ncols in [1, 2, 3, 4]:
+ base = int((chunksize // ncols or 1) or 1)
+ for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
+ 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
+ base - 1, base, base + 1]:
+ _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type,
+ c_idx_type=c_idx_type),
+ r_idx_type, c_idx_type)
+
+ for ncols in [1, 2, 3, 4]:
+ base = int((chunksize // ncols or 1) or 1)
+ for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
+ 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
+ base - 1, base, base + 1]:
+ _do_test(mkdf(nrows, ncols))
+
+ for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
+ df = mkdf(nrows, 3)
+ cols = list(df.columns)
+ cols[:2] = ["dupe", "dupe"]
+ cols[-2:] = ["dupe", "dupe"]
+ ix = list(df.index)
+ ix[:2] = ["rdupe", "rdupe"]
+ ix[-2:] = ["rdupe", "rdupe"]
+ df.index = ix
+ df.columns = cols
+ _do_test(df, dupe_col=True)
+
+ _do_test(DataFrame(index=lrange(10)))
+ _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2)
+ for ncols in [2, 3, 4]:
+ base = int(chunksize // ncols)
+ for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
+ 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
+ base - 1, base, base + 1]:
+ _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
+ _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
+ _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
+ rnlvl=2, cnlvl=2)
+
+ def test_to_csv_from_csv_w_some_infs(self):
+
+ # test roundtrip with inf, -inf, nan, as full columns and mix
+ self.frame['G'] = np.nan
+ f = lambda x: [np.inf, np.nan][np.random.rand() < .5]
+ self.frame['H'] = self.frame.index.map(f)
+
+ with ensure_clean() as path:
+ self.frame.to_csv(path)
+ recons = self.read_csv(path)
+
+ # TODO to_csv drops column name
+ assert_frame_equal(self.frame, recons, check_names=False)
+ assert_frame_equal(np.isinf(self.frame),
+ np.isinf(recons), check_names=False)
+
+ def test_to_csv_from_csv_w_all_infs(self):
+
+ # test roundtrip with inf, -inf, nan, as full columns and mix
+ self.frame['E'] = np.inf
+ self.frame['F'] = -np.inf
+
+ with ensure_clean() as path:
+ self.frame.to_csv(path)
+ recons = self.read_csv(path)
+
+ # TODO to_csv drops column name
+ assert_frame_equal(self.frame, recons, check_names=False)
+ assert_frame_equal(np.isinf(self.frame),
+ np.isinf(recons), check_names=False)
+
+ def test_to_csv_no_index(self):
+ # GH 3624, after appending columns, to_csv fails
+ with ensure_clean('__tmp_to_csv_no_index__') as path:
+ df = DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6]})
+ df.to_csv(path, index=False)
+ result = read_csv(path)
+ assert_frame_equal(df, result)
+ df['c3'] = Series([7, 8, 9], dtype='int64')
+ df.to_csv(path, index=False)
+ result = read_csv(path)
+ assert_frame_equal(df, result)
+
+ def test_to_csv_with_mix_columns(self):
+ # gh-11637: incorrect output when a mix of integer and string column
+ # names passed as columns parameter in to_csv
+
+ df = DataFrame({0: ['a', 'b', 'c'],
+ 1: ['aa', 'bb', 'cc']})
+ df['test'] = 'txt'
+ assert df.to_csv() == df.to_csv(columns=[0, 1, 'test'])
+
+ def test_to_csv_headers(self):
+ # GH6186, the presence or absence of `index` incorrectly
+ # causes to_csv to have different header semantics.
+ from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+ to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y'])
+ with ensure_clean('__tmp_to_csv_headers__') as path:
+ from_df.to_csv(path, header=['X', 'Y'])
+ recons = self.read_csv(path)
+
+ assert_frame_equal(to_df, recons)
+
+ from_df.to_csv(path, index=False, header=['X', 'Y'])
+ recons = self.read_csv(path)
+
+ recons.reset_index(inplace=True)
+ assert_frame_equal(to_df, recons)
+
+ def test_to_csv_multiindex(self):
+
+ frame = self.frame
+ old_index = frame.index
+ arrays = np.arange(len(old_index) * 2).reshape(2, -1)
+ new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
+ frame.index = new_index
+
+ with ensure_clean('__tmp_to_csv_multiindex__') as path:
+
+ frame.to_csv(path, header=False)
+ frame.to_csv(path, columns=['A', 'B'])
+
+ # round trip
+ frame.to_csv(path)
+
+ df = self.read_csv(path, index_col=[0, 1],
+ parse_dates=False)
+
+ # TODO to_csv drops column name
+ assert_frame_equal(frame, df, check_names=False)
+ assert frame.index.names == df.index.names
+
+ # needed if setUp becomes a class method
+ self.frame.index = old_index
+
+ # try multiindex with dates
+ tsframe = self.tsframe
+ old_index = tsframe.index
+ new_index = [old_index, np.arange(len(old_index))]
+ tsframe.index = MultiIndex.from_arrays(new_index)
+
+ tsframe.to_csv(path, index_label=['time', 'foo'])
+ recons = self.read_csv(path, index_col=[0, 1])
+
+ # TODO to_csv drops column name
+ assert_frame_equal(tsframe, recons, check_names=False)
+
+ # do not load index
+ tsframe.to_csv(path)
+ recons = self.read_csv(path, index_col=None)
+ assert len(recons.columns) == len(tsframe.columns) + 2
+
+ # no index
+ tsframe.to_csv(path, index=False)
+ recons = self.read_csv(path, index_col=None)
+ assert_almost_equal(recons.values, self.tsframe.values)
+
+ # needed if setUp becomes class method
+ self.tsframe.index = old_index
+
+ with ensure_clean('__tmp_to_csv_multiindex__') as path:
+ # GH3571, GH1651, GH3141
+
+ def _make_frame(names=None):
+ if names is True:
+ names = ['first', 'second']
+ return DataFrame(np.random.randint(0, 10, size=(3, 3)),
+ columns=MultiIndex.from_tuples(
+ [('bah', 'foo'),
+ ('bah', 'bar'),
+ ('ban', 'baz')], names=names),
+ dtype='int64')
+
+ # column & index are multi-index
+ df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+ df.to_csv(path)
+ result = read_csv(path, header=[0, 1, 2, 3],
+ index_col=[0, 1])
+ assert_frame_equal(df, result)
+
+ # column is mi
+ df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
+ df.to_csv(path)
+ result = read_csv(
+ path, header=[0, 1, 2, 3], index_col=0)
+ assert_frame_equal(df, result)
+
+ # dup column names?
+ df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
+ df.to_csv(path)
+ result = read_csv(path, header=[0, 1, 2, 3],
+ index_col=[0, 1, 2])
+ assert_frame_equal(df, result)
+
+ # writing with no index
+ df = _make_frame()
+ df.to_csv(path, index=False)
+ result = read_csv(path, header=[0, 1])
+ assert_frame_equal(df, result)
+
+ # we lose the names here
+ df = _make_frame(True)
+ df.to_csv(path, index=False)
+ result = read_csv(path, header=[0, 1])
+ assert com._all_none(*result.columns.names)
+ result.columns.names = df.columns.names
+ assert_frame_equal(df, result)
+
+ # tupleize_cols=True and index=False
+ df = _make_frame(True)
+ with tm.assert_produces_warning(FutureWarning):
+ df.to_csv(path, tupleize_cols=True, index=False)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = read_csv(path, header=0,
+ tupleize_cols=True,
+ index_col=None)
+ result.columns = df.columns
+ assert_frame_equal(df, result)
+
+ # whatsnew example
+ df = _make_frame()
+ df.to_csv(path)
+ result = read_csv(path, header=[0, 1],
+ index_col=[0])
+ assert_frame_equal(df, result)
+
+ df = _make_frame(True)
+ df.to_csv(path)
+ result = read_csv(path, header=[0, 1],
+ index_col=[0])
+ assert_frame_equal(df, result)
+
+ # column & index are multi-index (compatibility)
+ df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+ with tm.assert_produces_warning(FutureWarning):
+ df.to_csv(path, tupleize_cols=True)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = read_csv(path, header=0, index_col=[0, 1],
+ tupleize_cols=True)
+ result.columns = df.columns
+ assert_frame_equal(df, result)
+
+ # invalid options
+ df = _make_frame(True)
+ df.to_csv(path)
+
+ for i in [6, 7]:
+ msg = 'len of {i}, but only 5 lines in file'.format(i=i)
+ with pytest.raises(ParserError, match=msg):
+ read_csv(path, header=lrange(i), index_col=0)
+
+ # write with cols
+ msg = 'cannot specify cols with a MultiIndex'
+ with pytest.raises(TypeError, match=msg):
+ df.to_csv(path, columns=['foo', 'bar'])
+
+ with ensure_clean('__tmp_to_csv_multiindex__') as path:
+ # empty
+ tsframe[:0].to_csv(path)
+ recons = self.read_csv(path)
+
+ exp = tsframe[:0]
+ exp.index = []
+
+ tm.assert_index_equal(recons.columns, exp.columns)
+ assert len(recons) == 0
+
+ def test_to_csv_float32_nanrep(self):
+ df = DataFrame(np.random.randn(1, 4).astype(np.float32))
+ df[1] = np.nan
+
+ with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path:
+ df.to_csv(path, na_rep=999)
+
+ with open(path) as f:
+ lines = f.readlines()
+ assert lines[1].split(',')[2] == '999'
+
+ def test_to_csv_withcommas(self):
+
+ # Commas inside fields should be correctly escaped when saving as CSV.
+ df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']})
+
+ with ensure_clean('__tmp_to_csv_withcommas__.csv') as path:
+ df.to_csv(path)
+ df2 = self.read_csv(path)
+ assert_frame_equal(df2, df)
+
+ def test_to_csv_mixed(self):
+
+ def create_cols(name):
+ return ["%s%03d" % (name, i) for i in range(5)]
+
+ df_float = DataFrame(np.random.randn(
+ 100, 5), dtype='float64', columns=create_cols('float'))
+ df_int = DataFrame(np.random.randn(100, 5),
+ dtype='int64', columns=create_cols('int'))
+ df_bool = DataFrame(True, index=df_float.index,
+ columns=create_cols('bool'))
+ df_object = DataFrame('foo', index=df_float.index,
+ columns=create_cols('object'))
+ df_dt = DataFrame(Timestamp('20010101'),
+ index=df_float.index, columns=create_cols('date'))
+
+ # add in some nans
+ df_float.loc[30:50, 1:3] = np.nan
+
+ # ## this is a bug in read_csv right now ####
+ # df_dt.loc[30:50,1:3] = np.nan
+
+ df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
+
+ # dtype
+ dtypes = dict()
+ for n, dtype in [('float', np.float64), ('int', np.int64),
+ ('bool', np.bool), ('object', np.object)]:
+ for c in create_cols(n):
+ dtypes[c] = dtype
+
+ with ensure_clean() as filename:
+ df.to_csv(filename)
+ rs = read_csv(filename, index_col=0, dtype=dtypes,
+ parse_dates=create_cols('date'))
+ assert_frame_equal(rs, df)
+
+ def test_to_csv_dups_cols(self):
+
+ df = DataFrame(np.random.randn(1000, 30), columns=lrange(
+ 15) + lrange(15), dtype='float64')
+
+ with ensure_clean() as filename:
+ df.to_csv(filename) # single dtype, fine
+ result = read_csv(filename, index_col=0)
+ result.columns = df.columns
+ assert_frame_equal(result, df)
+
+ df_float = DataFrame(np.random.randn(1000, 3), dtype='float64')
+ df_int = DataFrame(np.random.randn(1000, 3), dtype='int64')
+ df_bool = DataFrame(True, index=df_float.index, columns=lrange(3))
+ df_object = DataFrame('foo', index=df_float.index, columns=lrange(3))
+ df_dt = DataFrame(Timestamp('20010101'),
+ index=df_float.index, columns=lrange(3))
+ df = pd.concat([df_float, df_int, df_bool, df_object,
+ df_dt], axis=1, ignore_index=True)
+
+ cols = []
+ for i in range(5):
+ cols.extend([0, 1, 2])
+ df.columns = cols
+
+ with ensure_clean() as filename:
+ df.to_csv(filename)
+ result = read_csv(filename, index_col=0)
+
+ # date cols
+ for i in ['0.4', '1.4', '2.4']:
+ result[i] = to_datetime(result[i])
+
+ result.columns = df.columns
+ assert_frame_equal(result, df)
+
+ # GH3457
+ from pandas.util.testing import makeCustomDataframe as mkdf
+
+ N = 10
+ df = mkdf(N, 3)
+ df.columns = ['a', 'a', 'b']
+
+ with ensure_clean() as filename:
+ df.to_csv(filename)
+
+ # read_csv will rename the dups columns
+ result = read_csv(filename, index_col=0)
+ result = result.rename(columns={'a.1': 'a'})
+ assert_frame_equal(result, df)
+
+ def test_to_csv_chunking(self):
+
+ aa = DataFrame({'A': lrange(100000)})
+ aa['B'] = aa.A + 1.0
+ aa['C'] = aa.A + 2.0
+ aa['D'] = aa.A + 3.0
+
+ for chunksize in [10000, 50000, 100000]:
+ with ensure_clean() as filename:
+ aa.to_csv(filename, chunksize=chunksize)
+ rs = read_csv(filename, index_col=0)
+ assert_frame_equal(rs, aa)
+
+ @pytest.mark.slow
+ def test_to_csv_wide_frame_formatting(self):
+ # Issue #8621
+ df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
+ with ensure_clean() as filename:
+ df.to_csv(filename, header=False, index=False)
+ rs = read_csv(filename, header=None)
+ assert_frame_equal(rs, df)
+
+ def test_to_csv_bug(self):
+ f1 = StringIO('a,1.0\nb,2.0')
+ df = self.read_csv(f1, header=None)
+ newdf = DataFrame({'t': df[df.columns[0]]})
+
+ with ensure_clean() as path:
+ newdf.to_csv(path)
+
+ recons = read_csv(path, index_col=0)
+ # don't check_names as t != 1
+ assert_frame_equal(recons, newdf, check_names=False)
+
+ def test_to_csv_unicode(self):
+
+ df = DataFrame({u('c/\u03c3'): [1, 2, 3]})
+ with ensure_clean() as path:
+
+ df.to_csv(path, encoding='UTF-8')
+ df2 = read_csv(path, index_col=0, encoding='UTF-8')
+ assert_frame_equal(df, df2)
+
+ df.to_csv(path, encoding='UTF-8', index=False)
+ df2 = read_csv(path, index_col=None, encoding='UTF-8')
+ assert_frame_equal(df, df2)
+
+ def test_to_csv_unicode_index_col(self):
+ buf = StringIO('')
+ df = DataFrame(
+ [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
+ columns=[u("\u05d0"),
+ u("\u05d1"), u("\u05d2"), u("\u05d3")],
+ index=[u("\u05d0"), u("\u05d1")])
+
+ df.to_csv(buf, encoding='UTF-8')
+ buf.seek(0)
+
+ df2 = read_csv(buf, index_col=0, encoding='UTF-8')
+ assert_frame_equal(df, df2)
+
+ def test_to_csv_stringio(self):
+ buf = StringIO()
+ self.frame.to_csv(buf)
+ buf.seek(0)
+ recons = read_csv(buf, index_col=0)
+ # TODO to_csv drops column name
+ assert_frame_equal(recons, self.frame, check_names=False)
+
+ def test_to_csv_float_format(self):
+
+ df = DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+ with ensure_clean() as filename:
+
+ df.to_csv(filename, float_format='%.2f')
+
+ rs = read_csv(filename, index_col=0)
+ xp = DataFrame([[0.12, 0.23, 0.57],
+ [12.32, 123123.20, 321321.20]],
+ index=['A', 'B'], columns=['X', 'Y', 'Z'])
+ assert_frame_equal(rs, xp)
+
+ def test_to_csv_unicodewriter_quoting(self):
+ df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})
+
+ buf = StringIO()
+ df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
+ encoding='utf-8')
+
+ result = buf.getvalue()
+ expected_rows = ['"A","B"',
+ '1,"foo"',
+ '2,"bar"',
+ '3,"baz"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ def test_to_csv_quote_none(self):
+ # GH4328
+ df = DataFrame({'A': ['hello', '{"hello"}']})
+ for encoding in (None, 'utf-8'):
+ buf = StringIO()
+ df.to_csv(buf, quoting=csv.QUOTE_NONE,
+ encoding=encoding, index=False)
+
+ result = buf.getvalue()
+ expected_rows = ['A',
+ 'hello',
+ '{"hello"}']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ def test_to_csv_index_no_leading_comma(self):
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
+ index=['one', 'two', 'three'])
+
+ buf = StringIO()
+ df.to_csv(buf, index_label=False)
+
+ expected_rows = ['A,B',
+ 'one,1,4',
+ 'two,2,5',
+ 'three,3,6']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert buf.getvalue() == expected
+
+ def test_to_csv_line_terminators(self):
+ # see gh-20353
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
+ index=['one', 'two', 'three'])
+
+ with ensure_clean() as path:
+ # case 1: CRLF as line terminator
+ df.to_csv(path, line_terminator='\r\n')
+ expected = b',A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n'
+
+ with open(path, mode='rb') as f:
+ assert f.read() == expected
+
+ with ensure_clean() as path:
+ # case 2: LF as line terminator
+ df.to_csv(path, line_terminator='\n')
+ expected = b',A,B\none,1,4\ntwo,2,5\nthree,3,6\n'
+
+ with open(path, mode='rb') as f:
+ assert f.read() == expected
+
+ with ensure_clean() as path:
+ # case 3: The default line terminator(=os.linesep)(gh-21406)
+ df.to_csv(path)
+ os_linesep = os.linesep.encode('utf-8')
+ expected = (b',A,B' + os_linesep + b'one,1,4' + os_linesep +
+ b'two,2,5' + os_linesep + b'three,3,6' + os_linesep)
+
+ with open(path, mode='rb') as f:
+ assert f.read() == expected
+
+ def test_to_csv_from_csv_categorical(self):
+
+ # CSV with categoricals should result in the same output
+ # as when one would add a "normal" Series/DataFrame.
+ s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
+ s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
+ res = StringIO()
+
+ s.to_csv(res, header=False)
+ exp = StringIO()
+
+ s2.to_csv(exp, header=False)
+ assert res.getvalue() == exp.getvalue()
+
+ df = DataFrame({"s": s})
+ df2 = DataFrame({"s": s2})
+
+ res = StringIO()
+ df.to_csv(res)
+
+ exp = StringIO()
+ df2.to_csv(exp)
+
+ assert res.getvalue() == exp.getvalue()
+
+ def test_to_csv_path_is_none(self):
+ # GH 8215
+ # Make sure we return string for consistency with
+ # Series.to_csv()
+ csv_str = self.frame.to_csv(path_or_buf=None)
+ assert isinstance(csv_str, str)
+ recons = pd.read_csv(StringIO(csv_str), index_col=0)
+ assert_frame_equal(self.frame, recons)
+
+ @pytest.mark.parametrize('df,encoding', [
+ (DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+ # GH 21241, 21118
+ (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+ (DataFrame(5 * [[123, u"你好", u"世界"]],
+ columns=['X', 'Y', 'Z']), 'gb2312'),
+ (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+ columns=['X', 'Y', 'Z']), 'cp737')
+ ])
+ def test_to_csv_compression(self, df, encoding, compression):
+
+ with ensure_clean() as filename:
+
+ df.to_csv(filename, compression=compression, encoding=encoding)
+ # test the round trip - to_csv -> read_csv
+ result = read_csv(filename, compression=compression,
+ index_col=0, encoding=encoding)
+ assert_frame_equal(df, result)
+
+ # test the round trip using file handle - to_csv -> read_csv
+ f, _handles = _get_handle(filename, 'w', compression=compression,
+ encoding=encoding)
+ with f:
+ df.to_csv(f, encoding=encoding)
+ result = pd.read_csv(filename, compression=compression,
+ encoding=encoding, index_col=0, squeeze=True)
+ assert_frame_equal(df, result)
+
+ # explicitly make sure file is compressed
+ with tm.decompress_file(filename, compression) as fh:
+ text = fh.read().decode(encoding or 'utf8')
+ for col in df.columns:
+ assert col in text
+
+ with tm.decompress_file(filename, compression) as fh:
+ assert_frame_equal(df, read_csv(fh,
+ index_col=0,
+ encoding=encoding))
+
+ def test_to_csv_date_format(self):
+ with ensure_clean('__tmp_to_csv_date_format__') as path:
+ dt_index = self.tsframe.index
+ datetime_frame = DataFrame(
+ {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index)
+ datetime_frame.to_csv(path, date_format='%Y%m%d')
+
+ # Check that the data was put in the specified format
+ test = read_csv(path, index_col=0)
+
+ datetime_frame_int = datetime_frame.applymap(
+ lambda x: int(x.strftime('%Y%m%d')))
+ datetime_frame_int.index = datetime_frame_int.index.map(
+ lambda x: int(x.strftime('%Y%m%d')))
+
+ assert_frame_equal(test, datetime_frame_int)
+
+ datetime_frame.to_csv(path, date_format='%Y-%m-%d')
+
+ # Check that the data was put in the specified format
+ test = read_csv(path, index_col=0)
+ datetime_frame_str = datetime_frame.applymap(
+ lambda x: x.strftime('%Y-%m-%d'))
+ datetime_frame_str.index = datetime_frame_str.index.map(
+ lambda x: x.strftime('%Y-%m-%d'))
+
+ assert_frame_equal(test, datetime_frame_str)
+
+ # Check that columns get converted
+ datetime_frame_columns = datetime_frame.T
+ datetime_frame_columns.to_csv(path, date_format='%Y%m%d')
+
+ test = read_csv(path, index_col=0)
+
+ datetime_frame_columns = datetime_frame_columns.applymap(
+ lambda x: int(x.strftime('%Y%m%d')))
+ # Columns don't get converted to ints by read_csv
+ datetime_frame_columns.columns = (
+ datetime_frame_columns.columns
+ .map(lambda x: x.strftime('%Y%m%d')))
+
+ assert_frame_equal(test, datetime_frame_columns)
+
+ # test NaTs
+ nat_index = to_datetime(
+ ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000'])
+ nat_frame = DataFrame({'A': nat_index}, index=nat_index)
+ nat_frame.to_csv(path, date_format='%Y-%m-%d')
+
+ test = read_csv(path, parse_dates=[0, 1], index_col=0)
+
+ assert_frame_equal(test, nat_frame)
+
+ def test_to_csv_with_dst_transitions(self):
+
+ with ensure_clean('csv_date_format_with_dst') as path:
+ # make sure we are not failing on transitions
+ times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00",
+ tz="Europe/London",
+ freq="H",
+ ambiguous='infer')
+
+ for i in [times, times + pd.Timedelta('10s')]:
+ time_range = np.array(range(len(i)), dtype='int64')
+ df = DataFrame({'A': time_range}, index=i)
+ df.to_csv(path, index=True)
+ # we have to reconvert the index as we
+ # don't parse the tz's
+ result = read_csv(path, index_col=0)
+ result.index = to_datetime(result.index, utc=True).tz_convert(
+ 'Europe/London')
+ assert_frame_equal(result, df)
+
+ # GH11619
+ idx = pd.date_range('2015-01-01', '2015-12-31',
+ freq='H', tz='Europe/Paris')
+ df = DataFrame({'values': 1, 'idx': idx},
+ index=idx)
+ with ensure_clean('csv_date_format_with_dst') as path:
+ df.to_csv(path, index=True)
+ result = read_csv(path, index_col=0)
+ result.index = to_datetime(result.index, utc=True).tz_convert(
+ 'Europe/Paris')
+ result['idx'] = to_datetime(result['idx'], utc=True).astype(
+ 'datetime64[ns, Europe/Paris]')
+ assert_frame_equal(result, df)
+
+ # assert working
+ df.astype(str)
+
+ with ensure_clean('csv_date_format_with_dst') as path:
+ df.to_pickle(path)
+ result = pd.read_pickle(path)
+ assert_frame_equal(result, df)
+
+ def test_to_csv_quoting(self):
+ df = DataFrame({
+ 'c_bool': [True, False],
+ 'c_float': [1.0, 3.2],
+ 'c_int': [42, np.nan],
+ 'c_string': ['a', 'b,c'],
+ })
+
+ expected_rows = [',c_bool,c_float,c_int,c_string',
+ '0,True,1.0,42.0,a',
+ '1,False,3.2,,"b,c"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ result = df.to_csv()
+ assert result == expected
+
+ result = df.to_csv(quoting=None)
+ assert result == expected
+
+ expected_rows = [',c_bool,c_float,c_int,c_string',
+ '0,True,1.0,42.0,a',
+ '1,False,3.2,,"b,c"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
+ assert result == expected
+
+ expected_rows = ['"","c_bool","c_float","c_int","c_string"',
+ '"0","True","1.0","42.0","a"',
+ '"1","False","3.2","","b,c"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ result = df.to_csv(quoting=csv.QUOTE_ALL)
+ assert result == expected
+
+ # see gh-12922, gh-13259: make sure changes to
+ # the formatters do not break this behaviour
+ expected_rows = ['"","c_bool","c_float","c_int","c_string"',
+ '0,True,1.0,42.0,"a"',
+ '1,False,3.2,"","b,c"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
+ assert result == expected
+
+ msg = "need to escape, but no escapechar set"
+ with pytest.raises(csv.Error, match=msg):
+ df.to_csv(quoting=csv.QUOTE_NONE)
+
+ with pytest.raises(csv.Error, match=msg):
+ df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None)
+
+ expected_rows = [',c_bool,c_float,c_int,c_string',
+ '0,True,1.0,42.0,a',
+ '1,False,3.2,,b!,c']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ result = df.to_csv(quoting=csv.QUOTE_NONE,
+ escapechar='!')
+ assert result == expected
+
+ expected_rows = [',c_bool,c_ffloat,c_int,c_string',
+ '0,True,1.0,42.0,a',
+ '1,False,3.2,,bf,c']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ result = df.to_csv(quoting=csv.QUOTE_NONE,
+ escapechar='f')
+ assert result == expected
+
+ # see gh-3503: quoting Windows line terminators
+ # presents with encoding?
+ text_rows = ['a,b,c',
+ '1,"test \r\n",3']
+ text = tm.convert_rows_list_to_csv_str(text_rows)
+ df = pd.read_csv(StringIO(text))
+
+ buf = StringIO()
+ df.to_csv(buf, encoding='utf-8', index=False)
+ assert buf.getvalue() == text
+
+ # xref gh-7791: make sure the quoting parameter is passed through
+ # with multi-indexes
+ df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
+ df = df.set_index(['a', 'b'])
+
+ expected_rows = ['"a","b","c"',
+ '"1","3","5"',
+ '"2","4","6"']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
+
+ def test_period_index_date_overflow(self):
+ # see gh-15982
+
+ dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
+ index = pd.PeriodIndex(dates, freq="D")
+
+ df = pd.DataFrame([4, 5, 6], index=index)
+ result = df.to_csv()
+
+ expected_rows = [',0',
+ '1990-01-01,4',
+ '2000-01-01,5',
+ '3005-01-01,6']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ date_format = "%m-%d-%Y"
+ result = df.to_csv(date_format=date_format)
+
+ expected_rows = [',0',
+ '01-01-1990,4',
+ '01-01-2000,5',
+ '01-01-3005,6']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ # Overflow with pd.NaT
+ dates = ["1990-01-01", pd.NaT, "3005-01-01"]
+ index = pd.PeriodIndex(dates, freq="D")
+
+ df = pd.DataFrame([4, 5, 6], index=index)
+ result = df.to_csv()
+
+ expected_rows = [',0',
+ '1990-01-01,4',
+ ',5',
+ '3005-01-01,6']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ def test_multi_index_header(self):
+ # see gh-5539
+ columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2),
+ ("b", 1), ("b", 2)])
+ df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
+ df.columns = columns
+
+ header = ["a", "b", "c", "d"]
+ result = df.to_csv(header=header)
+
+ expected_rows = [',a,b,c,d',
+ '0,1,2,3,4',
+ '1,5,6,7,8']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
+
+ def test_gz_lineend(self):
+ # GH 25311
+ df = pd.DataFrame({'a': [1, 2]})
+ expected_rows = ['a', '1', '2']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ with ensure_clean('__test_gz_lineend.csv.gz') as path:
+ df.to_csv(path, index=False)
+ with tm.decompress_file(path, compression='gzip') as f:
+ result = f.read().decode('utf-8')
+
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/frame/test_validate.py b/contrib/python/pandas/py2/pandas/tests/frame/test_validate.py
new file mode 100644
index 00000000000..6513c332c67
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/frame/test_validate.py
@@ -0,0 +1,32 @@
+import pytest
+
+from pandas.core.frame import DataFrame
+
+
+def dataframe():
+ return DataFrame({'a': [1, 2], 'b': [3, 4]})
+
+
+class TestDataFrameValidate(object):
+ """Tests for error handling related to data types of method arguments."""
+
+ @pytest.mark.parametrize("func", ["query", "eval", "set_index",
+ "reset_index", "dropna",
+ "drop_duplicates", "sort_values"])
+ @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
+ def test_validate_bool_args(self, dataframe, func, inplace):
+ msg = "For argument \"inplace\" expected type bool"
+ kwargs = dict(inplace=inplace)
+
+ if func == "query":
+ kwargs["expr"] = "a > b"
+ elif func == "eval":
+ kwargs["expr"] = "a + b"
+ elif func == "set_index":
+ kwargs["keys"] = ["a"]
+ elif func == "sort_values":
+ kwargs["by"] = ["a"]
+
+ with pytest.raises(ValueError, match=msg):
+ getattr(dataframe, func)(**kwargs)
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/__init__.py b/contrib/python/pandas/py2/pandas/tests/generic/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/test_frame.py b/contrib/python/pandas/py2/pandas/tests/generic/test_frame.py
new file mode 100644
index 00000000000..25440702a33
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/test_frame.py
@@ -0,0 +1,271 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=E1101,W0612
+
+from copy import deepcopy
+from distutils.version import LooseVersion
+from operator import methodcaller
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Series, date_range
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+from .test_generic import Generic
+
+try:
+ import xarray
+ _XARRAY_INSTALLED = True
+except ImportError:
+ _XARRAY_INSTALLED = False
+
+
+class TestDataFrame(Generic):
+ _typ = DataFrame
+ _comparator = lambda self, x, y: assert_frame_equal(x, y)
+
+ def test_rename_mi(self):
+ df = DataFrame([
+ 11, 21, 31
+ ], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]))
+ df.rename(str.lower)
+
+ def test_set_axis_name(self):
+ df = pd.DataFrame([[1, 2], [3, 4]])
+ funcs = ['_set_axis_name', 'rename_axis']
+ for func in funcs:
+ result = methodcaller(func, 'foo')(df)
+ assert df.index.name is None
+ assert result.index.name == 'foo'
+
+ result = methodcaller(func, 'cols', axis=1)(df)
+ assert df.columns.name is None
+ assert result.columns.name == 'cols'
+
+ def test_set_axis_name_mi(self):
+ df = DataFrame(
+ np.empty((3, 3)),
+ index=MultiIndex.from_tuples([("A", x) for x in list('aBc')]),
+ columns=MultiIndex.from_tuples([('C', x) for x in list('xyz')])
+ )
+
+ level_names = ['L1', 'L2']
+ funcs = ['_set_axis_name', 'rename_axis']
+ for func in funcs:
+ result = methodcaller(func, level_names)(df)
+ assert result.index.names == level_names
+ assert result.columns.names == [None, None]
+
+ result = methodcaller(func, level_names, axis=1)(df)
+ assert result.columns.names == ["L1", "L2"]
+ assert result.index.names == [None, None]
+
+ def test_nonzero_single_element(self):
+
+ # allow single item via bool method
+ df = DataFrame([[True]])
+ assert df.bool()
+
+ df = DataFrame([[False]])
+ assert not df.bool()
+
+ df = DataFrame([[False, False]])
+ with pytest.raises(ValueError):
+ df.bool()
+ with pytest.raises(ValueError):
+ bool(df)
+
+ def test_get_numeric_data_preserve_dtype(self):
+
+ # get the numeric data
+ o = DataFrame({'A': [1, '2', 3.]})
+ result = o._get_numeric_data()
+ expected = DataFrame(index=[0, 1, 2], dtype=object)
+ self._compare(result, expected)
+
+ def test_metadata_propagation_indiv(self):
+
+ # groupby
+ df = DataFrame(
+ {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8)})
+ result = df.groupby('A').sum()
+ self.check_metadata(df, result)
+
+ # resample
+ df = DataFrame(np.random.randn(1000, 2),
+ index=date_range('20130101', periods=1000, freq='s'))
+ result = df.resample('1T')
+ self.check_metadata(df, result)
+
+ # merging with override
+ # GH 6923
+ _metadata = DataFrame._metadata
+ _finalize = DataFrame.__finalize__
+
+ np.random.seed(10)
+ df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b'])
+ df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd'])
+ DataFrame._metadata = ['filename']
+ df1.filename = 'fname1.csv'
+ df2.filename = 'fname2.csv'
+
+ def finalize(self, other, method=None, **kwargs):
+
+ for name in self._metadata:
+ if method == 'merge':
+ left, right = other.left, other.right
+ value = getattr(left, name, '') + '|' + getattr(right,
+ name, '')
+ object.__setattr__(self, name, value)
+ else:
+ object.__setattr__(self, name, getattr(other, name, ''))
+
+ return self
+
+ DataFrame.__finalize__ = finalize
+ result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner')
+ assert result.filename == 'fname1.csv|fname2.csv'
+
+ # concat
+ # GH 6927
+ DataFrame._metadata = ['filename']
+ df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list('ab'))
+ df1.filename = 'foo'
+
+ def finalize(self, other, method=None, **kwargs):
+ for name in self._metadata:
+ if method == 'concat':
+ value = '+'.join([getattr(
+ o, name) for o in other.objs if getattr(o, name, None)
+ ])
+ object.__setattr__(self, name, value)
+ else:
+ object.__setattr__(self, name, getattr(other, name, None))
+
+ return self
+
+ DataFrame.__finalize__ = finalize
+
+ result = pd.concat([df1, df1])
+ assert result.filename == 'foo+foo'
+
+ # reset
+ DataFrame._metadata = _metadata
+ DataFrame.__finalize__ = _finalize
+
+ def test_set_attribute(self):
+ # Test for consistent setattr behavior when an attribute and a column
+ # have the same name (Issue #8994)
+ df = DataFrame({'x': [1, 2, 3]})
+
+ df.y = 2
+ df['y'] = [2, 4, 6]
+ df.y = 5
+
+ assert df.y == 5
+ assert_series_equal(df['y'], Series([2, 4, 6], name='y'))
+
+ @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and
+ LooseVersion(xarray.__version__) <
+ LooseVersion('0.10.0'),
+ reason='xarray >= 0.10.0 required')
+ @pytest.mark.parametrize(
+ "index", ['FloatIndex', 'IntIndex',
+ 'StringIndex', 'UnicodeIndex',
+ 'DateIndex', 'PeriodIndex',
+ 'CategoricalIndex', 'TimedeltaIndex'])
+ def test_to_xarray_index_types(self, index):
+ from xarray import Dataset
+
+ index = getattr(tm, 'make{}'.format(index))
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101',
+ periods=3,
+ tz='US/Eastern')}
+ )
+
+ df.index = index(3)
+ df.index.name = 'foo'
+ df.columns.name = 'bar'
+ result = df.to_xarray()
+ assert result.dims['foo'] == 3
+ assert len(result.coords) == 1
+ assert len(result.data_vars) == 8
+ assert_almost_equal(list(result.coords.keys()), ['foo'])
+ assert isinstance(result, Dataset)
+
+ # idempotency
+ # categoricals are not preserved
+ # datetimes w/tz are not preserved
+ # column names are lost
+ expected = df.copy()
+ expected['f'] = expected['f'].astype(object)
+ expected['h'] = expected['h'].astype('datetime64[ns]')
+ expected.columns.name = None
+ assert_frame_equal(result.to_dataframe(), expected,
+ check_index_type=False, check_categorical=False)
+
+ @td.skip_if_no('xarray', min_version='0.7.0')
+ def test_to_xarray(self):
+ from xarray import Dataset
+
+ df = DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.Categorical(list('abc')),
+ 'g': pd.date_range('20130101', periods=3),
+ 'h': pd.date_range('20130101',
+ periods=3,
+ tz='US/Eastern')}
+ )
+
+ df.index.name = 'foo'
+ result = df[0:0].to_xarray()
+ assert result.dims['foo'] == 0
+ assert isinstance(result, Dataset)
+
+ # available in 0.7.1
+ # MultiIndex
+ df.index = pd.MultiIndex.from_product([['a'], range(3)],
+ names=['one', 'two'])
+ result = df.to_xarray()
+ assert result.dims['one'] == 1
+ assert result.dims['two'] == 3
+ assert len(result.coords) == 2
+ assert len(result.data_vars) == 8
+ assert_almost_equal(list(result.coords.keys()), ['one', 'two'])
+ assert isinstance(result, Dataset)
+
+ result = result.to_dataframe()
+ expected = df.copy()
+ expected['f'] = expected['f'].astype(object)
+ expected['h'] = expected['h'].astype('datetime64[ns]')
+ expected.columns.name = None
+ assert_frame_equal(result,
+ expected,
+ check_index_type=False)
+
+ def test_deepcopy_empty(self):
+ # This test covers empty frame copying with non-empty column sets
+ # as reported in issue GH15370
+ empty_frame = DataFrame(data=[], index=[], columns=['A'])
+ empty_frame_copy = deepcopy(empty_frame)
+
+ self._compare(empty_frame_copy, empty_frame)
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/test_generic.py b/contrib/python/pandas/py2/pandas/tests/generic/test_generic.py
new file mode 100644
index 00000000000..7183fea85a0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/test_generic.py
@@ -0,0 +1,1029 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=E1101,W0612
+
+from copy import copy, deepcopy
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, range, zip
+
+from pandas.core.dtypes.common import is_scalar
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Panel, Series, date_range
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_panel_equal, assert_series_equal)
+
+import pandas.io.formats.printing as printing
+
+# ----------------------------------------------------------------------
+# Generic types test cases
+
+
+class Generic(object):
+
+ @property
+ def _ndim(self):
+ return self._typ._AXIS_LEN
+
+ def _axes(self):
+ """ return the axes for my object typ """
+ return self._typ._AXIS_ORDERS
+
+ def _construct(self, shape, value=None, dtype=None, **kwargs):
+ """ construct an object for the given shape
+ if value is specified use that if its a scalar
+ if value is an array, repeat it as needed """
+
+ if isinstance(shape, int):
+ shape = tuple([shape] * self._ndim)
+ if value is not None:
+ if is_scalar(value):
+ if value == 'empty':
+ arr = None
+
+ # remove the info axis
+ kwargs.pop(self._typ._info_axis_name, None)
+ else:
+ arr = np.empty(shape, dtype=dtype)
+ arr.fill(value)
+ else:
+ fshape = np.prod(shape)
+ arr = value.ravel()
+ new_shape = fshape / arr.shape[0]
+ if fshape % arr.shape[0] != 0:
+ raise Exception("invalid value passed in _construct")
+
+ arr = np.repeat(arr, new_shape).reshape(shape)
+ else:
+ arr = np.random.randn(*shape)
+ return self._typ(arr, dtype=dtype, **kwargs)
+
+ def _compare(self, result, expected):
+ self._comparator(result, expected)
+
+ def test_rename(self):
+
+ # single axis
+ idx = list('ABCD')
+ # relabeling values passed into self.rename
+ args = [
+ str.lower,
+ {x: x.lower() for x in idx},
+ Series({x: x.lower() for x in idx}),
+ ]
+
+ for axis in self._axes():
+ kwargs = {axis: idx}
+ obj = self._construct(4, **kwargs)
+
+ for arg in args:
+ # rename a single axis
+ result = obj.rename(**{axis: arg})
+ expected = obj.copy()
+ setattr(expected, axis, list('abcd'))
+ self._compare(result, expected)
+
+ # multiple axes at once
+
+ def test_get_numeric_data(self):
+
+ n = 4
+ kwargs = {self._typ._AXIS_NAMES[i]: list(range(n))
+ for i in range(self._ndim)}
+
+ # get the numeric data
+ o = self._construct(n, **kwargs)
+ result = o._get_numeric_data()
+ self._compare(result, o)
+
+ # non-inclusion
+ result = o._get_bool_data()
+ expected = self._construct(n, value='empty', **kwargs)
+ self._compare(result, expected)
+
+ # get the bool data
+ arr = np.array([True, True, False, True])
+ o = self._construct(n, value=arr, **kwargs)
+ result = o._get_numeric_data()
+ self._compare(result, o)
+
+ # _get_numeric_data is includes _get_bool_data, so can't test for
+ # non-inclusion
+
+ def test_get_default(self):
+
+ # GH 7725
+ d0 = "a", "b", "c", "d"
+ d1 = np.arange(4, dtype='int64')
+ others = "e", 10
+
+ for data, index in ((d0, d1), (d1, d0)):
+ s = Series(data, index=index)
+ for i, d in zip(index, data):
+ assert s.get(i) == d
+ assert s.get(i, d) == d
+ assert s.get(i, "z") == d
+ for other in others:
+ assert s.get(other, "z") == "z"
+ assert s.get(other, other) == other
+
+ def test_nonzero(self):
+
+ # GH 4633
+ # look at the boolean/nonzero behavior for objects
+ obj = self._construct(shape=4)
+ pytest.raises(ValueError, lambda: bool(obj == 0))
+ pytest.raises(ValueError, lambda: bool(obj == 1))
+ pytest.raises(ValueError, lambda: bool(obj))
+
+ obj = self._construct(shape=4, value=1)
+ pytest.raises(ValueError, lambda: bool(obj == 0))
+ pytest.raises(ValueError, lambda: bool(obj == 1))
+ pytest.raises(ValueError, lambda: bool(obj))
+
+ obj = self._construct(shape=4, value=np.nan)
+ pytest.raises(ValueError, lambda: bool(obj == 0))
+ pytest.raises(ValueError, lambda: bool(obj == 1))
+ pytest.raises(ValueError, lambda: bool(obj))
+
+ # empty
+ obj = self._construct(shape=0)
+ pytest.raises(ValueError, lambda: bool(obj))
+
+ # invalid behaviors
+
+ obj1 = self._construct(shape=4, value=1)
+ obj2 = self._construct(shape=4, value=1)
+
+ def f():
+ if obj1:
+ printing.pprint_thing("this works and shouldn't")
+
+ pytest.raises(ValueError, f)
+ pytest.raises(ValueError, lambda: obj1 and obj2)
+ pytest.raises(ValueError, lambda: obj1 or obj2)
+ pytest.raises(ValueError, lambda: not obj1)
+
+ def test_downcast(self):
+ # test close downcasting
+
+ o = self._construct(shape=4, value=9, dtype=np.int64)
+ result = o.copy()
+ result._data = o._data.downcast(dtypes='infer')
+ self._compare(result, o)
+
+ o = self._construct(shape=4, value=9.)
+ expected = o.astype(np.int64)
+ result = o.copy()
+ result._data = o._data.downcast(dtypes='infer')
+ self._compare(result, expected)
+
+ o = self._construct(shape=4, value=9.5)
+ result = o.copy()
+ result._data = o._data.downcast(dtypes='infer')
+ self._compare(result, o)
+
+ # are close
+ o = self._construct(shape=4, value=9.000000000005)
+ result = o.copy()
+ result._data = o._data.downcast(dtypes='infer')
+ expected = o.astype(np.int64)
+ self._compare(result, expected)
+
+ def test_constructor_compound_dtypes(self):
+ # see gh-5191
+ # Compound dtypes should raise NotImplementedError.
+
+ def f(dtype):
+ return self._construct(shape=3, value=1, dtype=dtype)
+
+ pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"),
+ ("B", "str"),
+ ("C", "int32")])
+
+ # these work (though results may be unexpected)
+ f('int64')
+ f('float64')
+ f('M8[ns]')
+
+ def check_metadata(self, x, y=None):
+ for m in x._metadata:
+ v = getattr(x, m, None)
+ if y is None:
+ assert v is None
+ else:
+ assert v == getattr(y, m, None)
+
+ def test_metadata_propagation(self):
+ # check that the metadata matches up on the resulting ops
+
+ o = self._construct(shape=3)
+ o.name = 'foo'
+ o2 = self._construct(shape=3)
+ o2.name = 'bar'
+
+ # TODO
+ # Once panel can do non-trivial combine operations
+ # (currently there is an a raise in the Panel arith_ops to prevent
+ # this, though it actually does work)
+ # can remove all of these try: except: blocks on the actual operations
+
+ # ----------
+ # preserving
+ # ----------
+
+ # simple ops with scalars
+ for op in ['__add__', '__sub__', '__truediv__', '__mul__']:
+ result = getattr(o, op)(1)
+ self.check_metadata(o, result)
+
+ # ops with like
+ for op in ['__add__', '__sub__', '__truediv__', '__mul__']:
+ try:
+ result = getattr(o, op)(o)
+ self.check_metadata(o, result)
+ except (ValueError, AttributeError):
+ pass
+
+ # simple boolean
+ for op in ['__eq__', '__le__', '__ge__']:
+ v1 = getattr(o, op)(o)
+ self.check_metadata(o, v1)
+
+ try:
+ self.check_metadata(o, v1 & v1)
+ except (ValueError):
+ pass
+
+ try:
+ self.check_metadata(o, v1 | v1)
+ except (ValueError):
+ pass
+
+ # combine_first
+ try:
+ result = o.combine_first(o2)
+ self.check_metadata(o, result)
+ except (AttributeError):
+ pass
+
+ # ---------------------------
+ # non-preserving (by default)
+ # ---------------------------
+
+ # add non-like
+ try:
+ result = o + o2
+ self.check_metadata(result)
+ except (ValueError, AttributeError):
+ pass
+
+ # simple boolean
+ for op in ['__eq__', '__le__', '__ge__']:
+
+ # this is a name matching op
+ v1 = getattr(o, op)(o)
+
+ v2 = getattr(o, op)(o2)
+ self.check_metadata(v2)
+
+ try:
+ self.check_metadata(v1 & v2)
+ except (ValueError):
+ pass
+
+ try:
+ self.check_metadata(v1 | v2)
+ except (ValueError):
+ pass
+
+ def test_head_tail(self):
+ # GH5370
+
+ o = self._construct(shape=10)
+
+ # check all index types
+ for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex,
+ tm.makeUnicodeIndex, tm.makeDateIndex,
+ tm.makePeriodIndex]:
+ axis = o._get_axis_name(0)
+ setattr(o, axis, index(len(getattr(o, axis))))
+
+ # Panel + dims
+ try:
+ o.head()
+ except (NotImplementedError):
+ pytest.skip('not implemented on {0}'.format(
+ o.__class__.__name__))
+
+ self._compare(o.head(), o.iloc[:5])
+ self._compare(o.tail(), o.iloc[-5:])
+
+ # 0-len
+ self._compare(o.head(0), o.iloc[0:0])
+ self._compare(o.tail(0), o.iloc[0:0])
+
+ # bounded
+ self._compare(o.head(len(o) + 1), o)
+ self._compare(o.tail(len(o) + 1), o)
+
+ # neg index
+ self._compare(o.head(-3), o.head(7))
+ self._compare(o.tail(-3), o.tail(7))
+
+ def test_sample(self):
+ # Fixes issue: 2419
+
+ o = self._construct(shape=10)
+
+ ###
+ # Check behavior of random_state argument
+ ###
+
+ # Check for stability when receives seed or random state -- run 10
+ # times.
+ for test in range(10):
+ seed = np.random.randint(0, 100)
+ self._compare(
+ o.sample(n=4, random_state=seed), o.sample(n=4,
+ random_state=seed))
+ self._compare(
+ o.sample(frac=0.7, random_state=seed), o.sample(
+ frac=0.7, random_state=seed))
+
+ self._compare(
+ o.sample(n=4, random_state=np.random.RandomState(test)),
+ o.sample(n=4, random_state=np.random.RandomState(test)))
+
+ self._compare(
+ o.sample(frac=0.7, random_state=np.random.RandomState(test)),
+ o.sample(frac=0.7, random_state=np.random.RandomState(test)))
+
+ os1, os2 = [], []
+ for _ in range(2):
+ np.random.seed(test)
+ os1.append(o.sample(n=4))
+ os2.append(o.sample(frac=0.7))
+ self._compare(*os1)
+ self._compare(*os2)
+
+ # Check for error when random_state argument invalid.
+ with pytest.raises(ValueError):
+ o.sample(random_state='astring!')
+
+ ###
+ # Check behavior of `frac` and `N`
+ ###
+
+ # Giving both frac and N throws error
+ with pytest.raises(ValueError):
+ o.sample(n=3, frac=0.3)
+
+ # Check that raises right error for negative lengths
+ with pytest.raises(ValueError):
+ o.sample(n=-3)
+ with pytest.raises(ValueError):
+ o.sample(frac=-0.3)
+
+ # Make sure float values of `n` give error
+ with pytest.raises(ValueError):
+ o.sample(n=3.2)
+
+ # Check lengths are right
+ assert len(o.sample(n=4) == 4)
+ assert len(o.sample(frac=0.34) == 3)
+ assert len(o.sample(frac=0.36) == 4)
+
+ ###
+ # Check weights
+ ###
+
+ # Weight length must be right
+ with pytest.raises(ValueError):
+ o.sample(n=3, weights=[0, 1])
+
+ with pytest.raises(ValueError):
+ bad_weights = [0.5] * 11
+ o.sample(n=3, weights=bad_weights)
+
+ with pytest.raises(ValueError):
+ bad_weight_series = Series([0, 0, 0.2])
+ o.sample(n=4, weights=bad_weight_series)
+
+ # Check won't accept negative weights
+ with pytest.raises(ValueError):
+ bad_weights = [-0.1] * 10
+ o.sample(n=3, weights=bad_weights)
+
+ # Check inf and -inf throw errors:
+ with pytest.raises(ValueError):
+ weights_with_inf = [0.1] * 10
+ weights_with_inf[0] = np.inf
+ o.sample(n=3, weights=weights_with_inf)
+
+ with pytest.raises(ValueError):
+ weights_with_ninf = [0.1] * 10
+ weights_with_ninf[0] = -np.inf
+ o.sample(n=3, weights=weights_with_ninf)
+
+ # All zeros raises errors
+ zero_weights = [0] * 10
+ with pytest.raises(ValueError):
+ o.sample(n=3, weights=zero_weights)
+
+ # All missing weights
+ nan_weights = [np.nan] * 10
+ with pytest.raises(ValueError):
+ o.sample(n=3, weights=nan_weights)
+
+ # Check np.nan are replaced by zeros.
+ weights_with_nan = [np.nan] * 10
+ weights_with_nan[5] = 0.5
+ self._compare(
+ o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6])
+
+ # Check None are also replaced by zeros.
+ weights_with_None = [None] * 10
+ weights_with_None[5] = 0.5
+ self._compare(
+ o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6])
+
+ def test_size_compat(self):
+ # GH8846
+ # size property should be defined
+
+ o = self._construct(shape=10)
+ assert o.size == np.prod(o.shape)
+ assert o.size == 10 ** len(o.axes)
+
+ def test_split_compat(self):
+ # xref GH8846
+ o = self._construct(shape=10)
+ assert len(np.array_split(o, 5)) == 5
+ assert len(np.array_split(o, 2)) == 2
+
+ def test_unexpected_keyword(self): # GH8597
+ df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe'])
+ ca = pd.Categorical([0, 0, 2, 2, 3, np.nan])
+ ts = df['joe'].copy()
+ ts[2] = np.nan
+
+ with pytest.raises(TypeError, match='unexpected keyword'):
+ df.drop('joe', axis=1, in_place=True)
+
+ with pytest.raises(TypeError, match='unexpected keyword'):
+ df.reindex([1, 0], inplace=True)
+
+ with pytest.raises(TypeError, match='unexpected keyword'):
+ ca.fillna(0, inplace=True)
+
+ with pytest.raises(TypeError, match='unexpected keyword'):
+ ts.fillna(0, in_place=True)
+
+ # See gh-12301
+ def test_stat_unexpected_keyword(self):
+ obj = self._construct(5)
+ starwars = 'Star Wars'
+ errmsg = 'unexpected keyword'
+
+ with pytest.raises(TypeError, match=errmsg):
+ obj.max(epic=starwars) # stat_function
+ with pytest.raises(TypeError, match=errmsg):
+ obj.var(epic=starwars) # stat_function_ddof
+ with pytest.raises(TypeError, match=errmsg):
+ obj.sum(epic=starwars) # cum_function
+ with pytest.raises(TypeError, match=errmsg):
+ obj.any(epic=starwars) # logical_function
+
+ def test_api_compat(self):
+
+ # GH 12021
+ # compat for __name__, __qualname__
+
+ obj = self._construct(5)
+ for func in ['sum', 'cumsum', 'any', 'var']:
+ f = getattr(obj, func)
+ assert f.__name__ == func
+ if PY3:
+ assert f.__qualname__.endswith(func)
+
+ def test_stat_non_defaults_args(self):
+ obj = self._construct(5)
+ out = np.array([0])
+ errmsg = "the 'out' parameter is not supported"
+
+ with pytest.raises(ValueError, match=errmsg):
+ obj.max(out=out) # stat_function
+ with pytest.raises(ValueError, match=errmsg):
+ obj.var(out=out) # stat_function_ddof
+ with pytest.raises(ValueError, match=errmsg):
+ obj.sum(out=out) # cum_function
+ with pytest.raises(ValueError, match=errmsg):
+ obj.any(out=out) # logical_function
+
+ def test_truncate_out_of_bounds(self):
+ # GH11382
+
+ # small
+ shape = [int(2e3)] + ([1] * (self._ndim - 1))
+ small = self._construct(shape, dtype='int8', value=1)
+ self._compare(small.truncate(), small)
+ self._compare(small.truncate(before=0, after=3e3), small)
+ self._compare(small.truncate(before=-1, after=2e3), small)
+
+ # big
+ shape = [int(2e6)] + ([1] * (self._ndim - 1))
+ big = self._construct(shape, dtype='int8', value=1)
+ self._compare(big.truncate(), big)
+ self._compare(big.truncate(before=0, after=3e6), big)
+ self._compare(big.truncate(before=-1, after=2e6), big)
+
+ def test_validate_bool_args(self):
+ df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ super(DataFrame, df).rename_axis(mapper={'a': 'x', 'b': 'y'},
+ axis=1, inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).drop('a', axis=1, inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).sort_index(inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df)._consolidate(inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).fillna(value=0, inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).replace(to_replace=1, value=7,
+ inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).interpolate(inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df)._where(cond=df.a > 2, inplace=value)
+
+ with pytest.raises(ValueError):
+ super(DataFrame, df).mask(cond=df.a > 2, inplace=value)
+
+ def test_copy_and_deepcopy(self):
+ # GH 15444
+ for shape in [0, 1, 2]:
+ obj = self._construct(shape)
+ for func in [copy,
+ deepcopy,
+ lambda x: x.copy(deep=False),
+ lambda x: x.copy(deep=True)]:
+ obj_copy = func(obj)
+ assert obj_copy is not obj
+ self._compare(obj_copy, obj)
+
+ @pytest.mark.parametrize("periods,fill_method,limit,exp", [
+ (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]),
+ (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]),
+ (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]),
+ (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]),
+ (-1, "ffill", None, [np.nan, np.nan, -.5, -.5, -.6, 0, 0, np.nan]),
+ (-1, "ffill", 1, [np.nan, np.nan, -.5, -.5, -.6, 0, np.nan, np.nan]),
+ (-1, "bfill", None, [0, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]),
+ (-1, "bfill", 1, [np.nan, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan])
+ ])
+ def test_pct_change(self, periods, fill_method, limit, exp):
+ vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
+ obj = self._typ(vals)
+ func = getattr(obj, 'pct_change')
+ res = func(periods=periods, fill_method=fill_method, limit=limit)
+ if type(obj) is DataFrame:
+ tm.assert_frame_equal(res, DataFrame(exp))
+ else:
+ tm.assert_series_equal(res, Series(exp))
+
+
+class TestNDFrame(object):
+ # tests that don't fit elsewhere
+
+ def test_sample(sel):
+ # Fixes issue: 2419
+ # additional specific object based tests
+
+ # A few dataframe test with degenerate weights.
+ easy_weight_list = [0] * 10
+ easy_weight_list[5] = 1
+
+ df = pd.DataFrame({'col1': range(10, 20),
+ 'col2': range(20, 30),
+ 'colString': ['a'] * 10,
+ 'easyweights': easy_weight_list})
+ sample1 = df.sample(n=1, weights='easyweights')
+ assert_frame_equal(sample1, df.iloc[5:6])
+
+ # Ensure proper error if string given as weight for Series, panel, or
+ # DataFrame with axis = 1.
+ s = Series(range(10))
+ with pytest.raises(ValueError):
+ s.sample(n=3, weights='weight_column')
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4],
+ minor_axis=[3, 4, 5])
+ with pytest.raises(ValueError):
+ panel.sample(n=1, weights='weight_column')
+
+ with pytest.raises(ValueError):
+ df.sample(n=1, weights='weight_column', axis=1)
+
+ # Check weighting key error
+ with pytest.raises(KeyError):
+ df.sample(n=3, weights='not_a_real_column_name')
+
+ # Check that re-normalizes weights that don't sum to one.
+ weights_less_than_1 = [0] * 10
+ weights_less_than_1[0] = 0.5
+ tm.assert_frame_equal(
+ df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
+
+ ###
+ # Test axis argument
+ ###
+
+ # Test axis argument
+ df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10})
+ second_column_weight = [0, 1]
+ assert_frame_equal(
+ df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])
+
+ # Different axis arg types
+ assert_frame_equal(df.sample(n=1, axis='columns',
+ weights=second_column_weight),
+ df[['col2']])
+
+ weight = [0] * 10
+ weight[5] = 0.5
+ assert_frame_equal(df.sample(n=1, axis='rows', weights=weight),
+ df.iloc[5:6])
+ assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
+ df.iloc[5:6])
+
+ # Check out of range axis values
+ with pytest.raises(ValueError):
+ df.sample(n=1, axis=2)
+
+ with pytest.raises(ValueError):
+ df.sample(n=1, axis='not_a_name')
+
+ with pytest.raises(ValueError):
+ s = pd.Series(range(10))
+ s.sample(n=1, axis=1)
+
+ # Test weight length compared to correct axis
+ with pytest.raises(ValueError):
+ df.sample(n=1, axis=1, weights=[0.5] * 10)
+
+ # Check weights with axis = 1
+ easy_weight_list = [0] * 3
+ easy_weight_list[2] = 1
+
+ df = pd.DataFrame({'col1': range(10, 20),
+ 'col2': range(20, 30),
+ 'colString': ['a'] * 10})
+ sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
+ assert_frame_equal(sample1, df[['colString']])
+
+ # Test default axes
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ p = Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6],
+ minor_axis=[1, 3, 5])
+ assert_panel_equal(
+ p.sample(n=3, random_state=42), p.sample(n=3, axis=1,
+ random_state=42))
+ assert_frame_equal(
+ df.sample(n=3, random_state=42), df.sample(n=3, axis=0,
+ random_state=42))
+
+ # Test that function aligns weights with frame
+ df = DataFrame(
+ {'col1': [5, 6, 7],
+ 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3])
+ s = Series([1, 0, 0], index=[3, 5, 9])
+ assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))
+
+ # Weights have index values to be dropped because not in
+ # sampled DataFrame
+ s2 = Series([0.001, 0, 10000], index=[3, 5, 10])
+ assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))
+
+ # Weights have empty values to be filed with zeros
+ s3 = Series([0.01, 0], index=[3, 5])
+ assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))
+
+ # No overlap in weight and sampled DataFrame indices
+ s4 = Series([1, 0], index=[1, 2])
+ with pytest.raises(ValueError):
+ df.sample(1, weights=s4)
+
+ def test_squeeze(self):
+ # noop
+ for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
+ tm.makeObjectSeries()]:
+ tm.assert_series_equal(s.squeeze(), s)
+ for df in [tm.makeTimeDataFrame()]:
+ tm.assert_frame_equal(df.squeeze(), df)
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ for p in [tm.makePanel()]:
+ tm.assert_panel_equal(p.squeeze(), p)
+
+ # squeezing
+ df = tm.makeTimeDataFrame().reindex(columns=['A'])
+ tm.assert_series_equal(df.squeeze(), df['A'])
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ p = tm.makePanel().reindex(items=['ItemA'])
+ tm.assert_frame_equal(p.squeeze(), p['ItemA'])
+
+ p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A'])
+ tm.assert_series_equal(p.squeeze(), p.loc['ItemA', :, 'A'])
+
+ # don't fail with 0 length dimensions GH11229 & GH8999
+ empty_series = Series([], name='five')
+ empty_frame = DataFrame([empty_series])
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ empty_panel = Panel({'six': empty_frame})
+
+ [tm.assert_series_equal(empty_series, higher_dim.squeeze())
+ for higher_dim in [empty_series, empty_frame, empty_panel]]
+
+ # axis argument
+ df = tm.makeTimeDataFrame(nper=1).iloc[:, :1]
+ assert df.shape == (1, 1)
+ tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0])
+ tm.assert_series_equal(df.squeeze(axis='index'), df.iloc[0])
+ tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0])
+ tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0])
+ assert df.squeeze() == df.iloc[0, 0]
+ pytest.raises(ValueError, df.squeeze, axis=2)
+ pytest.raises(ValueError, df.squeeze, axis='x')
+
+ df = tm.makeTimeDataFrame(3)
+ tm.assert_frame_equal(df.squeeze(axis=0), df)
+
+ def test_numpy_squeeze(self):
+ s = tm.makeFloatSeries()
+ tm.assert_series_equal(np.squeeze(s), s)
+
+ df = tm.makeTimeDataFrame().reindex(columns=['A'])
+ tm.assert_series_equal(np.squeeze(df), df['A'])
+
+ def test_transpose(self):
+ msg = (r"transpose\(\) got multiple values for "
+ r"keyword argument 'axes'")
+ for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
+ tm.makeObjectSeries()]:
+ # calls implementation in pandas/core/base.py
+ tm.assert_series_equal(s.transpose(), s)
+ for df in [tm.makeTimeDataFrame()]:
+ tm.assert_frame_equal(df.transpose().transpose(), df)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ for p in [tm.makePanel()]:
+ tm.assert_panel_equal(p.transpose(2, 0, 1)
+ .transpose(1, 2, 0), p)
+ with pytest.raises(TypeError, match=msg):
+ p.transpose(2, 0, 1, axes=(2, 0, 1))
+
+ def test_numpy_transpose(self):
+ msg = "the 'axes' parameter is not supported"
+
+ s = tm.makeFloatSeries()
+ tm.assert_series_equal(np.transpose(s), s)
+
+ with pytest.raises(ValueError, match=msg):
+ np.transpose(s, axes=1)
+
+ df = tm.makeTimeDataFrame()
+ tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
+
+ with pytest.raises(ValueError, match=msg):
+ np.transpose(df, axes=1)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ p = tm.makePanel()
+ tm.assert_panel_equal(np.transpose(
+ np.transpose(p, axes=(2, 0, 1)),
+ axes=(1, 2, 0)), p)
+
+ def test_take(self):
+ indices = [1, 5, -2, 6, 3, -1]
+ for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
+ tm.makeObjectSeries()]:
+ out = s.take(indices)
+ expected = Series(data=s.values.take(indices),
+ index=s.index.take(indices), dtype=s.dtype)
+ tm.assert_series_equal(out, expected)
+ for df in [tm.makeTimeDataFrame()]:
+ out = df.take(indices)
+ expected = DataFrame(data=df.values.take(indices, axis=0),
+ index=df.index.take(indices),
+ columns=df.columns)
+ tm.assert_frame_equal(out, expected)
+
+ indices = [-3, 2, 0, 1]
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ for p in [tm.makePanel()]:
+ out = p.take(indices)
+ expected = Panel(data=p.values.take(indices, axis=0),
+ items=p.items.take(indices),
+ major_axis=p.major_axis,
+ minor_axis=p.minor_axis)
+ tm.assert_panel_equal(out, expected)
+
+ def test_take_invalid_kwargs(self):
+ indices = [-3, 2, 0, 1]
+ s = tm.makeFloatSeries()
+ df = tm.makeTimeDataFrame()
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ p = tm.makePanel()
+
+ for obj in (s, df, p):
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ obj.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ obj.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ obj.take(indices, mode='clip')
+
+ def test_equals(self):
+ s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
+ s2 = s1.copy()
+ assert s1.equals(s2)
+
+ s1[1] = 99
+ assert not s1.equals(s2)
+
+ # NaNs compare as equal
+ s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
+ s2 = s1.copy()
+ assert s1.equals(s2)
+
+ s2[0] = 9.9
+ assert not s1.equals(s2)
+
+ idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
+ s1 = Series([1, 2, np.nan], index=idx)
+ s2 = s1.copy()
+ assert s1.equals(s2)
+
+ # Add object dtype column with nans
+ index = np.random.random(10)
+ df1 = DataFrame(
+ np.random.random(10, ), index=index, columns=['floats'])
+ df1['text'] = 'the sky is so blue. we could use more chocolate.'.split(
+ )
+ df1['start'] = date_range('2000-1-1', periods=10, freq='T')
+ df1['end'] = date_range('2000-1-1', periods=10, freq='D')
+ df1['diff'] = df1['end'] - df1['start']
+ df1['bool'] = (np.arange(10) % 3 == 0)
+ df1.loc[::2] = np.nan
+ df2 = df1.copy()
+ assert df1['text'].equals(df2['text'])
+ assert df1['start'].equals(df2['start'])
+ assert df1['end'].equals(df2['end'])
+ assert df1['diff'].equals(df2['diff'])
+ assert df1['bool'].equals(df2['bool'])
+ assert df1.equals(df2)
+ assert not df1.equals(object)
+
+ # different dtype
+ different = df1.copy()
+ different['floats'] = different['floats'].astype('float32')
+ assert not df1.equals(different)
+
+ # different index
+ different_index = -index
+ different = df2.set_index(different_index)
+ assert not df1.equals(different)
+
+ # different columns
+ different = df2.copy()
+ different.columns = df2.columns[::-1]
+ assert not df1.equals(different)
+
+ # DatetimeIndex
+ index = pd.date_range('2000-1-1', periods=10, freq='T')
+ df1 = df1.set_index(index)
+ df2 = df1.copy()
+ assert df1.equals(df2)
+
+ # MultiIndex
+ df3 = df1.set_index(['text'], append=True)
+ df2 = df1.set_index(['text'], append=True)
+ assert df3.equals(df2)
+
+ df2 = df1.set_index(['floats'], append=True)
+ assert not df3.equals(df2)
+
+ # NaN in index
+ df3 = df1.set_index(['floats'], append=True)
+ df2 = df1.set_index(['floats'], append=True)
+ assert df3.equals(df2)
+
+ # GH 8437
+ a = pd.Series([False, np.nan])
+ b = pd.Series([False, np.nan])
+ c = pd.Series(index=range(2))
+ d = pd.Series(index=range(2))
+ e = pd.Series(index=range(2))
+ f = pd.Series(index=range(2))
+ c[:-1] = d[:-1] = e[0] = f[0] = False
+ assert a.equals(a)
+ assert a.equals(b)
+ assert a.equals(c)
+ assert a.equals(d)
+ assert a.equals(e)
+ assert e.equals(f)
+
+ def test_describe_raises(self):
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ with pytest.raises(NotImplementedError):
+ tm.makePanel().describe()
+
+ def test_pipe(self):
+ df = DataFrame({'A': [1, 2, 3]})
+ f = lambda x, y: x ** y
+ result = df.pipe(f, 2)
+ expected = DataFrame({'A': [1, 4, 9]})
+ assert_frame_equal(result, expected)
+
+ result = df.A.pipe(f, 2)
+ assert_series_equal(result, expected.A)
+
+ def test_pipe_tuple(self):
+ df = DataFrame({'A': [1, 2, 3]})
+ f = lambda x, y: y
+ result = df.pipe((f, 'y'), 0)
+ assert_frame_equal(result, df)
+
+ result = df.A.pipe((f, 'y'), 0)
+ assert_series_equal(result, df.A)
+
+ def test_pipe_tuple_error(self):
+ df = DataFrame({"A": [1, 2, 3]})
+ f = lambda x, y: y
+ with pytest.raises(ValueError):
+ df.pipe((f, 'y'), x=1, y=0)
+
+ with pytest.raises(ValueError):
+ df.A.pipe((f, 'y'), x=1, y=0)
+
+ def test_pipe_panel(self):
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})})
+ f = lambda x, y: x + y
+ result = wp.pipe(f, 2)
+ expected = wp + 2
+ assert_panel_equal(result, expected)
+
+ result = wp.pipe((f, 'y'), x=1)
+ expected = wp + 1
+ assert_panel_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ wp.pipe((f, 'y'), x=1, y=1)
+
+ @pytest.mark.parametrize('box', [pd.Series, pd.DataFrame])
+ def test_axis_classmethods(self, box):
+ obj = box()
+ values = (list(box._AXIS_NAMES.keys()) +
+ list(box._AXIS_NUMBERS.keys()) +
+ list(box._AXIS_ALIASES.keys()))
+ for v in values:
+ assert obj._get_axis_number(v) == box._get_axis_number(v)
+ assert obj._get_axis_name(v) == box._get_axis_name(v)
+ assert obj._get_block_manager_axis(v) == \
+ box._get_block_manager_axis(v)
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/test_label_or_level_utils.py b/contrib/python/pandas/py2/pandas/tests/generic/test_label_or_level_utils.py
new file mode 100644
index 00000000000..91c58e01f0c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/test_label_or_level_utils.py
@@ -0,0 +1,406 @@
+import pytest
+
+from pandas.core.dtypes.missing import array_equivalent
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+# Fixtures
+# ========
+def df():
+ """DataFrame with columns 'L1', 'L2', and 'L3' """
+ return pd.DataFrame({'L1': [1, 2, 3],
+ 'L2': [11, 12, 13],
+ 'L3': ['A', 'B', 'C']})
+
+
[email protected](params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']])
+def df_levels(request, df):
+ """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """
+ levels = request.param
+
+ if levels:
+ df = df.set_index(levels)
+
+ return df
+
+
+def df_ambig(df):
+ """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """
+ df = df.set_index(['L1', 'L2'])
+
+ df['L1'] = df['L3']
+
+ return df
+
+
+def df_duplabels(df):
+ """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """
+ df = df.set_index(['L1'])
+ df = pd.concat([df, df['L2']], axis=1)
+
+ return df
+
+
+def panel():
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ return pd.Panel()
+
+
+# Test is label/level reference
+# =============================
+def get_labels_levels(df_levels):
+ expected_labels = list(df_levels.columns)
+ expected_levels = [name for name in df_levels.index.names
+ if name is not None]
+ return expected_labels, expected_levels
+
+
+def assert_label_reference(frame, labels, axis):
+ for label in labels:
+ assert frame._is_label_reference(label, axis=axis)
+ assert not frame._is_level_reference(label, axis=axis)
+ assert frame._is_label_or_level_reference(label, axis=axis)
+
+
+def assert_level_reference(frame, levels, axis):
+ for level in levels:
+ assert frame._is_level_reference(level, axis=axis)
+ assert not frame._is_label_reference(level, axis=axis)
+ assert frame._is_label_or_level_reference(level, axis=axis)
+
+
+# DataFrame
+# ---------
+def test_is_level_or_label_reference_df_simple(df_levels, axis):
+
+ # Compute expected labels and levels
+ expected_labels, expected_levels = get_labels_levels(df_levels)
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_levels = df_levels.T
+
+ # Perform checks
+ assert_level_reference(df_levels, expected_levels, axis=axis)
+ assert_label_reference(df_levels, expected_labels, axis=axis)
+
+
+def test_is_level_reference_df_ambig(df_ambig, axis):
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_ambig = df_ambig.T
+
+ # df has both an on-axis level and off-axis label named L1
+ # Therefore L1 should reference the label, not the level
+ assert_label_reference(df_ambig, ['L1'], axis=axis)
+
+ # df has an on-axis level named L2 and it is not ambiguous
+ # Therefore L2 is an level reference
+ assert_level_reference(df_ambig, ['L2'], axis=axis)
+
+ # df has a column named L3 and it not an level reference
+ assert_label_reference(df_ambig, ['L3'], axis=axis)
+
+
+# Series
+# ------
+def test_is_level_reference_series_simple_axis0(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+ assert_level_reference(s, ['L1'], axis=0)
+ assert not s._is_level_reference('L2')
+
+ # Make series with L1 and L2 as index
+ s = df.set_index(['L1', 'L2']).L3
+ assert_level_reference(s, ['L1', 'L2'], axis=0)
+ assert not s._is_level_reference('L3')
+
+
+def test_is_level_reference_series_axis1_error(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+
+ with pytest.raises(ValueError, match="No axis named 1"):
+ s._is_level_reference('L1', axis=1)
+
+
+# Panel
+# -----
+def test_is_level_reference_panel_error(panel):
+ msg = ("_is_level_reference is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._is_level_reference('L1', axis=0)
+
+
+def test_is_label_reference_panel_error(panel):
+ msg = ("_is_label_reference is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._is_label_reference('L1', axis=0)
+
+
+def test_is_label_or_level_reference_panel_error(panel):
+ msg = ("_is_label_or_level_reference is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._is_label_or_level_reference('L1', axis=0)
+
+
+# Test _check_label_or_level_ambiguity_df
+# =======================================
+
+# DataFrame
+# ---------
+def test_check_label_or_level_ambiguity_df(df_ambig, axis):
+
+ # Transpose frame if axis == 1
+ if axis in {1, "columns"}:
+ df_ambig = df_ambig.T
+
+ if axis in {0, "index"}:
+ msg = "'L1' is both an index level and a column label"
+ else:
+ msg = "'L1' is both a column level and an index label"
+
+ # df_ambig has both an on-axis level and off-axis label named L1
+ # Therefore, L1 is ambiguous.
+ with pytest.raises(ValueError, match=msg):
+ df_ambig._check_label_or_level_ambiguity("L1", axis=axis)
+
+ # df_ambig has an on-axis level named L2,, and it is not ambiguous.
+ df_ambig._check_label_or_level_ambiguity("L2", axis=axis)
+
+ # df_ambig has an off-axis label named L3, and it is not ambiguous
+ assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis)
+
+
+# Series
+# ------
+def test_check_label_or_level_ambiguity_series(df):
+
+ # A series has no columns and therefore references are never ambiguous
+
+ # Make series with L1 as index
+ s = df.set_index("L1").L2
+ s._check_label_or_level_ambiguity("L1", axis=0)
+ s._check_label_or_level_ambiguity("L2", axis=0)
+
+ # Make series with L1 and L2 as index
+ s = df.set_index(["L1", "L2"]).L3
+ s._check_label_or_level_ambiguity("L1", axis=0)
+ s._check_label_or_level_ambiguity("L2", axis=0)
+ s._check_label_or_level_ambiguity("L3", axis=0)
+
+
+def test_check_label_or_level_ambiguity_series_axis1_error(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+
+ with pytest.raises(ValueError, match="No axis named 1"):
+ s._check_label_or_level_ambiguity('L1', axis=1)
+
+
+# Panel
+# -----
+def test_check_label_or_level_ambiguity_panel_error(panel):
+ msg = ("_check_label_or_level_ambiguity is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._check_label_or_level_ambiguity("L1", axis=0)
+
+
+# Test _get_label_or_level_values
+# ===============================
+def assert_label_values(frame, labels, axis):
+ for label in labels:
+ if axis in {0, 'index'}:
+ expected = frame[label]._values
+ else:
+ expected = frame.loc[label]._values
+
+ result = frame._get_label_or_level_values(label, axis=axis)
+ assert array_equivalent(expected, result)
+
+
+def assert_level_values(frame, levels, axis):
+ for level in levels:
+ if axis in {0, "index"}:
+ expected = frame.index.get_level_values(level=level)._values
+ else:
+ expected = frame.columns.get_level_values(level=level)._values
+
+ result = frame._get_label_or_level_values(level, axis=axis)
+ assert array_equivalent(expected, result)
+
+
+# DataFrame
+# ---------
+def test_get_label_or_level_values_df_simple(df_levels, axis):
+
+ # Compute expected labels and levels
+ expected_labels, expected_levels = get_labels_levels(df_levels)
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_levels = df_levels.T
+
+ # Perform checks
+ assert_label_values(df_levels, expected_labels, axis=axis)
+ assert_level_values(df_levels, expected_levels, axis=axis)
+
+
+def test_get_label_or_level_values_df_ambig(df_ambig, axis):
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_ambig = df_ambig.T
+
+ # df has an on-axis level named L2, and it is not ambiguous.
+ assert_level_values(df_ambig, ['L2'], axis=axis)
+
+ # df has an off-axis label named L3, and it is not ambiguous.
+ assert_label_values(df_ambig, ['L3'], axis=axis)
+
+
+def test_get_label_or_level_values_df_duplabels(df_duplabels, axis):
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_duplabels = df_duplabels.T
+
+ # df has unambiguous level 'L1'
+ assert_level_values(df_duplabels, ['L1'], axis=axis)
+
+ # df has unique label 'L3'
+ assert_label_values(df_duplabels, ['L3'], axis=axis)
+
+ # df has duplicate labels 'L2'
+ if axis in {0, 'index'}:
+ expected_msg = "The column label 'L2' is not unique"
+ else:
+ expected_msg = "The index label 'L2' is not unique"
+
+ with pytest.raises(ValueError, match=expected_msg):
+ assert_label_values(df_duplabels, ['L2'], axis=axis)
+
+
+# Series
+# ------
+def test_get_label_or_level_values_series_axis0(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+ assert_level_values(s, ['L1'], axis=0)
+
+ # Make series with L1 and L2 as index
+ s = df.set_index(['L1', 'L2']).L3
+ assert_level_values(s, ['L1', 'L2'], axis=0)
+
+
+def test_get_label_or_level_values_series_axis1_error(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+
+ with pytest.raises(ValueError, match="No axis named 1"):
+ s._get_label_or_level_values('L1', axis=1)
+
+
+# Panel
+# -----
+def test_get_label_or_level_values_panel_error(panel):
+ msg = ("_get_label_or_level_values is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._get_label_or_level_values('L1', axis=0)
+
+
+# Test _drop_labels_or_levels
+# ===========================
+def assert_labels_dropped(frame, labels, axis):
+ for label in labels:
+ df_dropped = frame._drop_labels_or_levels(label, axis=axis)
+
+ if axis in {0, 'index'}:
+ assert label in frame.columns
+ assert label not in df_dropped.columns
+ else:
+ assert label in frame.index
+ assert label not in df_dropped.index
+
+
+def assert_levels_dropped(frame, levels, axis):
+ for level in levels:
+ df_dropped = frame._drop_labels_or_levels(level, axis=axis)
+
+ if axis in {0, 'index'}:
+ assert level in frame.index.names
+ assert level not in df_dropped.index.names
+ else:
+ assert level in frame.columns.names
+ assert level not in df_dropped.columns.names
+
+
+# DataFrame
+# ---------
+def test_drop_labels_or_levels_df(df_levels, axis):
+
+ # Compute expected labels and levels
+ expected_labels, expected_levels = get_labels_levels(df_levels)
+
+ # Transpose frame if axis == 1
+ if axis in {1, 'columns'}:
+ df_levels = df_levels.T
+
+ # Perform checks
+ assert_labels_dropped(df_levels, expected_labels, axis=axis)
+ assert_levels_dropped(df_levels, expected_levels, axis=axis)
+
+ with pytest.raises(ValueError, match="not valid labels or levels"):
+ df_levels._drop_labels_or_levels('L4', axis=axis)
+
+
+# Series
+# ------
+def test_drop_labels_or_levels_series(df):
+
+ # Make series with L1 as index
+ s = df.set_index('L1').L2
+ assert_levels_dropped(s, ['L1'], axis=0)
+
+ with pytest.raises(ValueError, match="not valid labels or levels"):
+ s._drop_labels_or_levels('L4', axis=0)
+
+ # Make series with L1 and L2 as index
+ s = df.set_index(['L1', 'L2']).L3
+ assert_levels_dropped(s, ['L1', 'L2'], axis=0)
+
+ with pytest.raises(ValueError, match="not valid labels or levels"):
+ s._drop_labels_or_levels('L4', axis=0)
+
+
+# Panel
+# -----
+def test_drop_labels_or_levels_panel_error(panel):
+ msg = ("_drop_labels_or_levels is not implemented for {type}"
+ .format(type=type(panel)))
+
+ with pytest.raises(NotImplementedError, match=msg):
+ panel._drop_labels_or_levels('L1', axis=0)
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/test_panel.py b/contrib/python/pandas/py2/pandas/tests/generic/test_panel.py
new file mode 100644
index 00000000000..8b090d95195
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/test_panel.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=E1101,W0612
+
+from warnings import catch_warnings, simplefilter
+
+import pandas.util._test_decorators as td
+
+from pandas import Panel
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal, assert_panel_equal
+
+from .test_generic import Generic
+
+
+class TestPanel(Generic):
+ _typ = Panel
+ _comparator = lambda self, x, y: assert_panel_equal(x, y, by_blocks=True)
+
+ @td.skip_if_no('xarray', min_version='0.7.0')
+ def test_to_xarray(self):
+ from xarray import DataArray
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ p = tm.makePanel()
+
+ result = p.to_xarray()
+ assert isinstance(result, DataArray)
+ assert len(result.coords) == 3
+ assert_almost_equal(list(result.coords.keys()),
+ ['items', 'major_axis', 'minor_axis'])
+ assert len(result.dims) == 3
+
+ # idempotency
+ assert_panel_equal(result.to_pandas(), p)
+
+
+# run all the tests, but wrap each in a warning catcher
+for t in ['test_rename', 'test_get_numeric_data',
+ 'test_get_default', 'test_nonzero',
+ 'test_downcast', 'test_constructor_compound_dtypes',
+ 'test_head_tail',
+ 'test_size_compat', 'test_split_compat',
+ 'test_unexpected_keyword',
+ 'test_stat_unexpected_keyword', 'test_api_compat',
+ 'test_stat_non_defaults_args',
+ 'test_truncate_out_of_bounds',
+ 'test_metadata_propagation', 'test_copy_and_deepcopy',
+ 'test_pct_change', 'test_sample']:
+
+ def f():
+ def tester(self):
+ f = getattr(super(TestPanel, self), t)
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ f()
+ return tester
+
+ setattr(TestPanel, t, f())
diff --git a/contrib/python/pandas/py2/pandas/tests/generic/test_series.py b/contrib/python/pandas/py2/pandas/tests/generic/test_series.py
new file mode 100644
index 00000000000..10430ebde82
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/generic/test_series.py
@@ -0,0 +1,247 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=E1101,W0612
+
+from distutils.version import LooseVersion
+from operator import methodcaller
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import MultiIndex, Series, date_range
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal, assert_series_equal
+
+from .test_generic import Generic
+
+try:
+ import xarray
+ _XARRAY_INSTALLED = True
+except ImportError:
+ _XARRAY_INSTALLED = False
+
+
+class TestSeries(Generic):
+ _typ = Series
+ _comparator = lambda self, x, y: assert_series_equal(x, y)
+
+ def setup_method(self):
+ self.ts = tm.makeTimeSeries() # Was at top level in test_series
+ self.ts.name = 'ts'
+
+ self.series = tm.makeStringSeries()
+ self.series.name = 'series'
+
+ def test_rename_mi(self):
+ s = Series([11, 21, 31],
+ index=MultiIndex.from_tuples(
+ [("A", x) for x in ["a", "B", "c"]]))
+ s.rename(str.lower)
+
+ def test_set_axis_name(self):
+ s = Series([1, 2, 3], index=['a', 'b', 'c'])
+ funcs = ['rename_axis', '_set_axis_name']
+ name = 'foo'
+ for func in funcs:
+ result = methodcaller(func, name)(s)
+ assert s.index.name is None
+ assert result.index.name == name
+
+ def test_set_axis_name_mi(self):
+ s = Series([11, 21, 31], index=MultiIndex.from_tuples(
+ [("A", x) for x in ["a", "B", "c"]],
+ names=['l1', 'l2'])
+ )
+ funcs = ['rename_axis', '_set_axis_name']
+ for func in funcs:
+ result = methodcaller(func, ['L1', 'L2'])(s)
+ assert s.index.name is None
+ assert s.index.names == ['l1', 'l2']
+ assert result.index.name is None
+ assert result.index.names, ['L1', 'L2']
+
+ def test_set_axis_name_raises(self):
+ s = pd.Series([1])
+ with pytest.raises(ValueError):
+ s._set_axis_name(name='a', axis=1)
+
+ def test_get_numeric_data_preserve_dtype(self):
+
+ # get the numeric data
+ o = Series([1, 2, 3])
+ result = o._get_numeric_data()
+ self._compare(result, o)
+
+ o = Series([1, '2', 3.])
+ result = o._get_numeric_data()
+ expected = Series([], dtype=object, index=pd.Index([], dtype=object))
+ self._compare(result, expected)
+
+ o = Series([True, False, True])
+ result = o._get_numeric_data()
+ self._compare(result, o)
+
+ o = Series([True, False, True])
+ result = o._get_bool_data()
+ self._compare(result, o)
+
+ o = Series(date_range('20130101', periods=3))
+ result = o._get_numeric_data()
+ expected = Series([], dtype='M8[ns]', index=pd.Index([], dtype=object))
+ self._compare(result, expected)
+
+ def test_nonzero_single_element(self):
+
+ # allow single item via bool method
+ s = Series([True])
+ assert s.bool()
+
+ s = Series([False])
+ assert not s.bool()
+
+ # single item nan to raise
+ for s in [Series([np.nan]), Series([pd.NaT]), Series([True]),
+ Series([False])]:
+ pytest.raises(ValueError, lambda: bool(s))
+
+ for s in [Series([np.nan]), Series([pd.NaT])]:
+ pytest.raises(ValueError, lambda: s.bool())
+
+ # multiple bool are still an error
+ for s in [Series([True, True]), Series([False, False])]:
+ pytest.raises(ValueError, lambda: bool(s))
+ pytest.raises(ValueError, lambda: s.bool())
+
+ # single non-bool are an error
+ for s in [Series([1]), Series([0]), Series(['a']), Series([0.0])]:
+ pytest.raises(ValueError, lambda: bool(s))
+ pytest.raises(ValueError, lambda: s.bool())
+
+ def test_metadata_propagation_indiv(self):
+ # check that the metadata matches up on the resulting ops
+
+ o = Series(range(3), range(3))
+ o.name = 'foo'
+ o2 = Series(range(3), range(3))
+ o2.name = 'bar'
+
+ result = o.T
+ self.check_metadata(o, result)
+
+ # resample
+ ts = Series(np.random.rand(1000),
+ index=date_range('20130101', periods=1000, freq='s'),
+ name='foo')
+ result = ts.resample('1T').mean()
+ self.check_metadata(ts, result)
+
+ result = ts.resample('1T').min()
+ self.check_metadata(ts, result)
+
+ result = ts.resample('1T').apply(lambda x: x.sum())
+ self.check_metadata(ts, result)
+
+ _metadata = Series._metadata
+ _finalize = Series.__finalize__
+ Series._metadata = ['name', 'filename']
+ o.filename = 'foo'
+ o2.filename = 'bar'
+
+ def finalize(self, other, method=None, **kwargs):
+ for name in self._metadata:
+ if method == 'concat' and name == 'filename':
+ value = '+'.join([getattr(
+ o, name) for o in other.objs if getattr(o, name, None)
+ ])
+ object.__setattr__(self, name, value)
+ else:
+ object.__setattr__(self, name, getattr(other, name, None))
+
+ return self
+
+ Series.__finalize__ = finalize
+
+ result = pd.concat([o, o2])
+ assert result.filename == 'foo+bar'
+ assert result.name is None
+
+ # reset
+ Series._metadata = _metadata
+ Series.__finalize__ = _finalize
+
+ @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and
+ LooseVersion(xarray.__version__) <
+ LooseVersion('0.10.0'),
+ reason='xarray >= 0.10.0 required')
+ @pytest.mark.parametrize(
+ "index",
+ ['FloatIndex', 'IntIndex',
+ 'StringIndex', 'UnicodeIndex',
+ 'DateIndex', 'PeriodIndex',
+ 'TimedeltaIndex', 'CategoricalIndex'])
+ def test_to_xarray_index_types(self, index):
+ from xarray import DataArray
+
+ index = getattr(tm, 'make{}'.format(index))
+ s = Series(range(6), index=index(6))
+ s.index.name = 'foo'
+ result = s.to_xarray()
+ repr(result)
+ assert len(result) == 6
+ assert len(result.coords) == 1
+ assert_almost_equal(list(result.coords.keys()), ['foo'])
+ assert isinstance(result, DataArray)
+
+ # idempotency
+ assert_series_equal(result.to_series(), s,
+ check_index_type=False,
+ check_categorical=True)
+
+ @td.skip_if_no('xarray', min_version='0.7.0')
+ def test_to_xarray(self):
+ from xarray import DataArray
+
+ s = Series([])
+ s.index.name = 'foo'
+ result = s.to_xarray()
+ assert len(result) == 0
+ assert len(result.coords) == 1
+ assert_almost_equal(list(result.coords.keys()), ['foo'])
+ assert isinstance(result, DataArray)
+
+ s = Series(range(6))
+ s.index.name = 'foo'
+ s.index = pd.MultiIndex.from_product([['a', 'b'], range(3)],
+ names=['one', 'two'])
+ result = s.to_xarray()
+ assert len(result) == 2
+ assert_almost_equal(list(result.coords.keys()), ['one', 'two'])
+ assert isinstance(result, DataArray)
+ assert_series_equal(result.to_series(), s)
+
+ def test_valid_deprecated(self):
+ # GH18800
+ with tm.assert_produces_warning(FutureWarning):
+ pd.Series([]).valid()
+
+ @pytest.mark.parametrize("s", [
+ Series([np.arange(5)]),
+ pd.date_range('1/1/2011', periods=24, freq='H'),
+ pd.Series(range(5), index=pd.date_range("2017", periods=5))
+ ])
+ @pytest.mark.parametrize("shift_size", [0, 1, 2])
+ def test_shift_always_copy(self, s, shift_size):
+ # GH22397
+ assert s.shift(shift_size) is not s
+
+ @pytest.mark.parametrize("move_by_freq", [
+ pd.Timedelta('1D'),
+ pd.Timedelta('1M'),
+ ])
+ def test_datetime_shift_always_copy(self, move_by_freq):
+ # GH22397
+ s = pd.Series(range(5), index=pd.date_range("2017", periods=5))
+ assert s.shift(freq=move_by_freq) is not s
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/__init__.py b/contrib/python/pandas/py2/pandas/tests/groupby/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/__init__.py b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_aggregate.py b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_aggregate.py
new file mode 100644
index 00000000000..62ec0555f90
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+
+"""
+test .agg behavior / note that .apply is tested generally in test_groupby.py
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import OrderedDict
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, concat
+from pandas.core.base import SpecificationError
+from pandas.core.groupby.grouper import Grouping
+import pandas.util.testing as tm
+
+
+def test_agg_regression1(tsframe):
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_must_agg(df):
+ grouped = df.groupby('A')['C']
+
+ msg = "Must produce aggregated value"
+ with pytest.raises(Exception, match=msg):
+ grouped.agg(lambda x: x.describe())
+ with pytest.raises(Exception, match=msg):
+ grouped.agg(lambda x: x.index[:2])
+
+
+def test_agg_ser_multi_key(df):
+ # TODO(wesm): unused
+ ser = df.C # noqa
+
+ f = lambda x: x.sum()
+ results = df.C.groupby([df.A, df.B]).aggregate(f)
+ expected = df.groupby(['A', 'B']).sum()['C']
+ tm.assert_series_equal(results, expected)
+
+
+def test_groupby_aggregation_mixed_dtype():
+
+ # GH 6212
+ expected = DataFrame({
+ 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
+ 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
+ index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
+ ('big', 'damp'),
+ ('blue', 'dry'),
+ ('red', 'red'), ('red', 'wet')],
+ names=['by1', 'by2']))
+
+ df = DataFrame({
+ 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
+ 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
+ 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
+ 12],
+ 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
+ np.nan, np.nan]
+ })
+
+ g = df.groupby(['by1', 'by2'])
+ result = g[['v1', 'v2']].mean()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_apply_corner(ts, tsframe):
+ # nothing to group, all NA
+ grouped = ts.groupby(ts * np.nan)
+ assert ts.dtype == np.float64
+
+ # groupby float64 values results in Float64Index
+ exp = Series([], dtype=np.float64,
+ index=pd.Index([], dtype=np.float64))
+ tm.assert_series_equal(grouped.sum(), exp)
+ tm.assert_series_equal(grouped.agg(np.sum), exp)
+ tm.assert_series_equal(grouped.apply(np.sum), exp,
+ check_index_type=False)
+
+ # DataFrame
+ grouped = tsframe.groupby(tsframe['A'] * np.nan)
+ exp_df = DataFrame(columns=tsframe.columns, dtype=float,
+ index=pd.Index([], dtype=np.float64))
+ tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
+ tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
+ tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
+ check_names=False)
+
+
+def test_agg_grouping_is_list_tuple(ts):
+ df = tm.makeTimeDataFrame()
+
+ grouped = df.groupby(lambda x: x.year)
+ grouper = grouped.grouper.groupings[0].grouper
+ grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ tm.assert_frame_equal(result, expected)
+
+ grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_python_multiindex(mframe):
+ grouped = mframe.groupby(['A', 'B'])
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('groupbyfunc', [
+ lambda x: x.weekday(),
+ [lambda x: x.month, lambda x: x.weekday()],
+])
+def test_aggregate_str_func(tsframe, groupbyfunc):
+ grouped = tsframe.groupby(groupbyfunc)
+
+ # single series
+ result = grouped['A'].agg('std')
+ expected = grouped['A'].std()
+ tm.assert_series_equal(result, expected)
+
+ # group frame by function name
+ result = grouped.aggregate('var')
+ expected = grouped.var()
+ tm.assert_frame_equal(result, expected)
+
+ # group frame by function dict
+ result = grouped.agg(OrderedDict([['A', 'var'],
+ ['B', 'std'],
+ ['C', 'mean'],
+ ['D', 'sem']]))
+ expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
+ ['B', grouped['B'].std()],
+ ['C', grouped['C'].mean()],
+ ['D', grouped['D'].sem()]]))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_aggregate_item_by_item(df):
+ grouped = df.groupby('A')
+
+ aggfun = lambda ser: ser.size
+ result = grouped.agg(aggfun)
+ foo = (df.A == 'foo').sum()
+ bar = (df.A == 'bar').sum()
+ K = len(result.columns)
+
+ # GH5782
+ # odd comparisons can result here, so cast to make easy
+ exp = pd.Series(np.array([foo] * K), index=list('BCD'),
+ dtype=np.float64, name='foo')
+ tm.assert_series_equal(result.xs('foo'), exp)
+
+ exp = pd.Series(np.array([bar] * K), index=list('BCD'),
+ dtype=np.float64, name='bar')
+ tm.assert_almost_equal(result.xs('bar'), exp)
+
+ def aggfun(ser):
+ return ser.size
+
+ result = DataFrame().groupby(df.A).agg(aggfun)
+ assert isinstance(result, DataFrame)
+ assert len(result) == 0
+
+
+def test_wrap_agg_out(three_group):
+ grouped = three_group.groupby(['A', 'B'])
+
+ def func(ser):
+ if ser.dtype == np.object:
+ raise TypeError
+ else:
+ return ser.sum()
+
+ result = grouped.aggregate(func)
+ exp_grouped = three_group.loc[:, three_group.columns != 'C']
+ expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_multiple_functions_maintain_order(df):
+ # GH #610
+ funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
+ result = df.groupby('A')['C'].agg(funcs)
+ exp_cols = Index(['mean', 'max', 'min'])
+
+ tm.assert_index_equal(result.columns, exp_cols)
+
+
+def test_multiple_functions_tuples_and_non_tuples(df):
+ # #1359
+ funcs = [('foo', 'mean'), 'std']
+ ex_funcs = [('foo', 'mean'), ('std', 'std')]
+
+ result = df.groupby('A')['C'].agg(funcs)
+ expected = df.groupby('A')['C'].agg(ex_funcs)
+ tm.assert_frame_equal(result, expected)
+
+ result = df.groupby('A').agg(funcs)
+ expected = df.groupby('A').agg(ex_funcs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_multiple_functions_too_many_lambdas(df):
+ grouped = df.groupby('A')
+ funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
+
+ msg = 'Function names must be unique, found multiple named <lambda>'
+ with pytest.raises(SpecificationError, match=msg):
+ grouped.agg(funcs)
+
+
+def test_more_flexible_frame_multi_function(df):
+ grouped = df.groupby('A')
+
+ exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
+ exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
+
+ expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
+ expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
+
+ d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
+ result = grouped.aggregate(d)
+
+ tm.assert_frame_equal(result, expected)
+
+ # be careful
+ result = grouped.aggregate(OrderedDict([['C', np.mean],
+ ['D', [np.mean, np.std]]]))
+ expected = grouped.aggregate(OrderedDict([['C', np.mean],
+ ['D', [np.mean, np.std]]]))
+ tm.assert_frame_equal(result, expected)
+
+ def foo(x):
+ return np.mean(x)
+
+ def bar(x):
+ return np.std(x, ddof=1)
+
+ # this uses column selection & renaming
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ d = OrderedDict([['C', np.mean],
+ ['D', OrderedDict([['foo', np.mean],
+ ['bar', np.std]])]])
+ result = grouped.aggregate(d)
+
+ d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
+ expected = grouped.aggregate(d)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multi_function_flexible_mix(df):
+ # GH #1268
+ grouped = df.groupby('A')
+
+ # Expected
+ d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
+ ['D', {'sum': 'sum'}]])
+ # this uses column selection & renaming
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ expected = grouped.aggregate(d)
+
+ # Test 1
+ d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
+ ['D', 'sum']])
+ # this uses column selection & renaming
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = grouped.aggregate(d)
+ tm.assert_frame_equal(result, expected)
+
+ # Test 2
+ d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
+ ['D', ['sum']]])
+ # this uses column selection & renaming
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = grouped.aggregate(d)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_cython.py b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_cython.py
new file mode 100644
index 00000000000..ad3974d5e2f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_cython.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+
+"""
+test cython .agg behavior
+"""
+
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range)
+from pandas.core.groupby.groupby import DataError
+import pandas.util.testing as tm
+
+
[email protected]('op_name', [
+ 'count',
+ 'sum',
+ 'std',
+ 'var',
+ 'sem',
+ 'mean',
+ pytest.param('median',
+ # ignore mean of empty slice
+ # and all-NaN
+ marks=[pytest.mark.filterwarnings(
+ "ignore::RuntimeWarning"
+ )]),
+ 'prod',
+ 'min',
+ 'max',
+])
+def test_cythonized_aggers(op_name):
+ data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan],
+ 'B': ['A', 'B'] * 6,
+ 'C': np.random.randn(12)}
+ df = DataFrame(data)
+ df.loc[2:10:2, 'C'] = np.nan
+
+ op = lambda x: getattr(x, op_name)()
+
+ # single column
+ grouped = df.drop(['B'], axis=1).groupby('A')
+ exp = {cat: op(group['C']) for cat, group in grouped}
+ exp = DataFrame({'C': exp})
+ exp.index.name = 'A'
+ result = op(grouped)
+ tm.assert_frame_equal(result, exp)
+
+ # multiple columns
+ grouped = df.groupby(['A', 'B'])
+ expd = {}
+ for (cat1, cat2), group in grouped:
+ expd.setdefault(cat1, {})[cat2] = op(group['C'])
+ exp = DataFrame(expd).T.stack(dropna=False)
+ exp.index.names = ['A', 'B']
+ exp.name = 'C'
+
+ result = op(grouped)['C']
+ if op_name in ['sum', 'prod']:
+ tm.assert_series_equal(result, exp)
+
+
+def test_cython_agg_boolean():
+ frame = DataFrame({'a': np.random.randint(0, 5, 50),
+ 'b': np.random.randint(0, 2, 50).astype('bool')})
+ result = frame.groupby('a')['b'].mean()
+ expected = frame.groupby('a')['b'].agg(np.mean)
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_cython_agg_nothing_to_agg():
+ frame = DataFrame({'a': np.random.randint(0, 5, 50),
+ 'b': ['foo', 'bar'] * 25})
+ msg = "No numeric types to aggregate"
+
+ with pytest.raises(DataError, match=msg):
+ frame.groupby('a')['b'].mean()
+
+ frame = DataFrame({'a': np.random.randint(0, 5, 50),
+ 'b': ['foo', 'bar'] * 25})
+ with pytest.raises(DataError, match=msg):
+ frame[['b']].groupby(frame['a']).mean()
+
+
+def test_cython_agg_nothing_to_agg_with_dates():
+ frame = DataFrame({'a': np.random.randint(0, 5, 50),
+ 'b': ['foo', 'bar'] * 25,
+ 'dates': pd.date_range('now', periods=50, freq='T')})
+ msg = "No numeric types to aggregate"
+ with pytest.raises(DataError, match=msg):
+ frame.groupby('b').dates.mean()
+
+
+def test_cython_agg_frame_columns():
+ # #2113
+ df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
+
+ df.groupby(level=0, axis='columns').mean()
+ df.groupby(level=0, axis='columns').mean()
+ df.groupby(level=0, axis='columns').mean()
+ df.groupby(level=0, axis='columns').mean()
+
+
+def test_cython_agg_return_dict():
+ # GH 16741
+ df = DataFrame(
+ {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8)})
+
+ ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict())
+ expected = Series([{'two': 1, 'one': 1, 'three': 1},
+ {'two': 2, 'one': 2, 'three': 1}],
+ index=Index(['bar', 'foo'], name='A'),
+ name='B')
+ tm.assert_series_equal(ts, expected)
+
+
+def test_cython_fail_agg():
+ dr = bdate_range('1/1/2000', periods=50)
+ ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
+
+ grouped = ts.groupby(lambda x: x.month)
+ summed = grouped.sum()
+ expected = grouped.agg(np.sum)
+ tm.assert_series_equal(summed, expected)
+
+
[email protected]('op, targop', [
+ ('mean', np.mean),
+ ('median', np.median),
+ ('var', np.var),
+ ('add', np.sum),
+ ('prod', np.prod),
+ ('min', np.min),
+ ('max', np.max),
+ ('first', lambda x: x.iloc[0]),
+ ('last', lambda x: x.iloc[-1]),
+])
+def test__cython_agg_general(op, targop):
+ df = DataFrame(np.random.randn(1000))
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+
+ result = df.groupby(labels)._cython_agg_general(op)
+ expected = df.groupby(labels).agg(targop)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('op, targop', [
+ ('mean', np.mean),
+ ('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
+ ('var', lambda x: np.var(x, ddof=1)),
+ ('min', np.min),
+ ('max', np.max), ]
+)
+def test_cython_agg_empty_buckets(op, targop, observed):
+ df = pd.DataFrame([11, 12, 13])
+ grps = range(0, 55, 5)
+
+ # calling _cython_agg_general directly, instead of via the user API
+ # which sets different values for min_count, so do that here.
+ g = df.groupby(pd.cut(df[0], grps), observed=observed)
+ result = g._cython_agg_general(op)
+
+ g = df.groupby(pd.cut(df[0], grps), observed=observed)
+ expected = g.agg(lambda x: targop(x))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_agg_empty_buckets_nanops(observed):
+ # GH-18869 can't call nanops on empty groups, so hardcode expected
+ # for these
+ df = pd.DataFrame([11, 12, 13], columns=['a'])
+ grps = range(0, 25, 5)
+ # add / sum
+ result = df.groupby(pd.cut(df['a'], grps),
+ observed=observed)._cython_agg_general('add')
+ intervals = pd.interval_range(0, 20, freq=5)
+ expected = pd.DataFrame(
+ {"a": [0, 0, 36, 0]},
+ index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+ if observed:
+ expected = expected[expected.a != 0]
+
+ tm.assert_frame_equal(result, expected)
+
+ # prod
+ result = df.groupby(pd.cut(df['a'], grps),
+ observed=observed)._cython_agg_general('prod')
+ expected = pd.DataFrame(
+ {"a": [1, 1, 1716, 1]},
+ index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+ if observed:
+ expected = expected[expected.a != 1]
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('op', ['first', 'last', 'max', 'min'])
+ Timestamp('2016-10-14 21:00:44.557'),
+ Timedelta('17088 days 21:00:44.557'), ])
+def test_cython_with_timestamp_and_nat(op, data):
+ # https://github.com/pandas-dev/pandas/issues/19526
+ df = DataFrame({'a': [0, 1], 'b': [data, NaT]})
+ index = Index([0, 1], name='a')
+
+ # We will group by a and test the cython aggregations
+ expected = DataFrame({'b': [data, NaT]}, index=index)
+
+ result = df.groupby('a').aggregate(op)
+ tm.assert_frame_equal(expected, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_other.py b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_other.py
new file mode 100644
index 00000000000..b5214b11bdd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/aggregate/test_other.py
@@ -0,0 +1,514 @@
+# -*- coding: utf-8 -*-
+
+"""
+test all other .agg behavior
+"""
+
+from __future__ import print_function
+
+from collections import OrderedDict
+import datetime as dt
+from functools import partial
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, PeriodIndex, Series, date_range,
+ period_range)
+from pandas.core.groupby.groupby import SpecificationError
+import pandas.util.testing as tm
+
+from pandas.io.formats.printing import pprint_thing
+
+
+def test_agg_api():
+ # GH 6337
+ # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
+ # different api for agg when passed custom function with mixed frame
+
+ df = DataFrame({'data1': np.random.randn(5),
+ 'data2': np.random.randn(5),
+ 'key1': ['a', 'a', 'b', 'b', 'a'],
+ 'key2': ['one', 'two', 'one', 'two', 'one']})
+ grouped = df.groupby('key1')
+
+ def peak_to_peak(arr):
+ return arr.max() - arr.min()
+
+ expected = grouped.agg([peak_to_peak])
+ expected.columns = ['data1', 'data2']
+ result = grouped.agg(peak_to_peak)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_datetimes_mixed():
+ data = [[1, '2012-01-01', 1.0],
+ [2, '2012-01-02', 2.0],
+ [3, None, 3.0]]
+
+ df1 = DataFrame({'key': [x[0] for x in data],
+ 'date': [x[1] for x in data],
+ 'value': [x[2] for x in data]})
+
+ data = [[row[0],
+ (dt.datetime.strptime(row[1], '%Y-%m-%d').date()
+ if row[1] else None),
+ row[2]]
+ for row in data]
+
+ df2 = DataFrame({'key': [x[0] for x in data],
+ 'date': [x[1] for x in data],
+ 'value': [x[2] for x in data]})
+
+ df1['weights'] = df1['value'] / df1['value'].sum()
+ gb1 = df1.groupby('date').aggregate(np.sum)
+
+ df2['weights'] = df1['value'] / df1['value'].sum()
+ gb2 = df2.groupby('date').aggregate(np.sum)
+
+ assert (len(gb1) == len(gb2))
+
+
+def test_agg_period_index():
+ prng = period_range('2012-1-1', freq='M', periods=3)
+ df = DataFrame(np.random.randn(3, 2), index=prng)
+ rs = df.groupby(level=0).sum()
+ assert isinstance(rs.index, PeriodIndex)
+
+ # GH 3579
+ index = period_range(start='1999-01', periods=5, freq='M')
+ s1 = Series(np.random.rand(len(index)), index=index)
+ s2 = Series(np.random.rand(len(index)), index=index)
+ series = [('s1', s1), ('s2', s2)]
+ df = DataFrame.from_dict(OrderedDict(series))
+ grouped = df.groupby(df.index.month)
+ list(grouped)
+
+
+def test_agg_dict_parameter_cast_result_dtypes():
+ # GH 12821
+
+ df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
+ 'time': date_range('1/1/2011', periods=8, freq='H')})
+ df.loc[[0, 1, 2, 5], 'time'] = None
+
+ # test for `first` function
+ exp = df.loc[[0, 3, 4, 6]].set_index('class')
+ grouped = df.groupby('class')
+ tm.assert_frame_equal(grouped.first(), exp)
+ tm.assert_frame_equal(grouped.agg('first'), exp)
+ tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp)
+ tm.assert_series_equal(grouped.time.first(), exp['time'])
+ tm.assert_series_equal(grouped.time.agg('first'), exp['time'])
+
+ # test for `last` function
+ exp = df.loc[[0, 3, 4, 7]].set_index('class')
+ grouped = df.groupby('class')
+ tm.assert_frame_equal(grouped.last(), exp)
+ tm.assert_frame_equal(grouped.agg('last'), exp)
+ tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp)
+ tm.assert_series_equal(grouped.time.last(), exp['time'])
+ tm.assert_series_equal(grouped.time.agg('last'), exp['time'])
+
+ # count
+ exp = pd.Series([2, 2, 2, 2],
+ index=Index(list('ABCD'), name='class'),
+ name='time')
+ tm.assert_series_equal(grouped.time.agg(len), exp)
+ tm.assert_series_equal(grouped.time.size(), exp)
+
+ exp = pd.Series([0, 1, 1, 2],
+ index=Index(list('ABCD'), name='class'),
+ name='time')
+ tm.assert_series_equal(grouped.time.count(), exp)
+
+
+def test_agg_cast_results_dtypes():
+ # similar to GH12821
+ # xref #11444
+ u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
+ v = list('aaabbbbbbccd')
+ df = pd.DataFrame({'X': v, 'Y': u})
+
+ result = df.groupby('X')['Y'].agg(len)
+ expected = df.groupby('X')['Y'].count()
+ tm.assert_series_equal(result, expected)
+
+
+def test_aggregate_float64_no_int64():
+ # see gh-11199
+ df = DataFrame({"a": [1, 2, 3, 4, 5],
+ "b": [1, 2, 2, 4, 5],
+ "c": [1, 2, 3, 4, 5]})
+
+ expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
+ expected.index.name = "b"
+
+ result = df.groupby("b")[["a"]].mean()
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]},
+ index=[1, 2, 4, 5])
+ expected.index.name = "b"
+
+ result = df.groupby("b")[["a", "c"]].mean()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_aggregate_api_consistency():
+ # GH 9052
+ # make sure that the aggregates via dict
+ # are consistent
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': np.random.randn(8) + 1.0,
+ 'D': np.arange(8)})
+
+ grouped = df.groupby(['A', 'B'])
+ c_mean = grouped['C'].mean()
+ c_sum = grouped['C'].sum()
+ d_mean = grouped['D'].mean()
+ d_sum = grouped['D'].sum()
+
+ result = grouped['D'].agg(['sum', 'mean'])
+ expected = pd.concat([d_sum, d_mean], axis=1)
+ expected.columns = ['sum', 'mean']
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = grouped.agg([np.sum, np.mean])
+ expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
+ expected.columns = MultiIndex.from_product([['C', 'D'],
+ ['sum', 'mean']])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = grouped[['D', 'C']].agg([np.sum, np.mean])
+ expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
+ expected.columns = MultiIndex.from_product([['D', 'C'],
+ ['sum', 'mean']])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = grouped.agg({'C': 'mean', 'D': 'sum'})
+ expected = pd.concat([d_sum, c_mean], axis=1)
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = grouped.agg({'C': ['mean', 'sum'],
+ 'D': ['mean', 'sum']})
+ expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
+ expected.columns = MultiIndex.from_product([['C', 'D'],
+ ['mean', 'sum']])
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = grouped[['D', 'C']].agg({'r': np.sum,
+ 'r2': np.mean})
+ expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
+ expected.columns = MultiIndex.from_product([['r', 'r2'],
+ ['D', 'C']])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_dict_renaming_deprecation():
+ # 15931
+ df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
+ 'B': range(5),
+ 'C': range(5)})
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False) as w:
+ df.groupby('A').agg({'B': {'foo': ['sum', 'max']},
+ 'C': {'bar': ['count', 'min']}})
+ assert "using a dict with renaming" in str(w[0].message)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ df.groupby('A')[['B', 'C']].agg({'ma': 'max'})
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.groupby('A').B.agg({'foo': 'count'})
+ assert "using a dict on a Series for aggregation" in str(w[0].message)
+
+
+def test_agg_compat():
+ # GH 12334
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': np.random.randn(8) + 1.0,
+ 'D': np.arange(8)})
+
+ g = df.groupby(['A', 'B'])
+
+ expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
+ expected.columns = MultiIndex.from_tuples([('C', 'sum'),
+ ('C', 'std')])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = g['D'].agg({'C': ['sum', 'std']})
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
+ expected.columns = ['C', 'D']
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = g['D'].agg({'C': 'sum', 'D': 'std'})
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_nested_dicts():
+ # API change for disallowing these types of nested dicts
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': np.random.randn(8) + 1.0,
+ 'D': np.arange(8)})
+
+ g = df.groupby(['A', 'B'])
+
+ msg = r'cannot perform renaming for r[1-2] with a nested dictionary'
+ with pytest.raises(SpecificationError, match=msg):
+ g.aggregate({'r1': {'C': ['mean', 'sum']},
+ 'r2': {'D': ['mean', 'sum']}})
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = g.agg({'C': {'ra': ['mean', 'std']},
+ 'D': {'rb': ['mean', 'std']}})
+ expected = pd.concat([g['C'].mean(), g['C'].std(),
+ g['D'].mean(), g['D'].std()],
+ axis=1)
+ expected.columns = pd.MultiIndex.from_tuples(
+ [('ra', 'mean'), ('ra', 'std'),
+ ('rb', 'mean'), ('rb', 'std')])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ # same name as the original column
+ # GH9052
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
+ expected = expected.rename(columns={'result1': 'D'})
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = g['D'].agg({'D': np.sum, 'result2': np.mean})
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_item_by_item_raise_typeerror():
+ df = DataFrame(np.random.randint(10, size=(20, 10)))
+
+ def raiseException(df):
+ pprint_thing('----------------------------------------')
+ pprint_thing(df.to_string())
+ raise TypeError('test')
+
+ with pytest.raises(TypeError, match='test'):
+ df.groupby(0).agg(raiseException)
+
+
+def test_series_agg_multikey():
+ ts = tm.makeTimeSeries()
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+
+ result = grouped.agg(np.sum)
+ expected = grouped.sum()
+ tm.assert_series_equal(result, expected)
+
+
+def test_series_agg_multi_pure_python():
+ data = DataFrame(
+ {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ def bad(x):
+ assert (len(x.values.base) > 0)
+ return 'foo'
+
+ result = data.groupby(['A', 'B']).agg(bad)
+ expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_consistency():
+ # agg with ([]) and () not consistent
+ # GH 6715
+ def P1(a):
+ try:
+ return np.percentile(a.dropna(), q=1)
+ except Exception:
+ return np.nan
+
+ df = DataFrame({'col1': [1, 2, 3, 4],
+ 'col2': [10, 25, 26, 31],
+ 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10),
+ dt.date(2013, 2, 11), dt.date(2013, 2, 11)]})
+
+ g = df.groupby('date')
+
+ expected = g.agg([P1])
+ expected.columns = expected.columns.levels[0]
+
+ result = g.agg(P1)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_callables():
+ # GH 7929
+ df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64)
+
+ class fn_class(object):
+
+ def __call__(self, x):
+ return sum(x)
+
+ equiv_callables = [sum,
+ np.sum,
+ lambda x: sum(x),
+ lambda x: x.sum(),
+ partial(sum),
+ fn_class(), ]
+
+ expected = df.groupby("foo").agg(sum)
+ for ecall in equiv_callables:
+ result = df.groupby('foo').agg(ecall)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_over_numpy_arrays():
+ # GH 3788
+ df = pd.DataFrame([[1, np.array([10, 20, 30])],
+ [1, np.array([40, 50, 60])],
+ [2, np.array([20, 30, 40])]],
+ columns=['category', 'arraydata'])
+ result = df.groupby('category').agg(sum)
+
+ expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
+ expected_index = pd.Index([1, 2], name='category')
+ expected_column = ['arraydata']
+ expected = pd.DataFrame(expected_data,
+ index=expected_index,
+ columns=expected_column)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_timezone_round_trip():
+ # GH 15426
+ ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific')
+ df = pd.DataFrame({'a': 1,
+ 'b': [ts + dt.timedelta(minutes=nn)
+ for nn in range(10)]})
+
+ result1 = df.groupby('a')['b'].agg(np.min).iloc[0]
+ result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0]
+ result3 = df.groupby('a')['b'].min().iloc[0]
+
+ assert result1 == ts
+ assert result2 == ts
+ assert result3 == ts
+
+ dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific')
+ for i in range(1, 5)]
+ df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates})
+ grouped = df.groupby('A')
+
+ ts = df['B'].iloc[0]
+ assert ts == grouped.nth(0)['B'].iloc[0]
+ assert ts == grouped.head(1)['B'].iloc[0]
+ assert ts == grouped.first()['B'].iloc[0]
+ assert ts == grouped.apply(lambda x: x.iloc[0])[0]
+
+ ts = df['B'].iloc[2]
+ assert ts == grouped.last()['B'].iloc[0]
+ assert ts == grouped.apply(lambda x: x.iloc[-1])[0]
+
+
+def test_sum_uint64_overflow():
+ # see gh-14758
+ # Convert to uint64 and don't overflow
+ df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
+ df = df + 9223372036854775807
+
+ index = pd.Index([9223372036854775808,
+ 9223372036854775810,
+ 9223372036854775812],
+ dtype=np.uint64)
+ expected = pd.DataFrame({1: [9223372036854775809,
+ 9223372036854775811,
+ 9223372036854775813]},
+ index=index)
+
+ expected.index.name = 0
+ result = df.groupby(0).sum()
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("structure, expected", [
+ (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
+ (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
+ (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1),
+ (3, 4): (3, 4, 4)}})),
+ (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1],
+ (3, 4): [3, 4, 4]}}))
+])
+def test_agg_structs_dataframe(structure, expected):
+ df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
+ 'B': [1, 1, 1, 4, 4, 4],
+ 'C': [1, 1, 1, 3, 4, 4]})
+
+ result = df.groupby(['A', 'B']).aggregate(structure)
+ expected.index.names = ['A', 'B']
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("structure, expected", [
+ (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')),
+ (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')),
+ (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)],
+ index=[1, 3], name='C')),
+ (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]],
+ index=[1, 3], name='C'))
+])
+def test_agg_structs_series(structure, expected):
+ # Issue #18079
+ df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
+ 'B': [1, 1, 1, 4, 4, 4],
+ 'C': [1, 1, 1, 3, 4, 4]})
+
+ result = df.groupby('A')['C'].aggregate(structure)
+ expected.index.name = 'A'
+ tm.assert_series_equal(result, expected)
+
+
+def test_agg_category_nansum(observed):
+ categories = ['a', 'b', 'c']
+ df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+ categories=categories),
+ 'B': [1, 2, 3]})
+ result = df.groupby("A", observed=observed).B.agg(np.nansum)
+ expected = pd.Series([3, 3, 0],
+ index=pd.CategoricalIndex(['a', 'b', 'c'],
+ categories=categories,
+ name='A'),
+ name='B')
+ if observed:
+ expected = expected[expected != 0]
+ tm.assert_series_equal(result, expected)
+
+
+def test_agg_list_like_func():
+ # GH 18473
+ df = pd.DataFrame({'A': [str(x) for x in range(3)],
+ 'B': [str(x) for x in range(3)]})
+ grouped = df.groupby('A', as_index=False, sort=False)
+ result = grouped.agg({'B': lambda x: list(x)})
+ expected = pd.DataFrame({'A': [str(x) for x in range(3)],
+ 'B': [[str(x)] for x in range(3)]})
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/conftest.py b/contrib/python/pandas/py2/pandas/tests/groupby/conftest.py
new file mode 100644
index 00000000000..cb4fe511651
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/conftest.py
@@ -0,0 +1,78 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex
+from pandas.util import testing as tm
+
+
+def mframe():
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ return DataFrame(np.random.randn(10, 3), index=index,
+ columns=['A', 'B', 'C'])
+
+
+def df():
+ return DataFrame(
+ {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8)})
+
+
+def ts():
+ return tm.makeTimeSeries()
+
+
+def seriesd():
+ return tm.getSeriesData()
+
+
+def tsd():
+ return tm.getTimeSeriesData()
+
+
+def frame(seriesd):
+ return DataFrame(seriesd)
+
+
+def tsframe(tsd):
+ return DataFrame(tsd)
+
+
+def df_mixed_floats():
+ return DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.array(
+ np.random.randn(8), dtype='float32')})
+
+
+def three_group():
+ return DataFrame({'A': ['foo', 'foo', 'foo',
+ 'foo', 'bar', 'bar',
+ 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one',
+ 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny',
+ 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_apply.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_apply.py
new file mode 100644
index 00000000000..659d1a9cf98
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_apply.py
@@ -0,0 +1,542 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, bdate_range, compat
+from pandas.util import testing as tm
+
+
+def test_apply_issues():
+ # GH 5788
+
+ s = """2011.05.16,00:00,1.40893
+2011.05.16,01:00,1.40760
+2011.05.16,02:00,1.40750
+2011.05.16,03:00,1.40649
+2011.05.17,02:00,1.40893
+2011.05.17,03:00,1.40760
+2011.05.17,04:00,1.40750
+2011.05.17,05:00,1.40649
+2011.05.18,02:00,1.40893
+2011.05.18,03:00,1.40760
+2011.05.18,04:00,1.40750
+2011.05.18,05:00,1.40649"""
+
+ df = pd.read_csv(
+ compat.StringIO(s), header=None, names=['date', 'time', 'value'],
+ parse_dates=[['date', 'time']])
+ df = df.set_index('date_time')
+
+ expected = df.groupby(df.index.date).idxmax()
+ result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
+ tm.assert_frame_equal(result, expected)
+
+ # GH 5789
+ # don't auto coerce dates
+ df = pd.read_csv(
+ compat.StringIO(s), header=None, names=['date', 'time', 'value'])
+ exp_idx = pd.Index(
+ ['2011.05.16', '2011.05.17', '2011.05.18'
+ ], dtype=object, name='date')
+ expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
+ result = df.groupby('date').apply(
+ lambda x: x['time'][x['value'].idxmax()])
+ tm.assert_series_equal(result, expected)
+
+
+def test_apply_trivial():
+ # GH 20066
+ # trivial apply: ignore input and return a constant dataframe.
+ df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
+ 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
+ columns=['key', 'data'])
+ expected = pd.concat([df.iloc[1:], df.iloc[1:]],
+ axis=1, keys=['float64', 'object'])
+ result = df.groupby([str(x) for x in df.dtypes],
+ axis=1).apply(lambda x: df.iloc[1:])
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected](reason="GH#20066; function passed into apply "
+ "returns a DataFrame with the same index "
+ "as the one to create GroupBy object.")
+def test_apply_trivial_fail():
+ # GH 20066
+ # trivial apply fails if the constant dataframe has the same index
+ # with the one used to create GroupBy object.
+ df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
+ 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
+ columns=['key', 'data'])
+ expected = pd.concat([df, df],
+ axis=1, keys=['float64', 'object'])
+ result = df.groupby([str(x) for x in df.dtypes],
+ axis=1).apply(lambda x: df)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_fast_apply():
+ # make sure that fast apply is correctly called
+ # rather than raising any kind of error
+ # otherwise the python path will be callsed
+ # which slows things down
+ N = 1000
+ labels = np.random.randint(0, 2000, size=N)
+ labels2 = np.random.randint(0, 3, size=N)
+ df = DataFrame({'key': labels,
+ 'key2': labels2,
+ 'value1': np.random.randn(N),
+ 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
+
+ def f(g):
+ return 1
+
+ g = df.groupby(['key', 'key2'])
+
+ grouper = g.grouper
+
+ splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
+ group_keys = grouper._get_group_keys()
+
+ values, mutated = splitter.fast_apply(f, group_keys)
+ assert not mutated
+
+
+def test_apply_with_mixed_dtype():
+ # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
+ df = DataFrame({'foo1': np.random.randn(6),
+ 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
+ result = df.apply(lambda x: x, axis=1)
+ tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
+
+ # GH 3610 incorrect dtype conversion with as_index=False
+ df = DataFrame({"c1": [1, 2, 6, 6, 8]})
+ df["c2"] = df.c1 / 2.0
+ result1 = df.groupby("c2").mean().reset_index().c2
+ result2 = df.groupby("c2", as_index=False).mean().c2
+ tm.assert_series_equal(result1, result2)
+
+
+def test_groupby_as_index_apply(df):
+ # GH #4648 and #3417
+ df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
+ 'user_id': [1, 2, 1, 1, 3, 1],
+ 'time': range(6)})
+
+ g_as = df.groupby('user_id', as_index=True)
+ g_not_as = df.groupby('user_id', as_index=False)
+
+ res_as = g_as.head(2).index
+ res_not_as = g_not_as.head(2).index
+ exp = Index([0, 1, 2, 4])
+ tm.assert_index_equal(res_as, exp)
+ tm.assert_index_equal(res_not_as, exp)
+
+ res_as_apply = g_as.apply(lambda x: x.head(2)).index
+ res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
+
+ # apply doesn't maintain the original ordering
+ # changed in GH5610 as the as_index=False returns a MI here
+ exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
+ 2, 4)])
+ tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
+ exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
+
+ tm.assert_index_equal(res_as_apply, exp_as_apply)
+ tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
+
+ ind = Index(list('abcde'))
+ df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
+ res = df.groupby(0, as_index=False).apply(lambda x: x).index
+ tm.assert_index_equal(res, ind)
+
+
+def test_apply_concat_preserve_names(three_group):
+ grouped = three_group.groupby(['A', 'B'])
+
+ def desc(group):
+ result = group.describe()
+ result.index.name = 'stat'
+ return result
+
+ def desc2(group):
+ result = group.describe()
+ result.index.name = 'stat'
+ result = result[:len(group)]
+ # weirdo
+ return result
+
+ def desc3(group):
+ result = group.describe()
+
+ # names are different
+ result.index.name = 'stat_%d' % len(group)
+
+ result = result[:len(group)]
+ # weirdo
+ return result
+
+ result = grouped.apply(desc)
+ assert result.index.names == ('A', 'B', 'stat')
+
+ result2 = grouped.apply(desc2)
+ assert result2.index.names == ('A', 'B', 'stat')
+
+ result3 = grouped.apply(desc3)
+ assert result3.index.names == ('A', 'B', None)
+
+
+def test_apply_series_to_frame():
+ def f(piece):
+ with np.errstate(invalid='ignore'):
+ logged = np.log(piece)
+ return DataFrame({'value': piece,
+ 'demeaned': piece - piece.mean(),
+ 'logged': logged})
+
+ dr = bdate_range('1/1/2000', periods=100)
+ ts = Series(np.random.randn(100), index=dr)
+
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.apply(f)
+
+ assert isinstance(result, DataFrame)
+ tm.assert_index_equal(result.index, ts.index)
+
+
+def test_apply_series_yield_constant(df):
+ result = df.groupby(['A', 'B'])['C'].apply(len)
+ assert result.index.names[:2] == ('A', 'B')
+
+
+def test_apply_frame_yield_constant(df):
+ # GH13568
+ result = df.groupby(['A', 'B']).apply(len)
+ assert isinstance(result, Series)
+ assert result.name is None
+
+ result = df.groupby(['A', 'B'])[['C', 'D']].apply(len)
+ assert isinstance(result, Series)
+ assert result.name is None
+
+
+def test_apply_frame_to_series(df):
+ grouped = df.groupby(['A', 'B'])
+ result = grouped.apply(len)
+ expected = grouped.count()['C']
+ tm.assert_index_equal(result.index, expected.index)
+ tm.assert_numpy_array_equal(result.values, expected.values)
+
+
+def test_apply_frame_concat_series():
+ def trans(group):
+ return group.groupby('B')['C'].sum().sort_values()[:2]
+
+ def trans2(group):
+ grouped = group.groupby(df.reindex(group.index)['B'])
+ return grouped.sum().sort_values()[:2]
+
+ df = DataFrame({'A': np.random.randint(0, 5, 1000),
+ 'B': np.random.randint(0, 5, 1000),
+ 'C': np.random.randn(1000)})
+
+ result = df.groupby('A').apply(trans)
+ exp = df.groupby('A')['C'].apply(trans2)
+ tm.assert_series_equal(result, exp, check_names=False)
+ assert result.name == 'C'
+
+
+def test_apply_transform(ts):
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.apply(lambda x: x * 2)
+ expected = grouped.transform(lambda x: x * 2)
+ tm.assert_series_equal(result, expected)
+
+
+def test_apply_multikey_corner(tsframe):
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+
+ def f(group):
+ return group.sort_values('A')[-5:]
+
+ result = grouped.apply(f)
+ for key, group in grouped:
+ tm.assert_frame_equal(result.loc[key], f(group))
+
+
+def test_apply_chunk_view():
+ # Low level tinkering could be unsafe, make sure not
+ df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ 'value': compat.lrange(9)})
+
+ result = df.groupby('key', group_keys=False).apply(lambda x: x[:2])
+ expected = df.take([0, 1, 3, 4, 6, 7])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_no_name_column_conflict():
+ df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
+ 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
+ 'value': compat.lrange(10)[::-1]})
+
+ # it works! #2605
+ grouped = df.groupby(['name', 'name2'])
+ grouped.apply(lambda x: x.sort_values('value', inplace=True))
+
+
+def test_apply_typecast_fail():
+ df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
+ 'c': np.tile(
+ ['a', 'b', 'c'], 2),
+ 'v': np.arange(1., 7.)})
+
+ def f(group):
+ v = group['v']
+ group['v2'] = (v - v.min()) / (v.max() - v.min())
+ return group
+
+ result = df.groupby('d').apply(f)
+
+ expected = df.copy()
+ expected['v2'] = np.tile([0., 0.5, 1], 2)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_multiindex_fail():
+ index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
+ ])
+ df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
+ 'c': np.tile(['a', 'b', 'c'], 2),
+ 'v': np.arange(1., 7.)}, index=index)
+
+ def f(group):
+ v = group['v']
+ group['v2'] = (v - v.min()) / (v.max() - v.min())
+ return group
+
+ result = df.groupby('d').apply(f)
+
+ expected = df.copy()
+ expected['v2'] = np.tile([0., 0.5, 1], 2)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_corner(tsframe):
+ result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
+ expected = tsframe * 2
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_without_copy():
+ # GH 5545
+ # returning a non-copy in an applied function fails
+
+ data = DataFrame({'id_field': [100, 100, 200, 300],
+ 'category': ['a', 'b', 'c', 'c'],
+ 'value': [1, 2, 3, 4]})
+
+ def filt1(x):
+ if x.shape[0] == 1:
+ return x.copy()
+ else:
+ return x[x.category == 'c']
+
+ def filt2(x):
+ if x.shape[0] == 1:
+ return x
+ else:
+ return x[x.category == 'c']
+
+ expected = data.groupby('id_field').apply(filt1)
+ result = data.groupby('id_field').apply(filt2)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_corner_cases():
+ # #535, can't use sliding iterator
+
+ N = 1000
+ labels = np.random.randint(0, 100, size=N)
+ df = DataFrame({'key': labels,
+ 'value1': np.random.randn(N),
+ 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
+
+ grouped = df.groupby('key')
+
+ def f(g):
+ g['value3'] = g['value1'] * 2
+ return g
+
+ result = grouped.apply(f)
+ assert 'value3' in result
+
+
+def test_apply_numeric_coercion_when_datetime():
+ # In the past, group-by/apply operations have been over-eager
+ # in converting dtypes to numeric, in the presence of datetime
+ # columns. Various GH issues were filed, the reproductions
+ # for which are here.
+
+ # GH 15670
+ df = pd.DataFrame({'Number': [1, 2],
+ 'Date': ["2017-03-02"] * 2,
+ 'Str': ["foo", "inf"]})
+ expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
+ df.Date = pd.to_datetime(df.Date)
+ result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
+ tm.assert_series_equal(result['Str'], expected['Str'])
+
+ # GH 15421
+ df = pd.DataFrame({'A': [10, 20, 30],
+ 'B': ['foo', '3', '4'],
+ 'T': [pd.Timestamp("12:31:22")] * 3})
+
+ def get_B(g):
+ return g.iloc[0][['B']]
+ result = df.groupby('A').apply(get_B)['B']
+ expected = df.B
+ expected.index = df.A
+ tm.assert_series_equal(result, expected)
+
+ # GH 14423
+ def predictions(tool):
+ out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
+ if 'step1' in list(tool.State):
+ out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
+ if 'step2' in list(tool.State):
+ out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
+ out['useTime'] = str(
+ tool[tool.State == 'step2'].oTime.values[0])
+ return out
+ df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
+ 'State': ['step1', 'step2', 'step1', 'step2'],
+ 'oTime': ['', '2016-09-19 05:24:33',
+ '', '2016-09-19 23:59:04'],
+ 'Machine': ['23', '36L', '36R', '36R']})
+ df2 = df1.copy()
+ df2.oTime = pd.to_datetime(df2.oTime)
+ expected = df1.groupby('Key').apply(predictions).p1
+ result = df2.groupby('Key').apply(predictions).p1
+ tm.assert_series_equal(expected, result)
+
+
+def test_time_field_bug():
+ # Test a fix for the following error related to GH issue 11324 When
+ # non-key fields in a group-by dataframe contained time-based fields
+ # that were not returned by the apply function, an exception would be
+ # raised.
+
+ df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
+
+ def func_with_no_date(batch):
+ return pd.Series({'c': 2})
+
+ def func_with_date(batch):
+ return pd.Series({'b': datetime(2015, 1, 1), 'c': 2})
+
+ dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
+ dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
+ dfg_no_conversion_expected.index.name = 'a'
+
+ dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
+ dfg_conversion_expected = pd.DataFrame(
+ {'b': datetime(2015, 1, 1),
+ 'c': 2}, index=[1])
+ dfg_conversion_expected.index.name = 'a'
+
+ tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
+ tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
+
+
+def test_gb_apply_list_of_unequal_len_arrays():
+
+ # GH1738
+ df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
+ 'b', 'b', 'b'],
+ 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
+ 'd', 'd', 'e'],
+ 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
+ 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
+ df = df.set_index(['group1', 'group2'])
+ df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
+
+ def noddy(value, weight):
+ out = np.array(value * weight).repeat(3)
+ return out
+
+ # the kernel function returns arrays of unequal length
+ # pandas sniffs the first one, sees it's an array and not
+ # a list, and assumed the rest are of equal length
+ # and so tries a vstack
+
+ # don't die
+ df_grouped.apply(lambda x: noddy(x.value, x.weight))
+
+
+def test_groupby_apply_all_none():
+ # Tests to make sure no errors if apply function returns all None
+ # values. Issue 9684.
+ test_df = DataFrame({'groups': [0, 0, 1, 1],
+ 'random_vars': [8, 7, 4, 5]})
+
+ def test_func(x):
+ pass
+
+ result = test_df.groupby('groups').apply(test_func)
+ expected = DataFrame()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_apply_none_first():
+ # GH 12824. Tests if apply returns None first.
+ test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
+ test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
+
+ def test_func(x):
+ if x.shape[0] < 2:
+ return None
+ return x.iloc[[0, -1]]
+
+ result1 = test_df1.groupby('groups').apply(test_func)
+ result2 = test_df2.groupby('groups').apply(test_func)
+ index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
+ names=['groups', None])
+ index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
+ names=['groups', None])
+ expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
+ index=index1)
+ expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
+ index=index2)
+ tm.assert_frame_equal(result1, expected1)
+ tm.assert_frame_equal(result2, expected2)
+
+
+def test_groupby_apply_return_empty_chunk():
+ # GH 22221: apply filter which returns some empty groups
+ df = pd.DataFrame(dict(value=[0, 1], group=['filled', 'empty']))
+ groups = df.groupby('group')
+ result = groups.apply(lambda group: group[group.value != 1]['value'])
+ expected = pd.Series([0], name='value',
+ index=MultiIndex.from_product([['empty', 'filled'],
+ [0]],
+ names=['group', None]
+ ).drop('empty'))
+ tm.assert_series_equal(result, expected)
+
+
+def test_apply_with_mixed_types():
+ # gh-20949
+ df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]})
+ g = df.groupby('A')
+
+ result = g.transform(lambda x: x / x.sum())
+ expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]})
+ tm.assert_frame_equal(result, expected)
+
+ result = g.apply(lambda x: x / x.sum())
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_bin_groupby.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_bin_groupby.py
new file mode 100644
index 00000000000..d7ea9bdf920
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_bin_groupby.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas._libs import groupby, lib, reduction
+
+from pandas.core.dtypes.common import ensure_int64
+
+from pandas import Index, isna
+from pandas.core.groupby.ops import generate_bins_generic
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+
+def test_series_grouper():
+ from pandas import Series
+ obj = Series(np.random.randn(10))
+ dummy = obj[:0]
+
+ labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)
+
+ grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
+ result, counts = grouper.get_result()
+
+ expected = np.array([obj[3:6].mean(), obj[6:].mean()])
+ assert_almost_equal(result, expected)
+
+ exp_counts = np.array([3, 4], dtype=np.int64)
+ assert_almost_equal(counts, exp_counts)
+
+
+def test_series_bin_grouper():
+ from pandas import Series
+ obj = Series(np.random.randn(10))
+ dummy = obj[:0]
+
+ bins = np.array([3, 6])
+
+ grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
+ result, counts = grouper.get_result()
+
+ expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
+ assert_almost_equal(result, expected)
+
+ exp_counts = np.array([3, 3, 4], dtype=np.int64)
+ assert_almost_equal(counts, exp_counts)
+
+
+class TestBinGroupers(object):
+
+ def setup_method(self, method):
+ self.obj = np.random.randn(10, 1)
+ self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64)
+ self.bins = np.array([3, 6], dtype=np.int64)
+
+ def test_generate_bins(self):
+ values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
+ binner = np.array([0, 3, 6, 9], dtype=np.int64)
+
+ for func in [lib.generate_bins_dt64, generate_bins_generic]:
+ bins = func(values, binner, closed='left')
+ assert ((bins == np.array([2, 5, 6])).all())
+
+ bins = func(values, binner, closed='right')
+ assert ((bins == np.array([3, 6, 6])).all())
+
+ for func in [lib.generate_bins_dt64, generate_bins_generic]:
+ values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
+ binner = np.array([0, 3, 6], dtype=np.int64)
+
+ bins = func(values, binner, closed='right')
+ assert ((bins == np.array([3, 6])).all())
+
+ msg = "Invalid length for values or for binner"
+ with pytest.raises(ValueError, match=msg):
+ generate_bins_generic(values, [], 'right')
+ with pytest.raises(ValueError, match=msg):
+ generate_bins_generic(values[:0], binner, 'right')
+
+ msg = "Values falls before first bin"
+ with pytest.raises(ValueError, match=msg):
+ generate_bins_generic(values, [4], 'right')
+ msg = "Values falls after last bin"
+ with pytest.raises(ValueError, match=msg):
+ generate_bins_generic(values, [-3, -1], 'right')
+
+
+def test_group_ohlc():
+ def _check(dtype):
+ obj = np.array(np.random.randn(20), dtype=dtype)
+
+ bins = np.array([6, 12, 20])
+ out = np.zeros((3, 4), dtype)
+ counts = np.zeros(len(out), dtype=np.int64)
+ labels = ensure_int64(np.repeat(np.arange(3),
+ np.diff(np.r_[0, bins])))
+
+ func = getattr(groupby, 'group_ohlc_%s' % dtype)
+ func(out, counts, obj[:, None], labels)
+
+ def _ohlc(group):
+ if isna(group).all():
+ return np.repeat(nan, 4)
+ return [group[0], group.max(), group.min(), group[-1]]
+
+ expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]),
+ _ohlc(obj[12:])])
+
+ assert_almost_equal(out, expected)
+ tm.assert_numpy_array_equal(counts,
+ np.array([6, 6, 8], dtype=np.int64))
+
+ obj[:6] = nan
+ func(out, counts, obj[:, None], labels)
+ expected[0] = nan
+ assert_almost_equal(out, expected)
+
+ _check('float32')
+ _check('float64')
+
+
+class TestMoments(object):
+ pass
+
+
+class TestReducer(object):
+
+ def test_int_index(self):
+ from pandas.core.series import Series
+
+ arr = np.random.randn(100, 4)
+ result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
+ expected = arr.sum(0)
+ assert_almost_equal(result, expected)
+
+ result = reduction.reduce(arr, np.sum, axis=1,
+ labels=Index(np.arange(100)))
+ expected = arr.sum(1)
+ assert_almost_equal(result, expected)
+
+ dummy = Series(0., index=np.arange(100))
+ result = reduction.reduce(arr, np.sum, dummy=dummy,
+ labels=Index(np.arange(4)))
+ expected = arr.sum(0)
+ assert_almost_equal(result, expected)
+
+ dummy = Series(0., index=np.arange(4))
+ result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
+ labels=Index(np.arange(100)))
+ expected = arr.sum(1)
+ assert_almost_equal(result, expected)
+
+ result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
+ labels=Index(np.arange(100)))
+ assert_almost_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_categorical.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_categorical.py
new file mode 100644
index 00000000000..e118135ccc7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_categorical.py
@@ -0,0 +1,936 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY37
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_equal, assert_frame_equal, assert_series_equal)
+
+
+def cartesian_product_for_groupers(result, args, names):
+ """ Reindex to a cartesian production for the groupers,
+ preserving the nature (Categorical) of each grouper """
+
+ def f(a):
+ if isinstance(a, (CategoricalIndex, Categorical)):
+ categories = a.categories
+ a = Categorical.from_codes(np.arange(len(categories)),
+ categories=categories,
+ ordered=a.ordered)
+ return a
+
+ index = pd.MultiIndex.from_product(map(f, args), names=names)
+ return result.reindex(index).sort_index()
+
+
+def test_apply_use_categorical_name(df):
+ cats = qcut(df.C, 4)
+
+ def get_stats(group):
+ return {'min': group.min(),
+ 'max': group.max(),
+ 'count': group.count(),
+ 'mean': group.mean()}
+
+ result = df.groupby(cats, observed=False).D.apply(get_stats)
+ assert result.index.names[0] == 'C'
+
+
+def test_basic():
+
+ cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+ exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
+ expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
+ result = data.groupby("b", observed=False).mean()
+ tm.assert_frame_equal(result, expected)
+
+ cat1 = Categorical(["a", "a", "b", "b"],
+ categories=["a", "b", "z"], ordered=True)
+ cat2 = Categorical(["c", "d", "c", "d"],
+ categories=["c", "d", "y"], ordered=True)
+ df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+
+ # single grouper
+ gb = df.groupby("A", observed=False)
+ exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
+ expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
+ result = gb.sum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 8623
+ x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
+ [1, 'John P. Doe']],
+ columns=['person_id', 'person_name'])
+ x['person_name'] = Categorical(x.person_name)
+
+ g = x.groupby(['person_id'], observed=False)
+ result = g.transform(lambda x: x)
+ tm.assert_frame_equal(result, x[['person_name']])
+
+ result = x.drop_duplicates('person_name')
+ expected = x.iloc[[0, 1]]
+ tm.assert_frame_equal(result, expected)
+
+ def f(x):
+ return x.drop_duplicates('person_name').iloc[0]
+
+ result = g.apply(f)
+ expected = x.iloc[[0, 1]].copy()
+ expected.index = Index([1, 2], name='person_id')
+ expected['person_name'] = expected['person_name'].astype('object')
+ tm.assert_frame_equal(result, expected)
+
+ # GH 9921
+ # Monotonic
+ df = DataFrame({"a": [5, 15, 25]})
+ c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
+
+ result = df.a.groupby(c, observed=False).transform(sum)
+ tm.assert_series_equal(result, df['a'])
+
+ tm.assert_series_equal(
+ df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+ df['a'])
+ tm.assert_frame_equal(
+ df.groupby(c, observed=False).transform(sum),
+ df[['a']])
+ tm.assert_frame_equal(
+ df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
+ df[['a']])
+
+ # Filter
+ tm.assert_series_equal(
+ df.a.groupby(c, observed=False).filter(np.all),
+ df['a'])
+ tm.assert_frame_equal(
+ df.groupby(c, observed=False).filter(np.all),
+ df)
+
+ # Non-monotonic
+ df = DataFrame({"a": [5, 15, 25, -5]})
+ c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
+
+ result = df.a.groupby(c, observed=False).transform(sum)
+ tm.assert_series_equal(result, df['a'])
+
+ tm.assert_series_equal(
+ df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+ df['a'])
+ tm.assert_frame_equal(
+ df.groupby(c, observed=False).transform(sum),
+ df[['a']])
+ tm.assert_frame_equal(
+ df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+ df[['a']])
+
+ # GH 9603
+ df = DataFrame({'a': [1, 0, 0, 0]})
+ c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
+ result = df.groupby(c, observed=False).apply(len)
+
+ exp_index = CategoricalIndex(
+ c.values.categories, ordered=c.values.ordered)
+ expected = Series([1, 0, 0, 0], index=exp_index)
+ expected.index.name = 'a'
+ tm.assert_series_equal(result, expected)
+
+ # more basic
+ levels = ['foo', 'bar', 'baz', 'qux']
+ codes = np.random.randint(0, 4, size=100)
+
+ cats = Categorical.from_codes(codes, levels, ordered=True)
+
+ data = DataFrame(np.random.randn(100, 4))
+
+ result = data.groupby(cats, observed=False).mean()
+
+ expected = data.groupby(np.asarray(cats), observed=False).mean()
+ exp_idx = CategoricalIndex(levels, categories=cats.categories,
+ ordered=True)
+ expected = expected.reindex(exp_idx)
+
+ assert_frame_equal(result, expected)
+
+ grouped = data.groupby(cats, observed=False)
+ desc_result = grouped.describe()
+
+ idx = cats.codes.argsort()
+ ord_labels = np.asarray(cats).take(idx)
+ ord_data = data.take(idx)
+
+ exp_cats = Categorical(ord_labels, ordered=True,
+ categories=['foo', 'bar', 'baz', 'qux'])
+ expected = ord_data.groupby(
+ exp_cats, sort=False, observed=False).describe()
+ assert_frame_equal(desc_result, expected)
+
+ # GH 10460
+ expc = Categorical.from_codes(np.arange(4).repeat(8),
+ levels, ordered=True)
+ exp = CategoricalIndex(expc)
+ tm.assert_index_equal((desc_result.stack().index
+ .get_level_values(0)), exp)
+ exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
+ '75%', 'max'] * 4)
+ tm.assert_index_equal((desc_result.stack().index
+ .get_level_values(1)), exp)
+
+
+def test_level_get_group(observed):
+ # GH15155
+ df = DataFrame(data=np.arange(2, 22, 2),
+ index=MultiIndex(
+ levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
+ codes=[[0] * 5 + [1] * 5, range(10)],
+ names=["Index1", "Index2"]))
+ g = df.groupby(level=["Index1"], observed=observed)
+
+ # expected should equal test.loc[["a"]]
+ # GH15166
+ expected = DataFrame(data=np.arange(2, 12, 2),
+ index=pd.MultiIndex(levels=[pd.CategoricalIndex(
+ ["a", "b"]), range(5)],
+ codes=[[0] * 5, range(5)],
+ names=["Index1", "Index2"]))
+ result = g.get_group('a')
+
+ assert_frame_equal(result, expected)
+
+
[email protected](PY37, reason="flaky on 3.7, xref gh-21636", strict=False)
[email protected]('ordered', [True, False])
+def test_apply(ordered):
+ # GH 10138
+
+ dense = Categorical(list('abc'), ordered=ordered)
+
+ # 'b' is in the categories but not in the list
+ missing = Categorical(
+ list('aaa'), categories=['a', 'b'], ordered=ordered)
+ values = np.arange(len(dense))
+ df = DataFrame({'missing': missing,
+ 'dense': dense,
+ 'values': values})
+ grouped = df.groupby(['missing', 'dense'], observed=True)
+
+ # missing category 'b' should still exist in the output index
+ idx = MultiIndex.from_arrays(
+ [missing, dense], names=['missing', 'dense'])
+ expected = DataFrame([0, 1, 2.],
+ index=idx,
+ columns=['values'])
+
+ result = grouped.apply(lambda x: np.mean(x))
+ assert_frame_equal(result, expected)
+
+ # we coerce back to ints
+ expected = expected.astype('int')
+ result = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ result = grouped.agg(np.mean)
+ assert_frame_equal(result, expected)
+
+ # but for transform we should still get back the original index
+ idx = MultiIndex.from_arrays([missing, dense],
+ names=['missing', 'dense'])
+ expected = Series(1, index=idx)
+ result = grouped.apply(lambda x: 1)
+ assert_series_equal(result, expected)
+
+
+def test_observed(observed):
+ # multiple groupers, don't re-expand the output space
+ # of the grouper
+ # gh-14942 (implement)
+ # gh-10132 (back-compat)
+ # gh-8138 (back-compat)
+ # gh-8869
+
+ cat1 = Categorical(["a", "a", "b", "b"],
+ categories=["a", "b", "z"], ordered=True)
+ cat2 = Categorical(["c", "d", "c", "d"],
+ categories=["c", "d", "y"], ordered=True)
+ df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+ df['C'] = ['foo', 'bar'] * 2
+
+ # multiple groupers with a non-cat
+ gb = df.groupby(['A', 'B', 'C'], observed=observed)
+ exp_index = pd.MultiIndex.from_arrays(
+ [cat1, cat2, ['foo', 'bar'] * 2],
+ names=['A', 'B', 'C'])
+ expected = DataFrame({'values': Series(
+ [1, 2, 3, 4], index=exp_index)}).sort_index()
+ result = gb.sum()
+ if not observed:
+ expected = cartesian_product_for_groupers(
+ expected,
+ [cat1, cat2, ['foo', 'bar']],
+ list('ABC'))
+
+ tm.assert_frame_equal(result, expected)
+
+ gb = df.groupby(['A', 'B'], observed=observed)
+ exp_index = pd.MultiIndex.from_arrays(
+ [cat1, cat2],
+ names=['A', 'B'])
+ expected = DataFrame({'values': [1, 2, 3, 4]},
+ index=exp_index)
+ result = gb.sum()
+ if not observed:
+ expected = cartesian_product_for_groupers(
+ expected,
+ [cat1, cat2],
+ list('AB'))
+
+ tm.assert_frame_equal(result, expected)
+
+ # https://github.com/pandas-dev/pandas/issues/8138
+ d = {'cat':
+ pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
+ ordered=True),
+ 'ints': [1, 1, 2, 2],
+ 'val': [10, 20, 30, 40]}
+ df = pd.DataFrame(d)
+
+ # Grouping on a single column
+ groups_single_key = df.groupby("cat", observed=observed)
+ result = groups_single_key.mean()
+
+ exp_index = pd.CategoricalIndex(list('ab'), name="cat",
+ categories=list('abc'),
+ ordered=True)
+ expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
+ index=exp_index)
+ if not observed:
+ index = pd.CategoricalIndex(list('abc'), name="cat",
+ categories=list('abc'),
+ ordered=True)
+ expected = expected.reindex(index)
+
+ tm.assert_frame_equal(result, expected)
+
+ # Grouping on two columns
+ groups_double_key = df.groupby(["cat", "ints"], observed=observed)
+ result = groups_double_key.agg('mean')
+ expected = DataFrame(
+ {"val": [10, 30, 20, 40],
+ "cat": pd.Categorical(['a', 'a', 'b', 'b'],
+ categories=['a', 'b', 'c'],
+ ordered=True),
+ "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
+ if not observed:
+ expected = cartesian_product_for_groupers(
+ expected,
+ [df.cat.values, [1, 2]],
+ ['cat', 'ints'])
+
+ tm.assert_frame_equal(result, expected)
+
+ # GH 10132
+ for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
+ c, i = key
+ result = groups_double_key.get_group(key)
+ expected = df[(df.cat == c) & (df.ints == i)]
+ assert_frame_equal(result, expected)
+
+ # gh-8869
+ # with as_index
+ d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
+ 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
+ df = pd.DataFrame(d)
+ cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
+ df['range'] = cat
+ groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
+ result = groups.agg('mean')
+
+ groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
+ expected = groups2.agg('mean').reset_index()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_observed_codes_remap(observed):
+ d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
+ df = pd.DataFrame(d)
+ values = pd.cut(df['C1'], [1, 2, 3, 6])
+ values.name = "cat"
+ groups_double_key = df.groupby([values, 'C2'], observed=observed)
+
+ idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
+ names=["cat", "C2"])
+ expected = DataFrame({"C1": [3, 3, 4, 5],
+ "C3": [10, 100, 200, 34]}, index=idx)
+ if not observed:
+ expected = cartesian_product_for_groupers(
+ expected,
+ [values.values, [1, 2, 3, 4]],
+ ['cat', 'C2'])
+
+ result = groups_double_key.agg('mean')
+ tm.assert_frame_equal(result, expected)
+
+
+def test_observed_perf():
+ # we create a cartesian product, so this is
+ # non-performant if we don't use observed values
+ # gh-14942
+ df = DataFrame({
+ 'cat': np.random.randint(0, 255, size=30000),
+ 'int_id': np.random.randint(0, 255, size=30000),
+ 'other_id': np.random.randint(0, 10000, size=30000),
+ 'foo': 0})
+ df['cat'] = df.cat.astype(str).astype('category')
+
+ grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
+ result = grouped.count()
+ assert result.index.levels[0].nunique() == df.cat.nunique()
+ assert result.index.levels[1].nunique() == df.int_id.nunique()
+ assert result.index.levels[2].nunique() == df.other_id.nunique()
+
+
+def test_observed_groups(observed):
+ # gh-20583
+ # test that we have the appropriate groups
+
+ cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
+ df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
+ g = df.groupby('cat', observed=observed)
+
+ result = g.groups
+ if observed:
+ expected = {'a': Index([0, 2], dtype='int64'),
+ 'c': Index([1], dtype='int64')}
+ else:
+ expected = {'a': Index([0, 2], dtype='int64'),
+ 'b': Index([], dtype='int64'),
+ 'c': Index([1], dtype='int64')}
+
+ tm.assert_dict_equal(result, expected)
+
+
+def test_observed_groups_with_nan(observed):
+ # GH 24740
+ df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'],
+ categories=['a', 'b', 'd']),
+ 'vals': [1, 2, 3]})
+ g = df.groupby('cat', observed=observed)
+ result = g.groups
+ if observed:
+ expected = {'a': Index([0, 2], dtype='int64')}
+ else:
+ expected = {'a': Index([0, 2], dtype='int64'),
+ 'b': Index([], dtype='int64'),
+ 'd': Index([], dtype='int64')}
+ tm.assert_dict_equal(result, expected)
+
+
+def test_dataframe_categorical_with_nan(observed):
+ # GH 21151
+ s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'],
+ categories=['a', 'b', 'c'])
+ s2 = pd.Series([1, 2, 3, 4])
+ df = pd.DataFrame({'s1': s1, 's2': s2})
+ result = df.groupby('s1', observed=observed).first().reset_index()
+ if observed:
+ expected = DataFrame({'s1': pd.Categorical(['a'],
+ categories=['a', 'b', 'c']), 's2': [2]})
+ else:
+ expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'],
+ categories=['a', 'b', 'c']),
+ 's2': [2, np.nan, np.nan]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_datetime():
+ # GH9049: ensure backward compatibility
+ levels = pd.date_range('2014-01-01', periods=4)
+ codes = np.random.randint(0, 4, size=100)
+
+ cats = Categorical.from_codes(codes, levels, ordered=True)
+
+ data = DataFrame(np.random.randn(100, 4))
+ result = data.groupby(cats, observed=False).mean()
+
+ expected = data.groupby(np.asarray(cats), observed=False).mean()
+ expected = expected.reindex(levels)
+ expected.index = CategoricalIndex(expected.index,
+ categories=expected.index,
+ ordered=True)
+
+ assert_frame_equal(result, expected)
+
+ grouped = data.groupby(cats, observed=False)
+ desc_result = grouped.describe()
+
+ idx = cats.codes.argsort()
+ ord_labels = cats.take_nd(idx)
+ ord_data = data.take(idx)
+ expected = ord_data.groupby(ord_labels, observed=False).describe()
+ assert_frame_equal(desc_result, expected)
+ tm.assert_index_equal(desc_result.index, expected.index)
+ tm.assert_index_equal(
+ desc_result.index.get_level_values(0),
+ expected.index.get_level_values(0))
+
+ # GH 10460
+ expc = Categorical.from_codes(
+ np.arange(4).repeat(8), levels, ordered=True)
+ exp = CategoricalIndex(expc)
+ tm.assert_index_equal((desc_result.stack().index
+ .get_level_values(0)), exp)
+ exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
+ '75%', 'max'] * 4)
+ tm.assert_index_equal((desc_result.stack().index
+ .get_level_values(1)), exp)
+
+
+def test_categorical_index():
+
+ s = np.random.RandomState(12345)
+ levels = ['foo', 'bar', 'baz', 'qux']
+ codes = s.randint(0, 4, size=20)
+ cats = Categorical.from_codes(codes, levels, ordered=True)
+ df = DataFrame(
+ np.repeat(
+ np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
+ df['cats'] = cats
+
+ # with a cat index
+ result = df.set_index('cats').groupby(level=0, observed=False).sum()
+ expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
+ expected.index = CategoricalIndex(
+ Categorical.from_codes(
+ [0, 1, 2, 3], levels, ordered=True), name='cats')
+ assert_frame_equal(result, expected)
+
+ # with a cat column, should produce a cat index
+ result = df.groupby('cats', observed=False).sum()
+ expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
+ expected.index = CategoricalIndex(
+ Categorical.from_codes(
+ [0, 1, 2, 3], levels, ordered=True), name='cats')
+ assert_frame_equal(result, expected)
+
+
+def test_describe_categorical_columns():
+ # GH 11558
+ cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
+ categories=['foo', 'bar', 'baz', 'qux'],
+ ordered=True)
+ df = DataFrame(np.random.randn(20, 4), columns=cats)
+ result = df.groupby([1, 2, 3, 4] * 5).describe()
+
+ tm.assert_index_equal(result.stack().columns, cats)
+ tm.assert_categorical_equal(result.stack().columns.values, cats.values)
+
+
+def test_unstack_categorical():
+ # GH11558 (example is taken from the original issue)
+ df = pd.DataFrame({'a': range(10),
+ 'medium': ['A', 'B'] * 5,
+ 'artist': list('XYXXY') * 2})
+ df['medium'] = df['medium'].astype('category')
+
+ gcat = df.groupby(
+ ['artist', 'medium'], observed=False)['a'].count().unstack()
+ result = gcat.describe()
+
+ exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
+ name='medium')
+ tm.assert_index_equal(result.columns, exp_columns)
+ tm.assert_categorical_equal(result.columns.values, exp_columns.values)
+
+ result = gcat['A'] + gcat['B']
+ expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
+ tm.assert_series_equal(result, expected)
+
+
+def test_bins_unequal_len():
+ # GH3011
+ series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
+ bins = pd.cut(series.dropna().values, 4)
+
+ # len(bins) != len(series) here
+ with pytest.raises(ValueError):
+ series.groupby(bins).mean()
+
+
+def test_as_index():
+ # GH13204
+ df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
+ 'A': [10, 11, 11],
+ 'B': [101, 102, 103]})
+ result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
+ expected = DataFrame(
+ {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+ 'A': [10, 11],
+ 'B': [101, 205]},
+ columns=['cat', 'A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ # function grouper
+ f = lambda r: df.loc[r, 'A']
+ result = df.groupby(['cat', f], as_index=False, observed=True).sum()
+ expected = DataFrame(
+ {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+ 'A': [10, 22],
+ 'B': [101, 205]},
+ columns=['cat', 'A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ # another not in-axis grouper (conflicting names in index)
+ s = Series(['a', 'b', 'b'], name='cat')
+ result = df.groupby(['cat', s], as_index=False, observed=True).sum()
+ tm.assert_frame_equal(result, expected)
+
+ # is original index dropped?
+ group_columns = ['cat', 'A']
+ expected = DataFrame(
+ {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+ 'A': [10, 11],
+ 'B': [101, 205]},
+ columns=['cat', 'A', 'B'])
+
+ for name in [None, 'X', 'B']:
+ df.index = Index(list("abc"), name=name)
+ result = df.groupby(group_columns, as_index=False, observed=True).sum()
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_preserve_categories():
+ # GH-13179
+ categories = list('abc')
+
+ # ordered=True
+ df = DataFrame({'A': pd.Categorical(list('ba'),
+ categories=categories,
+ ordered=True)})
+ index = pd.CategoricalIndex(categories, categories, ordered=True)
+ tm.assert_index_equal(
+ df.groupby('A', sort=True, observed=False).first().index, index)
+ tm.assert_index_equal(
+ df.groupby('A', sort=False, observed=False).first().index, index)
+
+ # ordered=False
+ df = DataFrame({'A': pd.Categorical(list('ba'),
+ categories=categories,
+ ordered=False)})
+ sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
+ nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
+ ordered=False)
+ tm.assert_index_equal(
+ df.groupby('A', sort=True, observed=False).first().index,
+ sort_index)
+ tm.assert_index_equal(
+ df.groupby('A', sort=False, observed=False).first().index,
+ nosort_index)
+
+
+def test_preserve_categorical_dtype():
+ # GH13743, GH13854
+ df = DataFrame({'A': [1, 2, 1, 1, 2],
+ 'B': [10, 16, 22, 28, 34],
+ 'C1': Categorical(list("abaab"),
+ categories=list("bac"),
+ ordered=False),
+ 'C2': Categorical(list("abaab"),
+ categories=list("bac"),
+ ordered=True)})
+ # single grouper
+ exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
+ 'B': [25.0, 20.0, np.nan],
+ 'C1': Categorical(list("bac"),
+ categories=list("bac"),
+ ordered=False),
+ 'C2': Categorical(list("bac"),
+ categories=list("bac"),
+ ordered=True)})
+ for col in ['C1', 'C2']:
+ result1 = df.groupby(by=col, as_index=False, observed=False).mean()
+ result2 = df.groupby(
+ by=col, as_index=True, observed=False).mean().reset_index()
+ expected = exp_full.reindex(columns=result1.columns)
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+
+
+def test_categorical_no_compress():
+ data = Series(np.random.randn(9))
+
+ codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
+ cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
+
+ result = data.groupby(cats, observed=False).mean()
+ exp = data.groupby(codes, observed=False).mean()
+
+ exp.index = CategoricalIndex(exp.index, categories=cats.categories,
+ ordered=cats.ordered)
+ assert_series_equal(result, exp)
+
+ codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
+ cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
+
+ result = data.groupby(cats, observed=False).mean()
+ exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
+ exp.index = CategoricalIndex(exp.index, categories=cats.categories,
+ ordered=cats.ordered)
+ assert_series_equal(result, exp)
+
+ cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+ result = data.groupby("b", observed=False).mean()
+ result = result["a"].values
+ exp = np.array([1, 2, 4, np.nan])
+ tm.assert_numpy_array_equal(result, exp)
+
+
+def test_sort():
+
+ # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8
+ # This should result in a properly sorted Series so that the plot
+ # has a sorted x axis
+ # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ res = df.groupby(['value_group'], observed=False)['value_group'].count()
+ exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
+ exp.index = CategoricalIndex(exp.index, name=exp.index.name)
+ tm.assert_series_equal(res, exp)
+
+
+def test_sort2():
+ # dataframe groupby sort was being ignored # GH 8868
+ df = DataFrame([['(7.5, 10]', 10, 10],
+ ['(7.5, 10]', 8, 20],
+ ['(2.5, 5]', 5, 30],
+ ['(5, 7.5]', 6, 40],
+ ['(2.5, 5]', 4, 50],
+ ['(0, 2.5]', 1, 60],
+ ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
+ df['range'] = Categorical(df['range'], ordered=True)
+ index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
+ '(7.5, 10]'], name='range', ordered=True)
+ expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+ columns=['foo', 'bar'], index=index)
+
+ col = 'range'
+ result_sort = df.groupby(col, sort=True, observed=False).first()
+ assert_frame_equal(result_sort, expected_sort)
+
+ # when categories is ordered, group is ordered by category's order
+ expected_sort = result_sort
+ result_sort = df.groupby(col, sort=False, observed=False).first()
+ assert_frame_equal(result_sort, expected_sort)
+
+ df['range'] = Categorical(df['range'], ordered=False)
+ index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
+ '(7.5, 10]'], name='range')
+ expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+ columns=['foo', 'bar'], index=index)
+
+ index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
+ '(0, 2.5]'],
+ categories=['(7.5, 10]', '(2.5, 5]',
+ '(5, 7.5]', '(0, 2.5]'],
+ name='range')
+ expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ index=index, columns=['foo', 'bar'])
+
+ col = 'range'
+
+ # this is an unordered categorical, but we allow this ####
+ result_sort = df.groupby(col, sort=True, observed=False).first()
+ assert_frame_equal(result_sort, expected_sort)
+
+ result_nosort = df.groupby(col, sort=False, observed=False).first()
+ assert_frame_equal(result_nosort, expected_nosort)
+
+
+def test_sort_datetimelike():
+ # GH10505
+
+ # use same data as test_groupby_sort_categorical, which category is
+ # corresponding to datetime.month
+ df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
+ datetime(2011, 2, 1), datetime(2011, 5, 1),
+ datetime(2011, 2, 1), datetime(2011, 1, 1),
+ datetime(2011, 5, 1)],
+ 'foo': [10, 8, 5, 6, 4, 1, 7],
+ 'bar': [10, 20, 30, 40, 50, 60, 70]},
+ columns=['dt', 'foo', 'bar'])
+
+ # ordered=True
+ df['dt'] = Categorical(df['dt'], ordered=True)
+ index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 7, 1)]
+ result_sort = DataFrame(
+ [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+ result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
+
+ index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 1, 1)]
+ result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ columns=['foo', 'bar'])
+ result_nosort.index = CategoricalIndex(index, categories=index,
+ name='dt', ordered=True)
+
+ col = 'dt'
+ assert_frame_equal(
+ result_sort, df.groupby(col, sort=True, observed=False).first())
+
+ # when categories is ordered, group is ordered by category's order
+ assert_frame_equal(
+ result_sort, df.groupby(col, sort=False, observed=False).first())
+
+ # ordered = False
+ df['dt'] = Categorical(df['dt'], ordered=False)
+ index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 7, 1)]
+ result_sort = DataFrame(
+ [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+ result_sort.index = CategoricalIndex(index, name='dt')
+
+ index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 1, 1)]
+ result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ columns=['foo', 'bar'])
+ result_nosort.index = CategoricalIndex(index, categories=index,
+ name='dt')
+
+ col = 'dt'
+ assert_frame_equal(
+ result_sort, df.groupby(col, sort=True, observed=False).first())
+ assert_frame_equal(
+ result_nosort, df.groupby(col, sort=False, observed=False).first())
+
+
+def test_empty_sum():
+ # https://github.com/pandas-dev/pandas/issues/18678
+ df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+ categories=['a', 'b', 'c']),
+ 'B': [1, 2, 1]})
+ expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
+
+ # 0 by default
+ result = df.groupby("A", observed=False).B.sum()
+ expected = pd.Series([3, 1, 0], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = df.groupby("A", observed=False).B.sum(min_count=0)
+ expected = pd.Series([3, 1, 0], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = df.groupby("A", observed=False).B.sum(min_count=1)
+ expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count>1
+ result = df.groupby("A", observed=False).B.sum(min_count=2)
+ expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+
+def test_empty_prod():
+ # https://github.com/pandas-dev/pandas/issues/18678
+ df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+ categories=['a', 'b', 'c']),
+ 'B': [1, 2, 1]})
+
+ expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
+
+ # 1 by default
+ result = df.groupby("A", observed=False).B.prod()
+ expected = pd.Series([2, 1, 1], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = df.groupby("A", observed=False).B.prod(min_count=0)
+ expected = pd.Series([2, 1, 1], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = df.groupby("A", observed=False).B.prod(min_count=1)
+ expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+
+def test_groupby_multiindex_categorical_datetime():
+ # https://github.com/pandas-dev/pandas/issues/21390
+
+ df = pd.DataFrame({
+ 'key1': pd.Categorical(list('abcbabcba')),
+ 'key2': pd.Categorical(
+ list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
+ 'values': np.arange(9),
+ })
+ result = df.groupby(['key1', 'key2']).mean()
+
+ idx = pd.MultiIndex.from_product(
+ [pd.Categorical(['a', 'b', 'c']),
+ pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
+ names=['key1', 'key2'])
+ expected = pd.DataFrame(
+ {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
+ assert_frame_equal(result, expected)
+
+
[email protected]("as_index, expected", [
+ (True, pd.Series(
+ index=pd.MultiIndex.from_arrays(
+ [pd.Series([1, 1, 2], dtype='category'),
+ [1, 2, 2]], names=['a', 'b']
+ ),
+ data=[1, 2, 3], name='x'
+ )),
+ (False, pd.DataFrame({
+ 'a': pd.Series([1, 1, 2], dtype='category'),
+ 'b': [1, 2, 2],
+ 'x': [1, 2, 3]
+ }))
+])
+def test_groupby_agg_observed_true_single_column(as_index, expected):
+ # GH-23970
+ df = pd.DataFrame({
+ 'a': pd.Series([1, 1, 2], dtype='category'),
+ 'b': [1, 2, 2],
+ 'x': [1, 2, 3]
+ })
+
+ result = df.groupby(
+ ['a', 'b'], as_index=as_index, observed=True)['x'].sum()
+
+ assert_equal(result, expected)
+
+
[email protected]('fill_value', [None, np.nan, pd.NaT])
+def test_shift(fill_value):
+ ct = pd.Categorical(['a', 'b', 'c', 'd'],
+ categories=['a', 'b', 'c', 'd'], ordered=False)
+ expected = pd.Categorical([None, 'a', 'b', 'c'],
+ categories=['a', 'b', 'c', 'd'], ordered=False)
+ res = ct.shift(1, fill_value=fill_value)
+ assert_equal(res, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_counting.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_counting.py
new file mode 100644
index 00000000000..1438de5b7e3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_counting.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+from pandas.compat import product as cart_product, range
+
+from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestCounting(object):
+
+ def test_cumcount(self):
+ df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0, 1, 2, 0, 3])
+
+ assert_series_equal(expected, g.cumcount())
+ assert_series_equal(expected, sg.cumcount())
+
+ def test_cumcount_empty(self):
+ ge = DataFrame().groupby(level=0)
+ se = Series().groupby(level=0)
+
+ # edge case, as this is usually considered float
+ e = Series(dtype='int64')
+
+ assert_series_equal(e, ge.cumcount())
+ assert_series_equal(e, se.cumcount())
+
+ def test_cumcount_dupe_index(self):
+ df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+ index=[0] * 5)
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+ assert_series_equal(expected, g.cumcount())
+ assert_series_equal(expected, sg.cumcount())
+
+ def test_cumcount_mi(self):
+ mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+ df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+ index=mi)
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0, 1, 2, 0, 3], index=mi)
+
+ assert_series_equal(expected, g.cumcount())
+ assert_series_equal(expected, sg.cumcount())
+
+ def test_cumcount_groupby_not_col(self):
+ df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+ index=[0] * 5)
+ g = df.groupby([0, 0, 0, 1, 0])
+ sg = g.A
+
+ expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+ assert_series_equal(expected, g.cumcount())
+ assert_series_equal(expected, sg.cumcount())
+
+ def test_ngroup(self):
+ df = DataFrame({'A': list('aaaba')})
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0, 0, 0, 1, 0])
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_distinct(self):
+ df = DataFrame({'A': list('abcde')})
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series(range(5), dtype='int64')
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_one_group(self):
+ df = DataFrame({'A': [0] * 5})
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0] * 5)
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_empty(self):
+ ge = DataFrame().groupby(level=0)
+ se = Series().groupby(level=0)
+
+ # edge case, as this is usually considered float
+ e = Series(dtype='int64')
+
+ assert_series_equal(e, ge.ngroup())
+ assert_series_equal(e, se.ngroup())
+
+ def test_ngroup_series_matches_frame(self):
+ df = DataFrame({'A': list('aaaba')})
+ s = Series(list('aaaba'))
+
+ assert_series_equal(df.groupby(s).ngroup(),
+ s.groupby(s).ngroup())
+
+ def test_ngroup_dupe_index(self):
+ df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
+ g = df.groupby('A')
+ sg = g.A
+
+ expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_mi(self):
+ mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+ df = DataFrame({'A': list('aaaba')}, index=mi)
+ g = df.groupby('A')
+ sg = g.A
+ expected = Series([0, 0, 0, 1, 0], index=mi)
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_groupby_not_col(self):
+ df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
+ g = df.groupby([0, 0, 0, 1, 0])
+ sg = g.A
+
+ expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+ assert_series_equal(expected, g.ngroup())
+ assert_series_equal(expected, sg.ngroup())
+
+ def test_ngroup_descending(self):
+ df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
+ g = df.groupby(['A'])
+
+ ascending = Series([0, 0, 1, 0, 1])
+ descending = Series([1, 1, 0, 1, 0])
+
+ assert_series_equal(descending, (g.ngroups - 1) - ascending)
+ assert_series_equal(ascending, g.ngroup(ascending=True))
+ assert_series_equal(descending, g.ngroup(ascending=False))
+
+ def test_ngroup_matches_cumcount(self):
+ # verify one manually-worked out case works
+ df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
+ ['a', 'x'], ['b', 'y']], columns=['A', 'X'])
+ g = df.groupby(['A', 'X'])
+ g_ngroup = g.ngroup()
+ g_cumcount = g.cumcount()
+ expected_ngroup = Series([0, 1, 2, 0, 3])
+ expected_cumcount = Series([0, 0, 0, 1, 0])
+
+ assert_series_equal(g_ngroup, expected_ngroup)
+ assert_series_equal(g_cumcount, expected_cumcount)
+
+ def test_ngroup_cumcount_pair(self):
+ # brute force comparison for all small series
+ for p in cart_product(range(3), repeat=4):
+ df = DataFrame({'a': p})
+ g = df.groupby(['a'])
+
+ order = sorted(set(p))
+ ngroupd = [order.index(val) for val in p]
+ cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
+
+ assert_series_equal(g.ngroup(), Series(ngroupd))
+ assert_series_equal(g.cumcount(), Series(cumcounted))
+
+ def test_ngroup_respects_groupby_order(self):
+ np.random.seed(0)
+ df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
+ for sort_flag in (False, True):
+ g = df.groupby(['a'], sort=sort_flag)
+ df['group_id'] = -1
+ df['group_index'] = -1
+
+ for i, (_, group) in enumerate(g):
+ df.loc[group.index, 'group_id'] = i
+ for j, ind in enumerate(group.index):
+ df.loc[ind, 'group_index'] = j
+
+ assert_series_equal(Series(df['group_id'].values),
+ g.ngroup())
+ assert_series_equal(Series(df['group_index'].values),
+ g.cumcount())
+
+ @pytest.mark.parametrize('datetimelike', [
+ [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)],
+ [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)],
+ [Timedelta(x, unit="h") for x in range(1, 4)],
+ [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]])
+ def test_count_with_datetimelike(self, datetimelike):
+ # test for #13393, where DataframeGroupBy.count() fails
+ # when counting a datetimelike column.
+
+ df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike})
+ res = df.groupby('x').count()
+ expected = DataFrame({'y': [2, 1]}, index=['a', 'b'])
+ expected.index.name = "x"
+ assert_frame_equal(expected, res)
+
+ def test_count_with_only_nans_in_first_group(self):
+ # GH21956
+ df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]})
+ result = df.groupby(['A', 'B']).C.count()
+ mi = MultiIndex(levels=[[], ['a', 'b']],
+ codes=[[], []],
+ names=['A', 'B'])
+ expected = Series([], index=mi, dtype=np.int64, name='C')
+ assert_series_equal(result, expected, check_index_type=False)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_filters.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_filters.py
new file mode 100644
index 00000000000..4d386db735b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_filters.py
@@ -0,0 +1,588 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Series, Timestamp
+import pandas.util.testing as tm
+
+
+def test_filter_series():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
+ expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() < 10), expected_odd)
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() > 10), expected_even)
+ # Test dropna=False.
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() < 10, dropna=False),
+ expected_odd.reindex(s.index))
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() > 10, dropna=False),
+ expected_even.reindex(s.index))
+
+
+def test_filter_single_column_df():
+ df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
+ expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
+ expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
+ grouper = df[0].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() < 10), expected_odd)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() > 10), expected_even)
+ # Test dropna=False.
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() < 10, dropna=False),
+ expected_odd.reindex(df.index))
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() > 10, dropna=False),
+ expected_even.reindex(df.index))
+
+
+def test_filter_multi_column_df():
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10),
+ expected)
+
+
+def test_filter_mixed_df():
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2])
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() > 10), expected)
+
+
+def test_filter_out_all_groups():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]])
+
+
+def test_filter_out_no_groups():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ filtered = grouped.filter(lambda x: x.mean() > 0)
+ tm.assert_series_equal(filtered, s)
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ filtered = grouped.filter(lambda x: x['A'].mean() > 0)
+ tm.assert_frame_equal(filtered, df)
+
+
+def test_filter_out_all_groups_in_df():
+ # GH12768
+ df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
+ res = df.groupby('a')
+ res = res.filter(lambda x: x['b'].sum() > 5, dropna=False)
+ expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3})
+ tm.assert_frame_equal(expected, res)
+
+ df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
+ res = df.groupby('a')
+ res = res.filter(lambda x: x['b'].sum() > 5, dropna=True)
+ expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64")
+ tm.assert_frame_equal(expected, res)
+
+
+def test_filter_condition_raises():
+ def raise_if_sum_is_zero(x):
+ if x.sum() == 0:
+ raise ValueError
+ else:
+ return x.sum() > 0
+
+ s = pd.Series([-1, 0, 1, 2])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ msg = "the filter must return a boolean result"
+ with pytest.raises(TypeError, match=msg):
+ grouped.filter(raise_if_sum_is_zero)
+
+
+def test_filter_with_axis_in_groupby():
+ # issue 11041
+ index = pd.MultiIndex.from_product([range(10), [0, 1]])
+ data = pd.DataFrame(
+ np.arange(100).reshape(-1, 20), columns=index, dtype='int64')
+ result = data.groupby(level=0,
+ axis=1).filter(lambda x: x.iloc[0, 0] > 10)
+ expected = data.iloc[:, 12:20]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_filter_bad_shapes():
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ s = df['B']
+ g_df = df.groupby('B')
+ g_s = s.groupby(s)
+
+ f = lambda x: x
+ msg = "filter function returned a DataFrame, but expected a scalar bool"
+ with pytest.raises(TypeError, match=msg):
+ g_df.filter(f)
+ msg = "the filter must return a boolean result"
+ with pytest.raises(TypeError, match=msg):
+ g_s.filter(f)
+
+ f = lambda x: x == 1
+ msg = "filter function returned a DataFrame, but expected a scalar bool"
+ with pytest.raises(TypeError, match=msg):
+ g_df.filter(f)
+ msg = "the filter must return a boolean result"
+ with pytest.raises(TypeError, match=msg):
+ g_s.filter(f)
+
+ f = lambda x: np.outer(x, x)
+ msg = "can't multiply sequence by non-int of type 'str'"
+ with pytest.raises(TypeError, match=msg):
+ g_df.filter(f)
+ msg = "the filter must return a boolean result"
+ with pytest.raises(TypeError, match=msg):
+ g_s.filter(f)
+
+
+def test_filter_nan_is_false():
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ s = df['B']
+ g_df = df.groupby(df['B'])
+ g_s = s.groupby(s)
+
+ f = lambda x: np.nan
+ tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
+ tm.assert_series_equal(g_s.filter(f), s[[]])
+
+
+def test_filter_against_workaround():
+ np.random.seed(0)
+ # Series of ints
+ s = Series(np.random.randint(0, 100, 1000))
+ grouper = s.apply(lambda x: np.round(x, -1))
+ grouped = s.groupby(grouper)
+ f = lambda x: x.mean() > 10
+
+ old_way = s[grouped.transform(f).astype('bool')]
+ new_way = grouped.filter(f)
+ tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+ # Series of floats
+ s = 100 * Series(np.random.random(1000))
+ grouper = s.apply(lambda x: np.round(x, -1))
+ grouped = s.groupby(grouper)
+ f = lambda x: x.mean() > 10
+ old_way = s[grouped.transform(f).astype('bool')]
+ new_way = grouped.filter(f)
+ tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+ # Set up DataFrame of ints, floats, strings.
+ from string import ascii_lowercase
+ letters = np.array(list(ascii_lowercase))
+ N = 1000
+ random_letters = letters.take(np.random.randint(0, 26, N))
+ df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
+ 'floats': N / 10 * Series(np.random.random(N)),
+ 'letters': Series(random_letters)})
+
+ # Group by ints; filter on floats.
+ grouped = df.groupby('ints')
+ old_way = df[grouped.floats.
+ transform(lambda x: x.mean() > N / 20).astype('bool')]
+ new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
+ tm.assert_frame_equal(new_way, old_way)
+
+ # Group by floats (rounded); filter on strings.
+ grouper = df.floats.apply(lambda x: np.round(x, -1))
+ grouped = df.groupby(grouper)
+ old_way = df[grouped.letters.
+ transform(lambda x: len(x) < N / 10).astype('bool')]
+ new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
+ tm.assert_frame_equal(new_way, old_way)
+
+ # Group by strings; filter on ints.
+ grouped = df.groupby('letters')
+ old_way = df[grouped.ints.
+ transform(lambda x: x.mean() > N / 20).astype('bool')]
+ new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
+ tm.assert_frame_equal(new_way, old_way)
+
+
+def test_filter_using_len():
+ # BUG GH4447
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ grouped = df.groupby('B')
+ actual = grouped.filter(lambda x: len(x) > 2)
+ expected = DataFrame(
+ {'A': np.arange(2, 6),
+ 'B': list('bbbb'),
+ 'C': np.arange(2, 6)}, index=np.arange(2, 6))
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped.filter(lambda x: len(x) > 4)
+ expected = df.loc[[]]
+ tm.assert_frame_equal(actual, expected)
+
+ # Series have always worked properly, but we'll test anyway.
+ s = df['B']
+ grouped = s.groupby(s)
+ actual = grouped.filter(lambda x: len(x) > 2)
+ expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped.filter(lambda x: len(x) > 4)
+ expected = s[[]]
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_maintains_ordering():
+ # Simple case: index is sequential. #4621
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+ # Now index is sequentially decreasing.
+ df.index = np.arange(len(df) - 1, -1, -1)
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+ # Index is shuffled.
+ SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
+ df.index = df.index[SHUFFLED]
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_multiple_timestamp():
+ # GH 10114
+ df = DataFrame({'A': np.arange(5, dtype='int64'),
+ 'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
+ 'C': Timestamp('20130101')})
+
+ grouped = df.groupby(['B', 'C'])
+
+ result = grouped['A'].filter(lambda x: True)
+ tm.assert_series_equal(df['A'], result)
+
+ result = grouped['A'].transform(len)
+ expected = Series([2, 3, 2, 3, 3], name='A')
+ tm.assert_series_equal(result, expected)
+
+ result = grouped.filter(lambda x: True)
+ tm.assert_frame_equal(df, result)
+
+ result = grouped.transform('sum')
+ expected = DataFrame({'A': [2, 8, 2, 8, 8]})
+ tm.assert_frame_equal(result, expected)
+
+ result = grouped.transform(len)
+ expected = DataFrame({'A': [2, 3, 2, 3, 3]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_filter_and_transform_with_non_unique_int_index():
+ # GH4620
+ index = [1, 1, 1, 2, 1, 1, 0, 1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_multiple_non_unique_int_index():
+ # GH4620
+ index = [1, 1, 1, 2, 0, 0, 0, 1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_float_index():
+ # GH4620
+ index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_timestamp_index():
+ # GH4620
+ t0 = Timestamp('2013-09-30 00:05:00')
+ t1 = Timestamp('2013-10-30 00:05:00')
+ t2 = Timestamp('2013-11-30 00:05:00')
+ index = [t1, t1, t1, t2, t1, t1, t0, t1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_string_index():
+ # GH4620
+ index = list('bbbcbbab')
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_has_access_to_grouped_cols():
+ df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ # previously didn't have access to col A #????
+ filt = g.filter(lambda x: x['A'].sum() == 2)
+ tm.assert_frame_equal(filt, df.iloc[[0, 1]])
+
+
+def test_filter_enforces_scalarness():
+ df = pd.DataFrame([
+ ['best', 'a', 'x'],
+ ['worst', 'b', 'y'],
+ ['best', 'c', 'x'],
+ ['best', 'd', 'y'],
+ ['worst', 'd', 'y'],
+ ['worst', 'd', 'y'],
+ ['best', 'd', 'z'],
+ ], columns=['a', 'b', 'c'])
+ with pytest.raises(TypeError, match='filter function returned a.*'):
+ df.groupby('c').filter(lambda g: g['a'] == 'best')
+
+
+def test_filter_non_bool_raises():
+ df = pd.DataFrame([
+ ['best', 'a', 1],
+ ['worst', 'b', 1],
+ ['best', 'c', 1],
+ ['best', 'd', 1],
+ ['worst', 'd', 1],
+ ['worst', 'd', 1],
+ ['best', 'd', 1],
+ ], columns=['a', 'b', 'c'])
+ with pytest.raises(TypeError, match='filter function returned a.*'):
+ df.groupby('a').filter(lambda g: g.c.mean())
+
+
+def test_filter_dropna_with_empty_groups():
+ # GH 10780
+ data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
+ groupped = data.groupby(level=0)
+ result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
+ expected_false = pd.Series([np.nan] * 9,
+ index=np.repeat([1, 2, 3], 3))
+ tm.assert_series_equal(result_false, expected_false)
+
+ result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
+ expected_true = pd.Series(index=pd.Index([], dtype=int))
+ tm.assert_series_equal(result_true, expected_true)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_function.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_function.py
new file mode 100644
index 00000000000..a884a37840f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_function.py
@@ -0,0 +1,1143 @@
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+from pandas.compat import product as cart_product
+from pandas.errors import UnsupportedFunctionCall
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range, isna)
+import pandas.core.nanops as nanops
+from pandas.util import testing as tm
+
+
[email protected]("agg_func", ['any', 'all'])
[email protected]("skipna", [True, False])
+ ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
+ [1, 2, 3], [1, 0, 0], [0, 0, 0],
+ [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
+ [True, True, True], [True, False, False], [False, False, False],
+ [np.nan, np.nan, np.nan]
+])
+def test_groupby_bool_aggs(agg_func, skipna, vals):
+ df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})
+
+ # Figure out expectation using Python builtin
+ exp = getattr(compat.builtins, agg_func)(vals)
+
+ # edge case for missing data with skipna and 'any'
+ if skipna and all(isna(vals)) and agg_func == 'any':
+ exp = False
+
+ exp_df = DataFrame([exp] * 2, columns=['val'], index=Index(
+ ['a', 'b'], name='key'))
+ result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
+ tm.assert_frame_equal(result, exp_df)
+
+
+def test_max_min_non_numeric():
+ # #2700
+ aa = DataFrame({'nn': [11, 11, 22, 22],
+ 'ii': [1, 2, 3, 4],
+ 'ss': 4 * ['mama']})
+
+ result = aa.groupby('nn').max()
+ assert 'ss' in result
+
+ result = aa.groupby('nn').max(numeric_only=False)
+ assert 'ss' in result
+
+ result = aa.groupby('nn').min()
+ assert 'ss' in result
+
+ result = aa.groupby('nn').min(numeric_only=False)
+ assert 'ss' in result
+
+
+def test_intercept_builtin_sum():
+ s = Series([1., 2., np.nan, 3.])
+ grouped = s.groupby([0, 1, 2, 2])
+
+ result = grouped.agg(compat.builtins.sum)
+ result2 = grouped.apply(compat.builtins.sum)
+ expected = grouped.sum()
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
+
+
+# @pytest.mark.parametrize("f", [max, min, sum])
+# def test_builtins_apply(f):
+
[email protected]("f", [max, min, sum])
+ "jim", # Single key
+ ["jim", "joe"] # Multi-key
+])
+def test_builtins_apply(keys, f):
+ # see gh-8155
+ df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
+ columns=["jim", "joe"])
+ df["jolie"] = np.random.randn(1000)
+
+ fname = f.__name__
+ result = df.groupby(keys).apply(f)
+ ngroups = len(df.drop_duplicates(subset=keys))
+
+ assert_msg = ("invalid frame shape: {} "
+ "(expected ({}, 3))".format(result.shape, ngroups))
+ assert result.shape == (ngroups, 3), assert_msg
+
+ tm.assert_frame_equal(result, # numpy's equivalent function
+ df.groupby(keys).apply(getattr(np, fname)))
+
+ if f != sum:
+ expected = df.groupby(keys).agg(fname).reset_index()
+ expected.set_index(keys, inplace=True, drop=False)
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ tm.assert_series_equal(getattr(result, fname)(),
+ getattr(df, fname)())
+
+
+def test_arg_passthru():
+ # make sure that we are passing thru kwargs
+ # to our agg functions
+
+ # GH3668
+ # GH5724
+ df = pd.DataFrame(
+ {'group': [1, 1, 2],
+ 'int': [1, 2, 3],
+ 'float': [4., 5., 6.],
+ 'string': list('abc'),
+ 'category_string': pd.Series(list('abc')).astype('category'),
+ 'category_int': [7, 8, 9],
+ 'datetime': pd.date_range('20130101', periods=3),
+ 'datetimetz': pd.date_range('20130101',
+ periods=3,
+ tz='US/Eastern'),
+ 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
+ columns=['group', 'int', 'float', 'string',
+ 'category_string', 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+
+ expected_columns_numeric = Index(['int', 'float', 'category_int'])
+
+ # mean / median
+ expected = pd.DataFrame(
+ {'category_int': [7.5, 9],
+ 'float': [4.5, 6.],
+ 'timedelta': [pd.Timedelta('1.5s'),
+ pd.Timedelta('3s')],
+ 'int': [1.5, 3],
+ 'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
+ pd.Timestamp('2013-01-03 00:00:00')],
+ 'datetimetz': [
+ pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
+ pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
+ index=Index([1, 2], name='group'),
+ columns=['int', 'float', 'category_int',
+ 'datetime', 'datetimetz', 'timedelta'])
+ for attr in ['mean', 'median']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ # TODO: min, max *should* handle
+ # categorical (ordered) dtype
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['min', 'max']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_string', 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['first', 'last']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_int', 'timedelta'])
+ for attr in ['sum']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'category_int'])
+ for attr in ['prod', 'cumprod']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ # like min, max, but don't include strings
+ expected_columns = Index(['int', 'float',
+ 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['cummin', 'cummax']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ # GH 15561: numeric_only=False set by default like min/max
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'category_int',
+ 'timedelta'])
+ for attr in ['cumsum']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+
+def test_non_cython_api():
+
+ # GH5610
+ # non-cython calls should not include the grouper
+
+ df = DataFrame(
+ [[1, 2, 'foo'],
+ [1, np.nan, 'bar'],
+ [3, np.nan, 'baz']],
+ columns=['A', 'B', 'C'])
+ g = df.groupby('A')
+ gni = df.groupby('A', as_index=False)
+
+ # mad
+ expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
+ expected.index.name = 'A'
+ result = g.mad()
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
+ index=[0, 1])
+ result = gni.mad()
+ tm.assert_frame_equal(result, expected)
+
+ # describe
+ expected_index = pd.Index([1, 3], name='A')
+ expected_col = pd.MultiIndex(levels=[['B'],
+ ['count', 'mean', 'std', 'min',
+ '25%', '50%', '75%', 'max']],
+ codes=[[0] * 8, list(range(8))])
+ expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+ [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
+ np.nan, np.nan]],
+ index=expected_index,
+ columns=expected_col)
+ result = g.describe()
+ tm.assert_frame_equal(result, expected)
+
+ expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
+ df[df.A == 3].describe().unstack().to_frame().T])
+ expected.index = pd.Index([0, 1])
+ result = gni.describe()
+ tm.assert_frame_equal(result, expected)
+
+ # any
+ expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
+ index=[1, 3])
+ expected.index.name = 'A'
+ result = g.any()
+ tm.assert_frame_equal(result, expected)
+
+ # idxmax
+ expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
+ expected.index.name = 'A'
+ result = g.idxmax()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_api2():
+
+ # this takes the fast apply path
+
+ # cumsum (GH5614)
+ df = DataFrame(
+ [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
+ ], columns=['A', 'B', 'C'])
+ expected = DataFrame(
+ [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
+ result = df.groupby('A').cumsum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 5755 - cumsum is a transformer and should ignore as_index
+ result = df.groupby('A', as_index=False).cumsum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 13994
+ result = df.groupby('A').cumsum(axis=1)
+ expected = df.cumsum(axis=1)
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').cumprod(axis=1)
+ expected = df.cumprod(axis=1)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_median():
+ df = DataFrame(np.random.randn(1000))
+ df.values[::2] = np.nan
+
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+ labels[::17] = np.nan
+
+ result = df.groupby(labels).median()
+ exp = df.groupby(labels).agg(nanops.nanmedian)
+ tm.assert_frame_equal(result, exp)
+
+ df = DataFrame(np.random.randn(1000, 5))
+ rs = df.groupby(labels).agg(np.median)
+ xp = df.groupby(labels).median()
+ tm.assert_frame_equal(rs, xp)
+
+
+def test_median_empty_bins(observed):
+ df = pd.DataFrame(np.random.randint(0, 44, 500))
+
+ grps = range(0, 55, 5)
+ bins = pd.cut(df[0], grps)
+
+ result = df.groupby(bins, observed=observed).median()
+ expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
+ tm.assert_frame_equal(result, expected)
+
+
+ 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
[email protected]("method,data", [
+ ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+ ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+ ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+ ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+ ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
+ 'args': [1]}),
+ ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
+ 'out_type': 'int64'})
+])
+def test_groupby_non_arithmetic_agg_types(dtype, method, data):
+ # GH9311, GH6620
+ df = pd.DataFrame(
+ [{'a': 1, 'b': 1},
+ {'a': 1, 'b': 2},
+ {'a': 2, 'b': 3},
+ {'a': 2, 'b': 4}])
+
+ df['b'] = df.b.astype(dtype)
+
+ if 'args' not in data:
+ data['args'] = []
+
+ if 'out_type' in data:
+ out_type = data['out_type']
+ else:
+ out_type = dtype
+
+ exp = data['df']
+ df_out = pd.DataFrame(exp)
+
+ df_out['b'] = df_out.b.astype(out_type)
+ df_out.set_index('a', inplace=True)
+
+ grpd = df.groupby('a')
+ t = getattr(grpd, method)(*data['args'])
+ tm.assert_frame_equal(t, df_out)
+
+
+ (Timestamp("2011-01-15 12:50:28.502376"),
+ Timestamp("2011-01-20 12:50:28.593448")),
+ (24650000000000001, 24650000000000002)
+])
+def test_groupby_non_arithmetic_agg_int_like_precision(i):
+ # see gh-6620, gh-9311
+ df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
+
+ grp_exp = {"first": {"expected": i[0]},
+ "last": {"expected": i[1]},
+ "min": {"expected": i[0]},
+ "max": {"expected": i[1]},
+ "nth": {"expected": i[1],
+ "args": [1]},
+ "count": {"expected": 2}}
+
+ for method, data in compat.iteritems(grp_exp):
+ if "args" not in data:
+ data["args"] = []
+
+ grouped = df.groupby("a")
+ res = getattr(grouped, method)(*data["args"])
+
+ assert res.iloc[0].b == data["expected"]
+
+
+def test_fill_consistency():
+
+ # GH9221
+ # pass thru keyword arguments to the generated wrapper
+ # are set if the passed kw is None (only)
+ df = DataFrame(index=pd.MultiIndex.from_product(
+ [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
+ columns=Index(
+ ['1', '2'], name='id'))
+ df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
+ np.nan, 22, np.nan]
+ df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
+ np.nan, 44, np.nan]
+
+ expected = df.groupby(level=0, axis=0).fillna(method='ffill')
+ result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
+ tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_cumprod():
+ # GH 4095
+ df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
+
+ actual = df.groupby('key')['value'].cumprod()
+ expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
+ expected.name = 'value'
+ tm.assert_series_equal(actual, expected)
+
+ df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
+ actual = df.groupby('key')['value'].cumprod()
+ # if overflows, groupby product casts to float
+ # while numpy passes back invalid values
+ df['value'] = df['value'].astype(float)
+ expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
+ expected.name = 'value'
+ tm.assert_series_equal(actual, expected)
+
+
+def test_ops_general():
+ ops = [('mean', np.mean),
+ ('median', np.median),
+ ('std', np.std),
+ ('var', np.var),
+ ('sum', np.sum),
+ ('prod', np.prod),
+ ('min', np.min),
+ ('max', np.max),
+ ('first', lambda x: x.iloc[0]),
+ ('last', lambda x: x.iloc[-1]),
+ ('count', np.size), ]
+ try:
+ from scipy.stats import sem
+ except ImportError:
+ pass
+ else:
+ ops.append(('sem', sem))
+ df = DataFrame(np.random.randn(1000))
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+
+ for op, targop in ops:
+ result = getattr(df.groupby(labels), op)().astype(float)
+ expected = df.groupby(labels).agg(targop)
+ try:
+ tm.assert_frame_equal(result, expected)
+ except BaseException as exc:
+ exc.args += ('operation: %s' % op, )
+ raise
+
+
+def test_max_nan_bug():
+ raw = """,Date,app,File
+-04-23,2013-04-23 00:00:00,,log080001.log
+-05-06,2013-05-06 00:00:00,,log.log
+-05-07,2013-05-07 00:00:00,OE,xlsx"""
+
+ df = pd.read_csv(compat.StringIO(raw), parse_dates=[0])
+ gb = df.groupby('Date')
+ r = gb[['File']].max()
+ e = gb['File'].max().to_frame()
+ tm.assert_frame_equal(r, e)
+ assert not r['File'].isna().any()
+
+
+def test_nlargest():
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+ b = Series(list('a' * 5 + 'b' * 5))
+ gb = a.groupby(b)
+ r = gb.nlargest(3)
+ e = Series([
+ 7, 5, 3, 10, 9, 6
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
+ tm.assert_series_equal(r, e)
+
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+ gb = a.groupby(b)
+ e = Series([
+ 3, 2, 1, 3, 3, 2
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
+ tm.assert_series_equal(gb.nlargest(3, keep='last'), e)
+
+
+def test_nsmallest():
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+ b = Series(list('a' * 5 + 'b' * 5))
+ gb = a.groupby(b)
+ r = gb.nsmallest(3)
+ e = Series([
+ 1, 2, 3, 0, 4, 6
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
+ tm.assert_series_equal(r, e)
+
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+ gb = a.groupby(b)
+ e = Series([
+ 0, 1, 1, 0, 1, 2
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
+ tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
+
+
+ 'mean', 'var', 'std', 'cumprod', 'cumsum'
+])
+def test_numpy_compat(func):
+ # see gh-12811
+ df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
+ g = df.groupby('A')
+
+ msg = "numpy operations are not valid with groupby"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(g, func)(1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(g, func)(foo=1)
+
+
+def test_cummin_cummax():
+ # GH 15048
+ num_types = [np.int32, np.int64, np.float32, np.float64]
+ num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
+ np.finfo(np.float32).min, np.finfo(np.float64).min]
+ num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
+ np.finfo(np.float32).max, np.finfo(np.float64).max]
+ base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
+ 'B': [3, 4, 3, 2, 2, 3, 2, 1]})
+ expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+ expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+ for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
+ df = base_df.astype(dtype)
+
+ # cummin
+ expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
+ result = df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test cummin w/ min value for dtype
+ df.loc[[2, 6], 'B'] = min_val
+ expected.loc[[2, 3, 6, 7], 'B'] = min_val
+ result = df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # cummax
+ expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
+ result = df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test cummax w/ max value for dtype
+ df.loc[[2, 6], 'B'] = max_val
+ expected.loc[[2, 3, 6, 7], 'B'] = max_val
+ result = df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test nan in some values
+ base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
+ expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
+ np.nan, 3, np.nan, 1]})
+ result = base_df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ expected = (base_df.groupby('A')
+ .B
+ .apply(lambda x: x.cummin())
+ .to_frame())
+ tm.assert_frame_equal(result, expected)
+
+ expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
+ np.nan, 3, np.nan, 3]})
+ result = base_df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ expected = (base_df.groupby('A')
+ .B
+ .apply(lambda x: x.cummax())
+ .to_frame())
+ tm.assert_frame_equal(result, expected)
+
+ # Test nan in entire column
+ base_df['B'] = np.nan
+ expected = pd.DataFrame({'B': [np.nan] * 8})
+ result = base_df.groupby('A').cummin()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').cummax()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(expected, result)
+
+ # GH 15561
+ df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
+ expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
+ for method in ['cummax', 'cummin']:
+ result = getattr(df.groupby('a')['b'], method)()
+ tm.assert_series_equal(expected, result)
+
+ # GH 15635
+ df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
+ result = df.groupby('a').b.cummax()
+ expected = pd.Series([2, 1, 2], name='b')
+ tm.assert_series_equal(result, expected)
+
+ df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
+ result = df.groupby('a').b.cummin()
+ expected = pd.Series([1, 2, 1], name='b')
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('in_vals, out_vals', [
+
+ # Basics: strictly increasing (T), strictly decreasing (F),
+ # abs val increasing (F), non-strictly increasing (T)
+ ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
+ [True, False, False, True]),
+
+ # Test with inf vals
+ ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
+ [True, False, True, False]),
+
+ # Test with nan vals; should always be False
+ ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+ [False, False, False, False]),
+])
+def test_is_monotonic_increasing(in_vals, out_vals):
+ # GH 17015
+ source_dict = {
+ 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
+ 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
+ 'C': in_vals}
+ df = pd.DataFrame(source_dict)
+ result = df.groupby('B').C.is_monotonic_increasing
+ index = Index(list('abcd'), name='B')
+ expected = pd.Series(index=index, data=out_vals, name='C')
+ tm.assert_series_equal(result, expected)
+
+ # Also check result equal to manually taking x.is_monotonic_increasing.
+ expected = (
+ df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('in_vals, out_vals', [
+ # Basics: strictly decreasing (T), strictly increasing (F),
+ # abs val decreasing (F), non-strictly increasing (T)
+ ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
+ [True, False, False, True]),
+
+ # Test with inf vals
+ ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
+ [True, True, False, True]),
+
+ # Test with nan vals; should always be False
+ ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+ [False, False, False, False]),
+])
+def test_is_monotonic_decreasing(in_vals, out_vals):
+ # GH 17015
+ source_dict = {
+ 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
+ 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
+ 'C': in_vals}
+
+ df = pd.DataFrame(source_dict)
+ result = df.groupby('B').C.is_monotonic_decreasing
+ index = Index(list('abcd'), name='B')
+ expected = pd.Series(index=index, data=out_vals, name='C')
+ tm.assert_series_equal(result, expected)
+
+
+# describe
+# --------------------------------
+
+def test_apply_describe_bug(mframe):
+ grouped = mframe.groupby(level='first')
+ grouped.describe() # it works!
+
+
+def test_series_describe_multikey():
+ ts = tm.makeTimeSeries()
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+ result = grouped.describe()
+ tm.assert_series_equal(result['mean'], grouped.mean(),
+ check_names=False)
+ tm.assert_series_equal(result['std'], grouped.std(), check_names=False)
+ tm.assert_series_equal(result['min'], grouped.min(), check_names=False)
+
+
+def test_series_describe_single():
+ ts = tm.makeTimeSeries()
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.apply(lambda x: x.describe())
+ expected = grouped.describe().stack()
+ tm.assert_series_equal(result, expected)
+
+
+def test_series_index_name(df):
+ grouped = df.loc[:, ['C']].groupby(df['A'])
+ result = grouped.agg(lambda x: x.mean())
+ assert result.index.name == 'A'
+
+
+def test_frame_describe_multikey(tsframe):
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+ result = grouped.describe()
+ desc_groups = []
+ for col in tsframe:
+ group = grouped[col].describe()
+ # GH 17464 - Remove duplicate MultiIndex levels
+ group_col = pd.MultiIndex(
+ levels=[[col], group.columns],
+ codes=[[0] * len(group.columns), range(len(group.columns))])
+ group = pd.DataFrame(group.values,
+ columns=group_col,
+ index=group.index)
+ desc_groups.append(group)
+ expected = pd.concat(desc_groups, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ groupedT = tsframe.groupby({'A': 0, 'B': 0,
+ 'C': 1, 'D': 1}, axis=1)
+ result = groupedT.describe()
+ expected = tsframe.describe().T
+ expected.index = pd.MultiIndex(
+ levels=[[0, 1], expected.index],
+ codes=[[0, 0, 1, 1], range(len(expected.index))])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_frame_describe_tupleindex():
+
+ # GH 14848 - regression from 0.19.0 to 0.19.1
+ df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
+ 'y': [10, 20, 30, 40, 50] * 3,
+ 'z': [100, 200, 300, 400, 500] * 3})
+ df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
+ df2 = df1.rename(columns={'k': 'key'})
+ msg = "Names should be list-like for a MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ df1.groupby('k').describe()
+ with pytest.raises(ValueError, match=msg):
+ df2.groupby('key').describe()
+
+
+def test_frame_describe_unstacked_format():
+ # GH 4792
+ prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
+ pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
+ pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
+ volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
+ pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
+ pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
+ df = pd.DataFrame({'PRICE': prices,
+ 'VOLUME': volumes})
+ result = df.groupby('PRICE').VOLUME.describe()
+ data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
+ df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
+ expected = pd.DataFrame(data,
+ index=pd.Index([24990, 25499], name='PRICE'),
+ columns=['count', 'mean', 'std', 'min',
+ '25%', '50%', '75%', 'max'])
+ tm.assert_frame_equal(result, expected)
+
+
+# nunique
+# --------------------------------
+
[email protected]('n', 10 ** np.arange(2, 6))
[email protected]('m', [10, 100, 1000])
[email protected]('sort', [False, True])
[email protected]('dropna', [False, True])
+def test_series_groupby_nunique(n, m, sort, dropna):
+
+ def check_nunique(df, keys, as_index=True):
+ gr = df.groupby(keys, as_index=as_index, sort=sort)
+ left = gr['julie'].nunique(dropna=dropna)
+
+ gr = df.groupby(keys, as_index=as_index, sort=sort)
+ right = gr['julie'].apply(Series.nunique, dropna=dropna)
+ if not as_index:
+ right = right.reset_index(drop=True)
+
+ tm.assert_series_equal(left, right, check_names=False)
+
+ days = date_range('2015-08-23', periods=10)
+
+ frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n),
+ 'joe': np.random.choice(days, n),
+ 'julie': np.random.randint(0, m, n)})
+
+ check_nunique(frame, ['jim'])
+ check_nunique(frame, ['jim', 'joe'])
+
+ frame.loc[1::17, 'jim'] = None
+ frame.loc[3::37, 'joe'] = None
+ frame.loc[7::19, 'julie'] = None
+ frame.loc[8::19, 'julie'] = None
+ frame.loc[9::19, 'julie'] = None
+
+ check_nunique(frame, ['jim'])
+ check_nunique(frame, ['jim', 'joe'])
+ check_nunique(frame, ['jim'], as_index=False)
+ check_nunique(frame, ['jim', 'joe'], as_index=False)
+
+
+def test_nunique():
+ df = DataFrame({
+ 'A': list('abbacc'),
+ 'B': list('abxacc'),
+ 'C': list('abbacx'),
+ })
+
+ expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
+ result = df.groupby('A', as_index=False).nunique()
+ tm.assert_frame_equal(result, expected)
+
+ # as_index
+ expected.index = list('abc')
+ expected.index.name = 'A'
+ result = df.groupby('A').nunique()
+ tm.assert_frame_equal(result, expected)
+
+ # with na
+ result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
+ tm.assert_frame_equal(result, expected)
+
+ # dropna
+ expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
+ index=list('abc'))
+ expected.index.name = 'A'
+ result = df.replace({'x': None}).groupby('A').nunique()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_nunique_with_object():
+ # GH 11077
+ data = pd.DataFrame(
+ [[100, 1, 'Alice'],
+ [200, 2, 'Bob'],
+ [300, 3, 'Charlie'],
+ [-400, 4, 'Dan'],
+ [500, 5, 'Edith']],
+ columns=['amount', 'id', 'name']
+ )
+
+ result = data.groupby(['id', 'amount'])['name'].nunique()
+ index = MultiIndex.from_arrays([data.id, data.amount])
+ expected = pd.Series([1] * 5, name='name', index=index)
+ tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_empty_series():
+ # GH 12553
+ data = pd.Series(name='name')
+ result = data.groupby(level=0).nunique()
+ expected = pd.Series(name='name', dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_timegrouper():
+ # GH 13453
+ test = pd.DataFrame({
+ 'time': [Timestamp('2016-06-28 09:35:35'),
+ Timestamp('2016-06-28 16:09:30'),
+ Timestamp('2016-06-28 16:46:28')],
+ 'data': ['1', '2', '3']}).set_index('time')
+ result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
+ expected = test.groupby(
+ pd.Grouper(freq='h')
+ )['data'].apply(pd.Series.nunique)
+ tm.assert_series_equal(result, expected)
+
+
+# count
+# --------------------------------
+
+def test_groupby_timedelta_cython_count():
+ df = DataFrame({'g': list('ab' * 2),
+ 'delt': np.arange(4).astype('timedelta64[ns]')})
+ expected = Series([
+ 2, 2
+ ], index=pd.Index(['a', 'b'], name='g'), name='delt')
+ result = df.groupby('g').delt.count()
+ tm.assert_series_equal(expected, result)
+
+
+def test_count():
+ n = 1 << 15
+ dr = date_range('2015-08-30', periods=n // 10, freq='T')
+
+ df = DataFrame({
+ '1st': np.random.choice(
+ list(ascii_lowercase), n),
+ '2nd': np.random.randint(0, 5, n),
+ '3rd': np.random.randn(n).round(3),
+ '4th': np.random.randint(-10, 10, n),
+ '5th': np.random.choice(dr, n),
+ '6th': np.random.randn(n).round(3),
+ '7th': np.random.randn(n).round(3),
+ '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
+ '9th': np.random.choice(
+ list(ascii_lowercase), n)
+ })
+
+ for col in df.columns.drop(['1st', '2nd', '4th']):
+ df.loc[np.random.choice(n, n // 10), col] = np.nan
+
+ df['9th'] = df['9th'].astype('category')
+
+ for key in '1st', '2nd', ['1st', '2nd']:
+ left = df.groupby(key).count()
+ right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
+ tm.assert_frame_equal(left, right)
+
+ # GH5610
+ # count counts non-nulls
+ df = pd.DataFrame([[1, 2, 'foo'],
+ [1, np.nan, 'bar'],
+ [3, np.nan, np.nan]],
+ columns=['A', 'B', 'C'])
+
+ count_as = df.groupby('A').count()
+ count_not_as = df.groupby('A', as_index=False).count()
+
+ expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
+ index=[1, 3])
+ expected.index.name = 'A'
+ tm.assert_frame_equal(count_not_as, expected.reset_index())
+ tm.assert_frame_equal(count_as, expected)
+
+ count_B = df.groupby('A')['B'].count()
+ tm.assert_series_equal(count_B, expected['B'])
+
+
+def test_count_object():
+ df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
+ result = df.groupby('c').a.count()
+ expected = pd.Series([
+ 3, 3
+ ], index=pd.Index([2, 3], name='c'), name='a')
+ tm.assert_series_equal(result, expected)
+
+ df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
+ 'c': [2] * 3 + [3] * 3})
+ result = df.groupby('c').a.count()
+ expected = pd.Series([
+ 1, 3
+ ], index=pd.Index([2, 3], name='c'), name='a')
+ tm.assert_series_equal(result, expected)
+
+
+def test_count_cross_type():
+ # GH8169
+ vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
+ 0, 2, (100, 2))))
+
+ df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
+ df[df == 2] = np.nan
+ expected = df.groupby(['c', 'd']).count()
+
+ for t in ['float32', 'object']:
+ df['a'] = df['a'].astype(t)
+ df['b'] = df['b'].astype(t)
+ result = df.groupby(['c', 'd']).count()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_lower_int_prec_count():
+ df = DataFrame({'a': np.array(
+ [0, 1, 2, 100], np.int8),
+ 'b': np.array(
+ [1, 2, 3, 6], np.uint32),
+ 'c': np.array(
+ [4, 5, 6, 8], np.int16),
+ 'grp': list('ab' * 2)})
+ result = df.groupby('grp').count()
+ expected = DataFrame({'a': [2, 2],
+ 'b': [2, 2],
+ 'c': [2, 2]}, index=pd.Index(list('ab'),
+ name='grp'))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_count_uses_size_on_exception():
+ class RaisingObjectException(Exception):
+ pass
+
+ class RaisingObject(object):
+
+ def __init__(self, msg='I will raise inside Cython'):
+ super(RaisingObject, self).__init__()
+ self.msg = msg
+
+ def __eq__(self, other):
+ # gets called in Cython to check that raising calls the method
+ raise RaisingObjectException(self.msg)
+
+ df = DataFrame({'a': [RaisingObject() for _ in range(4)],
+ 'grp': list('ab' * 2)})
+ result = df.groupby('grp').count()
+ expected = DataFrame({'a': [2, 2]}, index=pd.Index(
+ list('ab'), name='grp'))
+ tm.assert_frame_equal(result, expected)
+
+
+# size
+# --------------------------------
+
+def test_size(df):
+ grouped = df.groupby(['A', 'B'])
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ grouped = df.groupby('A')
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ grouped = df.groupby('B')
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
+ for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
+ left = df.groupby(key, sort=sort).size()
+ right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
+ tm.assert_series_equal(left, right, check_names=False)
+
+ # GH11699
+ df = DataFrame([], columns=['A', 'B'])
+ out = Series([], dtype='int64', index=Index([], name='A'))
+ tm.assert_series_equal(df.groupby('A').size(), out)
+
+
+# pipe
+# --------------------------------
+
+def test_pipe():
+ # Test the pipe method of DataFrameGroupBy.
+ # Issue #17871
+
+ random_state = np.random.RandomState(1234567890)
+
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': random_state.randn(8),
+ 'C': random_state.randn(8)})
+
+ def f(dfgb):
+ return dfgb.B.max() - dfgb.C.min().min()
+
+ def square(srs):
+ return srs ** 2
+
+ # Note that the transformations are
+ # GroupBy -> Series
+ # Series -> Series
+ # This then chains the GroupBy.pipe and the
+ # NDFrame.pipe methods
+ result = df.groupby('A').pipe(f).pipe(square)
+
+ index = Index([u'bar', u'foo'], dtype='object', name=u'A')
+ expected = pd.Series([8.99110003361, 8.17516964785], name='B',
+ index=index)
+
+ tm.assert_series_equal(expected, result)
+
+
+def test_pipe_args():
+ # Test passing args to the pipe method of DataFrameGroupBy.
+ # Issue #17871
+
+ df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
+ 'x': [1.0, 2.0, 3.0, 2.0, 5.0],
+ 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
+
+ def f(dfgb, arg1):
+ return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
+ .groupby(dfgb.grouper))
+
+ def g(dfgb, arg2):
+ return dfgb.sum() / dfgb.sum().sum() + arg2
+
+ def h(df, arg3):
+ return df.x + df.y - arg3
+
+ result = (df
+ .groupby('group')
+ .pipe(f, 0)
+ .pipe(g, 10)
+ .pipe(h, 100))
+
+ # Assert the results here
+ index = pd.Index(['A', 'B', 'C'], name='group')
+ expected = pd.Series([-79.5160891089, -78.4839108911, -80],
+ index=index)
+
+ tm.assert_series_equal(expected, result)
+
+ # test SeriesGroupby.pipe
+ ser = pd.Series([1, 1, 2, 2, 3, 3])
+ result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
+
+ expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_groupby_mean_no_overflow():
+ # Regression test for (#22487)
+ df = pd.DataFrame({
+ "user": ["A", "A", "A", "A", "A"],
+ "connections": [4970, 4749, 4719, 4704, 18446744073699999744]
+ })
+ assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_groupby.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_groupby.py
new file mode 100644
index 00000000000..98c917a6eca
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_groupby.py
@@ -0,0 +1,1746 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+from collections import defaultdict
+from datetime import datetime
+from decimal import Decimal
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+ OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
+ read_csv)
+import pandas.core.common as com
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+
+def test_repr():
+ # GH18203
+ result = repr(pd.Grouper(key='A', level='B'))
+ expected = "Grouper(key='A', level='B', axis=0, sort=False)"
+ assert result == expected
+
+
[email protected]('dtype', ['int64', 'int32', 'float64', 'float32'])
+def test_basic(dtype):
+
+ data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
+
+ index = np.arange(9)
+ np.random.shuffle(index)
+ data = data.reindex(index)
+
+ grouped = data.groupby(lambda x: x // 3)
+
+ for k, v in grouped:
+ assert len(v) == 3
+
+ agged = grouped.aggregate(np.mean)
+ assert agged[1] == 1
+
+ assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
+ assert_series_equal(agged, grouped.mean())
+ assert_series_equal(grouped.agg(np.sum), grouped.sum())
+
+ expected = grouped.apply(lambda x: x * x.sum())
+ transformed = grouped.transform(lambda x: x * x.sum())
+ assert transformed[7] == 12
+ assert_series_equal(transformed, expected)
+
+ value_grouped = data.groupby(data)
+ assert_series_equal(value_grouped.aggregate(np.mean), agged,
+ check_index_type=False)
+
+ # complex agg
+ agged = grouped.aggregate([np.mean, np.std])
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ agged = grouped.aggregate({'one': np.mean, 'two': np.std})
+
+ group_constants = {0: 10, 1: 20, 2: 30}
+ agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
+ assert agged[1] == 21
+
+ # corner cases
+ msg = "Must produce aggregated value"
+ # exception raised is type Exception
+ with pytest.raises(Exception, match=msg):
+ grouped.aggregate(lambda x: x * 2)
+
+
+def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
+ key = mframe.index.codes[0]
+ grouped = mframe.groupby(key)
+ result = grouped.sum()
+
+ expected = mframe.groupby(key.astype('O')).sum()
+ assert_frame_equal(result, expected)
+
+ # GH 3911, mixed frame non-conversion
+ df = df_mixed_floats.copy()
+ df['value'] = lrange(len(df))
+
+ def max_value(group):
+ return group.loc[group['value'].idxmax()]
+
+ applied = df.groupby('A').apply(max_value)
+ result = applied.get_dtype_counts().sort_values()
+ expected = Series({'float64': 2,
+ 'int64': 1,
+ 'object': 2}).sort_values()
+ assert_series_equal(result, expected)
+
+
+def test_groupby_return_type():
+
+ # GH2893, return a reduced type
+ df1 = DataFrame(
+ [{"val1": 1, "val2": 20},
+ {"val1": 1, "val2": 19},
+ {"val1": 2, "val2": 27},
+ {"val1": 2, "val2": 12}
+ ])
+
+ def func(dataf):
+ return dataf["val2"] - dataf["val2"].mean()
+
+ result = df1.groupby("val1", squeeze=True).apply(func)
+ assert isinstance(result, Series)
+
+ df2 = DataFrame(
+ [{"val1": 1, "val2": 20},
+ {"val1": 1, "val2": 19},
+ {"val1": 1, "val2": 27},
+ {"val1": 1, "val2": 12}
+ ])
+
+ def func(dataf):
+ return dataf["val2"] - dataf["val2"].mean()
+
+ result = df2.groupby("val1", squeeze=True).apply(func)
+ assert isinstance(result, Series)
+
+ # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
+ df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
+ result = df.groupby('X', squeeze=False).count()
+ assert isinstance(result, DataFrame)
+
+ # GH5592
+ # inconcistent return type
+ df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
+ 'Pony', 'Pony'], B=Series(
+ np.arange(7), dtype='int64'), C=date_range(
+ '20130101', periods=7)))
+
+ def f(grp):
+ return grp.iloc[0]
+
+ expected = df.groupby('A').first()[['B']]
+ result = df.groupby('A').apply(f)[['B']]
+ assert_frame_equal(result, expected)
+
+ def f(grp):
+ if grp.name == 'Tiger':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['B']]
+ e = expected.copy()
+ e.loc['Tiger'] = np.nan
+ assert_frame_equal(result, e)
+
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['B']]
+ e = expected.copy()
+ e.loc['Pony'] = np.nan
+ assert_frame_equal(result, e)
+
+ # 5592 revisited, with datetimes
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['C']]
+ e = df.groupby('A').first()[['C']]
+ e.loc['Pony'] = pd.NaT
+ assert_frame_equal(result, e)
+
+ # scalar outputs
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0].loc['C']
+
+ result = df.groupby('A').apply(f)
+ e = df.groupby('A').first()['C'].copy()
+ e.loc['Pony'] = np.nan
+ e.name = None
+ assert_series_equal(result, e)
+
+
+def test_pass_args_kwargs(ts, tsframe):
+
+ def f(x, q=None, axis=0):
+ return np.percentile(x, q, axis=axis)
+
+ g = lambda x: np.percentile(x, 80, axis=0)
+
+ # Series
+ ts_grouped = ts.groupby(lambda x: x.month)
+ agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
+ apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
+ trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
+
+ agg_expected = ts_grouped.quantile(.8)
+ trans_expected = ts_grouped.transform(g)
+
+ assert_series_equal(apply_result, agg_expected)
+ assert_series_equal(agg_result, agg_expected, check_names=False)
+ assert_series_equal(trans_result, trans_expected)
+
+ agg_result = ts_grouped.agg(f, q=80)
+ apply_result = ts_grouped.apply(f, q=80)
+ trans_result = ts_grouped.transform(f, q=80)
+ assert_series_equal(agg_result, agg_expected)
+ assert_series_equal(apply_result, agg_expected)
+ assert_series_equal(trans_result, trans_expected)
+
+ # DataFrame
+ df_grouped = tsframe.groupby(lambda x: x.month)
+ agg_result = df_grouped.agg(np.percentile, 80, axis=0)
+ apply_result = df_grouped.apply(DataFrame.quantile, .8)
+ expected = df_grouped.quantile(.8)
+ assert_frame_equal(apply_result, expected)
+ assert_frame_equal(agg_result, expected, check_names=False)
+
+ agg_result = df_grouped.agg(f, q=80)
+ apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
+ assert_frame_equal(agg_result, expected, check_names=False)
+ assert_frame_equal(apply_result, expected)
+
+
+def test_len():
+ df = tm.makeTimeDataFrame()
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day])
+ assert len(grouped) == len(df)
+
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month])
+ expected = len({(x.year, x.month) for x in df.index})
+ assert len(grouped) == expected
+
+ # issue 11016
+ df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
+ assert len(df.groupby(('a'))) == 0
+ assert len(df.groupby(('b'))) == 3
+ assert len(df.groupby(['a', 'b'])) == 3
+
+
+def test_basic_regression():
+ # regression
+ T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
+ result = Series(T, lrange(0, len(T)))
+
+ groupings = np.random.random((1100, ))
+ groupings = Series(groupings, lrange(0, len(groupings))) * 10.
+
+ grouped = result.groupby(groupings)
+ grouped.mean()
+
+
[email protected]('dtype', ['float64', 'float32', 'int64',
+ 'int32', 'int16', 'int8'])
+def test_with_na_groups(dtype):
+ index = Index(np.arange(10))
+ values = Series(np.ones(10), index, dtype=dtype)
+ labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
+ 'bar', 'bar', np.nan, 'foo'], index=index)
+
+ # this SHOULD be an int
+ grouped = values.groupby(labels)
+ agged = grouped.agg(len)
+ expected = Series([4, 2], index=['bar', 'foo'])
+
+ assert_series_equal(agged, expected, check_dtype=False)
+
+ # assert issubclass(agged.dtype.type, np.integer)
+
+ # explicitly return a float from my function
+ def f(x):
+ return float(len(x))
+
+ agged = grouped.agg(f)
+ expected = Series([4, 2], index=['bar', 'foo'])
+
+ assert_series_equal(agged, expected, check_dtype=False)
+ assert issubclass(agged.dtype.type, np.dtype(dtype).type)
+
+
+def test_indices_concatenation_order():
+
+ # GH 2808
+
+ def f1(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
+ names=['b', 'c'])
+ res = DataFrame(None, columns=['a'], index=multiindex)
+ return res
+ else:
+ y = y.set_index(['b', 'c'])
+ return y
+
+ def f2(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ return DataFrame()
+ else:
+ y = y.set_index(['b', 'c'])
+ return y
+
+ def f3(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
+ names=['foo', 'bar'])
+ res = DataFrame(None, columns=['a', 'b'], index=multiindex)
+ return res
+ else:
+ return y
+
+ df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
+
+ df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
+
+ # correct result
+ result1 = df.groupby('a').apply(f1)
+ result2 = df2.groupby('a').apply(f1)
+ assert_frame_equal(result1, result2)
+
+ # should fail (not the same number of levels)
+ msg = "Cannot concat indices that do not have the same number of levels"
+ with pytest.raises(AssertionError, match=msg):
+ df.groupby('a').apply(f2)
+ with pytest.raises(AssertionError, match=msg):
+ df2.groupby('a').apply(f2)
+
+ # should fail (incorrect shape)
+ with pytest.raises(AssertionError, match=msg):
+ df.groupby('a').apply(f3)
+ with pytest.raises(AssertionError, match=msg):
+ df2.groupby('a').apply(f3)
+
+
+def test_attr_wrapper(ts):
+ grouped = ts.groupby(lambda x: x.weekday())
+
+ result = grouped.std()
+ expected = grouped.agg(lambda x: np.std(x, ddof=1))
+ assert_series_equal(result, expected)
+
+ # this is pretty cool
+ result = grouped.describe()
+ expected = {name: gp.describe() for name, gp in grouped}
+ expected = DataFrame(expected).T
+ assert_frame_equal(result, expected)
+
+ # get attribute
+ result = grouped.dtype
+ expected = grouped.agg(lambda x: x.dtype)
+
+ # make sure raises error
+ msg = "'SeriesGroupBy' object has no attribute 'foo'"
+ with pytest.raises(AttributeError, match=msg):
+ getattr(grouped, 'foo')
+
+
+def test_frame_groupby(tsframe):
+ grouped = tsframe.groupby(lambda x: x.weekday())
+
+ # aggregate
+ aggregated = grouped.aggregate(np.mean)
+ assert len(aggregated) == 5
+ assert len(aggregated.columns) == 4
+
+ # by string
+ tscopy = tsframe.copy()
+ tscopy['weekday'] = [x.weekday() for x in tscopy.index]
+ stragged = tscopy.groupby('weekday').aggregate(np.mean)
+ assert_frame_equal(stragged, aggregated, check_names=False)
+
+ # transform
+ grouped = tsframe.head(30).groupby(lambda x: x.weekday())
+ transformed = grouped.transform(lambda x: x - x.mean())
+ assert len(transformed) == 30
+ assert len(transformed.columns) == 4
+
+ # transform propagate
+ transformed = grouped.transform(lambda x: x.mean())
+ for name, group in grouped:
+ mean = group.mean()
+ for idx in group.index:
+ tm.assert_series_equal(transformed.xs(idx), mean,
+ check_names=False)
+
+ # iterate
+ for weekday, group in grouped:
+ assert group.index[0].weekday() == weekday
+
+ # groups / group_indices
+ groups = grouped.groups
+ indices = grouped.indices
+
+ for k, v in compat.iteritems(groups):
+ samething = tsframe.index.take(indices[k])
+ assert (samething == v).all()
+
+
+def test_frame_groupby_columns(tsframe):
+ mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
+ grouped = tsframe.groupby(mapping, axis=1)
+
+ # aggregate
+ aggregated = grouped.aggregate(np.mean)
+ assert len(aggregated) == len(tsframe)
+ assert len(aggregated.columns) == 2
+
+ # transform
+ tf = lambda x: x - x.mean()
+ groupedT = tsframe.T.groupby(mapping, axis=0)
+ assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
+
+ # iterate
+ for k, v in grouped:
+ assert len(v.columns) == 2
+
+
+def test_frame_set_name_single(df):
+ grouped = df.groupby('A')
+
+ result = grouped.mean()
+ assert result.index.name == 'A'
+
+ result = df.groupby('A', as_index=False).mean()
+ assert result.index.name != 'A'
+
+ result = grouped.agg(np.mean)
+ assert result.index.name == 'A'
+
+ result = grouped.agg({'C': np.mean, 'D': np.std})
+ assert result.index.name == 'A'
+
+ result = grouped['C'].mean()
+ assert result.index.name == 'A'
+ result = grouped['C'].agg(np.mean)
+ assert result.index.name == 'A'
+ result = grouped['C'].agg([np.mean, np.std])
+ assert result.index.name == 'A'
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
+ assert result.index.name == 'A'
+
+
+def test_multi_func(df):
+ col1 = df['A']
+ col2 = df['B']
+
+ grouped = df.groupby([col1.get, col2.get])
+ agged = grouped.mean()
+ expected = df.groupby(['A', 'B']).mean()
+
+ # TODO groupby get drops names
+ assert_frame_equal(agged.loc[:, ['C', 'D']],
+ expected.loc[:, ['C', 'D']],
+ check_names=False)
+
+ # some "groups" with no data
+ df = DataFrame({'v1': np.random.randn(6),
+ 'v2': np.random.randn(6),
+ 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
+ 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
+ index=['one', 'two', 'three', 'four', 'five', 'six'])
+ # only verify that it works for now
+ grouped = df.groupby(['k1', 'k2'])
+ grouped.agg(np.sum)
+
+
+def test_multi_key_multiple_functions(df):
+ grouped = df.groupby(['A', 'B'])['C']
+
+ agged = grouped.agg([np.mean, np.std])
+ expected = DataFrame({'mean': grouped.agg(np.mean),
+ 'std': grouped.agg(np.std)})
+ assert_frame_equal(agged, expected)
+
+
+def test_frame_multi_key_function_list():
+ data = DataFrame(
+ {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ grouped = data.groupby(['A', 'B'])
+ funcs = [np.mean, np.std]
+ agged = grouped.agg(funcs)
+ expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
+ grouped['F'].agg(funcs)],
+ keys=['D', 'E', 'F'], axis=1)
+ assert (isinstance(agged.index, MultiIndex))
+ assert (isinstance(expected.index, MultiIndex))
+ assert_frame_equal(agged, expected)
+
+
[email protected]('op', [lambda x: x.sum(), lambda x: x.mean()])
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_groupby_multiple_columns(df, op):
+ data = df
+ grouped = data.groupby(['A', 'B'])
+
+ result1 = op(grouped)
+
+ expected = defaultdict(dict)
+ for n1, gp1 in data.groupby('A'):
+ for n2, gp2 in gp1.groupby('B'):
+ expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
+ expected = {k: DataFrame(v)
+ for k, v in compat.iteritems(expected)}
+ expected = Panel.fromDict(expected).swapaxes(0, 1)
+ expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
+
+ # a little bit crude
+ for col in ['C', 'D']:
+ result_col = op(grouped[col])
+ exp = expected[col]
+ pivoted = result1[col].unstack()
+ pivoted2 = result_col.unstack()
+ assert_frame_equal(pivoted.reindex_like(exp), exp)
+ assert_frame_equal(pivoted2.reindex_like(exp), exp)
+
+ # test single series works the same
+ result = data['C'].groupby([data['A'], data['B']]).mean()
+ expected = data.groupby(['A', 'B']).mean()['C']
+
+ assert_series_equal(result, expected)
+
+
+def test_groupby_as_index_agg(df):
+ grouped = df.groupby('A', as_index=False)
+
+ # single-key
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
+ expected2 = grouped.mean()
+ expected2['D'] = grouped.sum()['D']
+ assert_frame_equal(result2, expected2)
+
+ grouped = df.groupby('A', as_index=True)
+ expected3 = grouped['C'].sum()
+ expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result3 = grouped['C'].agg({'Q': np.sum})
+ assert_frame_equal(result3, expected3)
+
+ # multi-key
+
+ grouped = df.groupby(['A', 'B'], as_index=False)
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
+ expected2 = grouped.mean()
+ expected2['D'] = grouped.sum()['D']
+ assert_frame_equal(result2, expected2)
+
+ expected3 = grouped['C'].sum()
+ expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
+ result3 = grouped['C'].agg({'Q': np.sum})
+ assert_frame_equal(result3, expected3)
+
+ # GH7115 & GH8112 & GH8582
+ df = DataFrame(np.random.randint(0, 100, (50, 3)),
+ columns=['jim', 'joe', 'jolie'])
+ ts = Series(np.random.randint(5, 10, 50), name='jim')
+
+ gr = df.groupby(ts)
+ gr.nth(0) # invokes set_selection_from_grouper internally
+ assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
+
+ for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
+ gr = df.groupby(ts, as_index=False)
+ left = getattr(gr, attr)()
+
+ gr = df.groupby(ts.values, as_index=True)
+ right = getattr(gr, attr)().reset_index(drop=True)
+
+ assert_frame_equal(left, right)
+
+
+def test_as_index_series_return_frame(df):
+ grouped = df.groupby('A', as_index=False)
+ grouped2 = df.groupby(['A', 'B'], as_index=False)
+
+ result = grouped['C'].agg(np.sum)
+ expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
+ assert isinstance(result, DataFrame)
+ assert_frame_equal(result, expected)
+
+ result2 = grouped2['C'].agg(np.sum)
+ expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
+ assert isinstance(result2, DataFrame)
+ assert_frame_equal(result2, expected2)
+
+ result = grouped['C'].sum()
+ expected = grouped.sum().loc[:, ['A', 'C']]
+ assert isinstance(result, DataFrame)
+ assert_frame_equal(result, expected)
+
+ result2 = grouped2['C'].sum()
+ expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
+ assert isinstance(result2, DataFrame)
+ assert_frame_equal(result2, expected2)
+
+
+def test_as_index_series_column_slice_raises(df):
+ # GH15072
+ grouped = df.groupby('A', as_index=False)
+ msg = r"Column\(s\) C already selected"
+
+ with pytest.raises(IndexError, match=msg):
+ grouped['C'].__getitem__('D')
+
+
+def test_groupby_as_index_cython(df):
+ data = df
+
+ # single-key
+ grouped = data.groupby('A', as_index=False)
+ result = grouped.mean()
+ expected = data.groupby(['A']).mean()
+ expected.insert(0, 'A', expected.index)
+ expected.index = np.arange(len(expected))
+ assert_frame_equal(result, expected)
+
+ # multi-key
+ grouped = data.groupby(['A', 'B'], as_index=False)
+ result = grouped.mean()
+ expected = data.groupby(['A', 'B']).mean()
+
+ arrays = lzip(*expected.index.values)
+ expected.insert(0, 'A', arrays[0])
+ expected.insert(1, 'B', arrays[1])
+ expected.index = np.arange(len(expected))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_series_scalar(df):
+ grouped = df.groupby(['A', 'B'], as_index=False)
+
+ # GH #421
+
+ result = grouped['C'].agg(len)
+ expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_corner(df, ts):
+ msg = "as_index=False only valid with DataFrame"
+ with pytest.raises(TypeError, match=msg):
+ ts.groupby(lambda x: x.weekday(), as_index=False)
+
+ msg = "as_index=False only valid for axis=0"
+ with pytest.raises(ValueError, match=msg):
+ df.groupby(lambda x: x.lower(), as_index=False, axis=1)
+
+
+def test_groupby_multiple_key(df):
+ df = tm.makeTimeDataFrame()
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day])
+ agged = grouped.sum()
+ assert_almost_equal(df.values, agged.values)
+
+ grouped = df.T.groupby([lambda x: x.year,
+ lambda x: x.month,
+ lambda x: x.day], axis=1)
+
+ agged = grouped.agg(lambda x: x.sum())
+ tm.assert_index_equal(agged.index, df.columns)
+ assert_almost_equal(df.T.values, agged.values)
+
+ agged = grouped.agg(lambda x: x.sum())
+ assert_almost_equal(df.T.values, agged.values)
+
+
+def test_groupby_multi_corner(df):
+ # test that having an all-NA column doesn't mess you up
+ df = df.copy()
+ df['bad'] = np.nan
+ agged = df.groupby(['A', 'B']).mean()
+
+ expected = df.groupby(['A', 'B']).mean()
+ expected['bad'] = np.nan
+
+ assert_frame_equal(agged, expected)
+
+
+def test_omit_nuisance(df):
+ grouped = df.groupby('A')
+
+ result = grouped.mean()
+ expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
+ assert_frame_equal(result, expected)
+
+ agged = grouped.agg(np.mean)
+ exp = grouped.mean()
+ assert_frame_equal(agged, exp)
+
+ df = df.loc[:, ['A', 'C', 'D']]
+ df['E'] = datetime.now()
+ grouped = df.groupby('A')
+ result = grouped.agg(np.sum)
+ expected = grouped.sum()
+ assert_frame_equal(result, expected)
+
+ # won't work with axis = 1
+ grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
+ msg = (r'\("unsupported operand type\(s\) for \+: '
+ "'Timestamp' and 'float'\""
+ r", u?'occurred at index 0'\)")
+ with pytest.raises(TypeError, match=msg):
+ grouped.agg(lambda x: x.sum(0, numeric_only=False))
+
+
+def test_omit_nuisance_python_multiple(three_group):
+ grouped = three_group.groupby(['A', 'B'])
+
+ agged = grouped.agg(np.mean)
+ exp = grouped.mean()
+ assert_frame_equal(agged, exp)
+
+
+def test_empty_groups_corner(mframe):
+ # handle empty groups
+ df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
+ 'k2': np.array(['1', '1', '1', '2', '2', '2']),
+ 'k3': ['foo', 'bar'] * 3,
+ 'v1': np.random.randn(6),
+ 'v2': np.random.randn(6)})
+
+ grouped = df.groupby(['k1', 'k2'])
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ grouped = mframe[3:5].groupby(level=0)
+ agged = grouped.apply(lambda x: x.mean())
+ agged_A = grouped['A'].apply(np.mean)
+ assert_series_equal(agged['A'], agged_A)
+ assert agged.index.name == 'first'
+
+
+def test_nonsense_func():
+ df = DataFrame([0])
+ msg = r"unsupported operand type\(s\) for \+: '(int|long)' and 'str'"
+ with pytest.raises(TypeError, match=msg):
+ df.groupby(lambda x: x + 'foo')
+
+
+def test_wrap_aggregated_output_multindex(mframe):
+ df = mframe.T
+ df['baz', 'two'] = 'peekaboo'
+
+ keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
+ agged = df.groupby(keys).agg(np.mean)
+ assert isinstance(agged.columns, MultiIndex)
+
+ def aggfun(ser):
+ if ser.name == ('foo', 'one'):
+ raise TypeError
+ else:
+ return ser.sum()
+
+ agged2 = df.groupby(keys).aggregate(aggfun)
+ assert len(agged2.columns) + 1 == len(df.columns)
+
+
+def test_groupby_level_apply(mframe):
+
+ result = mframe.groupby(level=0).count()
+ assert result.index.name == 'first'
+ result = mframe.groupby(level=1).count()
+ assert result.index.name == 'second'
+
+ result = mframe['A'].groupby(level=0).count()
+ assert result.index.name == 'first'
+
+
+def test_groupby_level_mapper(mframe):
+ deleveled = mframe.reset_index()
+
+ mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
+ mapper1 = {'one': 0, 'two': 0, 'three': 1}
+
+ result0 = mframe.groupby(mapper0, level=0).sum()
+ result1 = mframe.groupby(mapper1, level=1).sum()
+
+ mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
+ mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
+ expected0 = mframe.groupby(mapped_level0).sum()
+ expected1 = mframe.groupby(mapped_level1).sum()
+ expected0.index.name, expected1.index.name = 'first', 'second'
+
+ assert_frame_equal(result0, expected0)
+ assert_frame_equal(result1, expected1)
+
+
+def test_groupby_level_nonmulti():
+ # GH 1313, GH 13901
+ s = Series([1, 2, 3, 10, 4, 5, 20, 6],
+ Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
+ expected = Series([11, 22, 3, 4, 5, 6],
+ Index(range(1, 7), name='foo'))
+
+ result = s.groupby(level=0).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=[0]).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=-1).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=[-1]).sum()
+ tm.assert_series_equal(result, expected)
+
+ msg = "level > 0 or level < -1 only valid with MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=1)
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=-2)
+ msg = "No group keys passed!"
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=[])
+ msg = "multiple levels only valid with MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=[0, 0])
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=[0, 1])
+ msg = "level > 0 or level < -1 only valid with MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ s.groupby(level=[1])
+
+
+def test_groupby_complex():
+ # GH 12902
+ a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
+ expected = Series((1 + 2j, 5 + 10j))
+
+ result = a.groupby(level=0).sum()
+ assert_series_equal(result, expected)
+
+ result = a.sum(level=0)
+ assert_series_equal(result, expected)
+
+
+def test_mutate_groups():
+
+ # GH3380
+
+ df = DataFrame({
+ 'cat1': ['a'] * 8 + ['b'] * 6,
+ 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
+ ['d'] * 2 + ['e'] * 2,
+ 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
+ 'val': np.random.randint(100, size=14),
+ })
+
+ def f_copy(x):
+ x = x.copy()
+ x['rank'] = x.val.rank(method='min')
+ return x.groupby('cat2')['rank'].min()
+
+ def f_no_copy(x):
+ x['rank'] = x.val.rank(method='min')
+ return x.groupby('cat2')['rank'].min()
+
+ grpby_copy = df.groupby('cat1').apply(f_copy)
+ grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
+ assert_series_equal(grpby_copy, grpby_no_copy)
+
+
+def test_no_mutate_but_looks_like():
+
+ # GH 8467
+ # first show's mutation indicator
+ # second does not, but should yield the same results
+ df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
+
+ result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
+ result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
+ assert_series_equal(result1, result2)
+
+
+def test_groupby_series_indexed_differently():
+ s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
+ index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
+ s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
+ index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
+
+ grouped = s1.groupby(s2)
+ agged = grouped.mean()
+ exp = s1.groupby(s2.reindex(s1.index).get).mean()
+ assert_series_equal(agged, exp)
+
+
+def test_groupby_with_hier_columns():
+ tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
+ 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
+ 'one', 'two']]))
+ index = MultiIndex.from_tuples(tuples)
+ columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
+ 'B', 'cat'), ('A', 'dog')])
+ df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
+
+ result = df.groupby(level=0).mean()
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0, axis=1).mean()
+ tm.assert_index_equal(result.index, df.index)
+
+ result = df.groupby(level=0).agg(np.mean)
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0).apply(lambda x: x.mean())
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
+ tm.assert_index_equal(result.columns, Index(['A', 'B']))
+ tm.assert_index_equal(result.index, df.index)
+
+ # add a nuisance column
+ sorted_columns, _ = columns.sortlevel(0)
+ df['A', 'foo'] = 'bar'
+ result = df.groupby(level=0).mean()
+ tm.assert_index_equal(result.columns, df.columns[:-1])
+
+
+def test_grouping_ndarray(df):
+ grouped = df.groupby(df['A'].values)
+
+ result = grouped.sum()
+ expected = df.groupby('A').sum()
+ assert_frame_equal(result, expected, check_names=False
+ ) # Note: no names when grouping by value
+
+
+def test_groupby_wrong_multi_labels():
+ data = """index,foo,bar,baz,spam,data
+0,foo1,bar1,baz1,spam2,20
+1,foo1,bar2,baz1,spam3,30
+2,foo2,bar2,baz1,spam2,40
+3,foo1,bar1,baz2,spam1,50
+4,foo3,bar1,baz2,spam1,60"""
+
+ data = read_csv(StringIO(data), index_col=0)
+
+ grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_series_with_name(df):
+ result = df.groupby(df['A']).mean()
+ result2 = df.groupby(df['A'], as_index=False).mean()
+ assert result.index.name == 'A'
+ assert 'A' in result2
+
+ result = df.groupby([df['A'], df['B']]).mean()
+ result2 = df.groupby([df['A'], df['B']],
+ as_index=False).mean()
+ assert result.index.names == ('A', 'B')
+ assert 'A' in result2
+ assert 'B' in result2
+
+
+def test_seriesgroupby_name_attr(df):
+ # GH 6265
+ result = df.groupby('A')['C']
+ assert result.count().name == 'C'
+ assert result.mean().name == 'C'
+
+ testFunc = lambda x: np.sum(x) * 2
+ assert result.agg(testFunc).name == 'C'
+
+
+def test_consistency_name():
+ # GH 12363
+
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': np.random.randn(8) + 1.0,
+ 'D': np.arange(8)})
+
+ expected = df.groupby(['A']).B.count()
+ result = df.B.groupby(df.A).count()
+ assert_series_equal(result, expected)
+
+
+def test_groupby_name_propagation(df):
+ # GH 6124
+ def summarize(df, name=None):
+ return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
+
+ def summarize_random_name(df):
+ # Provide a different name for each Series. In this case, groupby
+ # should not attempt to propagate the Series name since they are
+ # inconsistent.
+ return Series({
+ 'count': 1,
+ 'mean': 2,
+ 'omissions': 3,
+ }, name=df.iloc[0]['A'])
+
+ metrics = df.groupby('A').apply(summarize)
+ assert metrics.columns.name is None
+ metrics = df.groupby('A').apply(summarize, 'metrics')
+ assert metrics.columns.name == 'metrics'
+ metrics = df.groupby('A').apply(summarize_random_name)
+ assert metrics.columns.name is None
+
+
+def test_groupby_nonstring_columns():
+ df = DataFrame([np.arange(10) for x in range(10)])
+ grouped = df.groupby(0)
+ result = grouped.mean()
+ expected = df.groupby(df[0]).mean()
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_mixed_type_columns():
+ # GH 13432, unorderable types in py3
+ df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
+ expected = DataFrame([[1, 2]], columns=['B', 0],
+ index=Index([0], name='A'))
+
+ result = df.groupby('A').first()
+ tm.assert_frame_equal(result, expected)
+
+ result = df.groupby('A').sum()
+ tm.assert_frame_equal(result, expected)
+
+
+# TODO: Ensure warning isn't emitted in the first place
[email protected]("ignore:Mean of:RuntimeWarning")
+def test_cython_grouper_series_bug_noncontig():
+ arr = np.empty((100, 100))
+ arr.fill(np.nan)
+ obj = Series(arr[:, 0], index=lrange(100))
+ inds = np.tile(lrange(10), 10)
+
+ result = obj.groupby(inds).agg(Series.median)
+ assert result.isna().all()
+
+
+def test_series_grouper_noncontig_index():
+ index = Index(tm.rands_array(10, 100))
+
+ values = Series(np.random.randn(50), index=index[::2])
+ labels = np.random.randint(0, 5, 50)
+
+ # it works!
+ grouped = values.groupby(labels)
+
+ # accessing the index elements causes segfault
+ f = lambda x: len(set(map(id, x.index)))
+ grouped.agg(f)
+
+
+def test_convert_objects_leave_decimal_alone():
+
+ s = Series(lrange(5))
+ labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
+
+ def convert_fast(x):
+ return Decimal(str(x.mean()))
+
+ def convert_force_pure(x):
+ # base will be length 0
+ assert (len(x.values.base) > 0)
+ return Decimal(str(x.mean()))
+
+ grouped = s.groupby(labels)
+
+ result = grouped.agg(convert_fast)
+ assert result.dtype == np.object_
+ assert isinstance(result[0], Decimal)
+
+ result = grouped.agg(convert_force_pure)
+ assert result.dtype == np.object_
+ assert isinstance(result[0], Decimal)
+
+
+def test_groupby_dtype_inference_empty():
+ # GH 6733
+ df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
+ assert df['x'].dtype == np.float64
+
+ result = df.groupby('x').first()
+ exp_index = Index([], name='x', dtype=np.float64)
+ expected = DataFrame({'range': Series(
+ [], index=exp_index, dtype='int64')})
+ assert_frame_equal(result, expected, by_blocks=True)
+
+
+def test_groupby_list_infer_array_like(df):
+ result = df.groupby(list(df['A'])).mean()
+ expected = df.groupby(df['A']).mean()
+ assert_frame_equal(result, expected, check_names=False)
+
+ with pytest.raises(KeyError, match=r"^'foo'$"):
+ df.groupby(list(df['A'][:-1]))
+
+ # pathological case of ambiguity
+ df = DataFrame({'foo': [0, 1],
+ 'bar': [3, 4],
+ 'val': np.random.randn(2)})
+
+ result = df.groupby(['foo', 'bar']).mean()
+ expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
+
+
+def test_groupby_keys_same_size_as_index():
+ # GH 11185
+ freq = 's'
+ index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
+ periods=2, freq=freq)
+ df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
+ 'metric', 'values'
+ ], index=index)
+ result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
+ expected = df.set_index([df.index, 'metric'])
+
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_one_row():
+ # GH 11741
+ msg = r"^'Z'$"
+ df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
+ with pytest.raises(KeyError, match=msg):
+ df1.groupby('Z')
+ df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
+ with pytest.raises(KeyError, match=msg):
+ df2.groupby('Z')
+
+
+def test_groupby_nat_exclude():
+ # GH 6992
+ df = pd.DataFrame(
+ {'values': np.random.randn(8),
+ 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
+ '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
+ pd.Timestamp('2013-01-01')],
+ 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
+ grouped = df.groupby('dt')
+
+ expected = [pd.Index([1, 7]), pd.Index([3, 5])]
+ keys = sorted(grouped.groups.keys())
+ assert len(keys) == 2
+ for k, e in zip(keys, expected):
+ # grouped.groups keys are np.datetime64 with system tz
+ # not to be affected by tz, only compare values
+ tm.assert_index_equal(grouped.groups[k], e)
+
+ # confirm obj is not filtered
+ tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
+ assert grouped.ngroups == 2
+
+ expected = {
+ Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
+ Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
+ }
+
+ for k in grouped.indices:
+ tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
+
+ tm.assert_frame_equal(
+ grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
+ tm.assert_frame_equal(
+ grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
+
+ with pytest.raises(KeyError, match=r"^NaT$"):
+ grouped.get_group(pd.NaT)
+
+ nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
+ 'nat': [pd.NaT, pd.NaT, pd.NaT]})
+ assert nan_df['nan'].dtype == 'float64'
+ assert nan_df['nat'].dtype == 'datetime64[ns]'
+
+ for key in ['nan', 'nat']:
+ grouped = nan_df.groupby(key)
+ assert grouped.groups == {}
+ assert grouped.ngroups == 0
+ assert grouped.indices == {}
+ with pytest.raises(KeyError, match=r"^nan$"):
+ grouped.get_group(np.nan)
+ with pytest.raises(KeyError, match=r"^NaT$"):
+ grouped.get_group(pd.NaT)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_sparse_friendly(df):
+ sdf = df[['C', 'D']].to_sparse()
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+
+ def _check_work(gp):
+ gp.mean()
+ gp.agg(np.mean)
+ dict(iter(gp))
+
+ # it works!
+ _check_work(sdf.groupby(lambda x: x // 2))
+ _check_work(sdf['C'].groupby(lambda x: x // 2))
+ _check_work(sdf.groupby(df['A']))
+
+ # do this someday
+ # _check_work(panel.groupby(lambda x: x.month, axis=1))
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_panel_groupby():
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+ grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
+ axis='items')
+ agged = grouped.mean()
+ agged2 = grouped.agg(lambda x: x.mean('items'))
+
+ tm.assert_panel_equal(agged, agged2)
+
+ tm.assert_index_equal(agged.items, Index([0, 1]))
+
+ grouped = panel.groupby(lambda x: x.month, axis='major')
+ agged = grouped.mean()
+
+ exp = Index(sorted(list(set(panel.major_axis.month))))
+ tm.assert_index_equal(agged.major_axis, exp)
+
+ grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
+ axis='minor')
+ agged = grouped.mean()
+ tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
+
+
+def test_groupby_2d_malformed():
+ d = DataFrame(index=lrange(2))
+ d['group'] = ['g1', 'g2']
+ d['zeros'] = [0, 0]
+ d['ones'] = [1, 1]
+ d['label'] = ['l1', 'l2']
+ tmp = d.groupby(['group']).mean()
+ res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
+ tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
+ tm.assert_numpy_array_equal(tmp.values, res_values)
+
+
+def test_int32_overflow():
+ B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
+ ))
+ A = np.arange(25000)
+ df = DataFrame({'A': A,
+ 'B': B,
+ 'C': A,
+ 'D': B,
+ 'E': np.random.randn(25000)})
+
+ left = df.groupby(['A', 'B', 'C', 'D']).sum()
+ right = df.groupby(['D', 'C', 'B', 'A']).sum()
+ assert len(left) == len(right)
+
+
+def test_groupby_sort_multi():
+ df = DataFrame({'a': ['foo', 'bar', 'baz'],
+ 'b': [3, 2, 1],
+ 'c': [0, 1, 2],
+ 'd': np.random.randn(3)})
+
+ tups = lmap(tuple, df[['a', 'b', 'c']].values)
+ tups = com.asarray_tuplesafe(tups)
+ result = df.groupby(['a', 'b', 'c'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
+
+ tups = lmap(tuple, df[['c', 'a', 'b']].values)
+ tups = com.asarray_tuplesafe(tups)
+ result = df.groupby(['c', 'a', 'b'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups)
+
+ tups = lmap(tuple, df[['b', 'c', 'a']].values)
+ tups = com.asarray_tuplesafe(tups)
+ result = df.groupby(['b', 'c', 'a'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
+
+ df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
+ 'b': [0, 0, 0, 1, 1, 1],
+ 'd': np.random.randn(6)})
+ grouped = df.groupby(['a', 'b'])['d']
+ result = grouped.sum()
+
+ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
+ tups = lmap(tuple, df[keys].values)
+ tups = com.asarray_tuplesafe(tups)
+ expected = f(df.groupby(tups)[field])
+ for k, v in compat.iteritems(expected):
+ assert (result[k] == v)
+
+ _check_groupby(df, result, ['a', 'b'], 'd')
+
+
+def test_dont_clobber_name_column():
+ df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'name': ['foo', 'bar', 'baz'] * 2})
+
+ result = df.groupby('key').apply(lambda x: x)
+ assert_frame_equal(result, df)
+
+
+def test_skip_group_keys():
+
+ tsf = tm.makeTimeDataFrame()
+
+ grouped = tsf.groupby(lambda x: x.month, group_keys=False)
+ result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
+
+ pieces = [group.sort_values(by='A')[:3] for key, group in grouped]
+
+ expected = pd.concat(pieces)
+ assert_frame_equal(result, expected)
+
+ grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
+ result = grouped.apply(lambda x: x.sort_values()[:3])
+
+ pieces = [group.sort_values()[:3] for key, group in grouped]
+
+ expected = pd.concat(pieces)
+ assert_series_equal(result, expected)
+
+
+def test_no_nonsense_name(frame):
+ # GH #995
+ s = frame['C'].copy()
+ s.name = None
+
+ result = s.groupby(frame['A']).agg(np.sum)
+ assert result.name is None
+
+
+def test_multifunc_sum_bug():
+ # GH #1065
+ x = DataFrame(np.arange(9).reshape(3, 3))
+ x['test'] = 0
+ x['fl'] = [1.3, 1.5, 1.6]
+
+ grouped = x.groupby('test')
+ result = grouped.agg({'fl': 'sum', 2: 'size'})
+ assert result['fl'].dtype == np.float64
+
+
+def test_handle_dict_return_value(df):
+ def f(group):
+ return {'max': group.max(), 'min': group.min()}
+
+ def g(group):
+ return Series({'max': group.max(), 'min': group.min()})
+
+ result = df.groupby('A')['C'].apply(f)
+ expected = df.groupby('A')['C'].apply(g)
+
+ assert isinstance(result, Series)
+ assert_series_equal(result, expected)
+
+
[email protected]('grouper', ['A', ['A', 'B']])
+def test_set_group_name(df, grouper):
+ def f(group):
+ assert group.name is not None
+ return group
+
+ def freduce(group):
+ assert group.name is not None
+ return group.sum()
+
+ def foo(x):
+ return freduce(x)
+
+ grouped = df.groupby(grouper)
+
+ # make sure all these work
+ grouped.apply(f)
+ grouped.aggregate(freduce)
+ grouped.aggregate({'C': freduce, 'D': freduce})
+ grouped.transform(f)
+
+ grouped['C'].apply(f)
+ grouped['C'].aggregate(freduce)
+ grouped['C'].aggregate([freduce, foo])
+ grouped['C'].transform(f)
+
+
+def test_group_name_available_in_inference_pass():
+ # gh-15062
+ df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+ names = []
+
+ def f(group):
+ names.append(group.name)
+ return group.copy()
+
+ df.groupby('a', sort=False, group_keys=False).apply(f)
+ # we expect 2 zeros because we call ``f`` once to see if a faster route
+ # can be used.
+ expected_names = [0, 0, 1, 2]
+ assert names == expected_names
+
+
+def test_no_dummy_key_names(df):
+ # see gh-1291
+ result = df.groupby(df['A'].values).sum()
+ assert result.index.name is None
+
+ result = df.groupby([df['A'].values, df['B'].values]).sum()
+ assert result.index.names == (None, None)
+
+
+def test_groupby_sort_multiindex_series():
+ # series multiindex groupby sort argument was not being passed through
+ # _compress_group_index
+ # GH 9444
+ index = MultiIndex(levels=[[1, 2], [1, 2]],
+ codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
+ names=['a', 'b'])
+ mseries = Series([0, 1, 2, 3, 4, 5], index=index)
+ index = MultiIndex(levels=[[1, 2], [1, 2]],
+ codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
+ mseries_result = Series([0, 2, 4], index=index)
+
+ result = mseries.groupby(level=['a', 'b'], sort=False).first()
+ assert_series_equal(result, mseries_result)
+ result = mseries.groupby(level=['a', 'b'], sort=True).first()
+ assert_series_equal(result, mseries_result.sort_index())
+
+
+def test_groupby_reindex_inside_function():
+
+ periods = 1000
+ ind = date_range(start='2012/1/1', freq='5min', periods=periods)
+ df = DataFrame({'high': np.arange(
+ periods), 'low': np.arange(periods)}, index=ind)
+
+ def agg_before(hour, func, fix=False):
+ """
+ Run an aggregate func on the subset of data.
+ """
+
+ def _func(data):
+ d = data.loc[data.index.map(
+ lambda x: x.hour < 11)].dropna()
+ if fix:
+ data[data.index[0]]
+ if len(d) == 0:
+ return None
+ return func(d)
+
+ return _func
+
+ def afunc(data):
+ d = data.select(lambda x: x.hour < 11).dropna()
+ return np.max(d)
+
+ grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
+ closure_bad = grouped.agg({'high': agg_before(11, np.max)})
+ closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
+
+ assert_frame_equal(closure_bad, closure_good)
+
+
+def test_groupby_multiindex_missing_pair():
+ # GH9049
+ df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
+ 'group2': ['c', 'c', 'd', 'c'],
+ 'value': [1, 1, 1, 5]})
+ df = df.set_index(['group1', 'group2'])
+ df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
+
+ res = df_grouped.agg('sum')
+ idx = MultiIndex.from_tuples(
+ [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
+ exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
+
+ tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_multiindex_not_lexsorted():
+ # GH 11640
+
+ # define the lexsorted version
+ lexsorted_mi = MultiIndex.from_tuples(
+ [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
+ lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
+ assert lexsorted_df.columns.is_lexsorted()
+
+ # define the non-lexsorted version
+ not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
+ data=[[1, 'b1', 'c1', 3],
+ [1, 'b2', 'c2', 4]])
+ not_lexsorted_df = not_lexsorted_df.pivot_table(
+ index='a', columns=['b', 'c'], values='d')
+ not_lexsorted_df = not_lexsorted_df.reset_index()
+ assert not not_lexsorted_df.columns.is_lexsorted()
+
+ # compare the results
+ tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
+
+ expected = lexsorted_df.groupby('a').mean()
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = not_lexsorted_df.groupby('a').mean()
+ tm.assert_frame_equal(expected, result)
+
+ # a transforming function should work regardless of sort
+ # GH 14776
+ df = DataFrame({'x': ['a', 'a', 'b', 'a'],
+ 'y': [1, 1, 2, 2],
+ 'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
+ assert not df.index.is_lexsorted()
+
+ for level in [0, 1, [0, 1]]:
+ for sort in [False, True]:
+ result = df.groupby(level=level, sort=sort).apply(
+ DataFrame.drop_duplicates)
+ expected = df
+ tm.assert_frame_equal(expected, result)
+
+ result = df.sort_index().groupby(level=level, sort=sort).apply(
+ DataFrame.drop_duplicates)
+ expected = df.sort_index()
+ tm.assert_frame_equal(expected, result)
+
+
+def test_index_label_overlaps_location():
+ # checking we don't have any label/location confusion in the
+ # the wake of GH5375
+ df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
+ g = df.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = df.iloc[[1, 3, 4]]
+ assert_frame_equal(actual, expected)
+
+ ser = df[0]
+ g = ser.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = ser.take([1, 3, 4])
+ assert_series_equal(actual, expected)
+
+ # ... and again, with a generic Index of floats
+ df.index = df.index.astype(float)
+ g = df.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = df.iloc[[1, 3, 4]]
+ assert_frame_equal(actual, expected)
+
+ ser = df[0]
+ g = ser.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = ser.take([1, 3, 4])
+ assert_series_equal(actual, expected)
+
+
+def test_transform_doesnt_clobber_ints():
+ # GH 7972
+ n = 6
+ x = np.arange(n)
+ df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
+ df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
+
+ gb = df.groupby('a')
+ result = gb.transform('mean')
+
+ gb2 = df2.groupby('a')
+ expected = gb2.transform('mean')
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('sort_column', ['ints', 'floats', 'strings',
+ ['ints', 'floats'],
+ ['ints', 'strings']])
[email protected]('group_column', ['int_groups', 'string_groups',
+ ['int_groups', 'string_groups']])
+def test_groupby_preserves_sort(sort_column, group_column):
+ # Test to ensure that groupby always preserves sort order of original
+ # object. Issue #8588 and #9651
+
+ df = DataFrame(
+ {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
+ 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
+ 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
+ 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
+ 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
+
+ # Try sorting on different types and with different group types
+
+ df = df.sort_values(by=sort_column)
+ g = df.groupby(group_column)
+
+ def test_sort(x):
+ assert_frame_equal(x, x.sort_values(by=sort_column))
+ g.apply(test_sort)
+
+
+def test_group_shift_with_null_key():
+ # This test is designed to replicate the segfault in issue #13813.
+ n_rows = 1200
+
+ # Generate a moderately large dataframe with occasional missing
+ # values in column `B`, and then group by [`A`, `B`]. This should
+ # force `-1` in `labels` array of `g.grouper.group_info` exactly
+ # at those places, where the group-by key is partially missing.
+ df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
+ for i in range(n_rows)], dtype=float,
+ columns=["A", "B", "Z"], index=None)
+ g = df.groupby(["A", "B"])
+
+ expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
+ else np.nan)
+ for i in range(n_rows)], dtype=float,
+ columns=["Z"], index=None)
+ result = g.shift(-1)
+
+ assert_frame_equal(result, expected)
+
+
+def test_group_shift_with_fill_value():
+ # GH #24128
+ n_rows = 24
+ df = DataFrame([(i % 12, i % 3, i)
+ for i in range(n_rows)], dtype=float,
+ columns=["A", "B", "Z"], index=None)
+ g = df.groupby(["A", "B"])
+
+ expected = DataFrame([(i + 12 if i < n_rows - 12
+ else 0)
+ for i in range(n_rows)], dtype=float,
+ columns=["Z"], index=None)
+ result = g.shift(-1, fill_value=0)[["Z"]]
+
+ assert_frame_equal(result, expected)
+
+
+def test_pivot_table_values_key_error():
+ # This test is designed to replicate the error in issue #14938
+ df = pd.DataFrame({'eventDate':
+ pd.date_range(pd.datetime.today(),
+ periods=20, freq='M').tolist(),
+ 'thename': range(0, 20)})
+
+ df['year'] = df.set_index('eventDate').index.year
+ df['month'] = df.set_index('eventDate').index.month
+
+ with pytest.raises(KeyError, match="'badname'"):
+ df.reset_index().pivot_table(index='year', columns='month',
+ values='badname', aggfunc='count')
+
+
+def test_empty_dataframe_groupby():
+ # GH8093
+ df = DataFrame(columns=['A', 'B', 'C'])
+
+ result = df.groupby('A').sum()
+ expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
+ expected.index.name = 'A'
+
+ assert_frame_equal(result, expected)
+
+
+def test_tuple_warns():
+ # https://github.com/pandas-dev/pandas/issues/18314
+ df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
+ 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
+
+ assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+
+ with tm.assert_produces_warning(None):
+ df.groupby(('a', 'b')).c.mean()
+
+
+def test_tuple_warns_unhashable():
+ # https://github.com/pandas-dev/pandas/issues/18314
+ business_dates = date_range(start='4/1/2014', end='6/30/2014',
+ freq='B')
+ df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
+
+ assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+
+
+def test_tuple_correct_keyerror():
+ # https://github.com/pandas-dev/pandas/issues/18798
+ df = pd.DataFrame(1, index=range(3),
+ columns=pd.MultiIndex.from_product([[1, 2],
+ [3, 4]]))
+ with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
+ df.groupby((7, 8)).mean()
+
+
+def test_groupby_agg_ohlc_non_first():
+ # GH 21716
+ df = pd.DataFrame([[1], [1]], columns=['foo'],
+ index=pd.date_range('2018-01-01', periods=2, freq='D'))
+
+ expected = pd.DataFrame([
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1]
+ ], columns=pd.MultiIndex.from_tuples((
+ ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
+ ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
+ ('foo', 'sum', 'foo'))), index=pd.date_range(
+ '2018-01-01', periods=2, freq='D'))
+
+ result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])
+
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_grouping.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_grouping.py
new file mode 100644
index 00000000000..a509a7cb57c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_grouping.py
@@ -0,0 +1,838 @@
+# -*- coding: utf-8 -*-
+
+""" test where we are determining what we are grouping, or getting groups """
+
+import numpy as np
+import pytest
+
+from pandas.compat import long, lrange
+
+import pandas as pd
+from pandas import (
+ CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, compat,
+ date_range)
+from pandas.core.groupby.grouper import Grouping
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_panel_equal,
+ assert_series_equal)
+
+# selection
+# --------------------------------
+
+
+class TestSelection(object):
+
+ def test_select_bad_cols(self):
+ df = DataFrame([[1, 2]], columns=['A', 'B'])
+ g = df.groupby('A')
+ with pytest.raises(KeyError, match='"Columns not found: \'C\'"'):
+ g[['C']]
+
+ with pytest.raises(KeyError, match='^[^A]+$'):
+ # A should not be referenced as a bad column...
+ # will have to rethink regex if you change message!
+ g[['A', 'C']]
+
+ def test_groupby_duplicated_column_errormsg(self):
+ # GH7511
+ df = DataFrame(columns=['A', 'B', 'A', 'C'],
+ data=[range(4), range(2, 6), range(0, 8, 2)])
+
+ msg = "Grouper for 'A' not 1-dimensional"
+ with pytest.raises(ValueError, match=msg):
+ df.groupby('A')
+ with pytest.raises(ValueError, match=msg):
+ df.groupby(['A', 'B'])
+
+ grouped = df.groupby('B')
+ c = grouped.count()
+ assert c.columns.nlevels == 1
+ assert c.columns.size == 3
+
+ def test_column_select_via_attr(self, df):
+ result = df.groupby('A').C.sum()
+ expected = df.groupby('A')['C'].sum()
+ assert_series_equal(result, expected)
+
+ df['mean'] = 1.5
+ result = df.groupby('A').mean()
+ expected = df.groupby('A').agg(np.mean)
+ assert_frame_equal(result, expected)
+
+ def test_getitem_list_of_columns(self):
+ df = DataFrame(
+ {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8),
+ 'E': np.random.randn(8)})
+
+ result = df.groupby('A')[['C', 'D']].mean()
+ result2 = df.groupby('A')['C', 'D'].mean()
+ result3 = df.groupby('A')[df.columns[2:4]].mean()
+
+ expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
+
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+ assert_frame_equal(result3, expected)
+
+ def test_getitem_numeric_column_names(self):
+ # GH #13731
+ df = DataFrame({0: list('abcd') * 2,
+ 2: np.random.randn(8),
+ 4: np.random.randn(8),
+ 6: np.random.randn(8)})
+ result = df.groupby(0)[df.columns[1:3]].mean()
+ result2 = df.groupby(0)[2, 4].mean()
+ result3 = df.groupby(0)[[2, 4]].mean()
+
+ expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
+
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+ assert_frame_equal(result3, expected)
+
+
+# grouping
+# --------------------------------
+
+class TestGrouping():
+
+ def test_grouper_index_types(self):
+ # related GH5375
+ # groupby misbehaving when using a Floatlike index
+ df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
+ for index in [tm.makeFloatIndex, tm.makeStringIndex,
+ tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
+ tm.makePeriodIndex]:
+
+ df.index = index(len(df))
+ df.groupby(list('abcde')).apply(lambda x: x)
+
+ df.index = list(reversed(df.index.tolist()))
+ df.groupby(list('abcde')).apply(lambda x: x)
+
+ def test_grouper_multilevel_freq(self):
+
+ # GH 7885
+ # with level and freq specified in a pd.Grouper
+ from datetime import date, timedelta
+ d0 = date.today() - timedelta(days=14)
+ dates = date_range(d0, date.today())
+ date_index = pd.MultiIndex.from_product(
+ [dates, dates], names=['foo', 'bar'])
+ df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
+
+ # Check string level
+ expected = df.reset_index().groupby([pd.Grouper(
+ key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
+ # reset index changes columns dtype to object
+ expected.columns = pd.Index([0], dtype='int64')
+
+ result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
+ level='bar', freq='W')]).sum()
+ assert_frame_equal(result, expected)
+
+ # Check integer level
+ result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
+ level=1, freq='W')]).sum()
+ assert_frame_equal(result, expected)
+
+ def test_grouper_creation_bug(self):
+
+ # GH 8795
+ df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
+ g = df.groupby('A')
+ expected = g.sum()
+
+ g = df.groupby(pd.Grouper(key='A'))
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ result = g.apply(lambda x: x.sum())
+ assert_frame_equal(result, expected)
+
+ g = df.groupby(pd.Grouper(key='A', axis=0))
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ # GH14334
+ # pd.Grouper(key=...) may be passed in a list
+ df = DataFrame({'A': [0, 0, 0, 1, 1, 1],
+ 'B': [1, 1, 2, 2, 3, 3],
+ 'C': [1, 2, 3, 4, 5, 6]})
+ # Group by single column
+ expected = df.groupby('A').sum()
+ g = df.groupby([pd.Grouper(key='A')])
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ # Group by two columns
+ # using a combination of strings and Grouper objects
+ expected = df.groupby(['A', 'B']).sum()
+
+ # Group with two Grouper objects
+ g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ # Group with a string and a Grouper object
+ g = df.groupby(['A', pd.Grouper(key='B')])
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ # Group with a Grouper object and a string
+ g = df.groupby([pd.Grouper(key='A'), 'B'])
+ result = g.sum()
+ assert_frame_equal(result, expected)
+
+ # GH8866
+ s = Series(np.arange(8, dtype='int64'),
+ index=pd.MultiIndex.from_product(
+ [list('ab'), range(2),
+ date_range('20130101', periods=2)],
+ names=['one', 'two', 'three']))
+ result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
+ expected = Series([28], index=Index(
+ [Timestamp('2013-01-31')], freq='M', name='three'))
+ assert_series_equal(result, expected)
+
+ # just specifying a level breaks
+ result = s.groupby(pd.Grouper(level='one')).sum()
+ expected = s.groupby(level='one').sum()
+ assert_series_equal(result, expected)
+
+ def test_grouper_column_and_index(self):
+ # GH 14327
+
+ # Grouping a multi-index frame by a column and an index level should
+ # be equivalent to resetting the index and grouping by two columns
+ idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
+ ('b', 1), ('b', 2), ('b', 3)])
+ idx.names = ['outer', 'inner']
+ df_multi = pd.DataFrame({"A": np.arange(6),
+ 'B': ['one', 'one', 'two',
+ 'two', 'one', 'one']},
+ index=idx)
+ result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
+ expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
+ assert_frame_equal(result, expected)
+
+ # Test the reverse grouping order
+ result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
+ expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
+ assert_frame_equal(result, expected)
+
+ # Grouping a single-index frame by a column and the index should
+ # be equivalent to resetting the index and grouping by two columns
+ df_single = df_multi.reset_index('outer')
+ result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
+ expected = df_single.reset_index().groupby(['B', 'inner']).mean()
+ assert_frame_equal(result, expected)
+
+ # Test the reverse grouping order
+ result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
+ expected = df_single.reset_index().groupby(['inner', 'B']).mean()
+ assert_frame_equal(result, expected)
+
+ def test_groupby_levels_and_columns(self):
+ # GH9344, GH9049
+ idx_names = ['x', 'y']
+ idx = pd.MultiIndex.from_tuples(
+ [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
+ df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
+
+ by_levels = df.groupby(level=idx_names).mean()
+ # reset_index changes columns dtype to object
+ by_columns = df.reset_index().groupby(idx_names).mean()
+
+ tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
+
+ by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
+ tm.assert_frame_equal(by_levels, by_columns)
+
+ def test_groupby_categorical_index_and_columns(self, observed):
+ # GH18432
+ columns = ['A', 'B', 'A', 'B']
+ categories = ['B', 'A']
+ data = np.ones((5, 4), int)
+ cat_columns = CategoricalIndex(columns,
+ categories=categories,
+ ordered=True)
+ df = DataFrame(data=data, columns=cat_columns)
+ result = df.groupby(axis=1, level=0, observed=observed).sum()
+ expected_data = 2 * np.ones((5, 2), int)
+
+ if observed:
+ # if we are not-observed we undergo a reindex
+ # so need to adjust the output as our expected sets us up
+ # to be non-observed
+ expected_columns = CategoricalIndex(['A', 'B'],
+ categories=categories,
+ ordered=True)
+ else:
+ expected_columns = CategoricalIndex(categories,
+ categories=categories,
+ ordered=True)
+ expected = DataFrame(data=expected_data, columns=expected_columns)
+ assert_frame_equal(result, expected)
+
+ # test transposed version
+ df = DataFrame(data.T, index=cat_columns)
+ result = df.groupby(axis=0, level=0, observed=observed).sum()
+ expected = DataFrame(data=expected_data.T, index=expected_columns)
+ assert_frame_equal(result, expected)
+
+ def test_grouper_getting_correct_binner(self):
+
+ # GH 10063
+ # using a non-time-based grouper and a time-based grouper
+ # and specifying levels
+ df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
+ [list('ab'), date_range('20130101', periods=80)], names=['one',
+ 'two']))
+ result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
+ level='two', freq='M')]).sum()
+ expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
+ index=MultiIndex.from_product(
+ [list('ab'),
+ date_range('20130101', freq='M', periods=3)],
+ names=['one', 'two']))
+ assert_frame_equal(result, expected)
+
+ def test_grouper_iter(self, df):
+ assert sorted(df.groupby('A').grouper) == ['bar', 'foo']
+
+ def test_empty_groups(self, df):
+ # see gh-1048
+ with pytest.raises(ValueError, match="No group keys passed!"):
+ df.groupby([])
+
+ def test_groupby_grouper(self, df):
+ grouped = df.groupby('A')
+
+ result = df.groupby(grouped.grouper).mean()
+ expected = grouped.mean()
+ tm.assert_frame_equal(result, expected)
+
+ def test_groupby_dict_mapping(self):
+ # GH #679
+ from pandas import Series
+ s = Series({'T1': 5})
+ result = s.groupby({'T1': 'T2'}).agg(sum)
+ expected = s.groupby(['T2']).agg(sum)
+ assert_series_equal(result, expected)
+
+ s = Series([1., 2., 3., 4.], index=list('abcd'))
+ mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
+
+ result = s.groupby(mapping).mean()
+ result2 = s.groupby(mapping).agg(np.mean)
+ expected = s.groupby([0, 0, 1, 1]).mean()
+ expected2 = s.groupby([0, 0, 1, 1]).mean()
+ assert_series_equal(result, expected)
+ assert_series_equal(result, result2)
+ assert_series_equal(result, expected2)
+
+ def test_groupby_grouper_f_sanity_checked(self):
+ dates = date_range('01-Jan-2013', periods=12, freq='MS')
+ ts = Series(np.random.randn(12), index=dates)
+
+ # GH3035
+ # index.map is used to apply grouper to the index
+ # if it fails on the elements, map tries it on the entire index as
+ # a sequence. That can yield invalid results that cause trouble
+ # down the line.
+ # the surprise comes from using key[0:6] rather then str(key)[0:6]
+ # when the elements are Timestamp.
+ # the result is Index[0:6], very confusing.
+
+ msg = r"Grouper result violates len\(labels\) == len\(data\)"
+ with pytest.raises(AssertionError, match=msg):
+ ts.groupby(lambda key: key[0:6])
+
+ def test_grouping_error_on_multidim_input(self, df):
+ msg = ("Grouper for '<class 'pandas.core.frame.DataFrame'>'"
+ " not 1-dimensional")
+ with pytest.raises(ValueError, match=msg):
+ Grouping(df.index, df[['A', 'A']])
+
+ def test_multiindex_passthru(self):
+
+ # GH 7997
+ # regression from 0.14.1
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
+
+ result = df.groupby(axis=1, level=[0, 1]).first()
+ assert_frame_equal(result, df)
+
+ def test_multiindex_negative_level(self, mframe):
+ # GH 13901
+ result = mframe.groupby(level=-1).sum()
+ expected = mframe.groupby(level='second').sum()
+ assert_frame_equal(result, expected)
+
+ result = mframe.groupby(level=-2).sum()
+ expected = mframe.groupby(level='first').sum()
+ assert_frame_equal(result, expected)
+
+ result = mframe.groupby(level=[-2, -1]).sum()
+ expected = mframe
+ assert_frame_equal(result, expected)
+
+ result = mframe.groupby(level=[-1, 'first']).sum()
+ expected = mframe.groupby(level=['second', 'first']).sum()
+ assert_frame_equal(result, expected)
+
+ def test_multifunc_select_col_integer_cols(self, df):
+ df.columns = np.arange(len(df.columns))
+
+ # it works!
+ df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
+
+ def test_multiindex_columns_empty_level(self):
+ lst = [['count', 'values'], ['to filter', '']]
+ midx = MultiIndex.from_tuples(lst)
+
+ df = DataFrame([[long(1), 'A']], columns=midx)
+
+ grouped = df.groupby('to filter').groups
+ assert grouped['A'] == [0]
+
+ grouped = df.groupby([('to filter', '')]).groups
+ assert grouped['A'] == [0]
+
+ df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
+
+ expected = df.groupby('to filter').groups
+ result = df.groupby([('to filter', '')]).groups
+ assert result == expected
+
+ df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
+
+ expected = df.groupby('to filter').groups
+ result = df.groupby([('to filter', '')]).groups
+ tm.assert_dict_equal(result, expected)
+
+ def test_groupby_multiindex_tuple(self):
+ # GH 17979
+ df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
+ columns=pd.MultiIndex.from_arrays(
+ [['a', 'b', 'b', 'c'],
+ [1, 1, 2, 2]]))
+ expected = df.groupby([('b', 1)]).groups
+ result = df.groupby(('b', 1)).groups
+ tm.assert_dict_equal(expected, result)
+
+ df2 = pd.DataFrame(df.values,
+ columns=pd.MultiIndex.from_arrays(
+ [['a', 'b', 'b', 'c'],
+ ['d', 'd', 'e', 'e']]))
+ expected = df2.groupby([('b', 'd')]).groups
+ result = df.groupby(('b', 1)).groups
+ tm.assert_dict_equal(expected, result)
+
+ df3 = pd.DataFrame(df.values,
+ columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
+ expected = df3.groupby([('b', 'd')]).groups
+ result = df.groupby(('b', 1)).groups
+ tm.assert_dict_equal(expected, result)
+
+ @pytest.mark.parametrize('sort', [True, False])
+ def test_groupby_level(self, sort, mframe, df):
+ # GH 17537
+ frame = mframe
+ deleveled = frame.reset_index()
+
+ result0 = frame.groupby(level=0, sort=sort).sum()
+ result1 = frame.groupby(level=1, sort=sort).sum()
+
+ expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
+ expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()
+
+ expected0.index.name = 'first'
+ expected1.index.name = 'second'
+
+ assert result0.index.name == 'first'
+ assert result1.index.name == 'second'
+
+ assert_frame_equal(result0, expected0)
+ assert_frame_equal(result1, expected1)
+ assert result0.index.name == frame.index.names[0]
+ assert result1.index.name == frame.index.names[1]
+
+ # groupby level name
+ result0 = frame.groupby(level='first', sort=sort).sum()
+ result1 = frame.groupby(level='second', sort=sort).sum()
+ assert_frame_equal(result0, expected0)
+ assert_frame_equal(result1, expected1)
+
+ # axis=1
+
+ result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
+ result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
+ assert_frame_equal(result0, expected0.T)
+ assert_frame_equal(result1, expected1.T)
+
+ # raise exception for non-MultiIndex
+ msg = "level > 0 or level < -1 only valid with MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ df.groupby(level=1)
+
+ def test_groupby_level_index_names(self):
+ # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
+ df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3,
+ 'var1': lrange(6), }).set_index('exp')
+ df.groupby(level='exp')
+ msg = "level name foo is not the name of the index"
+ with pytest.raises(ValueError, match=msg):
+ df.groupby(level='foo')
+
+ @pytest.mark.parametrize('sort', [True, False])
+ def test_groupby_level_with_nas(self, sort):
+ # GH 17537
+ index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
+ codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
+ 2, 3]])
+
+ # factorizing doesn't confuse things
+ s = Series(np.arange(8.), index=index)
+ result = s.groupby(level=0, sort=sort).sum()
+ expected = Series([6., 22.], index=[0, 1])
+ assert_series_equal(result, expected)
+
+ index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
+ codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
+ 1, 2, 3]])
+
+ # factorizing doesn't confuse things
+ s = Series(np.arange(8.), index=index)
+ result = s.groupby(level=0, sort=sort).sum()
+ expected = Series([6., 18.], index=[0.0, 1.0])
+ assert_series_equal(result, expected)
+
+ def test_groupby_args(self, mframe):
+ # PR8618 and issue 8015
+ frame = mframe
+
+ msg = "You have to supply one of 'by' and 'level'"
+ with pytest.raises(TypeError, match=msg):
+ frame.groupby()
+
+ msg = "You have to supply one of 'by' and 'level'"
+ with pytest.raises(TypeError, match=msg):
+ frame.groupby(by=None, level=None)
+
+ @pytest.mark.parametrize('sort,labels', [
+ [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
+ [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
+ ])
+ def test_level_preserve_order(self, sort, labels, mframe):
+ # GH 17537
+ grouped = mframe.groupby(level=0, sort=sort)
+ exp_labels = np.array(labels, np.intp)
+ assert_almost_equal(grouped.grouper.labels[0], exp_labels)
+
+ def test_grouping_labels(self, mframe):
+ grouped = mframe.groupby(mframe.index.get_level_values(0))
+ exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
+ assert_almost_equal(grouped.grouper.labels[0], exp_labels)
+
+ def test_list_grouper_with_nat(self):
+ # GH 14715
+ df = pd.DataFrame({'date': pd.date_range('1/1/2011',
+ periods=365, freq='D')})
+ df.iloc[-1] = pd.NaT
+ grouper = pd.Grouper(key='date', freq='AS')
+
+ # Grouper in a list grouping
+ result = df.groupby([grouper])
+ expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))}
+ tm.assert_dict_equal(result.groups, expected)
+
+ # Test case without a list
+ result = df.groupby(grouper)
+ expected = {pd.Timestamp('2011-01-01'): 365}
+ tm.assert_dict_equal(result.groups, expected)
+
+
+# get_group
+# --------------------------------
+
+class TestGetGroup():
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_get_group(self):
+ wp = tm.makePanel()
+ grouped = wp.groupby(lambda x: x.month, axis='major')
+
+ gp = grouped.get_group(1)
+ expected = wp.reindex(
+ major=[x for x in wp.major_axis if x.month == 1])
+ assert_panel_equal(gp, expected)
+
+ # GH 5267
+ # be datelike friendly
+ df = DataFrame({'DATE': pd.to_datetime(
+ ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
+ '11-Oct-2013', '11-Oct-2013']),
+ 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
+ 'VAL': [1, 2, 3, 4, 5, 6]})
+
+ g = df.groupby('DATE')
+ key = list(g.groups)[0]
+ result1 = g.get_group(key)
+ result2 = g.get_group(Timestamp(key).to_pydatetime())
+ result3 = g.get_group(str(Timestamp(key)))
+ assert_frame_equal(result1, result2)
+ assert_frame_equal(result1, result3)
+
+ g = df.groupby(['DATE', 'label'])
+
+ key = list(g.groups)[0]
+ result1 = g.get_group(key)
+ result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
+ result3 = g.get_group((str(Timestamp(key[0])), key[1]))
+ assert_frame_equal(result1, result2)
+ assert_frame_equal(result1, result3)
+
+ # must pass a same-length tuple with multiple keys
+ msg = "must supply a tuple to get_group with multiple grouping keys"
+ with pytest.raises(ValueError, match=msg):
+ g.get_group('foo')
+ with pytest.raises(ValueError, match=msg):
+ g.get_group(('foo'))
+ msg = ("must supply a same-length tuple to get_group with multiple"
+ " grouping keys")
+ with pytest.raises(ValueError, match=msg):
+ g.get_group(('foo', 'bar', 'baz'))
+
+ def test_get_group_empty_bins(self, observed):
+
+ d = pd.DataFrame([3, 1, 7, 6])
+ bins = [0, 5, 10, 15]
+ g = d.groupby(pd.cut(d[0], bins), observed=observed)
+
+ # TODO: should prob allow a str of Interval work as well
+ # IOW '(0, 5]'
+ result = g.get_group(pd.Interval(0, 5))
+ expected = DataFrame([3, 1], index=[0, 1])
+ assert_frame_equal(result, expected)
+
+ msg = r"Interval\(10, 15, closed='right'\)"
+ with pytest.raises(KeyError, match=msg):
+ g.get_group(pd.Interval(10, 15))
+
+ def test_get_group_grouped_by_tuple(self):
+ # GH 8121
+ df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
+ gr = df.groupby('ids')
+ expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
+ result = gr.get_group((1, ))
+ assert_frame_equal(result, expected)
+
+ dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
+ '2010-01-02'])
+ df = DataFrame({'ids': [(x, ) for x in dt]})
+ gr = df.groupby('ids')
+ result = gr.get_group(('2010-01-01', ))
+ expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
+ assert_frame_equal(result, expected)
+
+ def test_groupby_with_empty(self):
+ index = pd.DatetimeIndex(())
+ data = ()
+ series = pd.Series(data, index)
+ grouper = pd.Grouper(freq='D')
+ grouped = series.groupby(grouper)
+ assert next(iter(grouped), None) is None
+
+ def test_groupby_with_single_column(self):
+ df = pd.DataFrame({'a': list('abssbab')})
+ tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]])
+ # GH 13530
+ exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a'))
+ tm.assert_frame_equal(df.groupby('a').count(), exp)
+ tm.assert_frame_equal(df.groupby('a').sum(), exp)
+ tm.assert_frame_equal(df.groupby('a').nth(1), exp)
+
+ def test_gb_key_len_equal_axis_len(self):
+ # GH16843
+ # test ensures that index and column keys are recognized correctly
+ # when number of keys equals axis length of groupby
+ df = pd.DataFrame([['foo', 'bar', 'B', 1],
+ ['foo', 'bar', 'B', 2],
+ ['foo', 'baz', 'C', 3]],
+ columns=['first', 'second', 'third', 'one'])
+ df = df.set_index(['first', 'second'])
+ df = df.groupby(['first', 'second', 'third']).size()
+ assert df.loc[('foo', 'bar', 'B')] == 2
+ assert df.loc[('foo', 'baz', 'C')] == 1
+
+
+# groups & iteration
+# --------------------------------
+
+class TestIteration():
+
+ def test_groups(self, df):
+ grouped = df.groupby(['A'])
+ groups = grouped.groups
+ assert groups is grouped.groups # caching works
+
+ for k, v in compat.iteritems(grouped.groups):
+ assert (df.loc[v]['A'] == k).all()
+
+ grouped = df.groupby(['A', 'B'])
+ groups = grouped.groups
+ assert groups is grouped.groups # caching works
+
+ for k, v in compat.iteritems(grouped.groups):
+ assert (df.loc[v]['A'] == k[0]).all()
+ assert (df.loc[v]['B'] == k[1]).all()
+
+ def test_grouping_is_iterable(self, tsframe):
+ # this code path isn't used anywhere else
+ # not sure it's useful
+ grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
+
+ # test it works
+ for g in grouped.grouper.groupings[0]:
+ pass
+
+ def test_multi_iter(self):
+ s = Series(np.arange(6))
+ k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
+ k2 = np.array(['1', '2', '1', '2', '1', '2'])
+
+ grouped = s.groupby([k1, k2])
+
+ iterated = list(grouped)
+ expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]),
+ ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])]
+ for i, ((one, two), three) in enumerate(iterated):
+ e1, e2, e3 = expected[i]
+ assert e1 == one
+ assert e2 == two
+ assert_series_equal(three, e3)
+
+ def test_multi_iter_frame(self, three_group):
+ k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
+ k2 = np.array(['1', '2', '1', '2', '1', '2'])
+ df = DataFrame({'v1': np.random.randn(6),
+ 'v2': np.random.randn(6),
+ 'k1': k1, 'k2': k2},
+ index=['one', 'two', 'three', 'four', 'five', 'six'])
+
+ grouped = df.groupby(['k1', 'k2'])
+
+ # things get sorted!
+ iterated = list(grouped)
+ idx = df.index
+ expected = [('a', '1', df.loc[idx[[4]]]),
+ ('a', '2', df.loc[idx[[3, 5]]]),
+ ('b', '1', df.loc[idx[[0, 2]]]),
+ ('b', '2', df.loc[idx[[1]]])]
+ for i, ((one, two), three) in enumerate(iterated):
+ e1, e2, e3 = expected[i]
+ assert e1 == one
+ assert e2 == two
+ assert_frame_equal(three, e3)
+
+ # don't iterate through groups with no data
+ df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
+ df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
+ grouped = df.groupby(['k1', 'k2'])
+ groups = {key: gp for key, gp in grouped}
+ assert len(groups) == 2
+
+ # axis = 1
+ three_levels = three_group.groupby(['A', 'B', 'C']).mean()
+ grouped = three_levels.T.groupby(axis=1, level=(1, 2))
+ for key, group in grouped:
+ pass
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_multi_iter_panel(self):
+ wp = tm.makePanel()
+ grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
+ axis=1)
+
+ for (month, wd), group in grouped:
+ exp_axis = [x
+ for x in wp.major_axis
+ if x.month == month and x.weekday() == wd]
+ expected = wp.reindex(major=exp_axis)
+ assert_panel_equal(group, expected)
+
+ def test_dictify(self, df):
+ dict(iter(df.groupby('A')))
+ dict(iter(df.groupby(['A', 'B'])))
+ dict(iter(df['C'].groupby(df['A'])))
+ dict(iter(df['C'].groupby([df['A'], df['B']])))
+ dict(iter(df.groupby('A')['C']))
+ dict(iter(df.groupby(['A', 'B'])['C']))
+
+ def test_groupby_with_small_elem(self):
+ # GH 8542
+ # length=2
+ df = pd.DataFrame({'event': ['start', 'start'],
+ 'change': [1234, 5678]},
+ index=pd.DatetimeIndex(['2014-09-10', '2013-10-10']))
+ grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
+ assert len(grouped.groups) == 2
+ assert grouped.ngroups == 2
+ assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
+ assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
+
+ res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[0], :])
+ res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[1], :])
+
+ df = pd.DataFrame({'event': ['start', 'start', 'start'],
+ 'change': [1234, 5678, 9123]},
+ index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
+ '2014-09-15']))
+ grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
+ assert len(grouped.groups) == 2
+ assert grouped.ngroups == 2
+ assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
+ assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
+
+ res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[0, 2], :])
+ res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[1], :])
+
+ # length=3
+ df = pd.DataFrame({'event': ['start', 'start', 'start'],
+ 'change': [1234, 5678, 9123]},
+ index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
+ '2014-08-05']))
+ grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
+ assert len(grouped.groups) == 3
+ assert grouped.ngroups == 3
+ assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
+ assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
+ assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups
+
+ res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[0], :])
+ res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[1], :])
+ res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
+ tm.assert_frame_equal(res, df.iloc[[2], :])
+
+ def test_grouping_string_repr(self):
+ # GH 13394
+ mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
+ df = DataFrame([[1, 2, 3]], columns=mi)
+ gr = df.groupby(df[('A', 'a')])
+
+ result = gr.grouper.groupings[0].__repr__()
+ expected = "Grouping(('A', 'a'))"
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_index_as_string.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_index_as_string.py
new file mode 100644
index 00000000000..141381f8430
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_index_as_string.py
@@ -0,0 +1,68 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
[email protected](params=[['inner'], ['inner', 'outer']])
+def frame(request):
+ levels = request.param
+ df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'inner': [1, 2, 3, 1, 2, 3],
+ 'A': np.arange(6),
+ 'B': ['one', 'one', 'two', 'two', 'one', 'one']})
+ if levels:
+ df = df.set_index(levels)
+
+ return df
+
+
+def series():
+ df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'inner': [1, 2, 3, 1, 2, 3],
+ 'A': np.arange(6),
+ 'B': ['one', 'one', 'two', 'two', 'one', 'one']})
+ s = df.set_index(['outer', 'inner', 'B'])['A']
+
+ return s
+
+
[email protected]('key_strs,groupers', [
+ ('inner', # Index name
+ pd.Grouper(level='inner')
+ ),
+ (['inner'], # List of index name
+ [pd.Grouper(level='inner')]
+ ),
+ (['B', 'inner'], # Column and index
+ ['B', pd.Grouper(level='inner')]
+ ),
+ (['inner', 'B'], # Index and column
+ [pd.Grouper(level='inner'), 'B'])])
+def test_grouper_index_level_as_string(frame, key_strs, groupers):
+ result = frame.groupby(key_strs).mean()
+ expected = frame.groupby(groupers).mean()
+ assert_frame_equal(result, expected)
+
+
[email protected]('levels', [
+ 'inner', 'outer', 'B',
+ ['inner'], ['outer'], ['B'],
+ ['inner', 'outer'], ['outer', 'inner'],
+ ['inner', 'outer', 'B'], ['B', 'outer', 'inner']
+])
+def test_grouper_index_level_as_string_series(series, levels):
+
+ # Compute expected result
+ if isinstance(levels, list):
+ groupers = [pd.Grouper(level=lv) for lv in levels]
+ else:
+ groupers = pd.Grouper(level=levels)
+
+ expected = series.groupby(groupers).mean()
+
+ # Compute and check result
+ result = series.groupby(levels).mean()
+ assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_nth.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_nth.py
new file mode 100644
index 00000000000..255d9a8acf2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_nth.py
@@ -0,0 +1,416 @@
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
+from pandas.util.testing import (
+ assert_frame_equal, assert_produces_warning, assert_series_equal)
+
+
+def test_first_last_nth(df):
+ # tests for first / last / nth
+ grouped = df.groupby('A')
+ first = grouped.first()
+ expected = df.loc[[1, 0], ['B', 'C', 'D']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(first, expected)
+
+ nth = grouped.nth(0)
+ assert_frame_equal(nth, expected)
+
+ last = grouped.last()
+ expected = df.loc[[5, 7], ['B', 'C', 'D']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ assert_frame_equal(last, expected)
+
+ nth = grouped.nth(-1)
+ assert_frame_equal(nth, expected)
+
+ nth = grouped.nth(1)
+ expected = df.loc[[2, 3], ['B', 'C', 'D']].copy()
+ expected.index = Index(['foo', 'bar'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(nth, expected)
+
+ # it works!
+ grouped['B'].first()
+ grouped['B'].last()
+ grouped['B'].nth(0)
+
+ df.loc[df['A'] == 'foo', 'B'] = np.nan
+ assert isna(grouped['B'].first()['foo'])
+ assert isna(grouped['B'].last()['foo'])
+ assert isna(grouped['B'].nth(0)['foo'])
+
+ # v0.14.0 whatsnew
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ result = g.first()
+ expected = df.iloc[[1, 2]].set_index('A')
+ assert_frame_equal(result, expected)
+
+ expected = df.iloc[[1, 2]].set_index('A')
+ result = g.nth(0, dropna='any')
+ assert_frame_equal(result, expected)
+
+
+def test_first_last_nth_dtypes(df_mixed_floats):
+
+ df = df_mixed_floats.copy()
+ df['E'] = True
+ df['F'] = 1
+
+ # tests for first / last / nth
+ grouped = df.groupby('A')
+ first = grouped.first()
+ expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(first, expected)
+
+ last = grouped.last()
+ expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(last, expected)
+
+ nth = grouped.nth(1)
+ expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(nth, expected)
+
+ # GH 2763, first/last shifting dtypes
+ idx = lrange(10)
+ idx.append(9)
+ s = Series(data=lrange(11), index=idx, name='IntCol')
+ assert s.dtype == 'int64'
+ f = s.groupby(level=0).first()
+ assert f.dtype == 'int64'
+
+
+def test_nth():
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+
+ assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
+ assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
+ assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
+ assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
+ assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
+ assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
+ assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
+ assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
+ assert_frame_equal(g[['B']].nth(0),
+ df.loc[[0, 2], ['A', 'B']].set_index('A'))
+
+ exp = df.set_index('A')
+ assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
+ assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
+
+ exp['B'] = np.nan
+ assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
+ assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
+
+ # out of bounds, regression from 0.13.1
+ # GH 6621
+ df = DataFrame({'color': {0: 'green',
+ 1: 'green',
+ 2: 'red',
+ 3: 'red',
+ 4: 'red'},
+ 'food': {0: 'ham',
+ 1: 'eggs',
+ 2: 'eggs',
+ 3: 'ham',
+ 4: 'pork'},
+ 'two': {0: 1.5456590000000001,
+ 1: -0.070345000000000005,
+ 2: -2.4004539999999999,
+ 3: 0.46206000000000003,
+ 4: 0.52350799999999997},
+ 'one': {0: 0.56573799999999996,
+ 1: -0.9742360000000001,
+ 2: 1.033801,
+ 3: -0.78543499999999999,
+ 4: 0.70422799999999997}}).set_index(['color',
+ 'food'])
+
+ result = df.groupby(level=0, as_index=False).nth(2)
+ expected = df.iloc[[-1]]
+ assert_frame_equal(result, expected)
+
+ result = df.groupby(level=0, as_index=False).nth(3)
+ expected = df.loc[[]]
+ assert_frame_equal(result, expected)
+
+ # GH 7559
+ # from the vbench
+ df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
+ s = df[1]
+ g = df[0]
+ expected = s.groupby(g).first()
+ expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
+ assert_series_equal(expected2, expected, check_names=False)
+ assert expected.name == 1
+ assert expected2.name == 1
+
+ # validate first
+ v = s[g == 1].iloc[0]
+ assert expected.iloc[0] == v
+ assert expected2.iloc[0] == v
+
+ # this is NOT the same as .first (as sorted is default!)
+ # as it keeps the order in the series (and not the group order)
+ # related GH 7287
+ expected = s.groupby(g, sort=False).first()
+ result = s.groupby(g, sort=False).nth(0, dropna='all')
+ assert_series_equal(result, expected)
+
+ # doc example
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ # PR 17493, related to issue 11038
+ # test Series.nth with True for dropna produces FutureWarning
+ with assert_produces_warning(FutureWarning):
+ result = g.B.nth(0, dropna=True)
+ expected = g.B.first()
+ assert_series_equal(result, expected)
+
+ # test multiple nth values
+ df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
+ columns=['A', 'B'])
+ g = df.groupby('A')
+
+ assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
+ assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
+ assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
+ assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
+
+ business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
+ freq='B')
+ df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+ # get the first, fourth and last two business days for each month
+ key = [df.index.year, df.index.month]
+ result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
+ expected_dates = pd.to_datetime(
+ ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
+ '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
+ '2014/6/27', '2014/6/30'])
+ expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
+ assert_frame_equal(result, expected)
+
+
+def test_nth_multi_index(three_group):
+ # PR 9090, related to issue 8979
+ # test nth on MultiIndex, should match .first()
+ grouped = three_group.groupby(['A', 'B'])
+ result = grouped.nth(0)
+ expected = grouped.first()
+ assert_frame_equal(result, expected)
+
+
[email protected]('data, expected_first, expected_last', [
+ ({'id': ['A'],
+ 'time': Timestamp('2012-02-01 14:00:00',
+ tz='US/Central'),
+ 'foo': [1]},
+ {'id': ['A'],
+ 'time': Timestamp('2012-02-01 14:00:00',
+ tz='US/Central'),
+ 'foo': [1]},
+ {'id': ['A'],
+ 'time': Timestamp('2012-02-01 14:00:00',
+ tz='US/Central'),
+ 'foo': [1]}),
+ ({'id': ['A', 'B', 'A'],
+ 'time': [Timestamp('2012-01-01 13:00:00',
+ tz='America/New_York'),
+ Timestamp('2012-02-01 14:00:00',
+ tz='US/Central'),
+ Timestamp('2012-03-01 12:00:00',
+ tz='Europe/London')],
+ 'foo': [1, 2, 3]},
+ {'id': ['A', 'B'],
+ 'time': [Timestamp('2012-01-01 13:00:00',
+ tz='America/New_York'),
+ Timestamp('2012-02-01 14:00:00',
+ tz='US/Central')],
+ 'foo': [1, 2]},
+ {'id': ['A', 'B'],
+ 'time': [Timestamp('2012-03-01 12:00:00',
+ tz='Europe/London'),
+ Timestamp('2012-02-01 14:00:00',
+ tz='US/Central')],
+ 'foo': [3, 2]})
+])
+def test_first_last_tz(data, expected_first, expected_last):
+ # GH15884
+ # Test that the timezone is retained when calling first
+ # or last on groupby with as_index=False
+
+ df = DataFrame(data)
+
+ result = df.groupby('id', as_index=False).first()
+ expected = DataFrame(expected_first)
+ cols = ['id', 'time', 'foo']
+ assert_frame_equal(result[cols], expected[cols])
+
+ result = df.groupby('id', as_index=False)['time'].first()
+ assert_frame_equal(result, expected[['id', 'time']])
+
+ result = df.groupby('id', as_index=False).last()
+ expected = DataFrame(expected_last)
+ cols = ['id', 'time', 'foo']
+ assert_frame_equal(result[cols], expected[cols])
+
+ result = df.groupby('id', as_index=False)['time'].last()
+ assert_frame_equal(result, expected[['id', 'time']])
+
+
+def test_nth_multi_index_as_expected():
+ # PR 9090, related to issue 8979
+ # test nth on MultiIndex
+ three_group = DataFrame(
+ {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny']})
+ grouped = three_group.groupby(['A', 'B'])
+ result = grouped.nth(0)
+ expected = DataFrame(
+ {'C': ['dull', 'dull', 'dull', 'dull']},
+ index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
+ ['one', 'two', 'one', 'two']],
+ names=['A', 'B']))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_head_tail():
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ g_as = df.groupby('A', as_index=True)
+ g_not_as = df.groupby('A', as_index=False)
+
+ # as_index= False, much easier
+ assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
+ assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
+
+ empty_not_as = DataFrame(columns=df.columns,
+ index=pd.Index([], dtype=df.index.dtype))
+ empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
+ empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
+ assert_frame_equal(empty_not_as, g_not_as.head(0))
+ assert_frame_equal(empty_not_as, g_not_as.tail(0))
+ assert_frame_equal(empty_not_as, g_not_as.head(-1))
+ assert_frame_equal(empty_not_as, g_not_as.tail(-1))
+
+ assert_frame_equal(df, g_not_as.head(7)) # contains all
+ assert_frame_equal(df, g_not_as.tail(7))
+
+ # as_index=True, (used to be different)
+ df_as = df
+
+ assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
+ assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
+
+ empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
+ empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
+ empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
+ assert_frame_equal(empty_as, g_as.head(0))
+ assert_frame_equal(empty_as, g_as.tail(0))
+ assert_frame_equal(empty_as, g_as.head(-1))
+ assert_frame_equal(empty_as, g_as.tail(-1))
+
+ assert_frame_equal(df_as, g_as.head(7)) # contains all
+ assert_frame_equal(df_as, g_as.tail(7))
+
+ # test with selection
+ assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
+ assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
+ assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
+ assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
+
+ assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
+ assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
+ assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
+ assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
+
+
+def test_group_selection_cache():
+ # GH 12839 nth, head, and tail should return same result consistently
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ expected = df.iloc[[0, 2]].set_index('A')
+
+ g = df.groupby('A')
+ result1 = g.head(n=2)
+ result2 = g.nth(0)
+ assert_frame_equal(result1, df)
+ assert_frame_equal(result2, expected)
+
+ g = df.groupby('A')
+ result1 = g.tail(n=2)
+ result2 = g.nth(0)
+ assert_frame_equal(result1, df)
+ assert_frame_equal(result2, expected)
+
+ g = df.groupby('A')
+ result1 = g.nth(0)
+ result2 = g.head(n=2)
+ assert_frame_equal(result1, expected)
+ assert_frame_equal(result2, df)
+
+ g = df.groupby('A')
+ result1 = g.nth(0)
+ result2 = g.tail(n=2)
+ assert_frame_equal(result1, expected)
+ assert_frame_equal(result2, df)
+
+
+def test_nth_empty():
+ # GH 16064
+ df = DataFrame(index=[0], columns=['a', 'b', 'c'])
+ result = df.groupby('a').nth(10)
+ expected = DataFrame(index=Index([], name='a'), columns=['b', 'c'])
+ assert_frame_equal(result, expected)
+
+ result = df.groupby(['a', 'b']).nth(10)
+ expected = DataFrame(index=MultiIndex([[], []], [[], []],
+ names=['a', 'b']),
+ columns=['c'])
+ assert_frame_equal(result, expected)
+
+
+def test_nth_column_order():
+ # GH 20760
+ # Check that nth preserves column order
+ df = DataFrame([[1, 'b', 100],
+ [1, 'a', 50],
+ [1, 'a', np.nan],
+ [2, 'c', 200],
+ [2, 'd', 150]],
+ columns=['A', 'C', 'B'])
+ result = df.groupby('A').nth(0)
+ expected = DataFrame([['b', 100.0],
+ ['c', 200.0]],
+ columns=['C', 'B'],
+ index=Index([1, 2], name='A'))
+ assert_frame_equal(result, expected)
+
+ result = df.groupby('A').nth(-1, dropna='any')
+ expected = DataFrame([['a', 50.0],
+ ['d', 150.0]],
+ columns=['C', 'B'],
+ index=Index([1, 2], name='A'))
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_rank.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_rank.py
new file mode 100644
index 00000000000..9b0396bb530
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_rank.py
@@ -0,0 +1,306 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Series, concat
+from pandas.util import testing as tm
+
+
+def test_rank_apply():
+ lev1 = tm.rands_array(10, 100)
+ lev2 = tm.rands_array(10, 130)
+ lab1 = np.random.randint(0, 100, size=500)
+ lab2 = np.random.randint(0, 130, size=500)
+
+ df = DataFrame({'value': np.random.randn(500),
+ 'key1': lev1.take(lab1),
+ 'key2': lev2.take(lab2)})
+
+ result = df.groupby(['key1', 'key2']).value.rank()
+
+ expected = [piece.value.rank()
+ for key, piece in df.groupby(['key1', 'key2'])]
+ expected = concat(expected, axis=0)
+ expected = expected.reindex(result.index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.groupby(['key1', 'key2']).value.rank(pct=True)
+
+ expected = [piece.value.rank(pct=True)
+ for key, piece in df.groupby(['key1', 'key2'])]
+ expected = concat(expected, axis=0)
+ expected = expected.reindex(result.index)
+ tm.assert_series_equal(result, expected)
+
+
+ ['qux'], ['qux', 'quux']])
+ [2, 2, 8, 2, 6],
+ [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-06')]])
[email protected]("ties_method,ascending,pct,exp", [
+ ('average', True, False, [2., 2., 5., 2., 4.]),
+ ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
+ ('average', False, False, [4., 4., 1., 4., 2.]),
+ ('average', False, True, [.8, .8, .2, .8, .4]),
+ ('min', True, False, [1., 1., 5., 1., 4.]),
+ ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
+ ('min', False, False, [3., 3., 1., 3., 2.]),
+ ('min', False, True, [.6, .6, .2, .6, .4]),
+ ('max', True, False, [3., 3., 5., 3., 4.]),
+ ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
+ ('max', False, False, [5., 5., 1., 5., 2.]),
+ ('max', False, True, [1., 1., .2, 1., .4]),
+ ('first', True, False, [1., 2., 5., 3., 4.]),
+ ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
+ ('first', False, False, [3., 4., 1., 5., 2.]),
+ ('first', False, True, [.6, .8, .2, 1., .4]),
+ ('dense', True, False, [1., 1., 3., 1., 2.]),
+ ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
+ ('dense', False, False, [3., 3., 1., 3., 2.]),
+ ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
+])
+def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending, pct=pct)
+
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+ ['qux'], ['qux', 'quux']])
+ [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf],
+])
[email protected]("ties_method,ascending,na_option,exp", [
+ ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
+ ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]),
+ ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]),
+ ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
+ ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]),
+ ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]),
+ ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]),
+ ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]),
+ ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]),
+ ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]),
+ ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]),
+ ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]),
+ ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]),
+ ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]),
+ ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]),
+ ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]),
+ ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]),
+ ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]),
+ ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]),
+ ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]),
+ ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]),
+ ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]),
+ ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]),
+ ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]),
+ ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]),
+ ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]),
+ ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]),
+ ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]),
+ ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]),
+ ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.])
+])
+def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
+ # GH 20561
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option)
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+ ['qux'], ['qux', 'quux']])
+ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
+ [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
+ pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-06'), np.nan, np.nan]
+])
[email protected]("ties_method,ascending,na_option,pct,exp", [
+ ('average', True, 'keep', False,
+ [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]),
+ ('average', True, 'keep', True,
+ [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]),
+ ('average', False, 'keep', False,
+ [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]),
+ ('average', False, 'keep', True,
+ [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]),
+ ('min', True, 'keep', False,
+ [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]),
+ ('min', True, 'keep', True,
+ [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
+ ('min', False, 'keep', False,
+ [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
+ ('min', False, 'keep', True,
+ [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
+ ('max', True, 'keep', False,
+ [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]),
+ ('max', True, 'keep', True,
+ [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
+ ('max', False, 'keep', False,
+ [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]),
+ ('max', False, 'keep', True,
+ [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
+ ('first', True, 'keep', False,
+ [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]),
+ ('first', True, 'keep', True,
+ [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
+ ('first', False, 'keep', False,
+ [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]),
+ ('first', False, 'keep', True,
+ [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
+ ('dense', True, 'keep', False,
+ [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
+ ('dense', True, 'keep', True,
+ [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
+ ('dense', False, 'keep', False,
+ [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
+ ('dense', False, 'keep', True,
+ [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
+ ('average', True, 'bottom', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
+ ('average', True, 'bottom', True,
+ [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
+ ('average', False, 'bottom', False, [4., 4., 7., 1., 4., 2., 7., 7.]),
+ ('average', False, 'bottom', True,
+ [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]),
+ ('min', True, 'bottom', False, [1., 1., 6., 5., 1., 4., 6., 6.]),
+ ('min', True, 'bottom', True,
+ [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]),
+ ('min', False, 'bottom', False, [3., 3., 6., 1., 3., 2., 6., 6.]),
+ ('min', False, 'bottom', True,
+ [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]),
+ ('max', True, 'bottom', False, [3., 3., 8., 5., 3., 4., 8., 8.]),
+ ('max', True, 'bottom', True,
+ [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]),
+ ('max', False, 'bottom', False, [5., 5., 8., 1., 5., 2., 8., 8.]),
+ ('max', False, 'bottom', True,
+ [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]),
+ ('first', True, 'bottom', False, [1., 2., 6., 5., 3., 4., 7., 8.]),
+ ('first', True, 'bottom', True,
+ [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]),
+ ('first', False, 'bottom', False, [3., 4., 6., 1., 5., 2., 7., 8.]),
+ ('first', False, 'bottom', True,
+ [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
+ ('dense', True, 'bottom', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
+ ('dense', True, 'bottom', True,
+ [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
+ ('dense', False, 'bottom', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
+ ('dense', False, 'bottom', True,
+ [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
+])
+def test_rank_args_missing(grps, vals, ties_method, ascending,
+ na_option, pct, exp):
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option, pct=pct)
+
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
[email protected]("pct,exp", [
+ (False, [3., 3., 3., 3., 3.]),
+ (True, [.6, .6, .6, .6, .6])])
+def test_rank_resets_each_group(pct, exp):
+ df = DataFrame(
+ {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'],
+ 'val': [1] * 10}
+ )
+ result = df.groupby('key').rank(pct=pct)
+ exp_df = DataFrame(exp * 2, columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+def test_rank_avg_even_vals():
+ df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4})
+ result = df.groupby('key').rank()
+ exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
[email protected]("ties_method", [
+ 'average', 'min', 'max', 'first', 'dense'])
[email protected]("ascending", [True, False])
[email protected]("na_option", ["keep", "top", "bottom"])
[email protected]("pct", [True, False])
+ ['bar', 'bar', 'foo', 'bar', 'baz'],
+ ['bar', np.nan, 'foo', np.nan, 'baz']
+])
+def test_rank_object_raises(ties_method, ascending, na_option,
+ pct, vals):
+ df = DataFrame({'key': ['foo'] * 5, 'val': vals})
+
+ with pytest.raises(TypeError, match="not callable"):
+ df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option, pct=pct)
+
+
[email protected]("na_option", [True, "bad", 1])
[email protected]("ties_method", [
+ 'average', 'min', 'max', 'first', 'dense'])
[email protected]("ascending", [True, False])
[email protected]("pct", [True, False])
+ ['bar', 'bar', 'foo', 'bar', 'baz'],
+ ['bar', np.nan, 'foo', np.nan, 'baz'],
+ [1, np.nan, 2, np.nan, 3]
+])
+def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
+ df = DataFrame({'key': ['foo'] * 5, 'val': vals})
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+
+ with pytest.raises(ValueError, match=msg):
+ df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option, pct=pct)
+
+
+def test_rank_empty_group():
+ # see gh-22519
+ column = "A"
+ df = DataFrame({
+ "A": [0, 1, 0],
+ "B": [1., np.nan, 2.]
+ })
+
+ result = df.groupby(column).B.rank(pct=True)
+ expected = Series([0.5, np.nan, 1.0], name="B")
+ tm.assert_series_equal(result, expected)
+
+ result = df.groupby(column).rank(pct=True)
+ expected = DataFrame({"B": [0.5, np.nan, 1.0]})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("input_key,input_value,output_value", [
+ ([1, 2], [1, 1], [1.0, 1.0]),
+ ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
+ ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
+ ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan])
+])
+def test_rank_zero_div(input_key, input_value, output_value):
+ # GH 23666
+ df = DataFrame({"A": input_key, "B": input_value})
+
+ result = df.groupby("A").rank(method="dense", pct=True)
+ expected = DataFrame({"B": output_value})
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_timegrouper.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_timegrouper.py
new file mode 100644
index 00000000000..a2f2c1392b2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_timegrouper.py
@@ -0,0 +1,652 @@
+""" test with the TimeGrouper / grouping with datetimes """
+
+from datetime import datetime
+
+import numpy as np
+from numpy import nan
+import pytest
+import pytz
+
+from pandas.compat import StringIO
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
+from pandas.core.groupby.ops import BinGrouper
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestGroupBy(object):
+
+ def test_groupby_with_timegrouper(self):
+ # GH 4161
+ # TimeGrouper requires a sorted index
+ # also verifies that the resultant index has the correct name
+ df_original = DataFrame({
+ 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
+ 'Quantity': [18, 3, 5, 1, 9, 3],
+ 'Date': [
+ datetime(2013, 9, 1, 13, 0),
+ datetime(2013, 9, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 3, 10, 0),
+ datetime(2013, 12, 2, 12, 0),
+ datetime(2013, 9, 2, 14, 0),
+ ]
+ })
+
+ # GH 6908 change target column's order
+ df_reordered = df_original.sort_values(by='Quantity')
+
+ for df in [df_original, df_reordered]:
+ df = df.set_index(['Date'])
+
+ expected = DataFrame(
+ {'Quantity': 0},
+ index=date_range('20130901',
+ '20131205', freq='5D',
+ name='Date', closed='left'))
+ expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')
+
+ result1 = df.resample('5D') .sum()
+ assert_frame_equal(result1, expected)
+
+ df_sorted = df.sort_index()
+ result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
+ assert_frame_equal(result2, expected)
+
+ result3 = df.groupby(pd.Grouper(freq='5D')).sum()
+ assert_frame_equal(result3, expected)
+
+ @pytest.mark.parametrize("should_sort", [True, False])
+ def test_groupby_with_timegrouper_methods(self, should_sort):
+ # GH 3881
+ # make sure API of timegrouper conforms
+
+ df = pd.DataFrame({
+ 'Branch': 'A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
+ 'Quantity': [1, 3, 5, 8, 9, 3],
+ 'Date': [
+ datetime(2013, 1, 1, 13, 0),
+ datetime(2013, 1, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 12, 2, 12, 0),
+ datetime(2013, 12, 2, 14, 0),
+ ]
+ })
+
+ if should_sort:
+ df = df.sort_values(by='Quantity', ascending=False)
+
+ df = df.set_index('Date', drop=False)
+ g = df.groupby(pd.Grouper(freq='6M'))
+ assert g.group_keys
+
+ assert isinstance(g.grouper, BinGrouper)
+ groups = g.groups
+ assert isinstance(groups, dict)
+ assert len(groups) == 3
+
+ def test_timegrouper_with_reg_groups(self):
+
+ # GH 3794
+ # allow combinateion of timegrouper/reg groups
+
+ df_original = DataFrame({
+ 'Branch': 'A A A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+ 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
+ 'Date': [
+ datetime(2013, 1, 1, 13, 0),
+ datetime(2013, 1, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 12, 2, 12, 0),
+ datetime(2013, 12, 2, 14, 0),
+ ]
+ }).set_index('Date')
+
+ df_sorted = df_original.sort_values(by='Quantity', ascending=False)
+
+ for df in [df_original, df_sorted]:
+ expected = DataFrame({
+ 'Buyer': 'Carl Joe Mark'.split(),
+ 'Quantity': [10, 18, 3],
+ 'Date': [
+ datetime(2013, 12, 31, 0, 0),
+ datetime(2013, 12, 31, 0, 0),
+ datetime(2013, 12, 31, 0, 0),
+ ]
+ }).set_index(['Date', 'Buyer'])
+
+ result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame({
+ 'Buyer': 'Carl Mark Carl Joe'.split(),
+ 'Quantity': [1, 3, 9, 18],
+ 'Date': [
+ datetime(2013, 1, 1, 0, 0),
+ datetime(2013, 1, 1, 0, 0),
+ datetime(2013, 7, 1, 0, 0),
+ datetime(2013, 7, 1, 0, 0),
+ ]
+ }).set_index(['Date', 'Buyer'])
+ result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
+ assert_frame_equal(result, expected)
+
+ df_original = DataFrame({
+ 'Branch': 'A A A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+ 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
+ 'Date': [
+ datetime(2013, 10, 1, 13, 0),
+ datetime(2013, 10, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 10, 2, 12, 0),
+ datetime(2013, 10, 2, 14, 0),
+ ]
+ }).set_index('Date')
+
+ df_sorted = df_original.sort_values(by='Quantity', ascending=False)
+ for df in [df_original, df_sorted]:
+
+ expected = DataFrame({
+ 'Buyer': 'Carl Joe Mark Carl Joe'.split(),
+ 'Quantity': [6, 8, 3, 4, 10],
+ 'Date': [
+ datetime(2013, 10, 1, 0, 0),
+ datetime(2013, 10, 1, 0, 0),
+ datetime(2013, 10, 1, 0, 0),
+ datetime(2013, 10, 2, 0, 0),
+ datetime(2013, 10, 2, 0, 0),
+ ]
+ }).set_index(['Date', 'Buyer'])
+
+ result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
+ assert_frame_equal(result, expected)
+
+ result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
+ expected = DataFrame({
+ 'Buyer': 'Carl Joe Mark'.split(),
+ 'Quantity': [10, 18, 3],
+ 'Date': [
+ datetime(2013, 10, 31, 0, 0),
+ datetime(2013, 10, 31, 0, 0),
+ datetime(2013, 10, 31, 0, 0),
+ ]
+ }).set_index(['Date', 'Buyer'])
+ assert_frame_equal(result, expected)
+
+ # passing the name
+ df = df.reset_index()
+ result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
+ ]).sum()
+ assert_frame_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
+
+ # passing the level
+ df = df.set_index('Date')
+ result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
+ ]).sum()
+ assert_frame_equal(result, expected)
+ result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
+ )
+ assert_frame_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ df.groupby([pd.Grouper(freq='1M', level='foo'),
+ 'Buyer']).sum()
+
+ # multi names
+ df = df.copy()
+ df['Date'] = df.index + pd.offsets.MonthEnd(2)
+ result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
+ ]).sum()
+ expected = DataFrame({
+ 'Buyer': 'Carl Joe Mark'.split(),
+ 'Quantity': [10, 18, 3],
+ 'Date': [
+ datetime(2013, 11, 30, 0, 0),
+ datetime(2013, 11, 30, 0, 0),
+ datetime(2013, 11, 30, 0, 0),
+ ]
+ }).set_index(['Date', 'Buyer'])
+ assert_frame_equal(result, expected)
+
+ # error as we have both a level and a name!
+ with pytest.raises(ValueError):
+ df.groupby([pd.Grouper(freq='1M', key='Date',
+ level='Date'), 'Buyer']).sum()
+
+ # single groupers
+ expected = DataFrame({'Quantity': [31],
+ 'Date': [datetime(2013, 10, 31, 0, 0)
+ ]}).set_index('Date')
+ result = df.groupby(pd.Grouper(freq='1M')).sum()
+ assert_frame_equal(result, expected)
+
+ result = df.groupby([pd.Grouper(freq='1M')]).sum()
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame({'Quantity': [31],
+ 'Date': [datetime(2013, 11, 30, 0, 0)
+ ]}).set_index('Date')
+ result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
+ assert_frame_equal(result, expected)
+
+ result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
+ def test_timegrouper_with_reg_groups_freq(self, freq):
+ # GH 6764 multiple grouping with/without sort
+ df = DataFrame({
+ 'date': pd.to_datetime([
+ '20121002', '20121007', '20130130', '20130202', '20130305',
+ '20121002', '20121207', '20130130', '20130202', '20130305',
+ '20130202', '20130305'
+ ]),
+ 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
+ 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
+ 359, 801],
+ 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
+ }).set_index('date')
+
+ expected = (
+ df.groupby('user_id')['whole_cost']
+ .resample(freq)
+ .sum(min_count=1) # XXX
+ .dropna()
+ .reorder_levels(['date', 'user_id'])
+ .sort_index()
+ .astype('int64')
+ )
+ expected.name = 'whole_cost'
+
+ result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
+ 'user_id'])['whole_cost'].sum()
+ assert_series_equal(result1, expected)
+
+ result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
+ 'whole_cost'].sum()
+ assert_series_equal(result2, expected)
+
+ def test_timegrouper_get_group(self):
+ # GH 6914
+
+ df_original = DataFrame({
+ 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
+ 'Quantity': [18, 3, 5, 1, 9, 3],
+ 'Date': [datetime(2013, 9, 1, 13, 0),
+ datetime(2013, 9, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 3, 10, 0),
+ datetime(2013, 12, 2, 12, 0),
+ datetime(2013, 9, 2, 14, 0), ]
+ })
+ df_reordered = df_original.sort_values(by='Quantity')
+
+ # single grouping
+ expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
+ df_original.iloc[[4]]]
+ dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
+
+ for df in [df_original, df_reordered]:
+ grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
+ for t, expected in zip(dt_list, expected_list):
+ dt = pd.Timestamp(t)
+ result = grouped.get_group(dt)
+ assert_frame_equal(result, expected)
+
+ # multiple grouping
+ expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
+ df_original.iloc[[4]]]
+ g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
+ ('Joe', '2013-12-31')]
+
+ for df in [df_original, df_reordered]:
+ grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
+ for (b, t), expected in zip(g_list, expected_list):
+ dt = pd.Timestamp(t)
+ result = grouped.get_group((b, dt))
+ assert_frame_equal(result, expected)
+
+ # with index
+ df_original = df_original.set_index('Date')
+ df_reordered = df_original.sort_values(by='Quantity')
+
+ expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
+ df_original.iloc[[4]]]
+
+ for df in [df_original, df_reordered]:
+ grouped = df.groupby(pd.Grouper(freq='M'))
+ for t, expected in zip(dt_list, expected_list):
+ dt = pd.Timestamp(t)
+ result = grouped.get_group(dt)
+ assert_frame_equal(result, expected)
+
+ def test_timegrouper_apply_return_type_series(self):
+ # Using `apply` with the `TimeGrouper` should give the
+ # same return type as an `apply` with a `Grouper`.
+ # Issue #11742
+ df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
+ 'value': [10, 13]})
+ df_dt = df.copy()
+ df_dt['date'] = pd.to_datetime(df_dt['date'])
+
+ def sumfunc_series(x):
+ return pd.Series([x['value'].sum()], ('sum',))
+
+ expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series)
+ result = (df_dt.groupby(pd.Grouper(freq='M', key='date'))
+ .apply(sumfunc_series))
+ assert_frame_equal(result.reset_index(drop=True),
+ expected.reset_index(drop=True))
+
+ def test_timegrouper_apply_return_type_value(self):
+ # Using `apply` with the `TimeGrouper` should give the
+ # same return type as an `apply` with a `Grouper`.
+ # Issue #11742
+ df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
+ 'value': [10, 13]})
+ df_dt = df.copy()
+ df_dt['date'] = pd.to_datetime(df_dt['date'])
+
+ def sumfunc_value(x):
+ return x.value.sum()
+
+ expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value)
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
+ .apply(sumfunc_value))
+ assert_series_equal(result.reset_index(drop=True),
+ expected.reset_index(drop=True))
+
+ def test_groupby_groups_datetimeindex(self):
+ # GH#1430
+ periods = 1000
+ ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
+ df = DataFrame({'high': np.arange(periods),
+ 'low': np.arange(periods)}, index=ind)
+ grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
+
+ # it works!
+ groups = grouped.groups
+ assert isinstance(list(groups.keys())[0], datetime)
+
+ # GH#11442
+ index = pd.date_range('2015/01/01', periods=5, name='date')
+ df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
+ 'B': [1, 2, 3, 4, 5]}, index=index)
+ result = df.groupby(level='date').groups
+ dates = ['2015-01-05', '2015-01-04', '2015-01-03',
+ '2015-01-02', '2015-01-01']
+ expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
+ for date in dates}
+ tm.assert_dict_equal(result, expected)
+
+ grouped = df.groupby(level='date')
+ for date in dates:
+ result = grouped.get_group(date)
+ data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
+ expected_index = pd.DatetimeIndex([date], name='date')
+ expected = pd.DataFrame(data,
+ columns=list('AB'),
+ index=expected_index)
+ tm.assert_frame_equal(result, expected)
+
+ def test_groupby_groups_datetimeindex_tz(self):
+ # GH 3950
+ dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00', '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00', '2011-07-19 09:00:00']
+ df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'datetime': dates,
+ 'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2] * 3})
+ df['datetime'] = df['datetime'].apply(
+ lambda d: Timestamp(d, tz='US/Pacific'))
+
+ exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
+ '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00',
+ '2011-07-19 09:00:00'],
+ tz='US/Pacific', name='datetime')
+ exp_idx2 = Index(['a', 'b'] * 3, name='label')
+ exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
+ expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
+ 'value2': [1, 2, 2, 1, 1, 2]},
+ index=exp_idx, columns=['value1', 'value2'])
+
+ result = df.groupby(['datetime', 'label']).sum()
+ assert_frame_equal(result, expected)
+
+ # by level
+ didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
+ df = DataFrame({'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2, 3, 1, 2, 3]},
+ index=didx)
+
+ exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00'], tz='Asia/Tokyo')
+ expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
+ index=exp_idx, columns=['value1', 'value2'])
+
+ result = df.groupby(level=0).sum()
+ assert_frame_equal(result, expected)
+
+ def test_frame_datetime64_handling_groupby(self):
+ # it works!
+ df = DataFrame([(3, np.datetime64('2012-07-03')),
+ (3, np.datetime64('2012-07-04'))],
+ columns=['a', 'date'])
+ result = df.groupby('a').first()
+ assert result['date'][3] == Timestamp('2012-07-03')
+
+ def test_groupby_multi_timezone(self):
+
+ # combining multiple / different timezones yields UTC
+
+ data = """0,2000-01-28 16:47:00,America/Chicago
+1,2000-01-29 16:48:00,America/Chicago
+2,2000-01-30 16:49:00,America/Los_Angeles
+3,2000-01-31 16:50:00,America/Chicago
+4,2000-01-01 16:50:00,America/New_York"""
+
+ df = pd.read_csv(StringIO(data), header=None,
+ names=['value', 'date', 'tz'])
+ result = df.groupby('tz').date.apply(
+ lambda x: pd.to_datetime(x).dt.tz_localize(x.name))
+
+ expected = Series([Timestamp('2000-01-28 16:47:00-0600',
+ tz='America/Chicago'),
+ Timestamp('2000-01-29 16:48:00-0600',
+ tz='America/Chicago'),
+ Timestamp('2000-01-30 16:49:00-0800',
+ tz='America/Los_Angeles'),
+ Timestamp('2000-01-31 16:50:00-0600',
+ tz='America/Chicago'),
+ Timestamp('2000-01-01 16:50:00-0500',
+ tz='America/New_York')],
+ name='date',
+ dtype=object)
+ assert_series_equal(result, expected)
+
+ tz = 'America/Chicago'
+ res_values = df.groupby('tz').date.get_group(tz)
+ result = pd.to_datetime(res_values).dt.tz_localize(tz)
+ exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00',
+ '2000-01-31 16:50:00'],
+ index=[0, 1, 3], name='date')
+ expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
+ assert_series_equal(result, expected)
+
+ def test_groupby_groups_periods(self):
+ dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00', '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00', '2011-07-19 09:00:00']
+ df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'period': [pd.Period(d, freq='H') for d in dates],
+ 'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2] * 3})
+
+ exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00',
+ '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00',
+ '2011-07-19 09:00:00'],
+ freq='H', name='period')
+ exp_idx2 = Index(['a', 'b'] * 3, name='label')
+ exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
+ expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
+ 'value2': [1, 2, 2, 1, 1, 2]},
+ index=exp_idx, columns=['value1', 'value2'])
+
+ result = df.groupby(['period', 'label']).sum()
+ assert_frame_equal(result, expected)
+
+ # by level
+ didx = pd.PeriodIndex(dates, freq='H')
+ df = DataFrame({'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2, 3, 1, 2, 3]},
+ index=didx)
+
+ exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00'], freq='H')
+ expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
+ index=exp_idx, columns=['value1', 'value2'])
+
+ result = df.groupby(level=0).sum()
+ assert_frame_equal(result, expected)
+
+ def test_groupby_first_datetime64(self):
+ df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
+ df[1] = df[1].view('M8[ns]')
+
+ assert issubclass(df[1].dtype.type, np.datetime64)
+
+ result = df.groupby(level=0).first()
+ got_dt = result[1].dtype
+ assert issubclass(got_dt.type, np.datetime64)
+
+ result = df[1].groupby(level=0).first()
+ got_dt = result.dtype
+ assert issubclass(got_dt.type, np.datetime64)
+
+ def test_groupby_max_datetime64(self):
+ # GH 5869
+ # datetimelike dtype conversion from int
+ df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
+ expected = df.groupby('A')['A'].apply(lambda x: x.max())
+ result = df.groupby('A')['A'].max()
+ assert_series_equal(result, expected)
+
+ def test_groupby_datetime64_32_bit(self):
+ # GH 6410 / numpy 4328
+ # 32-bit under 1.9-dev indexing issue
+
+ df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
+ result = df.groupby("A")["B"].transform(min)
+ expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
+ assert_series_equal(result, expected)
+
+ def test_groupby_with_timezone_selection(self):
+ # GH 11616
+ # Test that column selection returns output in correct timezone.
+ np.random.seed(42)
+ df = pd.DataFrame({
+ 'factor': np.random.randint(0, 3, size=60),
+ 'time': pd.date_range('01/01/2000 00:00', periods=60,
+ freq='s', tz='UTC')
+ })
+ df1 = df.groupby('factor').max()['time']
+ df2 = df.groupby('factor')['time'].max()
+ tm.assert_series_equal(df1, df2)
+
+ def test_timezone_info(self):
+ # see gh-11682: Timezone info lost when broadcasting
+ # scalar datetime to DataFrame
+
+ df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]})
+ assert df['b'][0].tzinfo == pytz.utc
+ df = pd.DataFrame({'a': [1, 2, 3]})
+ df['b'] = datetime.now(pytz.utc)
+ assert df['b'][0].tzinfo == pytz.utc
+
+ def test_datetime_count(self):
+ df = DataFrame({'a': [1, 2, 3] * 2,
+ 'dates': pd.date_range('now', periods=6, freq='T')})
+ result = df.groupby('a').dates.count()
+ expected = Series([
+ 2, 2, 2
+ ], index=Index([1, 2, 3], name='a'), name='dates')
+ tm.assert_series_equal(result, expected)
+
+ def test_first_last_max_min_on_time_data(self):
+ # GH 10295
+ # Verify that NaT is not in the result of max, min, first and last on
+ # Dataframe with datetime or timedelta values.
+ from datetime import timedelta as td
+ df_test = DataFrame(
+ {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11',
+ '2015-07-23 12:12', nan],
+ 'td': [nan, td(days=1), td(days=2), td(days=3), nan]})
+ df_test.dt = pd.to_datetime(df_test.dt)
+ df_test['group'] = 'A'
+ df_ref = df_test[df_test.dt.notna()]
+
+ grouped_test = df_test.groupby('group')
+ grouped_ref = df_ref.groupby('group')
+
+ assert_frame_equal(grouped_ref.max(), grouped_test.max())
+ assert_frame_equal(grouped_ref.min(), grouped_test.min())
+ assert_frame_equal(grouped_ref.first(), grouped_test.first())
+ assert_frame_equal(grouped_ref.last(), grouped_test.last())
+
+ def test_nunique_with_timegrouper_and_nat(self):
+ # GH 17575
+ test = pd.DataFrame({
+ 'time': [Timestamp('2016-06-28 09:35:35'),
+ pd.NaT,
+ Timestamp('2016-06-28 16:46:28')],
+ 'data': ['1', '2', '3']})
+
+ grouper = pd.Grouper(key='time', freq='h')
+ result = test.groupby(grouper)['data'].nunique()
+ expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
+ tm.assert_series_equal(result, expected)
+
+ def test_scalar_call_versus_list_call(self):
+ # Issue: 17530
+ data_frame = {
+ 'location': ['shanghai', 'beijing', 'shanghai'],
+ 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
+ '2017-08-11 22:23:15'],
+ dtype='datetime64[ns]'),
+ 'value': [1, 2, 3]
+ }
+ data_frame = pd.DataFrame(data_frame).set_index('time')
+ grouper = pd.Grouper(freq='D')
+
+ grouped = data_frame.groupby(grouper)
+ result = grouped.count()
+ grouped = data_frame.groupby([grouper])
+ expected = grouped.count()
+
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_transform.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_transform.py
new file mode 100644
index 00000000000..b645073fcf7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_transform.py
@@ -0,0 +1,847 @@
+""" test with the .transform """
+
+import numpy as np
+import pytest
+
+from pandas._libs import groupby
+from pandas.compat import StringIO
+
+from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
+from pandas.core.config import option_context
+from pandas.core.groupby.groupby import DataError
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+def assert_fp_equal(a, b):
+ assert (np.abs(a - b) < 1e-12).all()
+
+
+def test_transform():
+ data = Series(np.arange(9) // 3, index=np.arange(9))
+
+ index = np.arange(9)
+ np.random.shuffle(index)
+ data = data.reindex(index)
+
+ grouped = data.groupby(lambda x: x // 3)
+
+ transformed = grouped.transform(lambda x: x * x.sum())
+ assert transformed[7] == 12
+
+ # GH 8046
+ # make sure that we preserve the input order
+
+ df = DataFrame(
+ np.arange(6, dtype='int64').reshape(
+ 3, 2), columns=["a", "b"], index=[0, 2, 1])
+ key = [0, 0, 1]
+ expected = df.sort_index().groupby(key).transform(
+ lambda x: x - x.mean()).groupby(key).mean()
+ result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
+ key).mean()
+ assert_frame_equal(result, expected)
+
+ def demean(arr):
+ return arr - arr.mean()
+
+ people = DataFrame(np.random.randn(5, 5),
+ columns=['a', 'b', 'c', 'd', 'e'],
+ index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
+ key = ['one', 'two', 'one', 'two', 'one']
+ result = people.groupby(key).transform(demean).groupby(key).mean()
+ expected = people.groupby(key).apply(demean).groupby(key).mean()
+ assert_frame_equal(result, expected)
+
+ # GH 8430
+ df = tm.makeTimeDataFrame()
+ g = df.groupby(pd.Grouper(freq='M'))
+ g.transform(lambda x: x - 1)
+
+ # GH 9700
+ df = DataFrame({'a': range(5, 10), 'b': range(5)})
+ result = df.groupby('a').transform(max)
+ expected = DataFrame({'b': range(5)})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_fast():
+
+ df = DataFrame({'id': np.arange(100000) / 3,
+ 'val': np.random.randn(100000)})
+
+ grp = df.groupby('id')['val']
+
+ values = np.repeat(grp.mean().values,
+ ensure_platform_int(grp.count().values))
+ expected = pd.Series(values, index=df.index, name='val')
+
+ result = grp.transform(np.mean)
+ assert_series_equal(result, expected)
+
+ result = grp.transform('mean')
+ assert_series_equal(result, expected)
+
+ # GH 12737
+ df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
+ 'd': pd.date_range('2014-1-1', '2014-1-4'),
+ 'i': [1, 2, 3, 4]},
+ columns=['grouping', 'f', 'i', 'd'])
+ result = df.groupby('grouping').transform('first')
+
+ dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
+ pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
+ expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
+ 'd': dates,
+ 'i': [1, 2, 2, 4]},
+ columns=['f', 'i', 'd'])
+ assert_frame_equal(result, expected)
+
+ # selection
+ result = df.groupby('grouping')[['f', 'i']].transform('first')
+ expected = expected[['f', 'i']]
+ assert_frame_equal(result, expected)
+
+ # dup columns
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
+ result = df.groupby('g').transform('first')
+ expected = df.drop('g', axis=1)
+ assert_frame_equal(result, expected)
+
+
+def test_transform_broadcast(tsframe, ts):
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.transform(np.mean)
+
+ tm.assert_index_equal(result.index, ts.index)
+ for _, gp in grouped:
+ assert_fp_equal(result.reindex(gp.index), gp.mean())
+
+ grouped = tsframe.groupby(lambda x: x.month)
+ result = grouped.transform(np.mean)
+ tm.assert_index_equal(result.index, tsframe.index)
+ for _, gp in grouped:
+ agged = gp.mean()
+ res = result.reindex(gp.index)
+ for col in tsframe:
+ assert_fp_equal(res[col], agged[col])
+
+ # group columns
+ grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
+ axis=1)
+ result = grouped.transform(np.mean)
+ tm.assert_index_equal(result.index, tsframe.index)
+ tm.assert_index_equal(result.columns, tsframe.columns)
+ for _, gp in grouped:
+ agged = gp.mean(1)
+ res = result.reindex(columns=gp.columns)
+ for idx in gp.index:
+ assert_fp_equal(res.xs(idx), agged[idx])
+
+
+def test_transform_axis(tsframe):
+
+ # make sure that we are setting the axes
+ # correctly when on axis=0 or 1
+ # in the presence of a non-monotonic indexer
+ # GH12713
+
+ base = tsframe.iloc[0:5]
+ r = len(base.index)
+ c = len(base.columns)
+ tso = DataFrame(np.random.randn(r, c),
+ index=base.index,
+ columns=base.columns,
+ dtype='float64')
+ # monotonic
+ ts = tso
+ grouped = ts.groupby(lambda x: x.weekday())
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: x - x.mean())
+ assert_frame_equal(result, expected)
+
+ ts = ts.T
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1)
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+ assert_frame_equal(result, expected)
+
+ # non-monotonic
+ ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
+ grouped = ts.groupby(lambda x: x.weekday())
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: x - x.mean())
+ assert_frame_equal(result, expected)
+
+ ts = ts.T
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1)
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+ assert_frame_equal(result, expected)
+
+
+def test_transform_dtype():
+ # GH 9807
+ # Check transform dtype output is preserved
+ df = DataFrame([[1, 3], [2, 3]])
+ result = df.groupby(1).transform('mean')
+ expected = DataFrame([[1.5], [1.5]])
+ assert_frame_equal(result, expected)
+
+
+def test_transform_bug():
+ # GH 5712
+ # transforming on a datetime column
+ df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
+ result = df.groupby('A')['B'].transform(
+ lambda x: x.rank(ascending=False))
+ expected = Series(np.arange(5, 0, step=-1), name='B')
+ assert_series_equal(result, expected)
+
+
+def test_transform_numeric_to_boolean():
+ # GH 16875
+ # inconsistency in transforming boolean values
+ expected = pd.Series([True, True], name='A')
+
+ df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]})
+ result = df.groupby('B').A.transform(lambda x: True)
+ assert_series_equal(result, expected)
+
+ df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]})
+ result = df.groupby('B').A.transform(lambda x: True)
+ assert_series_equal(result, expected)
+
+
+def test_transform_datetime_to_timedelta():
+ # GH 15429
+ # transforming a datetime to timedelta
+ df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
+ expected = pd.Series([
+ Timestamp('20130101') - Timestamp('20130101')] * 5, name='A')
+
+ # this does date math without changing result type in transform
+ base_time = df['A'][0]
+ result = df.groupby('A')['A'].transform(
+ lambda x: x.max() - x.min() + base_time) - base_time
+ assert_series_equal(result, expected)
+
+ # this does date math and causes the transform to return timedelta
+ result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min())
+ assert_series_equal(result, expected)
+
+
+def test_transform_datetime_to_numeric():
+ # GH 10972
+ # convert dt to float
+ df = DataFrame({
+ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
+ result = df.groupby('a').b.transform(
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.mean())
+
+ expected = Series([-0.5, 0.5], name='b')
+ assert_series_equal(result, expected)
+
+ # convert dt to int
+ df = DataFrame({
+ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
+ result = df.groupby('a').b.transform(
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.min())
+
+ expected = Series([0, 1], name='b')
+ assert_series_equal(result, expected)
+
+
+def test_transform_casting():
+ # 13046
+ data = """
+ idx A ID3 DATETIME
+ 0 B-028 b76cd912ff "2014-10-08 13:43:27"
+ 1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
+ 2 B-076 1a682034f8 "2014-10-08 14:29:01"
+ 3 B-023 b76cd912ff "2014-10-08 18:39:34"
+ 4 B-023 f88g8d7sds "2014-10-08 18:40:18"
+ 5 B-033 b76cd912ff "2014-10-08 18:44:30"
+ 6 B-032 b76cd912ff "2014-10-08 18:46:00"
+ 7 B-037 b76cd912ff "2014-10-08 18:52:15"
+ 8 B-046 db959faf02 "2014-10-08 18:59:59"
+ 9 B-053 b76cd912ff "2014-10-08 19:17:48"
+ 10 B-065 b76cd912ff "2014-10-08 19:21:38"
+ """
+ df = pd.read_csv(StringIO(data), sep=r'\s+',
+ index_col=[0], parse_dates=['DATETIME'])
+
+ result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
+ assert is_timedelta64_dtype(result.dtype)
+
+ result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
+ lambda x: x.diff())
+ assert is_timedelta64_dtype(result.DATETIME.dtype)
+
+
+def test_transform_multiple(ts):
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+
+ grouped.transform(lambda x: x * 2)
+ grouped.transform(np.mean)
+
+
+def test_dispatch_transform(tsframe):
+ df = tsframe[::5].reindex(tsframe.index)
+
+ grouped = df.groupby(lambda x: x.month)
+
+ filled = grouped.fillna(method='pad')
+ fillit = lambda x: x.fillna(method='pad')
+ expected = df.groupby(lambda x: x.month).transform(fillit)
+ assert_frame_equal(filled, expected)
+
+
+def test_transform_select_columns(df):
+ f = lambda x: x.mean()
+ result = df.groupby('A')['C', 'D'].transform(f)
+
+ selection = df[['C', 'D']]
+ expected = selection.groupby(df['A']).transform(f)
+
+ assert_frame_equal(result, expected)
+
+
+def test_transform_exclude_nuisance(df):
+
+ # this also tests orderings in transform between
+ # series/frame to make sure it's consistent
+ expected = {}
+ grouped = df.groupby('A')
+ expected['C'] = grouped['C'].transform(np.mean)
+ expected['D'] = grouped['D'].transform(np.mean)
+ expected = DataFrame(expected)
+ result = df.groupby('A').transform(np.mean)
+
+ assert_frame_equal(result, expected)
+
+
+def test_transform_function_aliases(df):
+ result = df.groupby('A').transform('mean')
+ expected = df.groupby('A').transform(np.mean)
+ assert_frame_equal(result, expected)
+
+ result = df.groupby('A')['C'].transform('mean')
+ expected = df.groupby('A')['C'].transform(np.mean)
+ assert_series_equal(result, expected)
+
+
+def test_series_fast_transform_date():
+ # GH 13191
+ df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
+ 'd': pd.date_range('2014-1-1', '2014-1-4')})
+ result = df.groupby('grouping')['d'].transform('first')
+ dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
+ pd.Timestamp('2014-1-4')]
+ expected = pd.Series(dates, name='d')
+ assert_series_equal(result, expected)
+
+
+def test_transform_length():
+ # GH 9697
+ df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
+ expected = pd.Series([3.0] * 4)
+
+ def nsum(x):
+ return np.nansum(x)
+
+ results = [df.groupby('col1').transform(sum)['col2'],
+ df.groupby('col1')['col2'].transform(sum),
+ df.groupby('col1').transform(nsum)['col2'],
+ df.groupby('col1')['col2'].transform(nsum)]
+ for result in results:
+ assert_series_equal(result, expected, check_names=False)
+
+
+def test_transform_coercion():
+
+ # 14457
+ # when we are transforming be sure to not coerce
+ # via assignment
+ df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1]))
+ g = df.groupby('A')
+
+ expected = g.transform(np.mean)
+ result = g.transform(lambda x: np.mean(x))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_transform_with_int():
+
+ # GH 3740, make sure that we might upcast on item-by-item transform
+
+ # floats
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
+ C=Series(
+ [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+ expected = DataFrame(dict(B=np.nan, C=Series(
+ [-1, 0, 1, -1, 0, 1], dtype='float64')))
+ assert_frame_equal(result, expected)
+
+ # int case
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
+ C=[1, 2, 3, 1, 2, 3], D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+ expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
+ assert_frame_equal(result, expected)
+
+ # int that needs float conversion
+ s = Series([2, 3, 4, 10, 5, -1])
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+
+ s1 = s.iloc[0:3]
+ s1 = (s1 - s1.mean()) / s1.std()
+ s2 = s.iloc[3:6]
+ s2 = (s2 - s2.mean()) / s2.std()
+ expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
+ assert_frame_equal(result, expected)
+
+ # int downcasting
+ result = df.groupby('A').transform(lambda x: x * 2 / 2)
+ expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_transform_with_nan_group():
+ # GH 9941
+ df = pd.DataFrame({'a': range(10),
+ 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
+ result = df.groupby(df.b)['a'].transform(max)
+ expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.],
+ name='a')
+ assert_series_equal(result, expected)
+
+
+def test_transform_mixed_type():
+ index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
+ ])
+ df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
+ 'c': np.tile(['a', 'b', 'c'], 2),
+ 'v': np.arange(1., 7.)}, index=index)
+
+ def f(group):
+ group['g'] = group['d'] * 2
+ return group[:1]
+
+ grouped = df.groupby('c')
+ result = grouped.apply(f)
+
+ assert result['d'].dtype == np.float64
+
+ # this is by definition a mutating operation!
+ with option_context('mode.chained_assignment', None):
+ for key, group in grouped:
+ res = f(group)
+ assert_frame_equal(res, result.loc[key])
+
+
+def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
+ """
+ Check a group transform that executes a cumulative function.
+
+ Parameters
+ ----------
+ pd_op : callable
+ The pandas cumulative function.
+ np_op : callable
+ The analogous one in NumPy.
+ dtype : type
+ The specified dtype of the data.
+ """
+
+ is_datetimelike = False
+
+ data = np.array([[1], [2], [3], [4]], dtype=dtype)
+ ans = np.zeros_like(data)
+
+ labels = np.array([0, 0, 0, 0], dtype=np.int64)
+ pd_op(ans, data, labels, is_datetimelike)
+
+ tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
+ check_dtype=False)
+
+
+def test_cython_group_transform_cumsum(any_real_dtype):
+ # see gh-4095
+ dtype = np.dtype(any_real_dtype).type
+ pd_op, np_op = groupby.group_cumsum, np.cumsum
+ _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
+
+
+def test_cython_group_transform_cumprod():
+ # see gh-4095
+ dtype = np.float64
+ pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct
+ _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
+
+
+def test_cython_group_transform_algos():
+ # see gh-4095
+ is_datetimelike = False
+
+ # with nans
+ labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
+
+ data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
+ actual = np.zeros_like(data)
+ actual.fill(np.nan)
+ groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
+ expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
+ tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+ actual = np.zeros_like(data)
+ actual.fill(np.nan)
+ groupby.group_cumsum(actual, data, labels, is_datetimelike)
+ expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
+ tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+ # timedelta
+ is_datetimelike = True
+ data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
+ actual = np.zeros_like(data, dtype='int64')
+ groupby.group_cumsum(actual, data.view('int64'), labels,
+ is_datetimelike)
+ expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
+ 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
+ np.timedelta64(5, 'ns')])
+ tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
+
+
+ "op, args, targop",
+ [('cumprod', (), lambda x: x.cumprod()),
+ ('cumsum', (), lambda x: x.cumsum()),
+ ('shift', (-1, ), lambda x: x.shift(-1)),
+ ('shift', (1, ), lambda x: x.shift())])
+def test_cython_transform_series(op, args, targop):
+ # GH 4095
+ s = Series(np.random.randn(1000))
+ s_missing = s.copy()
+ s_missing.iloc[2:10] = np.nan
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+
+ # series
+ for data in [s, s_missing]:
+ # print(data.head())
+ expected = data.groupby(labels).transform(targop)
+
+ tm.assert_series_equal(
+ expected,
+ data.groupby(labels).transform(op, *args))
+ tm.assert_series_equal(expected, getattr(
+ data.groupby(labels), op)(*args))
+
+
[email protected]("op", ['cumprod', 'cumsum'])
[email protected]("skipna", [False, True])
[email protected]('input, exp', [
+ # When everything is NaN
+ ({'key': ['b'] * 10, 'value': np.nan},
+ pd.Series([np.nan] * 10, name='value')),
+ # When there is a single NaN
+ ({'key': ['b'] * 10 + ['a'] * 2,
+ 'value': [3] * 3 + [np.nan] + [3] * 8},
+ {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
+ ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
+ 2187., 6561., 19683., 3.0, 9.0],
+ ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
+ ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
+ 21., 24., 27., 3.0, 6.0]})])
+def test_groupby_cum_skipna(op, skipna, input, exp):
+ df = pd.DataFrame(input)
+ result = df.groupby('key')['value'].transform(op, skipna=skipna)
+ if isinstance(exp, dict):
+ expected = exp[(op, skipna)]
+ else:
+ expected = exp
+ expected = pd.Series(expected, name='value')
+ tm.assert_series_equal(expected, result)
+
+
+ "op, args, targop",
+ [('cumprod', (), lambda x: x.cumprod()),
+ ('cumsum', (), lambda x: x.cumsum()),
+ ('shift', (-1, ), lambda x: x.shift(-1)),
+ ('shift', (1, ), lambda x: x.shift())])
+def test_cython_transform_frame(op, args, targop):
+ s = Series(np.random.randn(1000))
+ s_missing = s.copy()
+ s_missing.iloc[2:10] = np.nan
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+ strings = list('qwertyuiopasdfghjklz')
+ strings_missing = strings[:]
+ strings_missing[5] = np.nan
+ df = DataFrame({'float': s,
+ 'float_missing': s_missing,
+ 'int': [1, 1, 1, 1, 2] * 200,
+ 'datetime': pd.date_range('1990-1-1', periods=1000),
+ 'timedelta': pd.timedelta_range(1, freq='s',
+ periods=1000),
+ 'string': strings * 50,
+ 'string_missing': strings_missing * 50},
+ columns=['float', 'float_missing', 'int', 'datetime',
+ 'timedelta', 'string', 'string_missing'])
+ df['cat'] = df['string'].astype('category')
+
+ df2 = df.copy()
+ df2.index = pd.MultiIndex.from_product([range(100), range(10)])
+
+ # DataFrame - Single and MultiIndex,
+ # group by values, index level, columns
+ for df in [df, df2]:
+ for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
+ ]: # dict(by='string_missing')]:
+ # dict(by=['int','string'])]:
+
+ gb = df.groupby(**gb_target)
+ # whitelisted methods set the selection before applying
+ # bit a of hack to make sure the cythonized shift
+ # is equivalent to pre 0.17.1 behavior
+ if op == 'shift':
+ gb._set_group_selection()
+
+ if op != 'shift' and 'int' not in gb_target:
+ # numeric apply fastpath promotes dtype so have
+ # to apply separately and concat
+ i = gb[['int']].apply(targop)
+ f = gb[['float', 'float_missing']].apply(targop)
+ expected = pd.concat([f, i], axis=1)
+ else:
+ expected = gb.apply(targop)
+
+ expected = expected.sort_index(axis=1)
+ tm.assert_frame_equal(expected,
+ gb.transform(op, *args).sort_index(
+ axis=1))
+ tm.assert_frame_equal(
+ expected,
+ getattr(gb, op)(*args).sort_index(axis=1))
+ # individual columns
+ for c in df:
+ if c not in ['float', 'int', 'float_missing'
+ ] and op != 'shift':
+ msg = "No numeric types to aggregate"
+ with pytest.raises(DataError, match=msg):
+ gb[c].transform(op)
+ with pytest.raises(DataError, match=msg):
+ getattr(gb[c], op)()
+ else:
+ expected = gb[c].apply(targop)
+ expected.name = c
+ tm.assert_series_equal(expected,
+ gb[c].transform(op, *args))
+ tm.assert_series_equal(expected,
+ getattr(gb[c], op)(*args))
+
+
+def test_transform_with_non_scalar_group():
+ # GH 10165
+ cols = pd.MultiIndex.from_tuples([
+ ('syn', 'A'), ('mis', 'A'), ('non', 'A'),
+ ('syn', 'C'), ('mis', 'C'), ('non', 'C'),
+ ('syn', 'T'), ('mis', 'T'), ('non', 'T'),
+ ('syn', 'G'), ('mis', 'G'), ('non', 'G')])
+ df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
+ columns=cols,
+ index=['A', 'C', 'G', 'T'])
+
+ msg = 'transform must return a scalar value for each group.*'
+ with pytest.raises(ValueError, match=msg):
+ df.groupby(axis=1, level=1).transform(
+ lambda z: z.div(z.sum(axis=1), axis=0))
+
+
[email protected]('cols,exp,comp_func', [
+ ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal),
+ (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}),
+ tm.assert_frame_equal)
+])
[email protected]('agg_func', [
+ 'count', 'rank', 'size'])
+def test_transform_numeric_ret(cols, exp, comp_func, agg_func):
+ if agg_func == 'size' and isinstance(cols, list):
+ pytest.xfail("'size' transformation not supported with "
+ "NDFrameGroupy")
+
+ # GH 19200
+ df = pd.DataFrame(
+ {'a': pd.date_range('2018-01-01', periods=3),
+ 'b': range(3),
+ 'c': range(7, 10)})
+
+ result = df.groupby('b')[cols].transform(agg_func)
+
+ if agg_func == 'rank':
+ exp = exp.astype('float')
+
+ comp_func(result, exp)
+
+
[email protected]("mix_groupings", [True, False])
[email protected]("as_series", [True, False])
[email protected]("val1,val2", [
+ ('foo', 'bar'), (1, 2), (1., 2.)])
[email protected]("fill_method,limit,exp_vals", [
+ ("ffill", None,
+ [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
+ ("ffill", 1,
+ [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
+ ("bfill", None,
+ ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
+ ("bfill", 1,
+ [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
+])
+def test_group_fill_methods(mix_groupings, as_series, val1, val2,
+ fill_method, limit, exp_vals):
+ vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
+ _exp_vals = list(exp_vals)
+ # Overwrite placeholder values
+ for index, exp_val in enumerate(_exp_vals):
+ if exp_val == 'val1':
+ _exp_vals[index] = val1
+ elif exp_val == 'val2':
+ _exp_vals[index] = val2
+
+ # Need to modify values and expectations depending on the
+ # Series / DataFrame that we ultimately want to generate
+ if mix_groupings: # ['a', 'b', 'a, 'b', ...]
+ keys = ['a', 'b'] * len(vals)
+
+ def interweave(list_obj):
+ temp = list()
+ for x in list_obj:
+ temp.extend([x, x])
+
+ return temp
+
+ _exp_vals = interweave(_exp_vals)
+ vals = interweave(vals)
+ else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
+ keys = ['a'] * len(vals) + ['b'] * len(vals)
+ _exp_vals = _exp_vals * 2
+ vals = vals * 2
+
+ df = DataFrame({'key': keys, 'val': vals})
+ if as_series:
+ result = getattr(
+ df.groupby('key')['val'], fill_method)(limit=limit)
+ exp = Series(_exp_vals, name='val')
+ assert_series_equal(result, exp)
+ else:
+ result = getattr(df.groupby('key'), fill_method)(limit=limit)
+ exp = DataFrame({'key': keys, 'val': _exp_vals})
+ assert_frame_equal(result, exp)
+
+
[email protected]("fill_method", ['ffill', 'bfill'])
+def test_pad_stable_sorting(fill_method):
+ # GH 21207
+ x = [0] * 20
+ y = [np.nan] * 10 + [1] * 10
+
+ if fill_method == 'bfill':
+ y = y[::-1]
+
+ df = pd.DataFrame({'x': x, 'y': y})
+ expected = df.copy()
+
+ result = getattr(df.groupby('x'), fill_method)()
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("test_series", [True, False])
+ None,
+ pytest.param('D', marks=pytest.mark.xfail(
+ reason='GH#23918 before method uses freq in vectorized approach'))])
[email protected]("periods,fill_method,limit", [
+ (1, 'ffill', None), (1, 'ffill', 1),
+ (1, 'bfill', None), (1, 'bfill', 1),
+ (-1, 'ffill', None), (-1, 'ffill', 1),
+ (-1, 'bfill', None), (-1, 'bfill', 1),
+])
+def test_pct_change(test_series, freq, periods, fill_method, limit):
+ # GH 21200, 21621
+ vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
+ keys = ['a', 'b']
+ key_v = np.repeat(keys, len(vals))
+ df = DataFrame({'key': key_v, 'vals': vals * 2})
+
+ df_g = getattr(df.groupby('key'), fill_method)(limit=limit)
+ grp = df_g.groupby('key')
+
+ expected = grp['vals'].obj / grp['vals'].shift(periods) - 1
+
+ if test_series:
+ result = df.groupby('key')['vals'].pct_change(
+ periods=periods, fill_method=fill_method, limit=limit, freq=freq)
+ tm.assert_series_equal(result, expected)
+ else:
+ result = df.groupby('key').pct_change(
+ periods=periods, fill_method=fill_method, limit=limit, freq=freq)
+ tm.assert_frame_equal(result, expected.to_frame('vals'))
+
+
[email protected]("func", [np.any, np.all])
+def test_any_all_np_func(func):
+ # GH 20653
+ df = pd.DataFrame([['foo', True],
+ [np.nan, True],
+ ['foo', True]], columns=['key', 'val'])
+
+ exp = pd.Series([True, np.nan, True], name='val')
+
+ res = df.groupby('key')['val'].transform(func)
+ tm.assert_series_equal(res, exp)
+
+
+def test_groupby_transform_rename():
+ # https://github.com/pandas-dev/pandas/issues/23461
+ def demean_rename(x):
+ result = x - x.mean()
+
+ if isinstance(x, pd.Series):
+ return result
+
+ result = result.rename(
+ columns={c: '{}_demeaned'.format(c) for c in result.columns})
+
+ return result
+
+ df = pd.DataFrame({'group': list('ababa'),
+ 'value': [1, 1, 1, 2, 2]})
+ expected = pd.DataFrame({'value': [-1. / 3, -0.5, -1. / 3, 0.5, 2. / 3]})
+
+ result = df.groupby('group').transform(demean_rename)
+ tm.assert_frame_equal(result, expected)
+ result_single = df.groupby('group').value.transform(demean_rename)
+ tm.assert_series_equal(result_single, expected['value'])
+
+
[email protected]('func', [min, max, np.min, np.max, 'first', 'last'])
+def test_groupby_transform_timezone_column(func):
+ # GH 24198
+ ts = pd.to_datetime('now', utc=True).tz_convert('Asia/Singapore')
+ result = pd.DataFrame({'end_time': [ts], 'id': [1]})
+ result['max_end_time'] = result.groupby('id').end_time.transform(func)
+ expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id',
+ 'max_end_time'])
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_value_counts.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_value_counts.py
new file mode 100644
index 00000000000..2b5f87aa59a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_value_counts.py
@@ -0,0 +1,76 @@
+"""
+these are systematically testing all of the args to value_counts
+with different size combinations. This is to ensure stability of the sorting
+and proper parameter handling
+"""
+
+from itertools import product
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex, Series, date_range
+from pandas.util import testing as tm
+
+
+# our starting frame
+def seed_df(seed_nans, n, m):
+ np.random.seed(1234)
+ days = date_range('2015-08-24', periods=10)
+
+ frame = DataFrame({
+ '1st': np.random.choice(
+ list('abcd'), n),
+ '2nd': np.random.choice(days, n),
+ '3rd': np.random.randint(1, m + 1, n)
+ })
+
+ if seed_nans:
+ frame.loc[1::11, '1st'] = np.nan
+ frame.loc[3::17, '2nd'] = np.nan
+ frame.loc[7::19, '3rd'] = np.nan
+ frame.loc[8::19, '3rd'] = np.nan
+ frame.loc[9::19, '3rd'] = np.nan
+
+ return frame
+
+
+# create input df, keys, and the bins
+binned = []
+ids = []
+for seed_nans in [True, False]:
+ for n, m in product((100, 1000), (5, 20)):
+
+ df = seed_df(seed_nans, n, m)
+ bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
+ keys = '1st', '2nd', ['1st', '2nd']
+ for k, b in product(keys, bins):
+ binned.append((df, k, b, n, m))
+ ids.append("{}-{}-{}".format(k, n, m))
+
+
[email protected]("df, keys, bins, n, m", binned, ids=ids)
+def test_series_groupby_value_counts(df, keys, bins, n, m):
+
+ def rebuild_index(df):
+ arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
+ df.index = MultiIndex.from_arrays(arr, names=df.index.names)
+ return df
+
+ for isort, normalize, sort, ascending, dropna \
+ in product((False, True), repeat=5):
+
+ kwargs = dict(normalize=normalize, sort=sort,
+ ascending=ascending, dropna=dropna, bins=bins)
+
+ gr = df.groupby(keys, sort=isort)
+ left = gr['3rd'].value_counts(**kwargs)
+
+ gr = df.groupby(keys, sort=isort)
+ right = gr['3rd'].apply(Series.value_counts, **kwargs)
+ right.index.names = right.index.names[:-1] + ['3rd']
+
+ # have to sort on index because of unstable sort on values
+ left, right = map(rebuild_index, (left, right)) # xref GH9212
+ tm.assert_series_equal(left.sort_index(), right.sort_index())
diff --git a/contrib/python/pandas/py2/pandas/tests/groupby/test_whitelist.py b/contrib/python/pandas/py2/pandas/tests/groupby/test_whitelist.py
new file mode 100644
index 00000000000..b7302b3911e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/groupby/test_whitelist.py
@@ -0,0 +1,297 @@
+"""
+test methods relating to generic function evaluation
+the so-called white/black lists
+"""
+
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Index, MultiIndex, Series, compat, date_range
+from pandas.util import testing as tm
+
+AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
+ 'mad', 'std', 'var', 'sem']
+AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']
+
+df_whitelist = [
+ 'quantile',
+ 'fillna',
+ 'mad',
+ 'take',
+ 'idxmax',
+ 'idxmin',
+ 'tshift',
+ 'skew',
+ 'plot',
+ 'hist',
+ 'dtypes',
+ 'corrwith',
+ 'corr',
+ 'cov',
+ 'diff',
+]
+
+
[email protected](params=df_whitelist)
+def df_whitelist_fixture(request):
+ return request.param
+
+
+s_whitelist = [
+ 'quantile',
+ 'fillna',
+ 'mad',
+ 'take',
+ 'idxmax',
+ 'idxmin',
+ 'tshift',
+ 'skew',
+ 'plot',
+ 'hist',
+ 'dtype',
+ 'corr',
+ 'cov',
+ 'diff',
+ 'unique',
+ 'nlargest',
+ 'nsmallest',
+ 'is_monotonic_increasing',
+ 'is_monotonic_decreasing',
+]
+
+
[email protected](params=s_whitelist)
+def s_whitelist_fixture(request):
+ return request.param
+
+
+def mframe():
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ return DataFrame(np.random.randn(10, 3), index=index,
+ columns=['A', 'B', 'C'])
+
+
+def df():
+ return DataFrame(
+ {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8)})
+
+
+def df_letters():
+ letters = np.array(list(ascii_lowercase))
+ N = 10
+ random_letters = letters.take(np.random.randint(0, 26, N))
+ df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
+ 'letters': Series(random_letters)})
+ return df
+
+
[email protected]("whitelist", [df_whitelist, s_whitelist])
+def test_groupby_whitelist(df_letters, whitelist):
+ df = df_letters
+ if whitelist == df_whitelist:
+ # dataframe
+ obj = df_letters
+ else:
+ obj = df_letters['floats']
+
+ gb = obj.groupby(df.letters)
+
+ assert set(whitelist) == set(gb._apply_whitelist)
+
+
+def check_whitelist(obj, df, m):
+ # check the obj for a particular whitelist m
+
+ gb = obj.groupby(df.letters)
+
+ f = getattr(type(gb), m)
+
+ # name
+ try:
+ n = f.__name__
+ except AttributeError:
+ return
+ assert n == m
+
+ # qualname
+ if compat.PY3:
+ try:
+ n = f.__qualname__
+ except AttributeError:
+ return
+ assert n.endswith(m)
+
+
+def test_groupby_series_whitelist(df_letters, s_whitelist_fixture):
+ m = s_whitelist_fixture
+ df = df_letters
+ check_whitelist(df.letters, df, m)
+
+
+def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture):
+ m = df_whitelist_fixture
+ df = df_letters
+ check_whitelist(df, df, m)
+
+
+def raw_frame():
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ raw_frame = DataFrame(np.random.randn(10, 3), index=index,
+ columns=Index(['A', 'B', 'C'], name='exp'))
+ raw_frame.iloc[1, [1, 2]] = np.nan
+ raw_frame.iloc[7, [0, 1]] = np.nan
+ return raw_frame
+
+
[email protected]('op', AGG_FUNCTIONS)
[email protected]('level', [0, 1])
[email protected]('axis', [0, 1])
[email protected]('skipna', [True, False])
[email protected]('sort', [True, False])
+def test_regression_whitelist_methods(
+ raw_frame, op, level,
+ axis, skipna, sort):
+ # GH6944
+ # GH 17537
+ # explicitly test the whitelist methods
+
+ if axis == 0:
+ frame = raw_frame
+ else:
+ frame = raw_frame.T
+
+ if op in AGG_FUNCTIONS_WITH_SKIPNA:
+ grouped = frame.groupby(level=level, axis=axis, sort=sort)
+ result = getattr(grouped, op)(skipna=skipna)
+ expected = getattr(frame, op)(level=level, axis=axis,
+ skipna=skipna)
+ if sort:
+ expected = expected.sort_index(axis=axis, level=level)
+ tm.assert_frame_equal(result, expected)
+ else:
+ grouped = frame.groupby(level=level, axis=axis, sort=sort)
+ result = getattr(grouped, op)()
+ expected = getattr(frame, op)(level=level, axis=axis)
+ if sort:
+ expected = expected.sort_index(axis=axis, level=level)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_blacklist(df_letters):
+ df = df_letters
+ s = df_letters.floats
+
+ blacklist = [
+ 'eval', 'query', 'abs', 'where',
+ 'mask', 'align', 'groupby', 'clip', 'astype',
+ 'at', 'combine', 'consolidate', 'convert_objects',
+ ]
+ to_methods = [method for method in dir(df) if method.startswith('to_')]
+
+ blacklist.extend(to_methods)
+
+ # e.g., to_csv
+ defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "
+ "'apply' method$)")
+
+ # e.g., query, eval
+ not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
+ fmt = defined_but_not_allowed + '|' + not_defined
+ for bl in blacklist:
+ for obj in (df, s):
+ gb = obj.groupby(df.letters)
+ msg = fmt.format(bl, type(gb).__name__)
+ with pytest.raises(AttributeError, match=msg):
+ getattr(gb, bl)
+
+
+def test_tab_completion(mframe):
+ grp = mframe.groupby(level='second')
+ results = {v for v in dir(grp) if not v.startswith('_')}
+ expected = {
+ 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
+ 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
+ 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot',
+ 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
+ 'nunique', 'head', 'describe', 'cummax', 'quantile',
+ 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
+ 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew',
+ 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
+ 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
+ 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe',
+ }
+ assert results == expected
+
+
+def test_groupby_function_rename(mframe):
+ grp = mframe.groupby(level='second')
+ for name in ['sum', 'prod', 'min', 'max', 'first', 'last']:
+ f = getattr(grp, name)
+ assert f.__name__ == name
+
+
+def test_groupby_selection_with_methods(df):
+ # some methods which require DatetimeIndex
+ rng = date_range('2014', periods=len(df))
+ df.index = rng
+
+ g = df.groupby(['A'])[['C']]
+ g_exp = df[['C']].groupby(df['A'])
+ # TODO check groupby with > 1 col ?
+
+ # methods which are called as .foo()
+ methods = ['count',
+ 'corr',
+ 'cummax',
+ 'cummin',
+ 'cumprod',
+ 'describe',
+ 'rank',
+ 'quantile',
+ 'diff',
+ 'shift',
+ 'all',
+ 'any',
+ 'idxmin',
+ 'idxmax',
+ 'ffill',
+ 'bfill',
+ 'pct_change',
+ 'tshift']
+
+ for m in methods:
+ res = getattr(g, m)()
+ exp = getattr(g_exp, m)()
+
+ # should always be frames!
+ tm.assert_frame_equal(res, exp)
+
+ # methods which aren't just .foo()
+ tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
+ tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
+ tm.assert_frame_equal(g.apply(lambda x: x.sum()),
+ g_exp.apply(lambda x: x.sum()))
+
+ tm.assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean())
+ tm.assert_frame_equal(g.resample('D').ohlc(),
+ g_exp.resample('D').ohlc())
+
+ tm.assert_frame_equal(g.filter(lambda x: len(x) == 3),
+ g_exp.filter(lambda x: len(x) == 3))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/common.py b/contrib/python/pandas/py2/pandas/tests/indexes/common.py
new file mode 100644
index 00000000000..24207da416b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/common.py
@@ -0,0 +1,928 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+import pandas.compat as compat
+from pandas.compat import PY3
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index,
+ IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series, TimedeltaIndex,
+ UInt64Index, isna)
+from pandas.core.indexes.base import InvalidIndexError
+from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
+import pandas.util.testing as tm
+
+
+class Base(object):
+ """ base class for index sub-class tests """
+ _holder = None
+ _compat_props = ['shape', 'ndim', 'size', 'nbytes']
+
+ def setup_indices(self):
+ for name, idx in self.indices.items():
+ setattr(self, name, idx)
+
+ def test_pickle_compat_construction(self):
+ # need an object to create with
+ pytest.raises(TypeError, self._holder)
+
+ def test_to_series(self):
+ # assert that we are creating a copy of the index
+
+ idx = self.create_index()
+ s = idx.to_series()
+ assert s.values is not idx.values
+ assert s.index is not idx
+ assert s.name == idx.name
+
+ def test_to_series_with_arguments(self):
+ # GH18699
+
+ # index kwarg
+ idx = self.create_index()
+ s = idx.to_series(index=idx)
+
+ assert s.values is not idx.values
+ assert s.index is idx
+ assert s.name == idx.name
+
+ # name kwarg
+ idx = self.create_index()
+ s = idx.to_series(name='__test')
+
+ assert s.values is not idx.values
+ assert s.index is not idx
+ assert s.name != idx.name
+
+ @pytest.mark.parametrize("name", [None, "new_name"])
+ def test_to_frame(self, name):
+ # see GH-15230, GH-22580
+ idx = self.create_index()
+
+ if name:
+ idx_name = name
+ else:
+ idx_name = idx.name or 0
+
+ df = idx.to_frame(name=idx_name)
+
+ assert df.index is idx
+ assert len(df.columns) == 1
+ assert df.columns[0] == idx_name
+ assert df[idx_name].values is not idx.values
+
+ df = idx.to_frame(index=False, name=idx_name)
+ assert df.index is not idx
+
+ def test_shift(self):
+
+ # GH8083 test the base class for shift
+ idx = self.create_index()
+ pytest.raises(NotImplementedError, idx.shift, 1)
+ pytest.raises(NotImplementedError, idx.shift, 1, 2)
+
+ def test_create_index_existing_name(self):
+
+ # GH11193, when an existing index is passed, and a new name is not
+ # specified, the new index should inherit the previous object name
+ expected = self.create_index()
+ if not isinstance(expected, MultiIndex):
+ expected.name = 'foo'
+ result = pd.Index(expected)
+ tm.assert_index_equal(result, expected)
+
+ result = pd.Index(expected, name='bar')
+ expected.name = 'bar'
+ tm.assert_index_equal(result, expected)
+ else:
+ expected.names = ['foo', 'bar']
+ result = pd.Index(expected)
+ tm.assert_index_equal(
+ result, Index(Index([('foo', 'one'), ('foo', 'two'),
+ ('bar', 'one'), ('baz', 'two'),
+ ('qux', 'one'), ('qux', 'two')],
+ dtype='object'),
+ names=['foo', 'bar']))
+
+ result = pd.Index(expected, names=['A', 'B'])
+ tm.assert_index_equal(
+ result,
+ Index(Index([('foo', 'one'), ('foo', 'two'), ('bar', 'one'),
+ ('baz', 'two'), ('qux', 'one'), ('qux', 'two')],
+ dtype='object'), names=['A', 'B']))
+
+ def test_numeric_compat(self):
+
+ idx = self.create_index()
+ with pytest.raises(TypeError, match="cannot perform __mul__"):
+ idx * 1
+ with pytest.raises(TypeError, match="cannot perform __rmul__"):
+ 1 * idx
+
+ div_err = ("cannot perform __truediv__" if PY3
+ else "cannot perform __div__")
+ with pytest.raises(TypeError, match=div_err):
+ idx / 1
+
+ div_err = div_err.replace(' __', ' __r')
+ with pytest.raises(TypeError, match=div_err):
+ 1 / idx
+ with pytest.raises(TypeError, match="cannot perform __floordiv__"):
+ idx // 1
+ with pytest.raises(TypeError, match="cannot perform __rfloordiv__"):
+ 1 // idx
+
+ def test_logical_compat(self):
+ idx = self.create_index()
+ with pytest.raises(TypeError, match='cannot perform all'):
+ idx.all()
+ with pytest.raises(TypeError, match='cannot perform any'):
+ idx.any()
+
+ def test_boolean_context_compat(self):
+
+ # boolean context compat
+ idx = self.create_index()
+
+ with pytest.raises(ValueError, match='The truth value of a'):
+ if idx:
+ pass
+
+ def test_reindex_base(self):
+ idx = self.create_index()
+ expected = np.arange(idx.size, dtype=np.intp)
+
+ actual = idx.get_indexer(idx)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ with pytest.raises(ValueError, match='Invalid fill method'):
+ idx.get_indexer(idx, method='invalid')
+
+ def test_get_indexer_consistency(self):
+ # See GH 16819
+ for name, index in self.indices.items():
+ if isinstance(index, IntervalIndex):
+ continue
+
+ if index.is_unique or isinstance(index, CategoricalIndex):
+ indexer = index.get_indexer(index[0:2])
+ assert isinstance(indexer, np.ndarray)
+ assert indexer.dtype == np.intp
+ else:
+ e = "Reindexing only valid with uniquely valued Index objects"
+ with pytest.raises(InvalidIndexError, match=e):
+ index.get_indexer(index[0:2])
+
+ indexer, _ = index.get_indexer_non_unique(index[0:2])
+ assert isinstance(indexer, np.ndarray)
+ assert indexer.dtype == np.intp
+
+ def test_ndarray_compat_properties(self):
+ idx = self.create_index()
+ assert idx.T.equals(idx)
+ assert idx.transpose().equals(idx)
+
+ values = idx.values
+ for prop in self._compat_props:
+ assert getattr(idx, prop) == getattr(values, prop)
+
+ # test for validity
+ idx.nbytes
+ idx.values.nbytes
+
+ def test_repr_roundtrip(self):
+
+ idx = self.create_index()
+ tm.assert_index_equal(eval(repr(idx)), idx)
+
+ def test_str(self):
+
+ # test the string repr
+ idx = self.create_index()
+ idx.name = 'foo'
+ assert "'foo'" in str(idx)
+ assert idx.__class__.__name__ in str(idx)
+
+ def test_repr_max_seq_item_setting(self):
+ # GH10182
+ idx = self.create_index()
+ idx = idx.repeat(50)
+ with pd.option_context("display.max_seq_items", None):
+ repr(idx)
+ assert '...' not in str(idx)
+
+ def test_copy_name(self):
+ # gh-12309: Check that the "name" argument
+ # passed at initialization is honored.
+
+ for name, index in compat.iteritems(self.indices):
+ if isinstance(index, MultiIndex):
+ continue
+
+ first = index.__class__(index, copy=True, name='mario')
+ second = first.__class__(first, copy=False)
+
+ # Even though "copy=False", we want a new object.
+ assert first is not second
+
+ # Not using tm.assert_index_equal() since names differ.
+ assert index.equals(first)
+
+ assert first.name == 'mario'
+ assert second.name == 'mario'
+
+ s1 = Series(2, index=first)
+ s2 = Series(3, index=second[:-1])
+
+ if not isinstance(index, CategoricalIndex):
+ # See gh-13365
+ s3 = s1 * s2
+ assert s3.index.name == 'mario'
+
+ def test_ensure_copied_data(self):
+ # Check the "copy" argument of each Index.__new__ is honoured
+ # GH12309
+ for name, index in compat.iteritems(self.indices):
+ init_kwargs = {}
+ if isinstance(index, PeriodIndex):
+ # Needs "freq" specification:
+ init_kwargs['freq'] = index.freq
+ elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)):
+ # RangeIndex cannot be initialized from data
+ # MultiIndex and CategoricalIndex are tested separately
+ continue
+
+ index_type = index.__class__
+ result = index_type(index.values, copy=True, **init_kwargs)
+ tm.assert_index_equal(index, result)
+ tm.assert_numpy_array_equal(index._ndarray_values,
+ result._ndarray_values,
+ check_same='copy')
+
+ if isinstance(index, PeriodIndex):
+ # .values an object array of Period, thus copied
+ result = index_type(ordinal=index.asi8, copy=False,
+ **init_kwargs)
+ tm.assert_numpy_array_equal(index._ndarray_values,
+ result._ndarray_values,
+ check_same='same')
+ elif isinstance(index, IntervalIndex):
+ # checked in test_interval.py
+ pass
+ else:
+ result = index_type(index.values, copy=False, **init_kwargs)
+ tm.assert_numpy_array_equal(index.values, result.values,
+ check_same='same')
+ tm.assert_numpy_array_equal(index._ndarray_values,
+ result._ndarray_values,
+ check_same='same')
+
+ def test_memory_usage(self):
+ for name, index in compat.iteritems(self.indices):
+ result = index.memory_usage()
+ if len(index):
+ index.get_loc(index[0])
+ result2 = index.memory_usage()
+ result3 = index.memory_usage(deep=True)
+
+ # RangeIndex, IntervalIndex
+ # don't have engines
+ if not isinstance(index, (RangeIndex, IntervalIndex)):
+ assert result2 > result
+
+ if index.inferred_type == 'object':
+ assert result3 > result2
+
+ else:
+
+ # we report 0 for no-length
+ assert result == 0
+
+ def test_argsort(self):
+ for k, ind in self.indices.items():
+
+ # separately tested
+ if k in ['catIndex']:
+ continue
+
+ result = ind.argsort()
+ expected = np.array(ind).argsort()
+ tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+ def test_numpy_argsort(self):
+ for k, ind in self.indices.items():
+ result = np.argsort(ind)
+ expected = ind.argsort()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # these are the only two types that perform
+ # pandas compatibility input validation - the
+ # rest already perform separate (or no) such
+ # validation via their 'values' attribute as
+ # defined in pandas.core.indexes/base.py - they
+ # cannot be changed at the moment due to
+ # backwards compatibility concerns
+ if isinstance(type(ind), (CategoricalIndex, RangeIndex)):
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(ind, axis=1)
+
+ msg = "the 'kind' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(ind, kind='mergesort')
+
+ msg = "the 'order' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(ind, order=('a', 'b'))
+
+ def test_take(self):
+ indexer = [4, 3, 0, 2]
+ for k, ind in self.indices.items():
+
+ # separate
+ if k in ['boolIndex', 'tuples', 'empty']:
+ continue
+
+ result = ind.take(indexer)
+ expected = ind[indexer]
+ assert result.equals(expected)
+
+ if not isinstance(ind,
+ (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ # GH 10791
+ with pytest.raises(AttributeError):
+ ind.freq
+
+ def test_take_invalid_kwargs(self):
+ idx = self.create_index()
+ indices = [1, 2]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+ def test_repeat(self):
+ rep = 2
+ i = self.create_index()
+ expected = pd.Index(i.values.repeat(rep), name=i.name)
+ tm.assert_index_equal(i.repeat(rep), expected)
+
+ i = self.create_index()
+ rep = np.arange(len(i))
+ expected = pd.Index(i.values.repeat(rep), name=i.name)
+ tm.assert_index_equal(i.repeat(rep), expected)
+
+ def test_numpy_repeat(self):
+ rep = 2
+ i = self.create_index()
+ expected = i.repeat(rep)
+ tm.assert_index_equal(np.repeat(i, rep), expected)
+
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.repeat(i, rep, axis=0)
+
+ @pytest.mark.parametrize('klass', [list, tuple, np.array, Series])
+ def test_where(self, klass):
+ i = self.create_index()
+
+ cond = [True] * len(i)
+ result = i.where(klass(cond))
+ expected = i
+ tm.assert_index_equal(result, expected)
+
+ cond = [False] + [True] * len(i[1:])
+ expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype)
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("case", [0.5, "xxx"])
+ @pytest.mark.parametrize("method", ["intersection", "union",
+ "difference", "symmetric_difference"])
+ def test_set_ops_error_cases(self, case, method):
+ for name, idx in compat.iteritems(self.indices):
+ # non-iterable input
+
+ msg = "Input must be Index or array-like"
+ with pytest.raises(TypeError, match=msg):
+ getattr(idx, method)(case)
+
+ def test_intersection_base(self):
+ for name, idx in compat.iteritems(self.indices):
+ first = idx[:5]
+ second = idx[:3]
+ intersect = first.intersection(second)
+
+ if isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ assert tm.equalContents(intersect, second)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ if isinstance(idx, PeriodIndex):
+ msg = "can only call with other PeriodIndex-ed objects"
+ with pytest.raises(ValueError, match=msg):
+ first.intersection(case)
+ elif isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ result = first.intersection(case)
+ assert tm.equalContents(result, second)
+
+ if isinstance(idx, MultiIndex):
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.intersection([1, 2, 3])
+
+ def test_union_base(self):
+ for name, idx in compat.iteritems(self.indices):
+ first = idx[3:]
+ second = idx[:5]
+ everything = idx
+ union = first.union(second)
+ assert tm.equalContents(union, everything)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ if isinstance(idx, PeriodIndex):
+ msg = "can only call with other PeriodIndex-ed objects"
+ with pytest.raises(ValueError, match=msg):
+ first.union(case)
+ elif isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ result = first.union(case)
+ assert tm.equalContents(result, everything)
+
+ if isinstance(idx, MultiIndex):
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.union([1, 2, 3])
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_base(self, sort):
+ for name, idx in compat.iteritems(self.indices):
+ first = idx[2:]
+ second = idx[:4]
+ answer = idx[4:]
+ result = first.difference(second, sort)
+
+ if isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ assert tm.equalContents(result, answer)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ if isinstance(idx, PeriodIndex):
+ msg = "can only call with other PeriodIndex-ed objects"
+ with pytest.raises(ValueError, match=msg):
+ first.difference(case, sort)
+ elif isinstance(idx, CategoricalIndex):
+ pass
+ elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)):
+ assert result.__class__ == answer.__class__
+ tm.assert_numpy_array_equal(result.sort_values().asi8,
+ answer.sort_values().asi8)
+ else:
+ result = first.difference(case, sort)
+ assert tm.equalContents(result, answer)
+
+ if isinstance(idx, MultiIndex):
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.difference([1, 2, 3], sort)
+
+ def test_symmetric_difference(self):
+ for name, idx in compat.iteritems(self.indices):
+ first = idx[1:]
+ second = idx[:-1]
+ if isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ answer = idx[[0, -1]]
+ result = first.symmetric_difference(second)
+ assert tm.equalContents(result, answer)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ if isinstance(idx, PeriodIndex):
+ msg = "can only call with other PeriodIndex-ed objects"
+ with pytest.raises(ValueError, match=msg):
+ first.symmetric_difference(case)
+ elif isinstance(idx, CategoricalIndex):
+ pass
+ else:
+ result = first.symmetric_difference(case)
+ assert tm.equalContents(result, answer)
+
+ if isinstance(idx, MultiIndex):
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.symmetric_difference([1, 2, 3])
+
+ def test_insert_base(self):
+
+ for name, idx in compat.iteritems(self.indices):
+ result = idx[1:4]
+
+ if not len(idx):
+ continue
+
+ # test 0th element
+ assert idx[0:4].equals(result.insert(0, idx[0]))
+
+ def test_delete_base(self):
+
+ for name, idx in compat.iteritems(self.indices):
+
+ if not len(idx):
+ continue
+
+ if isinstance(idx, RangeIndex):
+ # tested in class
+ continue
+
+ expected = idx[1:]
+ result = idx.delete(0)
+ assert result.equals(expected)
+ assert result.name == expected.name
+
+ expected = idx[:-1]
+ result = idx.delete(-1)
+ assert result.equals(expected)
+ assert result.name == expected.name
+
+ with pytest.raises((IndexError, ValueError)):
+ # either depending on numpy version
+ idx.delete(len(idx))
+
+ def test_equals(self):
+
+ for name, idx in compat.iteritems(self.indices):
+ assert idx.equals(idx)
+ assert idx.equals(idx.copy())
+ assert idx.equals(idx.astype(object))
+
+ assert not idx.equals(list(idx))
+ assert not idx.equals(np.array(idx))
+
+ # Cannot pass in non-int64 dtype to RangeIndex
+ if not isinstance(idx, RangeIndex):
+ same_values = Index(idx, dtype=object)
+ assert idx.equals(same_values)
+ assert same_values.equals(idx)
+
+ if idx.nlevels == 1:
+ # do not test MultiIndex
+ assert not idx.equals(pd.Series(idx))
+
+ def test_equals_op(self):
+ # GH9947, GH10637
+ index_a = self.create_index()
+ if isinstance(index_a, PeriodIndex):
+ pytest.skip('Skip check for PeriodIndex')
+
+ n = len(index_a)
+ index_b = index_a[0:-1]
+ index_c = index_a[0:-1].append(index_a[-2:-1])
+ index_d = index_a[0:1]
+
+ msg = "Lengths must match|could not be broadcast"
+ with pytest.raises(ValueError, match=msg):
+ index_a == index_b
+ expected1 = np.array([True] * n)
+ expected2 = np.array([True] * (n - 1) + [False])
+ tm.assert_numpy_array_equal(index_a == index_a, expected1)
+ tm.assert_numpy_array_equal(index_a == index_c, expected2)
+
+ # test comparisons with numpy arrays
+ array_a = np.array(index_a)
+ array_b = np.array(index_a[0:-1])
+ array_c = np.array(index_a[0:-1].append(index_a[-2:-1]))
+ array_d = np.array(index_a[0:1])
+ with pytest.raises(ValueError, match=msg):
+ index_a == array_b
+ tm.assert_numpy_array_equal(index_a == array_a, expected1)
+ tm.assert_numpy_array_equal(index_a == array_c, expected2)
+
+ # test comparisons with Series
+ series_a = Series(array_a)
+ series_b = Series(array_b)
+ series_c = Series(array_c)
+ series_d = Series(array_d)
+ with pytest.raises(ValueError, match=msg):
+ index_a == series_b
+
+ tm.assert_numpy_array_equal(index_a == series_a, expected1)
+ tm.assert_numpy_array_equal(index_a == series_c, expected2)
+
+ # cases where length is 1 for one of them
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == index_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == series_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == array_d
+ msg = "Can only compare identically-labeled Series objects"
+ with pytest.raises(ValueError, match=msg):
+ series_a == series_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ series_a == array_d
+
+ # comparing with a scalar should broadcast; note that we are excluding
+ # MultiIndex because in this case each item in the index is a tuple of
+ # length 2, and therefore is considered an array of length 2 in the
+ # comparison instead of a scalar
+ if not isinstance(index_a, MultiIndex):
+ expected3 = np.array([False] * (len(index_a) - 2) + [True, False])
+ # assuming the 2nd to last item is unique in the data
+ item = index_a[-2]
+ tm.assert_numpy_array_equal(index_a == item, expected3)
+ tm.assert_series_equal(series_a == item, Series(expected3))
+
+ def test_numpy_ufuncs(self):
+ # test ufuncs of numpy, see:
+ # http://docs.scipy.org/doc/numpy/reference/ufuncs.html
+
+ for name, idx in compat.iteritems(self.indices):
+ for func in [np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10,
+ np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin,
+ np.arccos, np.arctan, np.sinh, np.cosh, np.tanh,
+ np.arcsinh, np.arccosh, np.arctanh, np.deg2rad,
+ np.rad2deg]:
+ if isinstance(idx, DatetimeIndexOpsMixin):
+ # raise TypeError or ValueError (PeriodIndex)
+ # PeriodIndex behavior should be changed in future version
+ with pytest.raises(Exception):
+ with np.errstate(all='ignore'):
+ func(idx)
+ elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)):
+ # coerces to float (e.g. np.sin)
+ with np.errstate(all='ignore'):
+ result = func(idx)
+ exp = Index(func(idx.values), name=idx.name)
+
+ tm.assert_index_equal(result, exp)
+ assert isinstance(result, pd.Float64Index)
+ else:
+ # raise AttributeError or TypeError
+ if len(idx) == 0:
+ continue
+ else:
+ with pytest.raises(Exception):
+ with np.errstate(all='ignore'):
+ func(idx)
+
+ for func in [np.isfinite, np.isinf, np.isnan, np.signbit]:
+ if isinstance(idx, DatetimeIndexOpsMixin):
+ # raise TypeError or ValueError (PeriodIndex)
+ with pytest.raises(Exception):
+ func(idx)
+ elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)):
+ # Results in bool array
+ result = func(idx)
+ assert isinstance(result, np.ndarray)
+ assert not isinstance(result, Index)
+ else:
+ if len(idx) == 0:
+ continue
+ else:
+ with pytest.raises(Exception):
+ func(idx)
+
+ def test_hasnans_isnans(self):
+ # GH 11343, added tests for hasnans / isnans
+ for name, index in self.indices.items():
+ if isinstance(index, MultiIndex):
+ pass
+ else:
+ idx = index.copy()
+
+ # cases in indices doesn't include NaN
+ expected = np.array([False] * len(idx), dtype=bool)
+ tm.assert_numpy_array_equal(idx._isnan, expected)
+ assert idx.hasnans is False
+
+ idx = index.copy()
+ values = np.asarray(idx.values)
+
+ if len(index) == 0:
+ continue
+ elif isinstance(index, DatetimeIndexOpsMixin):
+ values[1] = iNaT
+ elif isinstance(index, (Int64Index, UInt64Index)):
+ continue
+ else:
+ values[1] = np.nan
+
+ if isinstance(index, PeriodIndex):
+ idx = index.__class__(values, freq=index.freq)
+ else:
+ idx = index.__class__(values)
+
+ expected = np.array([False] * len(idx), dtype=bool)
+ expected[1] = True
+ tm.assert_numpy_array_equal(idx._isnan, expected)
+ assert idx.hasnans is True
+
+ def test_fillna(self):
+ # GH 11343
+ for name, index in self.indices.items():
+ if len(index) == 0:
+ pass
+ elif isinstance(index, MultiIndex):
+ idx = index.copy()
+ msg = "isna is not defined for MultiIndex"
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.fillna(idx[0])
+ else:
+ idx = index.copy()
+ result = idx.fillna(idx[0])
+ tm.assert_index_equal(result, idx)
+ assert result is not idx
+
+ msg = "'value' must be a scalar, passed: "
+ with pytest.raises(TypeError, match=msg):
+ idx.fillna([idx[0]])
+
+ idx = index.copy()
+ values = np.asarray(idx.values)
+
+ if isinstance(index, DatetimeIndexOpsMixin):
+ values[1] = iNaT
+ elif isinstance(index, (Int64Index, UInt64Index)):
+ continue
+ else:
+ values[1] = np.nan
+
+ if isinstance(index, PeriodIndex):
+ idx = index.__class__(values, freq=index.freq)
+ else:
+ idx = index.__class__(values)
+
+ expected = np.array([False] * len(idx), dtype=bool)
+ expected[1] = True
+ tm.assert_numpy_array_equal(idx._isnan, expected)
+ assert idx.hasnans is True
+
+ def test_nulls(self):
+ # this is really a smoke test for the methods
+ # as these are adequately tested for function elsewhere
+
+ for name, index in self.indices.items():
+ if len(index) == 0:
+ tm.assert_numpy_array_equal(
+ index.isna(), np.array([], dtype=bool))
+ elif isinstance(index, MultiIndex):
+ idx = index.copy()
+ msg = "isna is not defined for MultiIndex"
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.isna()
+ else:
+
+ if not index.hasnans:
+ tm.assert_numpy_array_equal(
+ index.isna(), np.zeros(len(index), dtype=bool))
+ tm.assert_numpy_array_equal(
+ index.notna(), np.ones(len(index), dtype=bool))
+ else:
+ result = isna(index)
+ tm.assert_numpy_array_equal(index.isna(), result)
+ tm.assert_numpy_array_equal(index.notna(), ~result)
+
+ def test_empty(self):
+ # GH 15270
+ index = self.create_index()
+ assert not index.empty
+ assert index[:0].empty
+
+ def test_join_self_unique(self, join_type):
+ index = self.create_index()
+ if index.is_unique:
+ joined = index.join(index, how=join_type)
+ assert (index == joined).all()
+
+ def test_map(self):
+ # callable
+ index = self.create_index()
+
+ # we don't infer UInt64
+ if isinstance(index, pd.UInt64Index):
+ expected = index.astype('int64')
+ else:
+ expected = index
+
+ result = index.map(lambda x: x)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "mapper",
+ [
+ lambda values, index: {i: e for e, i in zip(values, index)},
+ lambda values, index: pd.Series(values, index)])
+ def test_map_dictlike(self, mapper):
+
+ index = self.create_index()
+ if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)):
+ pytest.skip("skipping tests for {}".format(type(index)))
+
+ identity = mapper(index.values, index)
+
+ # we don't infer to UInt64 for a dict
+ if isinstance(index, pd.UInt64Index) and isinstance(identity, dict):
+ expected = index.astype('int64')
+ else:
+ expected = index
+
+ result = index.map(identity)
+ tm.assert_index_equal(result, expected)
+
+ # empty mappable
+ expected = pd.Index([np.nan] * len(index))
+ result = index.map(mapper(expected, index))
+ tm.assert_index_equal(result, expected)
+
+ def test_putmask_with_wrong_mask(self):
+ # GH18368
+ index = self.create_index()
+
+ with pytest.raises(ValueError):
+ index.putmask(np.ones(len(index) + 1, np.bool), 1)
+
+ with pytest.raises(ValueError):
+ index.putmask(np.ones(len(index) - 1, np.bool), 1)
+
+ with pytest.raises(ValueError):
+ index.putmask('foo', 1)
+
+ @pytest.mark.parametrize('copy', [True, False])
+ @pytest.mark.parametrize('name', [None, 'foo'])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_astype_category(self, copy, name, ordered):
+ # GH 18630
+ index = self.create_index()
+ if name:
+ index = index.rename(name)
+
+ # standard categories
+ dtype = CategoricalDtype(ordered=ordered)
+ result = index.astype(dtype, copy=copy)
+ expected = CategoricalIndex(index.values, name=name, ordered=ordered)
+ tm.assert_index_equal(result, expected)
+
+ # non-standard categories
+ dtype = CategoricalDtype(index.unique().tolist()[:-1], ordered)
+ result = index.astype(dtype, copy=copy)
+ expected = CategoricalIndex(index.values, name=name, dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ if ordered is False:
+ # dtype='category' defaults to ordered=False, so only test once
+ result = index.astype('category', copy=copy)
+ expected = CategoricalIndex(index.values, name=name)
+ tm.assert_index_equal(result, expected)
+
+ def test_is_unique(self):
+ # initialize a unique index
+ index = self.create_index().drop_duplicates()
+ assert index.is_unique is True
+
+ # empty index should be unique
+ index_empty = index[:0]
+ assert index_empty.is_unique is True
+
+ # test basic dupes
+ index_dup = index.insert(0, index[0])
+ assert index_dup.is_unique is False
+
+ # single NA should be unique
+ index_na = index.insert(0, np.nan)
+ assert index_na.is_unique is True
+
+ # multiple NA should not be unique
+ index_na_dup = index_na.insert(0, np.nan)
+ assert index_na_dup.is_unique is False
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/conftest.py b/contrib/python/pandas/py2/pandas/tests/indexes/conftest.py
new file mode 100644
index 00000000000..e82cce873e7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/conftest.py
@@ -0,0 +1,49 @@
+import numpy as np
+import pytest
+
+from pandas.compat import long, lzip
+
+import pandas as pd
+from pandas.core.indexes.api import Index, MultiIndex
+import pandas.util.testing as tm
+
+
[email protected](params=[tm.makeUnicodeIndex(100),
+ tm.makeStringIndex(100),
+ tm.makeDateIndex(100),
+ tm.makePeriodIndex(100),
+ tm.makeTimedeltaIndex(100),
+ tm.makeIntIndex(100),
+ tm.makeUIntIndex(100),
+ tm.makeRangeIndex(100),
+ tm.makeFloatIndex(100),
+ Index([True, False]),
+ tm.makeCategoricalIndex(100),
+ Index([]),
+ MultiIndex.from_tuples(lzip(
+ ['foo', 'bar', 'baz'], [1, 2, 3])),
+ Index([0, 0, 1, 1, 2, 2])],
+ ids=lambda x: type(x).__name__)
+def indices(request):
+ return request.param
+
+
[email protected](params=[1, np.array(1, dtype=np.int64)])
+def one(request):
+ # zero-dim integer array behaves like an integer
+ return request.param
+
+
+zeros = [box([0] * 5, dtype=dtype)
+ for box in [pd.Index, np.array]
+ for dtype in [np.int64, np.uint64, np.float64]]
+zeros.extend([np.array(0, dtype=dtype)
+ for dtype in [np.int64, np.uint64, np.float64]])
+zeros.extend([0, 0.0, long(0)])
+
+
[email protected](params=zeros)
+def zero(request):
+ # For testing division by (or of) zero for Index with length 5, this
+ # gives several scalar-zeros and length-5 vector-zeros
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimelike.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimelike.py
new file mode 100644
index 00000000000..180033c2d26
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimelike.py
@@ -0,0 +1,101 @@
+""" generic datetimelike tests """
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .common import Base
+
+
+class DatetimeLike(Base):
+
+ def test_argmax_axis_invalid(self):
+ # GH#23081
+ rng = self.create_index()
+ with pytest.raises(ValueError):
+ rng.argmax(axis=1)
+ with pytest.raises(ValueError):
+ rng.argmin(axis=2)
+ with pytest.raises(ValueError):
+ rng.min(axis=-2)
+ with pytest.raises(ValueError):
+ rng.max(axis=-3)
+
+ def test_can_hold_identifiers(self):
+ idx = self.create_index()
+ key = idx[0]
+ assert idx._can_hold_identifiers_and_holds_name(key) is False
+
+ def test_shift_identity(self):
+
+ idx = self.create_index()
+ tm.assert_index_equal(idx, idx.shift(0))
+
+ def test_str(self):
+
+ # test the string repr
+ idx = self.create_index()
+ idx.name = 'foo'
+ assert not "length=%s" % len(idx) in str(idx)
+ assert "'foo'" in str(idx)
+ assert idx.__class__.__name__ in str(idx)
+
+ if hasattr(idx, 'tz'):
+ if idx.tz is not None:
+ assert idx.tz in str(idx)
+ if hasattr(idx, 'freq'):
+ assert "freq='%s'" % idx.freqstr in str(idx)
+
+ def test_view(self):
+ i = self.create_index()
+
+ i_view = i.view('i8')
+ result = self._holder(i)
+ tm.assert_index_equal(result, i)
+
+ i_view = i.view(self._holder)
+ result = self._holder(i)
+ tm.assert_index_equal(result, i_view)
+
+ def test_map_callable(self):
+ expected = self.index + self.index.freq
+ result = self.index.map(lambda x: x + x.freq)
+ tm.assert_index_equal(result, expected)
+
+ # map to NaT
+ result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x)
+ expected = pd.Index([pd.NaT] + self.index[1:].tolist())
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "mapper",
+ [
+ lambda values, index: {i: e for e, i in zip(values, index)},
+ lambda values, index: pd.Series(values, index)])
+ def test_map_dictlike(self, mapper):
+ expected = self.index + self.index.freq
+
+ # don't compare the freqs
+ if isinstance(expected, pd.DatetimeIndex):
+ expected.freq = None
+
+ result = self.index.map(mapper(expected, self.index))
+ tm.assert_index_equal(result, expected)
+
+ expected = pd.Index([pd.NaT] + self.index[1:].tolist())
+ result = self.index.map(mapper(expected, self.index))
+ tm.assert_index_equal(result, expected)
+
+ # empty map; these map to np.nan because we cannot know
+ # to re-infer things
+ expected = pd.Index([np.nan] * len(self.index))
+ result = self.index.map(mapper([], []))
+ tm.assert_index_equal(result, expected)
+
+ def test_asobject_deprecated(self):
+ # GH18572
+ d = self.create_index()
+ with tm.assert_produces_warning(FutureWarning):
+ i = d.asobject
+ assert isinstance(i, pd.Index)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_arithmetic.py
new file mode 100644
index 00000000000..1b75d6bd347
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_arithmetic.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+import pytest
+import pytz
+
+from pandas.errors import NullFrequencyError
+
+import pandas as pd
+from pandas import DatetimeIndex, Series, date_range
+import pandas.util.testing as tm
+
+
+class TestDatetimeIndexArithmetic(object):
+
+ # -------------------------------------------------------------
+ # DatetimeIndex.shift is used in integer addition
+
+ def test_dti_shift_tzaware(self, tz_naive_fixture):
+ # GH#9903
+ tz = tz_naive_fixture
+ idx = pd.DatetimeIndex([], name='xxx', tz=tz)
+ tm.assert_index_equal(idx.shift(0, freq='H'), idx)
+ tm.assert_index_equal(idx.shift(3, freq='H'), idx)
+
+ idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00',
+ '2011-01-01 12:00'], name='xxx', tz=tz)
+ tm.assert_index_equal(idx.shift(0, freq='H'), idx)
+ exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00',
+ '2011-01-01 15:00'], name='xxx', tz=tz)
+ tm.assert_index_equal(idx.shift(3, freq='H'), exp)
+ exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00',
+ '2011-01-01 09:00'], name='xxx', tz=tz)
+ tm.assert_index_equal(idx.shift(-3, freq='H'), exp)
+
+ def test_dti_shift_freqs(self):
+ # test shift for DatetimeIndex and non DatetimeIndex
+ # GH#8083
+ drange = pd.date_range('20130101', periods=5)
+ result = drange.shift(1)
+ expected = pd.DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04',
+ '2013-01-05',
+ '2013-01-06'], freq='D')
+ tm.assert_index_equal(result, expected)
+
+ result = drange.shift(-1)
+ expected = pd.DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02',
+ '2013-01-03', '2013-01-04'],
+ freq='D')
+ tm.assert_index_equal(result, expected)
+
+ result = drange.shift(3, freq='2D')
+ expected = pd.DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09',
+ '2013-01-10',
+ '2013-01-11'], freq='D')
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_shift_int(self):
+ rng = date_range('1/1/2000', periods=20)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng + 5
+
+ expected = rng.shift(5)
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng - 5
+
+ expected = rng.shift(-5)
+ tm.assert_index_equal(result, expected)
+
+ def test_dti_shift_no_freq(self):
+ # GH#19147
+ dti = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01'], freq=None)
+ with pytest.raises(NullFrequencyError):
+ dti.shift(2)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_shift_localized(self, tzstr):
+ dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
+ dr_tz = dr.tz_localize(tzstr)
+
+ result = dr_tz.shift(1, '10T')
+ assert result.tz == dr_tz.tz
+
+ def test_dti_shift_across_dst(self):
+ # GH 8616
+ idx = date_range('2013-11-03', tz='America/Chicago',
+ periods=7, freq='H')
+ s = Series(index=idx[:-1])
+ result = s.shift(freq='H')
+ expected = Series(index=idx[1:])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('shift, result_time', [
+ [0, '2014-11-14 00:00:00'],
+ [-1, '2014-11-13 23:00:00'],
+ [1, '2014-11-14 01:00:00']])
+ def test_dti_shift_near_midnight(self, shift, result_time):
+ # GH 8616
+ dt = datetime(2014, 11, 14, 0)
+ dt_est = pytz.timezone('EST').localize(dt)
+ s = Series(data=[1], index=[dt_est])
+ result = s.shift(shift, freq='H')
+ expected = Series(1, index=DatetimeIndex([result_time], tz='EST'))
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_astype.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_astype.py
new file mode 100644
index 00000000000..ddf6a6ded69
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_astype.py
@@ -0,0 +1,343 @@
+from datetime import datetime
+
+import dateutil
+from dateutil.tz import tzlocal
+import numpy as np
+import pytest
+import pytz
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Index, Int64Index, NaT, Period, Series, Timestamp,
+ date_range)
+import pandas.util.testing as tm
+
+
+class TestDatetimeIndex(object):
+
+ def test_astype(self):
+ # GH 13149, GH 13209
+ idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+
+ result = idx.astype(object)
+ expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype(int)
+ expected = Int64Index([1463356800000000000] +
+ [-9223372036854775808] * 3, dtype=np.int64)
+ tm.assert_index_equal(result, expected)
+
+ rng = date_range('1/1/2000', periods=10)
+ result = rng.astype('i8')
+ tm.assert_index_equal(result, Index(rng.asi8))
+ tm.assert_numpy_array_equal(result.values, rng.asi8)
+
+ def test_astype_uint(self):
+ arr = date_range('2000', periods=2)
+ expected = pd.UInt64Index(
+ np.array([946684800000000000, 946771200000000000], dtype="uint64")
+ )
+
+ tm.assert_index_equal(arr.astype("uint64"), expected)
+ tm.assert_index_equal(arr.astype("uint32"), expected)
+
+ def test_astype_with_tz(self):
+
+ # with tz
+ rng = date_range('1/1/2000', periods=10, tz='US/Eastern')
+ result = rng.astype('datetime64[ns]')
+ expected = (date_range('1/1/2000', periods=10,
+ tz='US/Eastern')
+ .tz_convert('UTC').tz_localize(None))
+ tm.assert_index_equal(result, expected)
+
+ # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex
+ result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str)
+ expected = pd.Series(
+ ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ result = Series(pd.date_range('2012-01-01', periods=3,
+ tz='US/Eastern')).astype(str)
+ expected = Series(['2012-01-01 00:00:00-05:00',
+ '2012-01-02 00:00:00-05:00',
+ '2012-01-03 00:00:00-05:00'],
+ dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ # GH 18951: tz-aware to tz-aware
+ idx = date_range('20170101', periods=4, tz='US/Pacific')
+ result = idx.astype('datetime64[ns, US/Eastern]')
+ expected = date_range('20170101 03:00:00', periods=4, tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ # GH 18951: tz-naive to tz-aware
+ idx = date_range('20170101', periods=4)
+ result = idx.astype('datetime64[ns, US/Eastern]')
+ expected = date_range('20170101', periods=4, tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ def test_astype_str_compat(self):
+ # GH 13149, GH 13209
+ # verify that we are returning NaT as a string (and not unicode)
+
+ idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+ result = idx.astype(str)
+ expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ def test_astype_str(self):
+ # test astype string - #10442
+ result = date_range('2012-01-01', periods=4,
+ name='test_name').astype(str)
+ expected = Index(['2012-01-01', '2012-01-02', '2012-01-03',
+ '2012-01-04'], name='test_name', dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ # test astype string with tz and name
+ result = date_range('2012-01-01', periods=3, name='test_name',
+ tz='US/Eastern').astype(str)
+ expected = Index(['2012-01-01 00:00:00-05:00',
+ '2012-01-02 00:00:00-05:00',
+ '2012-01-03 00:00:00-05:00'],
+ name='test_name', dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ # test astype string with freqH and name
+ result = date_range('1/1/2011', periods=3, freq='H',
+ name='test_name').astype(str)
+ expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
+ '2011-01-01 02:00:00'],
+ name='test_name', dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ # test astype string with freqH and timezone
+ result = date_range('3/6/2012 00:00', periods=2, freq='H',
+ tz='Europe/London', name='test_name').astype(str)
+ expected = Index(['2012-03-06 00:00:00+00:00',
+ '2012-03-06 01:00:00+00:00'],
+ dtype=object, name='test_name')
+ tm.assert_index_equal(result, expected)
+
+ def test_astype_datetime64(self):
+ # GH 13149, GH 13209
+ idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+
+ result = idx.astype('datetime64[ns]')
+ tm.assert_index_equal(result, idx)
+ assert result is not idx
+
+ result = idx.astype('datetime64[ns]', copy=False)
+ tm.assert_index_equal(result, idx)
+ assert result is idx
+
+ idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST')
+ result = idx_tz.astype('datetime64[ns]')
+ expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'],
+ dtype='datetime64[ns]')
+ tm.assert_index_equal(result, expected)
+
+ def test_astype_object(self):
+ rng = date_range('1/1/2000', periods=20)
+
+ casted = rng.astype('O')
+ exp_values = list(rng)
+
+ tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_))
+ assert casted.tolist() == exp_values
+
+ @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo'])
+ def test_astype_object_tz(self, tz):
+ idx = pd.date_range(start='2013-01-01', periods=4, freq='M',
+ name='idx', tz=tz)
+ expected_list = [Timestamp('2013-01-31', tz=tz),
+ Timestamp('2013-02-28', tz=tz),
+ Timestamp('2013-03-31', tz=tz),
+ Timestamp('2013-04-30', tz=tz)]
+ expected = pd.Index(expected_list, dtype=object, name='idx')
+ result = idx.astype(object)
+ tm.assert_index_equal(result, expected)
+ assert idx.tolist() == expected_list
+
+ def test_astype_object_with_nat(self):
+ idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2),
+ pd.NaT, datetime(2013, 1, 4)], name='idx')
+ expected_list = [Timestamp('2013-01-01'),
+ Timestamp('2013-01-02'), pd.NaT,
+ Timestamp('2013-01-04')]
+ expected = pd.Index(expected_list, dtype=object, name='idx')
+ result = idx.astype(object)
+ tm.assert_index_equal(result, expected)
+ assert idx.tolist() == expected_list
+
+ @pytest.mark.parametrize('dtype', [
+ float, 'timedelta64', 'timedelta64[ns]', 'datetime64',
+ 'datetime64[D]'])
+ def test_astype_raises(self, dtype):
+ # GH 13149, GH 13209
+ idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+ msg = 'Cannot cast DatetimeArray to dtype'
+ with pytest.raises(TypeError, match=msg):
+ idx.astype(dtype)
+
+ def test_index_convert_to_datetime_array(self):
+ def _check_rng(rng):
+ converted = rng.to_pydatetime()
+ assert isinstance(converted, np.ndarray)
+ for x, stamp in zip(converted, rng):
+ assert isinstance(x, datetime)
+ assert x == stamp.to_pydatetime()
+ assert x.tzinfo == stamp.tzinfo
+
+ rng = date_range('20090415', '20090519')
+ rng_eastern = date_range('20090415', '20090519', tz='US/Eastern')
+ rng_utc = date_range('20090415', '20090519', tz='utc')
+
+ _check_rng(rng)
+ _check_rng(rng_eastern)
+ _check_rng(rng_utc)
+
+ def test_index_convert_to_datetime_array_explicit_pytz(self):
+ def _check_rng(rng):
+ converted = rng.to_pydatetime()
+ assert isinstance(converted, np.ndarray)
+ for x, stamp in zip(converted, rng):
+ assert isinstance(x, datetime)
+ assert x == stamp.to_pydatetime()
+ assert x.tzinfo == stamp.tzinfo
+
+ rng = date_range('20090415', '20090519')
+ rng_eastern = date_range('20090415', '20090519',
+ tz=pytz.timezone('US/Eastern'))
+ rng_utc = date_range('20090415', '20090519', tz=pytz.utc)
+
+ _check_rng(rng)
+ _check_rng(rng_eastern)
+ _check_rng(rng_utc)
+
+ def test_index_convert_to_datetime_array_dateutil(self):
+ def _check_rng(rng):
+ converted = rng.to_pydatetime()
+ assert isinstance(converted, np.ndarray)
+ for x, stamp in zip(converted, rng):
+ assert isinstance(x, datetime)
+ assert x == stamp.to_pydatetime()
+ assert x.tzinfo == stamp.tzinfo
+
+ rng = date_range('20090415', '20090519')
+ rng_eastern = date_range('20090415', '20090519',
+ tz='dateutil/US/Eastern')
+ rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc())
+
+ _check_rng(rng)
+ _check_rng(rng_eastern)
+ _check_rng(rng_utc)
+
+ @pytest.mark.parametrize('tz, dtype', [
+ ['US/Pacific', 'datetime64[ns, US/Pacific]'],
+ [None, 'datetime64[ns]']])
+ def test_integer_index_astype_datetime(self, tz, dtype):
+ # GH 20997, 20964, 24559
+ val = [pd.Timestamp('2018-01-01', tz=tz).value]
+ result = pd.Index(val).astype(dtype)
+ expected = pd.DatetimeIndex(["2018-01-01"], tz=tz)
+ tm.assert_index_equal(result, expected)
+
+
+class TestToPeriod(object):
+
+ def setup_method(self, method):
+ data = [Timestamp('2007-01-01 10:11:12.123456Z'),
+ Timestamp('2007-01-01 10:11:13.789123Z')]
+ self.index = DatetimeIndex(data)
+
+ def test_to_period_millisecond(self):
+ index = self.index
+
+ with tm.assert_produces_warning(UserWarning):
+ # warning that timezone info will be lost
+ period = index.to_period(freq='L')
+ assert 2 == len(period)
+ assert period[0] == Period('2007-01-01 10:11:12.123Z', 'L')
+ assert period[1] == Period('2007-01-01 10:11:13.789Z', 'L')
+
+ def test_to_period_microsecond(self):
+ index = self.index
+
+ with tm.assert_produces_warning(UserWarning):
+ # warning that timezone info will be lost
+ period = index.to_period(freq='U')
+ assert 2 == len(period)
+ assert period[0] == Period('2007-01-01 10:11:12.123456Z', 'U')
+ assert period[1] == Period('2007-01-01 10:11:13.789123Z', 'U')
+
+ @pytest.mark.parametrize('tz', [
+ 'US/Eastern', pytz.utc, tzlocal(), 'dateutil/US/Eastern',
+ dateutil.tz.tzutc()])
+ def test_to_period_tz(self, tz):
+ ts = date_range('1/1/2000', '2/1/2000', tz=tz)
+
+ with tm.assert_produces_warning(UserWarning):
+ # GH#21333 warning that timezone info will be lost
+ result = ts.to_period()[0]
+ expected = ts[0].to_period()
+
+ assert result == expected
+
+ expected = date_range('1/1/2000', '2/1/2000').to_period()
+
+ with tm.assert_produces_warning(UserWarning):
+ # GH#21333 warning that timezone info will be lost
+ result = ts.to_period()
+
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', ['Etc/GMT-1', 'Etc/GMT+1'])
+ def test_to_period_tz_utc_offset_consistency(self, tz):
+ # GH 22905
+ ts = pd.date_range('1/1/2000', '2/1/2000', tz='Etc/GMT-1')
+ with tm.assert_produces_warning(UserWarning):
+ result = ts.to_period()[0]
+ expected = ts[0].to_period()
+ assert result == expected
+
+ def test_to_period_nofreq(self):
+ idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'])
+ with pytest.raises(ValueError):
+ idx.to_period()
+
+ idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'],
+ freq='infer')
+ assert idx.freqstr == 'D'
+ expected = pd.PeriodIndex(['2000-01-01', '2000-01-02',
+ '2000-01-03'], freq='D')
+ tm.assert_index_equal(idx.to_period(), expected)
+
+ # GH 7606
+ idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'])
+ assert idx.freqstr is None
+ tm.assert_index_equal(idx.to_period(), expected)
+
+ @pytest.mark.parametrize('tz', [None, 'US/Central'])
+ def test_astype_category(self, tz):
+ obj = pd.date_range("2000", periods=2, tz=tz)
+ result = obj.astype('category')
+ expected = pd.CategoricalIndex([pd.Timestamp('2000-01-01', tz=tz),
+ pd.Timestamp('2000-01-02', tz=tz)])
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype('category')
+ expected = expected.values
+ tm.assert_categorical_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'US/Central'])
+ def test_astype_array_fallback(self, tz):
+ obj = pd.date_range("2000", periods=2, tz=tz)
+ result = obj.astype(bool)
+ expected = pd.Index(np.array([True, True]))
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype(bool)
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_construction.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_construction.py
new file mode 100644
index 00000000000..7ebebbf6dee
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_construction.py
@@ -0,0 +1,794 @@
+from datetime import timedelta
+from functools import partial
+from operator import attrgetter
+
+import dateutil
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs import OutOfBoundsDatetime, conversion
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Index, Timestamp, date_range, datetime, offsets,
+ to_datetime)
+from pandas.core.arrays import DatetimeArray, period_array
+import pandas.util.testing as tm
+
+
+class TestDatetimeIndex(object):
+
+ @pytest.mark.parametrize('dt_cls', [DatetimeIndex,
+ DatetimeArray._from_sequence])
+ def test_freq_validation_with_nat(self, dt_cls):
+ # GH#11587 make sure we get a useful error message when generate_range
+ # raises
+ msg = ("Inferred frequency None from passed values does not conform "
+ "to passed frequency D")
+ with pytest.raises(ValueError, match=msg):
+ dt_cls([pd.NaT, pd.Timestamp('2011-01-01')], freq='D')
+ with pytest.raises(ValueError, match=msg):
+ dt_cls([pd.NaT, pd.Timestamp('2011-01-01').value],
+ freq='D')
+
+ def test_categorical_preserves_tz(self):
+ # GH#18664 retain tz when going DTI-->Categorical-->DTI
+ # TODO: parametrize over DatetimeIndex/DatetimeArray
+ # once CategoricalIndex(DTA) works
+
+ dti = pd.DatetimeIndex(
+ [pd.NaT, '2015-01-01', '1999-04-06 15:14:13', '2015-01-01'],
+ tz='US/Eastern')
+
+ ci = pd.CategoricalIndex(dti)
+ carr = pd.Categorical(dti)
+ cser = pd.Series(ci)
+
+ for obj in [ci, carr, cser]:
+ result = pd.DatetimeIndex(obj)
+ tm.assert_index_equal(result, dti)
+
+ def test_dti_with_period_data_raises(self):
+ # GH#23675
+ data = pd.PeriodIndex(['2016Q1', '2016Q2'], freq='Q')
+
+ with pytest.raises(TypeError, match="PeriodDtype data is invalid"):
+ DatetimeIndex(data)
+
+ with pytest.raises(TypeError, match="PeriodDtype data is invalid"):
+ to_datetime(data)
+
+ with pytest.raises(TypeError, match="PeriodDtype data is invalid"):
+ DatetimeIndex(period_array(data))
+
+ with pytest.raises(TypeError, match="PeriodDtype data is invalid"):
+ to_datetime(period_array(data))
+
+ def test_dti_with_timedelta64_data_deprecation(self):
+ # GH#23675
+ data = np.array([0], dtype='m8[ns]')
+ with tm.assert_produces_warning(FutureWarning):
+ result = DatetimeIndex(data)
+
+ assert result[0] == Timestamp('1970-01-01')
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = to_datetime(data)
+
+ assert result[0] == Timestamp('1970-01-01')
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = DatetimeIndex(pd.TimedeltaIndex(data))
+
+ assert result[0] == Timestamp('1970-01-01')
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = to_datetime(pd.TimedeltaIndex(data))
+
+ assert result[0] == Timestamp('1970-01-01')
+
+ def test_construction_caching(self):
+
+ df = pd.DataFrame({'dt': pd.date_range('20130101', periods=3),
+ 'dttz': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT,
+ pd.Timestamp('20130103')],
+ 'dtns': pd.date_range('20130101', periods=3,
+ freq='ns')})
+ assert df.dttz.dtype.tz.zone == 'US/Eastern'
+
+ @pytest.mark.parametrize('kwargs', [
+ {'tz': 'dtype.tz'},
+ {'dtype': 'dtype'},
+ {'dtype': 'dtype', 'tz': 'dtype.tz'}])
+ def test_construction_with_alt(self, kwargs, tz_aware_fixture):
+ tz = tz_aware_fixture
+ i = pd.date_range('20130101', periods=5, freq='H', tz=tz)
+ kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()}
+ result = DatetimeIndex(i, **kwargs)
+ tm.assert_index_equal(i, result)
+
+ @pytest.mark.parametrize('kwargs', [
+ {'tz': 'dtype.tz'},
+ {'dtype': 'dtype'},
+ {'dtype': 'dtype', 'tz': 'dtype.tz'}])
+ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture):
+ tz = tz_aware_fixture
+ i = pd.date_range('20130101', periods=5, freq='H', tz=tz)
+ kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()}
+
+ if str(tz) in ('UTC', 'tzutc()'):
+ warn = None
+ else:
+ warn = FutureWarning
+
+ with tm.assert_produces_warning(warn, check_stacklevel=False):
+ result = DatetimeIndex(i.tz_localize(None).asi8, **kwargs)
+ expected = DatetimeIndex(i, **kwargs)
+ tm.assert_index_equal(result, expected)
+
+ # localize into the provided tz
+ i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC')
+ expected = i.tz_localize(None).tz_localize('UTC')
+ tm.assert_index_equal(i2, expected)
+
+ # incompat tz/dtype
+ pytest.raises(ValueError, lambda: DatetimeIndex(
+ i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific'))
+
+ def test_construction_index_with_mixed_timezones(self):
+ # gh-11488: no tz results in DatetimeIndex
+ result = Index([Timestamp('2011-01-01'),
+ Timestamp('2011-01-02')], name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01'),
+ Timestamp('2011-01-02')], name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is None
+
+ # same tz results in DatetimeIndex
+ result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')],
+ name='idx')
+ exp = DatetimeIndex(
+ [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00')
+ ], tz='Asia/Tokyo', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ # same tz results in DatetimeIndex (DST)
+ result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'),
+ Timestamp('2011-08-01 10:00', tz='US/Eastern')],
+ name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-08-01 10:00')],
+ tz='US/Eastern', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ # Different tz results in Index(dtype=object)
+ result = Index([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ name='idx')
+ exp = Index([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ dtype='object', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert not isinstance(result, DatetimeIndex)
+
+ result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ name='idx')
+ exp = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ dtype='object', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert not isinstance(result, DatetimeIndex)
+
+ # length = 1
+ result = Index([Timestamp('2011-01-01')], name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is None
+
+ # length = 1 with tz
+ result = Index(
+ [Timestamp('2011-01-01 10:00', tz='Asia/Tokyo')], name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01 10:00')], tz='Asia/Tokyo',
+ name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ def test_construction_index_with_mixed_timezones_with_NaT(self):
+ # see gh-11488
+ result = Index([pd.NaT, Timestamp('2011-01-01'),
+ pd.NaT, Timestamp('2011-01-02')], name='idx')
+ exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'),
+ pd.NaT, Timestamp('2011-01-02')], name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is None
+
+ # Same tz results in DatetimeIndex
+ result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ pd.NaT, Timestamp('2011-01-02 10:00',
+ tz='Asia/Tokyo')],
+ name='idx')
+ exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'),
+ pd.NaT, Timestamp('2011-01-02 10:00')],
+ tz='Asia/Tokyo', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ # same tz results in DatetimeIndex (DST)
+ result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'),
+ pd.NaT,
+ Timestamp('2011-08-01 10:00', tz='US/Eastern')],
+ name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT,
+ Timestamp('2011-08-01 10:00')],
+ tz='US/Eastern', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ # different tz results in Index(dtype=object)
+ result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
+ pd.NaT, Timestamp('2011-01-02 10:00',
+ tz='US/Eastern')],
+ name='idx')
+ exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
+ pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ dtype='object', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert not isinstance(result, DatetimeIndex)
+
+ result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ pd.NaT, Timestamp('2011-01-02 10:00',
+ tz='US/Eastern')], name='idx')
+ exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ dtype='object', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert not isinstance(result, DatetimeIndex)
+
+ # all NaT
+ result = Index([pd.NaT, pd.NaT], name='idx')
+ exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is None
+
+ # all NaT with tz
+ result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx')
+ exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx')
+
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz is not None
+ assert result.tz == exp.tz
+
+ def test_construction_dti_with_mixed_timezones(self):
+ # GH 11488 (not changed, added explicit tests)
+
+ # no tz results in DatetimeIndex
+ result = DatetimeIndex(
+ [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx')
+ exp = DatetimeIndex(
+ [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+
+ # same tz results in DatetimeIndex
+ result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00',
+ tz='Asia/Tokyo')],
+ name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00')],
+ tz='Asia/Tokyo', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+
+ # same tz results in DatetimeIndex (DST)
+ result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='US/Eastern'),
+ Timestamp('2011-08-01 10:00',
+ tz='US/Eastern')],
+ name='idx')
+ exp = DatetimeIndex([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-08-01 10:00')],
+ tz='US/Eastern', name='idx')
+ tm.assert_index_equal(result, exp, exact=True)
+ assert isinstance(result, DatetimeIndex)
+
+ # tz mismatch affecting to tz-aware raises TypeError/ValueError
+
+ with pytest.raises(ValueError):
+ DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ name='idx')
+
+ msg = 'cannot be converted to datetime64'
+ with pytest.raises(ValueError, match=msg):
+ DatetimeIndex([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ tz='Asia/Tokyo', name='idx')
+
+ with pytest.raises(ValueError):
+ DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ tz='US/Eastern', name='idx')
+
+ with pytest.raises(ValueError, match=msg):
+ # passing tz should results in DatetimeIndex, then mismatch raises
+ # TypeError
+ Index([pd.NaT, Timestamp('2011-01-01 10:00'),
+ pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
+ tz='Asia/Tokyo', name='idx')
+
+ def test_construction_base_constructor(self):
+ arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]
+ tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.DatetimeIndex(np.array(arr)))
+
+ arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')]
+ tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.DatetimeIndex(np.array(arr)))
+
+ def test_construction_outofbounds(self):
+ # GH 13663
+ dates = [datetime(3000, 1, 1), datetime(4000, 1, 1),
+ datetime(5000, 1, 1), datetime(6000, 1, 1)]
+ exp = Index(dates, dtype=object)
+ # coerces to object
+ tm.assert_index_equal(Index(dates), exp)
+
+ with pytest.raises(OutOfBoundsDatetime):
+ # can't create DatetimeIndex
+ DatetimeIndex(dates)
+
+ def test_construction_with_ndarray(self):
+ # GH 5152
+ dates = [datetime(2013, 10, 7),
+ datetime(2013, 10, 8),
+ datetime(2013, 10, 9)]
+ data = DatetimeIndex(dates, freq=pd.offsets.BDay()).values
+ result = DatetimeIndex(data, freq=pd.offsets.BDay())
+ expected = DatetimeIndex(['2013-10-07',
+ '2013-10-08',
+ '2013-10-09'],
+ freq='B')
+ tm.assert_index_equal(result, expected)
+
+ def test_verify_integrity_deprecated(self):
+ # GH#23919
+ with tm.assert_produces_warning(FutureWarning):
+ DatetimeIndex(['1/1/2000'], verify_integrity=False)
+
+ def test_range_kwargs_deprecated(self):
+ # GH#23919
+ with tm.assert_produces_warning(FutureWarning):
+ DatetimeIndex(start='1/1/2000', end='1/10/2000', freq='D')
+
+ def test_integer_values_and_tz_deprecated(self):
+ # GH-24559
+ values = np.array([946684800000000000])
+ with tm.assert_produces_warning(FutureWarning):
+ result = DatetimeIndex(values, tz='US/Central')
+ expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central")
+ tm.assert_index_equal(result, expected)
+
+ # but UTC is *not* deprecated.
+ with tm.assert_produces_warning(None):
+ result = DatetimeIndex(values, tz='UTC')
+ expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central")
+
+ def test_constructor_coverage(self):
+ rng = date_range('1/1/2000', periods=10.5)
+ exp = date_range('1/1/2000', periods=10)
+ tm.assert_index_equal(rng, exp)
+
+ msg = 'periods must be a number, got foo'
+ with pytest.raises(TypeError, match=msg):
+ date_range(start='1/1/2000', periods='foo', freq='D')
+
+ with pytest.raises(ValueError):
+ with tm.assert_produces_warning(FutureWarning):
+ DatetimeIndex(start='1/1/2000', end='1/10/2000')
+
+ with pytest.raises(TypeError):
+ DatetimeIndex('1/1/2000')
+
+ # generator expression
+ gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10))
+ result = DatetimeIndex(gen)
+ expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i)
+ for i in range(10)])
+ tm.assert_index_equal(result, expected)
+
+ # NumPy string array
+ strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03'])
+ result = DatetimeIndex(strings)
+ expected = DatetimeIndex(strings.astype('O'))
+ tm.assert_index_equal(result, expected)
+
+ from_ints = DatetimeIndex(expected.asi8)
+ tm.assert_index_equal(from_ints, expected)
+
+ # string with NaT
+ strings = np.array(['2000-01-01', '2000-01-02', 'NaT'])
+ result = DatetimeIndex(strings)
+ expected = DatetimeIndex(strings.astype('O'))
+ tm.assert_index_equal(result, expected)
+
+ from_ints = DatetimeIndex(expected.asi8)
+ tm.assert_index_equal(from_ints, expected)
+
+ # non-conforming
+ pytest.raises(ValueError, DatetimeIndex,
+ ['2000-01-01', '2000-01-02', '2000-01-04'], freq='D')
+
+ pytest.raises(ValueError, date_range, start='2011-01-01',
+ freq='b')
+ pytest.raises(ValueError, date_range, end='2011-01-01',
+ freq='B')
+ pytest.raises(ValueError, date_range, periods=10, freq='D')
+
+ @pytest.mark.parametrize('freq', ['AS', 'W-SUN'])
+ def test_constructor_datetime64_tzformat(self, freq):
+ # see GH#6572: ISO 8601 format results in pytz.FixedOffset
+ idx = date_range('2013-01-01T00:00:00-05:00',
+ '2016-01-01T23:59:59-05:00', freq=freq)
+ expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
+ freq=freq, tz=pytz.FixedOffset(-300))
+ tm.assert_index_equal(idx, expected)
+ # Unable to use `US/Eastern` because of DST
+ expected_i8 = date_range('2013-01-01T00:00:00',
+ '2016-01-01T23:59:59', freq=freq,
+ tz='America/Lima')
+ tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8)
+
+ idx = date_range('2013-01-01T00:00:00+09:00',
+ '2016-01-01T23:59:59+09:00', freq=freq)
+ expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
+ freq=freq, tz=pytz.FixedOffset(540))
+ tm.assert_index_equal(idx, expected)
+ expected_i8 = date_range('2013-01-01T00:00:00',
+ '2016-01-01T23:59:59', freq=freq,
+ tz='Asia/Tokyo')
+ tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8)
+
+ # Non ISO 8601 format results in dateutil.tz.tzoffset
+ idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00',
+ freq=freq)
+ expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
+ freq=freq, tz=pytz.FixedOffset(-300))
+ tm.assert_index_equal(idx, expected)
+ # Unable to use `US/Eastern` because of DST
+ expected_i8 = date_range('2013-01-01T00:00:00',
+ '2016-01-01T23:59:59', freq=freq,
+ tz='America/Lima')
+ tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8)
+
+ idx = date_range('2013/1/1 0:00:00+9:00',
+ '2016/1/1 23:59:59+09:00', freq=freq)
+ expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
+ freq=freq, tz=pytz.FixedOffset(540))
+ tm.assert_index_equal(idx, expected)
+ expected_i8 = date_range('2013-01-01T00:00:00',
+ '2016-01-01T23:59:59', freq=freq,
+ tz='Asia/Tokyo')
+ tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8)
+
+ def test_constructor_dtype(self):
+
+ # passing a dtype with a tz should localize
+ idx = DatetimeIndex(['2013-01-01', '2013-01-02'],
+ dtype='datetime64[ns, US/Eastern]')
+ expected = DatetimeIndex(['2013-01-01', '2013-01-02']
+ ).tz_localize('US/Eastern')
+ tm.assert_index_equal(idx, expected)
+
+ idx = DatetimeIndex(['2013-01-01', '2013-01-02'],
+ tz='US/Eastern')
+ tm.assert_index_equal(idx, expected)
+
+ # if we already have a tz and its not the same, then raise
+ idx = DatetimeIndex(['2013-01-01', '2013-01-02'],
+ dtype='datetime64[ns, US/Eastern]')
+
+ pytest.raises(ValueError,
+ lambda: DatetimeIndex(idx,
+ dtype='datetime64[ns]'))
+
+ # this is effectively trying to convert tz's
+ pytest.raises(TypeError,
+ lambda: DatetimeIndex(idx,
+ dtype='datetime64[ns, CET]'))
+ pytest.raises(ValueError,
+ lambda: DatetimeIndex(
+ idx, tz='CET',
+ dtype='datetime64[ns, US/Eastern]'))
+ result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]')
+ tm.assert_index_equal(idx, result)
+
+ def test_constructor_name(self):
+ idx = date_range(start='2000-01-01', periods=1, freq='A',
+ name='TEST')
+ assert idx.name == 'TEST'
+
+ def test_000constructor_resolution(self):
+ # 2252
+ t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1)
+ idx = DatetimeIndex([t1])
+
+ assert idx.nanosecond[0] == t1.nanosecond
+
+ def test_disallow_setting_tz(self):
+ # GH 3746
+ dti = DatetimeIndex(['2010'], tz='UTC')
+ with pytest.raises(AttributeError):
+ dti.tz = pytz.timezone('US/Pacific')
+
+ @pytest.mark.parametrize('tz', [
+ None, 'America/Los_Angeles', pytz.timezone('America/Los_Angeles'),
+ Timestamp('2000', tz='America/Los_Angeles').tz])
+ def test_constructor_start_end_with_tz(self, tz):
+ # GH 18595
+ start = Timestamp('2013-01-01 06:00:00', tz='America/Los_Angeles')
+ end = Timestamp('2013-01-02 06:00:00', tz='America/Los_Angeles')
+ result = date_range(freq='D', start=start, end=end, tz=tz)
+ expected = DatetimeIndex(['2013-01-01 06:00:00',
+ '2013-01-02 06:00:00'],
+ tz='America/Los_Angeles')
+ tm.assert_index_equal(result, expected)
+ # Especially assert that the timezone is consistent for pytz
+ assert pytz.timezone('America/Los_Angeles') is result.tz
+
+ @pytest.mark.parametrize('tz', ['US/Pacific', 'US/Eastern', 'Asia/Tokyo'])
+ def test_constructor_with_non_normalized_pytz(self, tz):
+ # GH 18595
+ non_norm_tz = Timestamp('2010', tz=tz).tz
+ result = DatetimeIndex(['2010'], tz=non_norm_tz)
+ assert pytz.timezone(tz) is result.tz
+
+ def test_constructor_timestamp_near_dst(self):
+ # GH 20854
+ ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'),
+ Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')]
+ result = DatetimeIndex(ts)
+ expected = DatetimeIndex([ts[0].to_pydatetime(),
+ ts[1].to_pydatetime()])
+ tm.assert_index_equal(result, expected)
+
+ # TODO(GH-24559): Remove the xfail for the tz-aware case.
+ @pytest.mark.parametrize('klass', [Index, DatetimeIndex])
+ @pytest.mark.parametrize('box', [
+ np.array, partial(np.array, dtype=object), list])
+ @pytest.mark.parametrize('tz, dtype', [
+ pytest.param('US/Pacific', 'datetime64[ns, US/Pacific]',
+ marks=[pytest.mark.xfail(),
+ pytest.mark.filterwarnings(
+ "ignore:\\n Passing:FutureWarning")]),
+ [None, 'datetime64[ns]'],
+ ])
+ def test_constructor_with_int_tz(self, klass, box, tz, dtype):
+ # GH 20997, 20964
+ ts = Timestamp('2018-01-01', tz=tz)
+ result = klass(box([ts.value]), dtype=dtype)
+ expected = klass([ts])
+ assert result == expected
+
+ # This is the desired future behavior
+ @pytest.mark.xfail(reason="Future behavior", strict=False)
+ @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning")
+ def test_construction_int_rountrip(self, tz_naive_fixture):
+ # GH 12619
+ # TODO(GH-24559): Remove xfail
+ tz = tz_naive_fixture
+ result = 1293858000000000000
+ expected = DatetimeIndex([1293858000000000000], tz=tz).asi8[0]
+ assert result == expected
+
+ def test_construction_from_replaced_timestamps_with_dst(self):
+ # GH 18785
+ index = pd.date_range(pd.Timestamp(2000, 1, 1),
+ pd.Timestamp(2005, 1, 1),
+ freq='MS', tz='Australia/Melbourne')
+ test = pd.DataFrame({'data': range(len(index))}, index=index)
+ test = test.resample('Y').mean()
+ result = pd.DatetimeIndex([x.replace(month=6, day=1)
+ for x in test.index])
+ expected = pd.DatetimeIndex(['2000-06-01 00:00:00',
+ '2001-06-01 00:00:00',
+ '2002-06-01 00:00:00',
+ '2003-06-01 00:00:00',
+ '2004-06-01 00:00:00',
+ '2005-06-01 00:00:00'],
+ tz='Australia/Melbourne')
+ tm.assert_index_equal(result, expected)
+
+ def test_construction_with_tz_and_tz_aware_dti(self):
+ # GH 23579
+ dti = date_range('2016-01-01', periods=3, tz='US/Central')
+ with pytest.raises(TypeError):
+ DatetimeIndex(dti, tz='Asia/Tokyo')
+
+ def test_construction_with_nat_and_tzlocal(self):
+ tz = dateutil.tz.tzlocal()
+ result = DatetimeIndex(['2018', 'NaT'], tz=tz)
+ expected = DatetimeIndex([Timestamp('2018', tz=tz), pd.NaT])
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_no_precision_warns(self):
+ # GH-24753, GH-24739
+ expected = pd.DatetimeIndex(['2000'], dtype='datetime64[ns]')
+
+ # we set the stacklevel for DatetimeIndex
+ with tm.assert_produces_warning(FutureWarning):
+ result = pd.DatetimeIndex(['2000'], dtype='datetime64')
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = pd.Index(['2000'], dtype='datetime64')
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_wrong_precision_raises(self):
+ with pytest.raises(ValueError):
+ pd.DatetimeIndex(['2000'], dtype='datetime64[us]')
+
+
+class TestTimeSeries(object):
+
+ def test_dti_constructor_preserve_dti_freq(self):
+ rng = date_range('1/1/2000', '1/2/2000', freq='5min')
+
+ rng2 = DatetimeIndex(rng)
+ assert rng.freq == rng2.freq
+
+ def test_dti_constructor_years_only(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ # GH 6961
+ rng1 = date_range('2014', '2015', freq='M', tz=tz)
+ expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz)
+
+ rng2 = date_range('2014', '2015', freq='MS', tz=tz)
+ expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', tz=tz)
+
+ rng3 = date_range('2014', '2020', freq='A', tz=tz)
+ expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz)
+
+ rng4 = date_range('2014', '2020', freq='AS', tz=tz)
+ expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', tz=tz)
+
+ for rng, expected in [(rng1, expected1), (rng2, expected2),
+ (rng3, expected3), (rng4, expected4)]:
+ tm.assert_index_equal(rng, expected)
+
+ def test_dti_constructor_small_int(self, any_int_dtype):
+ # see gh-13721
+ exp = DatetimeIndex(['1970-01-01 00:00:00.00000000',
+ '1970-01-01 00:00:00.00000001',
+ '1970-01-01 00:00:00.00000002'])
+
+ arr = np.array([0, 10, 20], dtype=any_int_dtype)
+ tm.assert_index_equal(DatetimeIndex(arr), exp)
+
+ def test_ctor_str_intraday(self):
+ rng = DatetimeIndex(['1-1-2000 00:00:01'])
+ assert rng[0].second == 1
+
+ def test_is_(self):
+ dti = date_range(start='1/1/2005', end='12/1/2005', freq='M')
+ assert dti.is_(dti)
+ assert dti.is_(dti.view())
+ assert not dti.is_(dti.copy())
+
+ def test_index_cast_datetime64_other_units(self):
+ arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]')
+ idx = Index(arr)
+
+ assert (idx.values == conversion.ensure_datetime64ns(arr)).all()
+
+ def test_constructor_int64_nocopy(self):
+ # GH#1624
+ arr = np.arange(1000, dtype=np.int64)
+ index = DatetimeIndex(arr)
+
+ arr[50:100] = -1
+ assert (index.asi8[50:100] == -1).all()
+
+ arr = np.arange(1000, dtype=np.int64)
+ index = DatetimeIndex(arr, copy=True)
+
+ arr[50:100] = -1
+ assert (index.asi8[50:100] != -1).all()
+
+ @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', 'BH',
+ 'T', 'S', 'L', 'U', 'H', 'N', 'C'])
+ def test_from_freq_recreate_from_data(self, freq):
+ org = date_range(start='2001/02/01 09:00', freq=freq, periods=1)
+ idx = DatetimeIndex(org, freq=freq)
+ tm.assert_index_equal(idx, org)
+
+ org = date_range(start='2001/02/01 09:00', freq=freq,
+ tz='US/Pacific', periods=1)
+ idx = DatetimeIndex(org, freq=freq, tz='US/Pacific')
+ tm.assert_index_equal(idx, org)
+
+ def test_datetimeindex_constructor_misc(self):
+ arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04']
+ pytest.raises(Exception, DatetimeIndex, arr)
+
+ arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']
+ idx1 = DatetimeIndex(arr)
+
+ arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04']
+ idx2 = DatetimeIndex(arr)
+
+ arr = [Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005',
+ '2005-01-04']
+ idx3 = DatetimeIndex(arr)
+
+ arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005',
+ '2005-01-04'], dtype='O')
+ idx4 = DatetimeIndex(arr)
+
+ arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'])
+ idx5 = DatetimeIndex(arr)
+
+ arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04'
+ ])
+ idx6 = DatetimeIndex(arr)
+
+ idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True)
+ idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False,
+ yearfirst=True)
+ tm.assert_index_equal(idx7, idx8)
+
+ for other in [idx2, idx3, idx4, idx5, idx6]:
+ assert (idx1.values == other.values).all()
+
+ sdate = datetime(1999, 12, 25)
+ edate = datetime(2000, 1, 1)
+ idx = date_range(start=sdate, freq='1B', periods=20)
+ assert len(idx) == 20
+ assert idx[0] == sdate + 0 * offsets.BDay()
+ assert idx.freq == 'B'
+
+ idx = date_range(end=edate, freq=('D', 5), periods=20)
+ assert len(idx) == 20
+ assert idx[-1] == edate
+ assert idx.freq == '5D'
+
+ idx1 = date_range(start=sdate, end=edate, freq='W-SUN')
+ idx2 = date_range(start=sdate, end=edate,
+ freq=offsets.Week(weekday=6))
+ assert len(idx1) == len(idx2)
+ assert idx1.freq == idx2.freq
+
+ idx1 = date_range(start=sdate, end=edate, freq='QS')
+ idx2 = date_range(start=sdate, end=edate,
+ freq=offsets.QuarterBegin(startingMonth=1))
+ assert len(idx1) == len(idx2)
+ assert idx1.freq == idx2.freq
+
+ idx1 = date_range(start=sdate, end=edate, freq='BQ')
+ idx2 = date_range(start=sdate, end=edate,
+ freq=offsets.BQuarterEnd(startingMonth=12))
+ assert len(idx1) == len(idx2)
+ assert idx1.freq == idx2.freq
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_date_range.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_date_range.py
new file mode 100644
index 00000000000..a9bece248e9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_date_range.py
@@ -0,0 +1,842 @@
+"""
+test date_range, bdate_range construction from the convenience range functions
+"""
+
+from datetime import datetime, time, timedelta
+
+import numpy as np
+import pytest
+import pytz
+from pytz import timezone
+
+import pandas.compat as compat
+from pandas.errors import OutOfBoundsDatetime
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DatetimeIndex, Timestamp, bdate_range, date_range, offsets
+from pandas.tests.series.common import TestData
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import (
+ BDay, CDay, DateOffset, MonthEnd, generate_range, prefix_mapping)
+
+START, END = datetime(2009, 1, 1), datetime(2010, 1, 1)
+
+
+class TestTimestampEquivDateRange(object):
+ # Older tests in TestTimeSeries constructed their `stamp` objects
+ # using `date_range` instead of the `Timestamp` constructor.
+ # TestTimestampEquivDateRange checks that these are equivalent in the
+ # pertinent cases.
+
+ def test_date_range_timestamp_equiv(self):
+ rng = date_range('20090415', '20090519', tz='US/Eastern')
+ stamp = rng[0]
+
+ ts = Timestamp('20090415', tz='US/Eastern', freq='D')
+ assert ts == stamp
+
+ def test_date_range_timestamp_equiv_dateutil(self):
+ rng = date_range('20090415', '20090519', tz='dateutil/US/Eastern')
+ stamp = rng[0]
+
+ ts = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D')
+ assert ts == stamp
+
+ def test_date_range_timestamp_equiv_explicit_pytz(self):
+ rng = date_range('20090415', '20090519',
+ tz=pytz.timezone('US/Eastern'))
+ stamp = rng[0]
+
+ ts = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D')
+ assert ts == stamp
+
+ @td.skip_if_windows_python_3
+ def test_date_range_timestamp_equiv_explicit_dateutil(self):
+ from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
+
+ rng = date_range('20090415', '20090519', tz=gettz('US/Eastern'))
+ stamp = rng[0]
+
+ ts = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D')
+ assert ts == stamp
+
+ def test_date_range_timestamp_equiv_from_datetime_instance(self):
+ datetime_instance = datetime(2014, 3, 4)
+ # build a timestamp with a frequency, since then it supports
+ # addition/subtraction of integers
+ timestamp_instance = date_range(datetime_instance, periods=1,
+ freq='D')[0]
+
+ ts = Timestamp(datetime_instance, freq='D')
+ assert ts == timestamp_instance
+
+ def test_date_range_timestamp_equiv_preserve_frequency(self):
+ timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0]
+ ts = Timestamp('2014-03-05', freq='D')
+
+ assert timestamp_instance == ts
+
+
+class TestDateRanges(TestData):
+ def test_date_range_nat(self):
+ # GH#11587
+ msg = "Neither `start` nor `end` can be NaT"
+ with pytest.raises(ValueError, match=msg):
+ date_range(start='2016-01-01', end=pd.NaT, freq='D')
+ with pytest.raises(ValueError, match=msg):
+ date_range(start=pd.NaT, end='2016-01-01', freq='D')
+
+ def test_date_range_multiplication_overflow(self):
+ # GH#24255
+ # check that overflows in calculating `addend = periods * stride`
+ # are caught
+ with tm.assert_produces_warning(None):
+ # we should _not_ be seeing a overflow RuntimeWarning
+ dti = date_range(start='1677-09-22', periods=213503, freq='D')
+
+ assert dti[0] == Timestamp('1677-09-22')
+ assert len(dti) == 213503
+
+ msg = "Cannot generate range with"
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ date_range('1969-05-04', periods=200000000, freq='30000D')
+
+ def test_date_range_unsigned_overflow_handling(self):
+ # GH#24255
+ # case where `addend = periods * stride` overflows int64 bounds
+ # but not uint64 bounds
+ dti = date_range(start='1677-09-22', end='2262-04-11', freq='D')
+
+ dti2 = date_range(start=dti[0], periods=len(dti), freq='D')
+ assert dti2.equals(dti)
+
+ dti3 = date_range(end=dti[-1], periods=len(dti), freq='D')
+ assert dti3.equals(dti)
+
+ def test_date_range_int64_overflow_non_recoverable(self):
+ # GH#24255
+ # case with start later than 1970-01-01, overflow int64 but not uint64
+ msg = "Cannot generate range with"
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ date_range(start='1970-02-01', periods=106752 * 24, freq='H')
+
+ # case with end before 1970-01-01, overflow int64 but not uint64
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ date_range(end='1969-11-14', periods=106752 * 24, freq='H')
+
+ def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
+ # cases where stride * periods overflow int64 and stride/endpoint
+ # have different signs
+ start = Timestamp('2262-02-23')
+ end = Timestamp('1969-11-14')
+
+ expected = date_range(start=start, end=end, freq='-1H')
+ assert expected[0] == start
+ assert expected[-1] == end
+
+ dti = date_range(end=end, periods=len(expected), freq='-1H')
+ tm.assert_index_equal(dti, expected)
+
+ start2 = Timestamp('1970-02-01')
+ end2 = Timestamp('1677-10-22')
+
+ expected2 = date_range(start=start2, end=end2, freq='-1H')
+ assert expected2[0] == start2
+ assert expected2[-1] == end2
+
+ dti2 = date_range(start=start2, periods=len(expected2), freq='-1H')
+ tm.assert_index_equal(dti2, expected2)
+
+ def test_date_range_out_of_bounds(self):
+ # GH#14187
+ with pytest.raises(OutOfBoundsDatetime):
+ date_range('2016-01-01', periods=100000, freq='D')
+ with pytest.raises(OutOfBoundsDatetime):
+ date_range(end='1763-10-12', periods=100000, freq='D')
+
+ def test_date_range_gen_error(self):
+ rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min')
+ assert len(rng) == 4
+
+ @pytest.mark.parametrize("freq", ["AS", "YS"])
+ def test_begin_year_alias(self, freq):
+ # see gh-9313
+ rng = date_range("1/1/2013", "7/1/2017", freq=freq)
+ exp = pd.DatetimeIndex(["2013-01-01", "2014-01-01",
+ "2015-01-01", "2016-01-01",
+ "2017-01-01"], freq=freq)
+ tm.assert_index_equal(rng, exp)
+
+ @pytest.mark.parametrize("freq", ["A", "Y"])
+ def test_end_year_alias(self, freq):
+ # see gh-9313
+ rng = date_range("1/1/2013", "7/1/2017", freq=freq)
+ exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31",
+ "2015-12-31", "2016-12-31"], freq=freq)
+ tm.assert_index_equal(rng, exp)
+
+ @pytest.mark.parametrize("freq", ["BA", "BY"])
+ def test_business_end_year_alias(self, freq):
+ # see gh-9313
+ rng = date_range("1/1/2013", "7/1/2017", freq=freq)
+ exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31",
+ "2015-12-31", "2016-12-30"], freq=freq)
+ tm.assert_index_equal(rng, exp)
+
+ def test_date_range_negative_freq(self):
+ # GH 11018
+ rng = date_range('2011-12-31', freq='-2A', periods=3)
+ exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31',
+ '2007-12-31'], freq='-2A')
+ tm.assert_index_equal(rng, exp)
+ assert rng.freq == '-2A'
+
+ rng = date_range('2011-01-31', freq='-2M', periods=3)
+ exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30',
+ '2010-09-30'], freq='-2M')
+ tm.assert_index_equal(rng, exp)
+ assert rng.freq == '-2M'
+
+ def test_date_range_bms_bug(self):
+ # #1645
+ rng = date_range('1/1/2000', periods=10, freq='BMS')
+
+ ex_first = Timestamp('2000-01-03')
+ assert rng[0] == ex_first
+
+ def test_date_range_normalize(self):
+ snap = datetime.today()
+ n = 50
+
+ rng = date_range(snap, periods=n, normalize=False, freq='2D')
+
+ offset = timedelta(2)
+ values = DatetimeIndex([snap + i * offset for i in range(n)])
+
+ tm.assert_index_equal(rng, values)
+
+ rng = date_range('1/1/2000 08:15', periods=n, normalize=False,
+ freq='B')
+ the_time = time(8, 15)
+ for val in rng:
+ assert val.time() == the_time
+
+ def test_date_range_fy5252(self):
+ dr = date_range(start="2013-01-01", periods=2, freq=offsets.FY5253(
+ startingMonth=1, weekday=3, variation="nearest"))
+ assert dr[0] == Timestamp('2013-01-31')
+ assert dr[1] == Timestamp('2014-01-30')
+
+ def test_date_range_ambiguous_arguments(self):
+ # #2538
+ start = datetime(2011, 1, 1, 5, 3, 40)
+ end = datetime(2011, 1, 1, 8, 9, 40)
+
+ msg = ('Of the four parameters: start, end, periods, and '
+ 'freq, exactly three must be specified')
+ with pytest.raises(ValueError, match=msg):
+ date_range(start, end, periods=10, freq='s')
+
+ def test_date_range_convenience_periods(self):
+ # GH 20808
+ result = date_range('2018-04-24', '2018-04-27', periods=3)
+ expected = DatetimeIndex(['2018-04-24 00:00:00',
+ '2018-04-25 12:00:00',
+ '2018-04-27 00:00:00'], freq=None)
+
+ tm.assert_index_equal(result, expected)
+
+ # Test if spacing remains linear if tz changes to dst in range
+ result = date_range('2018-04-01 01:00:00',
+ '2018-04-01 04:00:00',
+ tz='Australia/Sydney',
+ periods=3)
+ expected = DatetimeIndex([Timestamp('2018-04-01 01:00:00+1100',
+ tz='Australia/Sydney'),
+ Timestamp('2018-04-01 02:00:00+1000',
+ tz='Australia/Sydney'),
+ Timestamp('2018-04-01 04:00:00+1000',
+ tz='Australia/Sydney')])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('start,end,result_tz', [
+ ['20180101', '20180103', 'US/Eastern'],
+ [datetime(2018, 1, 1), datetime(2018, 1, 3), 'US/Eastern'],
+ [Timestamp('20180101'), Timestamp('20180103'), 'US/Eastern'],
+ [Timestamp('20180101', tz='US/Eastern'),
+ Timestamp('20180103', tz='US/Eastern'), 'US/Eastern'],
+ [Timestamp('20180101', tz='US/Eastern'),
+ Timestamp('20180103', tz='US/Eastern'), None]])
+ def test_date_range_linspacing_tz(self, start, end, result_tz):
+ # GH 20983
+ result = date_range(start, end, periods=3, tz=result_tz)
+ expected = date_range('20180101', periods=3, freq='D', tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ def test_date_range_businesshour(self):
+ idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00',
+ '2014-07-04 11:00',
+ '2014-07-04 12:00', '2014-07-04 13:00',
+ '2014-07-04 14:00',
+ '2014-07-04 15:00', '2014-07-04 16:00'],
+ freq='BH')
+ rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH')
+ tm.assert_index_equal(idx, rng)
+
+ idx = DatetimeIndex(
+ ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH')
+ rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH')
+ tm.assert_index_equal(idx, rng)
+
+ idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00',
+ '2014-07-04 11:00',
+ '2014-07-04 12:00', '2014-07-04 13:00',
+ '2014-07-04 14:00',
+ '2014-07-04 15:00', '2014-07-04 16:00',
+ '2014-07-07 09:00', '2014-07-07 10:00',
+ '2014-07-07 11:00',
+ '2014-07-07 12:00', '2014-07-07 13:00',
+ '2014-07-07 14:00',
+ '2014-07-07 15:00', '2014-07-07 16:00',
+ '2014-07-08 09:00', '2014-07-08 10:00',
+ '2014-07-08 11:00',
+ '2014-07-08 12:00', '2014-07-08 13:00',
+ '2014-07-08 14:00',
+ '2014-07-08 15:00', '2014-07-08 16:00'],
+ freq='BH')
+ rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH')
+ tm.assert_index_equal(idx, rng)
+
+ def test_range_misspecified(self):
+ # GH #1095
+ msg = ('Of the four parameters: start, end, periods, and '
+ 'freq, exactly three must be specified')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(start='1/1/2000')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(end='1/1/2000')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(periods=10)
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(start='1/1/2000', freq='H')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(end='1/1/2000', freq='H')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range(periods=10, freq='H')
+
+ with pytest.raises(ValueError, match=msg):
+ date_range()
+
+ @pytest.mark.parametrize('f', [compat.long, int])
+ def test_compat_replace(self, f):
+ # https://github.com/statsmodels/statsmodels/issues/3349
+ # replace should take ints/longs for compat
+ result = date_range(Timestamp('1960-04-01 00:00:00', freq='QS-JAN'),
+ periods=f(76), freq='QS-JAN')
+ assert len(result) == 76
+
+ def test_catch_infinite_loop(self):
+ offset = offsets.DateOffset(minute=5)
+ # blow up, don't loop forever
+ pytest.raises(Exception, date_range, datetime(2011, 11, 11),
+ datetime(2011, 11, 12), freq=offset)
+
+ @pytest.mark.parametrize('periods', (1, 2))
+ def test_wom_len(self, periods):
+ # https://github.com/pandas-dev/pandas/issues/20517
+ res = date_range(start='20110101', periods=periods, freq='WOM-1MON')
+ assert len(res) == periods
+
+ def test_construct_over_dst(self):
+ # GH 20854
+ pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific',
+ ambiguous=True)
+ pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific',
+ ambiguous=False)
+ expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'),
+ pre_dst,
+ pst_dst]
+ expected = DatetimeIndex(expect_data)
+ result = date_range(start='2010-11-7', periods=3,
+ freq='H', tz='US/Pacific')
+ tm.assert_index_equal(result, expected)
+
+ def test_construct_with_different_start_end_string_format(self):
+ # GH 12064
+ result = date_range('2013-01-01 00:00:00+09:00',
+ '2013/01/01 02:00:00+09:00', freq='H')
+ expected = DatetimeIndex([Timestamp('2013-01-01 00:00:00+09:00'),
+ Timestamp('2013-01-01 01:00:00+09:00'),
+ Timestamp('2013-01-01 02:00:00+09:00')])
+ tm.assert_index_equal(result, expected)
+
+ def test_error_with_zero_monthends(self):
+ msg = r'Offset <0 \* MonthEnds> did not increment date'
+ with pytest.raises(ValueError, match=msg):
+ date_range('1/1/2000', '1/1/2001', freq=MonthEnd(0))
+
+ def test_range_bug(self):
+ # GH #770
+ offset = DateOffset(months=3)
+ result = date_range("2011-1-1", "2012-1-31", freq=offset)
+
+ start = datetime(2011, 1, 1)
+ expected = DatetimeIndex([start + i * offset for i in range(5)])
+ tm.assert_index_equal(result, expected)
+
+ def test_range_tz_pytz(self):
+ # see gh-2906
+ tz = timezone('US/Eastern')
+ start = tz.localize(datetime(2011, 1, 1))
+ end = tz.localize(datetime(2011, 1, 3))
+
+ dr = date_range(start=start, periods=3)
+ assert dr.tz.zone == tz.zone
+ assert dr[0] == start
+ assert dr[2] == end
+
+ dr = date_range(end=end, periods=3)
+ assert dr.tz.zone == tz.zone
+ assert dr[0] == start
+ assert dr[2] == end
+
+ dr = date_range(start=start, end=end)
+ assert dr.tz.zone == tz.zone
+ assert dr[0] == start
+ assert dr[2] == end
+
+ @pytest.mark.parametrize('start, end', [
+ [Timestamp(datetime(2014, 3, 6), tz='US/Eastern'),
+ Timestamp(datetime(2014, 3, 12), tz='US/Eastern')],
+ [Timestamp(datetime(2013, 11, 1), tz='US/Eastern'),
+ Timestamp(datetime(2013, 11, 6), tz='US/Eastern')]
+ ])
+ def test_range_tz_dst_straddle_pytz(self, start, end):
+ dr = date_range(start, end, freq='D')
+ assert dr[0] == start
+ assert dr[-1] == end
+ assert np.all(dr.hour == 0)
+
+ dr = date_range(start, end, freq='D', tz='US/Eastern')
+ assert dr[0] == start
+ assert dr[-1] == end
+ assert np.all(dr.hour == 0)
+
+ dr = date_range(start.replace(tzinfo=None), end.replace(
+ tzinfo=None), freq='D', tz='US/Eastern')
+ assert dr[0] == start
+ assert dr[-1] == end
+ assert np.all(dr.hour == 0)
+
+ def test_range_tz_dateutil(self):
+ # see gh-2906
+
+ # Use maybe_get_tz to fix filename in tz under dateutil.
+ from pandas._libs.tslibs.timezones import maybe_get_tz
+ tz = lambda x: maybe_get_tz('dateutil/' + x)
+
+ start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern'))
+ end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern'))
+
+ dr = date_range(start=start, periods=3)
+ assert dr.tz == tz('US/Eastern')
+ assert dr[0] == start
+ assert dr[2] == end
+
+ dr = date_range(end=end, periods=3)
+ assert dr.tz == tz('US/Eastern')
+ assert dr[0] == start
+ assert dr[2] == end
+
+ dr = date_range(start=start, end=end)
+ assert dr.tz == tz('US/Eastern')
+ assert dr[0] == start
+ assert dr[2] == end
+
+ @pytest.mark.parametrize('freq', ["1D", "3D", "2M", "7W", "3H", "A"])
+ def test_range_closed(self, freq):
+ begin = datetime(2011, 1, 1)
+ end = datetime(2014, 1, 1)
+
+ closed = date_range(begin, end, closed=None, freq=freq)
+ left = date_range(begin, end, closed="left", freq=freq)
+ right = date_range(begin, end, closed="right", freq=freq)
+ expected_left = left
+ expected_right = right
+
+ if end == closed[-1]:
+ expected_left = closed[:-1]
+ if begin == closed[0]:
+ expected_right = closed[1:]
+
+ tm.assert_index_equal(expected_left, left)
+ tm.assert_index_equal(expected_right, right)
+
+ def test_range_closed_with_tz_aware_start_end(self):
+ # GH12409, GH12684
+ begin = Timestamp('2011/1/1', tz='US/Eastern')
+ end = Timestamp('2014/1/1', tz='US/Eastern')
+
+ for freq in ["1D", "3D", "2M", "7W", "3H", "A"]:
+ closed = date_range(begin, end, closed=None, freq=freq)
+ left = date_range(begin, end, closed="left", freq=freq)
+ right = date_range(begin, end, closed="right", freq=freq)
+ expected_left = left
+ expected_right = right
+
+ if end == closed[-1]:
+ expected_left = closed[:-1]
+ if begin == closed[0]:
+ expected_right = closed[1:]
+
+ tm.assert_index_equal(expected_left, left)
+ tm.assert_index_equal(expected_right, right)
+
+ begin = Timestamp('2011/1/1')
+ end = Timestamp('2014/1/1')
+ begintz = Timestamp('2011/1/1', tz='US/Eastern')
+ endtz = Timestamp('2014/1/1', tz='US/Eastern')
+
+ for freq in ["1D", "3D", "2M", "7W", "3H", "A"]:
+ closed = date_range(begin, end, closed=None, freq=freq,
+ tz='US/Eastern')
+ left = date_range(begin, end, closed="left", freq=freq,
+ tz='US/Eastern')
+ right = date_range(begin, end, closed="right", freq=freq,
+ tz='US/Eastern')
+ expected_left = left
+ expected_right = right
+
+ if endtz == closed[-1]:
+ expected_left = closed[:-1]
+ if begintz == closed[0]:
+ expected_right = closed[1:]
+
+ tm.assert_index_equal(expected_left, left)
+ tm.assert_index_equal(expected_right, right)
+
+ @pytest.mark.parametrize('closed', ['right', 'left', None])
+ def test_range_closed_boundary(self, closed):
+ # GH#11804
+ right_boundary = date_range('2015-09-12', '2015-12-01',
+ freq='QS-MAR', closed=closed)
+ left_boundary = date_range('2015-09-01', '2015-09-12',
+ freq='QS-MAR', closed=closed)
+ both_boundary = date_range('2015-09-01', '2015-12-01',
+ freq='QS-MAR', closed=closed)
+ expected_right = expected_left = expected_both = both_boundary
+
+ if closed == 'right':
+ expected_left = both_boundary[1:]
+ if closed == 'left':
+ expected_right = both_boundary[:-1]
+ if closed is None:
+ expected_right = both_boundary[1:]
+ expected_left = both_boundary[:-1]
+
+ tm.assert_index_equal(right_boundary, expected_right)
+ tm.assert_index_equal(left_boundary, expected_left)
+ tm.assert_index_equal(both_boundary, expected_both)
+
+ def test_years_only(self):
+ # GH 6961
+ dr = date_range('2014', '2015', freq='M')
+ assert dr[0] == datetime(2014, 1, 31)
+ assert dr[-1] == datetime(2014, 12, 31)
+
+ def test_freq_divides_end_in_nanos(self):
+ # GH 10885
+ result_1 = date_range('2005-01-12 10:00', '2005-01-12 16:00',
+ freq='345min')
+ result_2 = date_range('2005-01-13 10:00', '2005-01-13 16:00',
+ freq='345min')
+ expected_1 = DatetimeIndex(['2005-01-12 10:00:00',
+ '2005-01-12 15:45:00'],
+ dtype='datetime64[ns]', freq='345T',
+ tz=None)
+ expected_2 = DatetimeIndex(['2005-01-13 10:00:00',
+ '2005-01-13 15:45:00'],
+ dtype='datetime64[ns]', freq='345T',
+ tz=None)
+ tm.assert_index_equal(result_1, expected_1)
+ tm.assert_index_equal(result_2, expected_2)
+
+ def test_cached_range_bug(self):
+ rng = date_range('2010-09-01 05:00:00', periods=50,
+ freq=DateOffset(hours=6))
+ assert len(rng) == 50
+ assert rng[0] == datetime(2010, 9, 1, 5)
+
+ def test_timezone_comparaison_bug(self):
+ # smoke test
+ start = Timestamp('20130220 10:00', tz='US/Eastern')
+ result = date_range(start, periods=2, tz='US/Eastern')
+ assert len(result) == 2
+
+ def test_timezone_comparaison_assert(self):
+ start = Timestamp('20130220 10:00', tz='US/Eastern')
+ msg = 'Inferred time zone not equal to passed time zone'
+ with pytest.raises(AssertionError, match=msg):
+ date_range(start, periods=2, tz='Europe/Berlin')
+
+ def test_negative_non_tick_frequency_descending_dates(self,
+ tz_aware_fixture):
+ # GH 23270
+ tz = tz_aware_fixture
+ result = pd.date_range(start='2011-06-01', end='2011-01-01',
+ freq='-1MS', tz=tz)
+ expected = pd.date_range(end='2011-06-01', start='2011-01-01',
+ freq='1MS', tz=tz)[::-1]
+ tm.assert_index_equal(result, expected)
+
+
+class TestGenRangeGeneration(object):
+
+ def test_generate(self):
+ rng1 = list(generate_range(START, END, offset=BDay()))
+ rng2 = list(generate_range(START, END, offset='B'))
+ assert rng1 == rng2
+
+ def test_generate_cday(self):
+ rng1 = list(generate_range(START, END, offset=CDay()))
+ rng2 = list(generate_range(START, END, offset='C'))
+ assert rng1 == rng2
+
+ def test_1(self):
+ rng = list(generate_range(start=datetime(2009, 3, 25), periods=2))
+ expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)]
+ assert rng == expected
+
+ def test_2(self):
+ rng = list(generate_range(start=datetime(2008, 1, 1),
+ end=datetime(2008, 1, 3)))
+ expected = [datetime(2008, 1, 1),
+ datetime(2008, 1, 2),
+ datetime(2008, 1, 3)]
+ assert rng == expected
+
+ def test_3(self):
+ rng = list(generate_range(start=datetime(2008, 1, 5),
+ end=datetime(2008, 1, 6)))
+ expected = []
+ assert rng == expected
+
+ def test_precision_finer_than_offset(self):
+ # GH#9907
+ result1 = pd.date_range(start='2015-04-15 00:00:03',
+ end='2016-04-22 00:00:00', freq='Q')
+ result2 = pd.date_range(start='2015-04-15 00:00:03',
+ end='2015-06-22 00:00:04', freq='W')
+ expected1_list = ['2015-06-30 00:00:03', '2015-09-30 00:00:03',
+ '2015-12-31 00:00:03', '2016-03-31 00:00:03']
+ expected2_list = ['2015-04-19 00:00:03', '2015-04-26 00:00:03',
+ '2015-05-03 00:00:03', '2015-05-10 00:00:03',
+ '2015-05-17 00:00:03', '2015-05-24 00:00:03',
+ '2015-05-31 00:00:03', '2015-06-07 00:00:03',
+ '2015-06-14 00:00:03', '2015-06-21 00:00:03']
+ expected1 = DatetimeIndex(expected1_list, dtype='datetime64[ns]',
+ freq='Q-DEC', tz=None)
+ expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]',
+ freq='W-SUN', tz=None)
+ tm.assert_index_equal(result1, expected1)
+ tm.assert_index_equal(result2, expected2)
+
+ dt1, dt2 = '2017-01-01', '2017-01-01'
+ tz1, tz2 = 'US/Eastern', 'Europe/London'
+
+ @pytest.mark.parametrize("start,end", [
+ (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)),
+ (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)),
+ (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)),
+ (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1))
+ ])
+ def test_mismatching_tz_raises_err(self, start, end):
+ # issue 18488
+ with pytest.raises(TypeError):
+ pd.date_range(start, end)
+ with pytest.raises(TypeError):
+ pd.date_range(start, end, freq=BDay())
+
+
+class TestBusinessDateRange(object):
+
+ def test_constructor(self):
+ bdate_range(START, END, freq=BDay())
+ bdate_range(START, periods=20, freq=BDay())
+ bdate_range(end=START, periods=20, freq=BDay())
+
+ msg = 'periods must be a number, got B'
+ with pytest.raises(TypeError, match=msg):
+ date_range('2011-1-1', '2012-1-1', 'B')
+
+ with pytest.raises(TypeError, match=msg):
+ bdate_range('2011-1-1', '2012-1-1', 'B')
+
+ msg = 'freq must be specified for bdate_range; use date_range instead'
+ with pytest.raises(TypeError, match=msg):
+ bdate_range(START, END, periods=10, freq=None)
+
+ def test_naive_aware_conflicts(self):
+ naive = bdate_range(START, END, freq=BDay(), tz=None)
+ aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong")
+
+ msg = 'tz-naive.*tz-aware'
+ with pytest.raises(TypeError, match=msg):
+ naive.join(aware)
+
+ with pytest.raises(TypeError, match=msg):
+ aware.join(naive)
+
+ def test_misc(self):
+ end = datetime(2009, 5, 13)
+ dr = bdate_range(end=end, periods=20)
+ firstDate = end - 19 * BDay()
+
+ assert len(dr) == 20
+ assert dr[0] == firstDate
+ assert dr[-1] == end
+
+ def test_date_parse_failure(self):
+ badly_formed_date = '2007/100/1'
+
+ with pytest.raises(ValueError):
+ Timestamp(badly_formed_date)
+
+ with pytest.raises(ValueError):
+ bdate_range(start=badly_formed_date, periods=10)
+
+ with pytest.raises(ValueError):
+ bdate_range(end=badly_formed_date, periods=10)
+
+ with pytest.raises(ValueError):
+ bdate_range(badly_formed_date, badly_formed_date)
+
+ def test_daterange_bug_456(self):
+ # GH #456
+ rng1 = bdate_range('12/5/2011', '12/5/2011')
+ rng2 = bdate_range('12/2/2011', '12/5/2011')
+ rng2.freq = BDay()
+
+ result = rng1.union(rng2)
+ assert isinstance(result, DatetimeIndex)
+
+ @pytest.mark.parametrize('closed', ['left', 'right'])
+ def test_bdays_and_open_boundaries(self, closed):
+ # GH 6673
+ start = '2018-07-21' # Saturday
+ end = '2018-07-29' # Sunday
+ result = pd.date_range(start, end, freq='B', closed=closed)
+
+ bday_start = '2018-07-23' # Monday
+ bday_end = '2018-07-27' # Friday
+ expected = pd.date_range(bday_start, bday_end, freq='D')
+ tm.assert_index_equal(result, expected)
+
+
+class TestCustomDateRange(object):
+
+ def test_constructor(self):
+ bdate_range(START, END, freq=CDay())
+ bdate_range(START, periods=20, freq=CDay())
+ bdate_range(end=START, periods=20, freq=CDay())
+
+ msg = 'periods must be a number, got C'
+ with pytest.raises(TypeError, match=msg):
+ date_range('2011-1-1', '2012-1-1', 'C')
+
+ with pytest.raises(TypeError, match=msg):
+ bdate_range('2011-1-1', '2012-1-1', 'C')
+
+ def test_misc(self):
+ end = datetime(2009, 5, 13)
+ dr = bdate_range(end=end, periods=20, freq='C')
+ firstDate = end - 19 * CDay()
+
+ assert len(dr) == 20
+ assert dr[0] == firstDate
+ assert dr[-1] == end
+
+ def test_daterange_bug_456(self):
+ # GH #456
+ rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C')
+ rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C')
+ rng2.freq = CDay()
+
+ result = rng1.union(rng2)
+ assert isinstance(result, DatetimeIndex)
+
+ def test_cdaterange(self):
+ result = bdate_range('2013-05-01', periods=3, freq='C')
+ expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03'])
+ tm.assert_index_equal(result, expected)
+
+ def test_cdaterange_weekmask(self):
+ result = bdate_range('2013-05-01', periods=3, freq='C',
+ weekmask='Sun Mon Tue Wed Thu')
+ expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05'])
+ tm.assert_index_equal(result, expected)
+
+ # raise with non-custom freq
+ msg = ('a custom frequency string is required when holidays or '
+ 'weekmask are passed, got frequency B')
+ with pytest.raises(ValueError, match=msg):
+ bdate_range('2013-05-01', periods=3,
+ weekmask='Sun Mon Tue Wed Thu')
+
+ def test_cdaterange_holidays(self):
+ result = bdate_range('2013-05-01', periods=3, freq='C',
+ holidays=['2013-05-01'])
+ expected = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06'])
+ tm.assert_index_equal(result, expected)
+
+ # raise with non-custom freq
+ msg = ('a custom frequency string is required when holidays or '
+ 'weekmask are passed, got frequency B')
+ with pytest.raises(ValueError, match=msg):
+ bdate_range('2013-05-01', periods=3, holidays=['2013-05-01'])
+
+ def test_cdaterange_weekmask_and_holidays(self):
+ result = bdate_range('2013-05-01', periods=3, freq='C',
+ weekmask='Sun Mon Tue Wed Thu',
+ holidays=['2013-05-01'])
+ expected = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06'])
+ tm.assert_index_equal(result, expected)
+
+ # raise with non-custom freq
+ msg = ('a custom frequency string is required when holidays or '
+ 'weekmask are passed, got frequency B')
+ with pytest.raises(ValueError, match=msg):
+ bdate_range('2013-05-01', periods=3,
+ weekmask='Sun Mon Tue Wed Thu',
+ holidays=['2013-05-01'])
+
+ @pytest.mark.parametrize('freq', [freq for freq in prefix_mapping
+ if freq.startswith('C')])
+ def test_all_custom_freq(self, freq):
+ # should not raise
+ bdate_range(START, END, freq=freq, weekmask='Mon Wed Fri',
+ holidays=['2009-03-14'])
+
+ bad_freq = freq + 'FOO'
+ msg = 'invalid custom frequency string: {freq}'
+ with pytest.raises(ValueError, match=msg.format(freq=bad_freq)):
+ bdate_range(START, END, freq=bad_freq)
+
+ @pytest.mark.parametrize('start_end', [
+ ('2018-01-01T00:00:01.000Z', '2018-01-03T00:00:01.000Z'),
+ ('2018-01-01T00:00:00.010Z', '2018-01-03T00:00:00.010Z'),
+ ('2001-01-01T00:00:00.010Z', '2001-01-03T00:00:00.010Z')])
+ def test_range_with_millisecond_resolution(self, start_end):
+ # https://github.com/pandas-dev/pandas/issues/24110
+ start, end = start_end
+ result = pd.date_range(start=start, end=end, periods=2, closed='left')
+ expected = DatetimeIndex([start])
+ tm.assert_index_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetime.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetime.py
new file mode 100644
index 00000000000..e1ba0e17084
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetime.py
@@ -0,0 +1,436 @@
+from datetime import date
+
+import dateutil
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets)
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+randn = np.random.randn
+
+
+class TestDatetimeIndex(object):
+
+ def test_roundtrip_pickle_with_tz(self):
+
+ # GH 8367
+ # round-trip of timezone
+ index = date_range('20130101', periods=3, tz='US/Eastern', name='foo')
+ unpickled = tm.round_trip_pickle(index)
+ tm.assert_index_equal(index, unpickled)
+
+ def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self):
+ # GH7774
+ index = date_range('20130101', periods=3, tz='US/Eastern')
+ assert str(index.reindex([])[0].tz) == 'US/Eastern'
+ assert str(index.reindex(np.array([]))[0].tz) == 'US/Eastern'
+
+ def test_time_loc(self): # GH8667
+ from datetime import time
+ from pandas._libs.index import _SIZE_CUTOFF
+
+ ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64)
+ key = time(15, 11, 30)
+ start = key.hour * 3600 + key.minute * 60 + key.second
+ step = 24 * 3600
+
+ for n in ns:
+ idx = pd.date_range('2014-11-26', periods=n, freq='S')
+ ts = pd.Series(np.random.randn(n), index=idx)
+ i = np.arange(start, n, step)
+
+ tm.assert_numpy_array_equal(ts.index.get_loc(key), i,
+ check_dtype=False)
+ tm.assert_series_equal(ts[key], ts.iloc[i])
+
+ left, right = ts.copy(), ts.copy()
+ left[key] *= -10
+ right.iloc[i] *= -10
+ tm.assert_series_equal(left, right)
+
+ def test_time_overflow_for_32bit_machines(self):
+ # GH8943. On some machines NumPy defaults to np.int32 (for example,
+ # 32-bit Linux machines). In the function _generate_regular_range
+ # found in tseries/index.py, `periods` gets multiplied by `strides`
+ # (which has value 1e9) and since the max value for np.int32 is ~2e9,
+ # and since those machines won't promote np.int32 to np.int64, we get
+ # overflow.
+ periods = np.int_(1000)
+
+ idx1 = pd.date_range(start='2000', periods=periods, freq='S')
+ assert len(idx1) == periods
+
+ idx2 = pd.date_range(end='2000', periods=periods, freq='S')
+ assert len(idx2) == periods
+
+ def test_nat(self):
+ assert DatetimeIndex([np.nan])[0] is pd.NaT
+
+ def test_week_of_month_frequency(self):
+ # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise
+ d1 = date(2002, 9, 1)
+ d2 = date(2013, 10, 27)
+ d3 = date(2012, 9, 30)
+ idx1 = DatetimeIndex([d1, d2])
+ idx2 = DatetimeIndex([d3])
+ result_append = idx1.append(idx2)
+ expected = DatetimeIndex([d1, d2, d3])
+ tm.assert_index_equal(result_append, expected)
+ result_union = idx1.union(idx2)
+ expected = DatetimeIndex([d1, d3, d2])
+ tm.assert_index_equal(result_union, expected)
+
+ # GH 5115
+ result = date_range("2013-1-1", periods=4, freq='WOM-1SAT')
+ dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06']
+ expected = DatetimeIndex(dates, freq='WOM-1SAT')
+ tm.assert_index_equal(result, expected)
+
+ def test_hash_error(self):
+ index = date_range('20010101', periods=10)
+ with pytest.raises(TypeError, match=("unhashable type: %r" %
+ type(index).__name__)):
+ hash(index)
+
+ def test_stringified_slice_with_tz(self):
+ # GH#2658
+ import datetime
+ start = datetime.datetime.now()
+ idx = date_range(start=start, freq="1d", periods=10)
+ df = DataFrame(lrange(10), index=idx)
+ df["2013-01-14 23:44:34.437768-05:00":] # no exception here
+
+ def test_append_join_nondatetimeindex(self):
+ rng = date_range('1/1/2000', periods=10)
+ idx = Index(['a', 'b', 'c', 'd'])
+
+ result = rng.append(idx)
+ assert isinstance(result[0], Timestamp)
+
+ # it works
+ rng.join(idx, how='outer')
+
+ def test_map(self):
+ rng = date_range('1/1/2000', periods=10)
+
+ f = lambda x: x.strftime('%Y%m%d')
+ result = rng.map(f)
+ exp = Index([f(x) for x in rng], dtype='<U8')
+ tm.assert_index_equal(result, exp)
+
+ def test_map_fallthrough(self, capsys):
+ # GH#22067, check we don't get warnings about silently ignored errors
+ dti = date_range('2017-01-01', '2018-01-01', freq='B')
+
+ dti.map(lambda x: pd.Period(year=x.year, month=x.month, freq='M'))
+
+ captured = capsys.readouterr()
+ assert captured.err == ''
+
+ def test_iteration_preserves_tz(self):
+ # see gh-8890
+ index = date_range("2012-01-01", periods=3, freq='H', tz='US/Eastern')
+
+ for i, ts in enumerate(index):
+ result = ts
+ expected = index[i]
+ assert result == expected
+
+ index = date_range("2012-01-01", periods=3, freq='H',
+ tz=dateutil.tz.tzoffset(None, -28800))
+
+ for i, ts in enumerate(index):
+ result = ts
+ expected = index[i]
+ assert result._repr_base == expected._repr_base
+ assert result == expected
+
+ # 9100
+ index = pd.DatetimeIndex(['2014-12-01 03:32:39.987000-08:00',
+ '2014-12-01 04:12:34.987000-08:00'])
+ for i, ts in enumerate(index):
+ result = ts
+ expected = index[i]
+ assert result._repr_base == expected._repr_base
+ assert result == expected
+
+ @pytest.mark.parametrize('periods', [0, 9999, 10000, 10001])
+ def test_iteration_over_chunksize(self, periods):
+ # GH21012
+
+ index = date_range('2000-01-01 00:00:00', periods=periods, freq='min')
+ num = 0
+ for stamp in index:
+ assert index[num] == stamp
+ num += 1
+ assert num == len(index)
+
+ def test_misc_coverage(self):
+ rng = date_range('1/1/2000', periods=5)
+ result = rng.groupby(rng.day)
+ assert isinstance(list(result.values())[0][0], Timestamp)
+
+ idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02'])
+ assert not idx.equals(list(idx))
+
+ non_datetime = Index(list('abc'))
+ assert not idx.equals(list(non_datetime))
+
+ def test_string_index_series_name_converted(self):
+ # #1644
+ df = DataFrame(np.random.randn(10, 4),
+ index=date_range('1/1/2000', periods=10))
+
+ result = df.loc['1/3/2000']
+ assert result.name == df.index[2]
+
+ result = df.T['1/3/2000']
+ assert result.name == df.index[2]
+
+ def test_get_duplicates(self):
+ idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02',
+ '2000-01-03', '2000-01-03', '2000-01-04'])
+
+ with tm.assert_produces_warning(FutureWarning):
+ # Deprecated - see GH20239
+ result = idx.get_duplicates()
+
+ ex = DatetimeIndex(['2000-01-02', '2000-01-03'])
+ tm.assert_index_equal(result, ex)
+
+ def test_argmin_argmax(self):
+ idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02'])
+ assert idx.argmin() == 1
+ assert idx.argmax() == 0
+
+ def test_sort_values(self):
+ idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02'])
+
+ ordered = idx.sort_values()
+ assert ordered.is_monotonic
+
+ ordered = idx.sort_values(ascending=False)
+ assert ordered[::-1].is_monotonic
+
+ ordered, dexer = idx.sort_values(return_indexer=True)
+ assert ordered.is_monotonic
+ tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0], dtype=np.intp))
+
+ ordered, dexer = idx.sort_values(return_indexer=True, ascending=False)
+ assert ordered[::-1].is_monotonic
+ tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1], dtype=np.intp))
+
+ def test_map_bug_1677(self):
+ index = DatetimeIndex(['2012-04-25 09:30:00.393000'])
+ f = index.asof
+
+ result = index.map(f)
+ expected = Index([f(index[0])])
+ tm.assert_index_equal(result, expected)
+
+ def test_groupby_function_tuple_1677(self):
+ df = DataFrame(np.random.rand(100),
+ index=date_range("1/1/2000", periods=100))
+ monthly_group = df.groupby(lambda x: (x.year, x.month))
+
+ result = monthly_group.mean()
+ assert isinstance(result.index[0], tuple)
+
+ def test_append_numpy_bug_1681(self):
+ # another datetime64 bug
+ dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
+ a = DataFrame()
+ c = DataFrame({'A': 'foo', 'B': dr}, index=dr)
+
+ result = a.append(c)
+ assert (result['B'] == dr).all()
+
+ def test_isin(self):
+ index = tm.makeDateIndex(4)
+ result = index.isin(index)
+ assert result.all()
+
+ result = index.isin(list(index))
+ assert result.all()
+
+ assert_almost_equal(index.isin([index[2], 5]),
+ np.array([False, False, True, False]))
+
+ def test_does_not_convert_mixed_integer(self):
+ df = tm.makeCustomDataframe(10, 10,
+ data_gen_f=lambda *args, **kwargs: randn(),
+ r_idx_type='i', c_idx_type='dt')
+ cols = df.columns.join(df.index, how='outer')
+ joined = cols.join(df.columns)
+ assert cols.dtype == np.dtype('O')
+ assert cols.dtype == joined.dtype
+ tm.assert_numpy_array_equal(cols.values, joined.values)
+
+ def test_join_self(self, join_type):
+ index = date_range('1/1/2000', periods=10)
+ joined = index.join(index, how=join_type)
+ assert index is joined
+
+ def assert_index_parameters(self, index):
+ assert index.freq == '40960N'
+ assert index.inferred_freq == '40960N'
+
+ def test_ns_index(self):
+ nsamples = 400
+ ns = int(1e9 / 24414)
+ dtstart = np.datetime64('2012-09-20T00:00:00')
+
+ dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns')
+ freq = ns * offsets.Nano()
+ index = pd.DatetimeIndex(dt, freq=freq, name='time')
+ self.assert_index_parameters(index)
+
+ new_index = pd.date_range(start=index[0], end=index[-1],
+ freq=index.freq)
+ self.assert_index_parameters(new_index)
+
+ def test_join_with_period_index(self, join_type):
+ df = tm.makeCustomDataframe(
+ 10, 10, data_gen_f=lambda *args: np.random.randint(2),
+ c_idx_type='p', r_idx_type='dt')
+ s = df.iloc[:5, 0]
+
+ msg = 'can only call with other PeriodIndex-ed objects'
+ with pytest.raises(ValueError, match=msg):
+ df.columns.join(s.index, how=join_type)
+
+ def test_factorize(self):
+ idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+ '2014-03', '2014-03'])
+
+ exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp)
+ exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
+
+ arr, idx = idx1.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ arr, idx = idx1.factorize(sort=True)
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ # tz must be preserved
+ idx1 = idx1.tz_localize('Asia/Tokyo')
+ exp_idx = exp_idx.tz_localize('Asia/Tokyo')
+
+ arr, idx = idx1.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+ '2014-03', '2014-01'])
+
+ exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp)
+ exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
+ arr, idx = idx2.factorize(sort=True)
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
+ exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01'])
+ arr, idx = idx2.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ # freq must be preserved
+ idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo')
+ exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
+ arr, idx = idx3.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, idx3)
+
+ def test_factorize_tz(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ # GH#13750
+ base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz)
+ idx = base.repeat(5)
+
+ exp_arr = np.arange(100, dtype=np.intp).repeat(5)
+
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(res, base)
+
+ def test_factorize_dst(self):
+ # GH 13750
+ idx = pd.date_range('2016-11-06', freq='H', periods=12,
+ tz='US/Eastern')
+
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
+ tm.assert_index_equal(res, idx)
+
+ idx = pd.date_range('2016-06-13', freq='H', periods=12,
+ tz='US/Eastern')
+
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
+ tm.assert_index_equal(res, idx)
+
+ @pytest.mark.parametrize('arr, expected', [
+ (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])),
+ (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'),
+ pd.DatetimeIndex(['2017'], tz='US/Eastern')),
+ ])
+ def test_unique(self, arr, expected):
+ result = arr.unique()
+ tm.assert_index_equal(result, expected)
+ # GH 21737
+ # Ensure the underlying data is consistent
+ assert result[0] == expected[0]
+
+ def test_asarray_tz_naive(self):
+ # This shouldn't produce a warning.
+ idx = pd.date_range('2000', periods=2)
+ # M8[ns] by default
+ with tm.assert_produces_warning(None):
+ result = np.asarray(idx)
+
+ expected = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # optionally, object
+ with tm.assert_produces_warning(None):
+ result = np.asarray(idx, dtype=object)
+
+ expected = np.array([pd.Timestamp('2000-01-01'),
+ pd.Timestamp('2000-01-02')])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_asarray_tz_aware(self):
+ tz = 'US/Central'
+ idx = pd.date_range('2000', periods=2, tz=tz)
+ expected = np.array(['2000-01-01T06', '2000-01-02T06'], dtype='M8[ns]')
+ # We warn by default and return an ndarray[M8[ns]]
+ with tm.assert_produces_warning(FutureWarning):
+ result = np.asarray(idx)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Old behavior with no warning
+ with tm.assert_produces_warning(None):
+ result = np.asarray(idx, dtype="M8[ns]")
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Future behavior with no warning
+ expected = np.array([pd.Timestamp("2000-01-01", tz=tz),
+ pd.Timestamp("2000-01-02", tz=tz)])
+ with tm.assert_produces_warning(None):
+ result = np.asarray(idx, dtype=object)
+
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetimelike.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetimelike.py
new file mode 100644
index 00000000000..f095e0a06c3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_datetimelike.py
@@ -0,0 +1,31 @@
+""" generic tests from the Datetimelike class """
+
+from pandas import DatetimeIndex, date_range
+from pandas.util import testing as tm
+
+from ..datetimelike import DatetimeLike
+
+
+class TestDatetimeIndex(DatetimeLike):
+ _holder = DatetimeIndex
+
+ def setup_method(self, method):
+ self.indices = dict(index=tm.makeDateIndex(10),
+ index_dec=date_range('20130110', periods=10,
+ freq='-1D'))
+ self.setup_indices()
+
+ def create_index(self):
+ return date_range('20130101', periods=5)
+
+ def test_shift(self):
+ pass # handled in test_ops
+
+ def test_pickle_compat_construction(self):
+ pass
+
+ def test_intersection(self):
+ pass # handled in test_setops
+
+ def test_union(self):
+ pass # handled in test_setops
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_formats.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_formats.py
new file mode 100644
index 00000000000..df0a5742e7a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_formats.py
@@ -0,0 +1,221 @@
+from datetime import datetime
+
+import dateutil.tz
+import numpy as np
+import pytest
+import pytz
+
+import pandas as pd
+from pandas import DatetimeIndex, Series
+import pandas.util.testing as tm
+
+
+def test_to_native_types():
+ index = pd.date_range(freq='1D', periods=3, start='2017-01-01')
+
+ # First, with no arguments.
+ expected = np.array(['2017-01-01', '2017-01-02',
+ '2017-01-03'], dtype=object)
+
+ result = index.to_native_types()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # No NaN values, so na_rep has no effect
+ result = index.to_native_types(na_rep='pandas')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Make sure slicing works
+ expected = np.array(['2017-01-01', '2017-01-03'], dtype=object)
+
+ result = index.to_native_types([0, 2])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Make sure date formatting works
+ expected = np.array(['01-2017-01', '01-2017-02',
+ '01-2017-03'], dtype=object)
+
+ result = index.to_native_types(date_format='%m-%Y-%d')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # NULL object handling should work
+ index = DatetimeIndex(['2017-01-01', pd.NaT, '2017-01-03'])
+ expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
+
+ result = index.to_native_types()
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = np.array(['2017-01-01', 'pandas',
+ '2017-01-03'], dtype=object)
+
+ result = index.to_native_types(na_rep='pandas')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestDatetimeIndexRendering(object):
+ def test_dti_repr_short(self):
+ dr = pd.date_range(start='1/1/2012', periods=1)
+ repr(dr)
+
+ dr = pd.date_range(start='1/1/2012', periods=2)
+ repr(dr)
+
+ dr = pd.date_range(start='1/1/2012', periods=3)
+ repr(dr)
+
+ @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__'])
+ def test_dti_representation(self, method):
+ idxs = []
+ idxs.append(DatetimeIndex([], freq='D'))
+ idxs.append(DatetimeIndex(['2011-01-01'], freq='D'))
+ idxs.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D'))
+ idxs.append(DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ freq='D'))
+ idxs.append(DatetimeIndex(
+ ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'
+ ], freq='H', tz='Asia/Tokyo'))
+ idxs.append(DatetimeIndex(
+ ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern'))
+ idxs.append(DatetimeIndex(
+ ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='UTC'))
+
+ exp = []
+ exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""")
+ exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', "
+ "freq='D')")
+ exp.append("DatetimeIndex(['2011-01-01', '2011-01-02'], "
+ "dtype='datetime64[ns]', freq='D')")
+ exp.append("DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], "
+ "dtype='datetime64[ns]', freq='D')")
+ exp.append("DatetimeIndex(['2011-01-01 09:00:00+09:00', "
+ "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']"
+ ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')")
+ exp.append("DatetimeIndex(['2011-01-01 09:00:00-05:00', "
+ "'2011-01-01 10:00:00-05:00', 'NaT'], "
+ "dtype='datetime64[ns, US/Eastern]', freq=None)")
+ exp.append("DatetimeIndex(['2011-01-01 09:00:00+00:00', "
+ "'2011-01-01 10:00:00+00:00', 'NaT'], "
+ "dtype='datetime64[ns, UTC]', freq=None)""")
+
+ with pd.option_context('display.width', 300):
+ for indx, expected in zip(idxs, exp):
+ result = getattr(indx, method)()
+ assert result == expected
+
+ def test_dti_representation_to_series(self):
+ idx1 = DatetimeIndex([], freq='D')
+ idx2 = DatetimeIndex(['2011-01-01'], freq='D')
+ idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')
+ idx4 = DatetimeIndex(
+ ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')
+ idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00',
+ '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo')
+ idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT],
+ tz='US/Eastern')
+ idx7 = DatetimeIndex(['2011-01-01 09:00', '2011-01-02 10:15'])
+
+ exp1 = """Series([], dtype: datetime64[ns])"""
+
+ exp2 = ("0 2011-01-01\n"
+ "dtype: datetime64[ns]")
+
+ exp3 = ("0 2011-01-01\n"
+ "1 2011-01-02\n"
+ "dtype: datetime64[ns]")
+
+ exp4 = ("0 2011-01-01\n"
+ "1 2011-01-02\n"
+ "2 2011-01-03\n"
+ "dtype: datetime64[ns]")
+
+ exp5 = ("0 2011-01-01 09:00:00+09:00\n"
+ "1 2011-01-01 10:00:00+09:00\n"
+ "2 2011-01-01 11:00:00+09:00\n"
+ "dtype: datetime64[ns, Asia/Tokyo]")
+
+ exp6 = ("0 2011-01-01 09:00:00-05:00\n"
+ "1 2011-01-01 10:00:00-05:00\n"
+ "2 NaT\n"
+ "dtype: datetime64[ns, US/Eastern]")
+
+ exp7 = ("0 2011-01-01 09:00:00\n"
+ "1 2011-01-02 10:15:00\n"
+ "dtype: datetime64[ns]")
+
+ with pd.option_context('display.width', 300):
+ for idx, expected in zip([idx1, idx2, idx3, idx4,
+ idx5, idx6, idx7],
+ [exp1, exp2, exp3, exp4,
+ exp5, exp6, exp7]):
+ result = repr(Series(idx))
+ assert result == expected
+
+ def test_dti_summary(self):
+ # GH#9116
+ idx1 = DatetimeIndex([], freq='D')
+ idx2 = DatetimeIndex(['2011-01-01'], freq='D')
+ idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')
+ idx4 = DatetimeIndex(
+ ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')
+ idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00',
+ '2011-01-01 11:00'],
+ freq='H', tz='Asia/Tokyo')
+ idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT],
+ tz='US/Eastern')
+
+ exp1 = ("DatetimeIndex: 0 entries\n"
+ "Freq: D")
+
+ exp2 = ("DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\n"
+ "Freq: D")
+
+ exp3 = ("DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\n"
+ "Freq: D")
+
+ exp4 = ("DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\n"
+ "Freq: D")
+
+ exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 "
+ "to 2011-01-01 11:00:00+09:00\n"
+ "Freq: H")
+
+ exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT"""
+
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6],
+ [exp1, exp2, exp3, exp4, exp5, exp6]):
+ result = idx._summary()
+ assert result == expected
+
+ def test_dti_business_repr(self):
+ # only really care that it works
+ repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)))
+
+ def test_dti_business_summary(self):
+ rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1))
+ rng._summary()
+ rng[2:2]._summary()
+
+ def test_dti_business_summary_pytz(self):
+ pd.bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc)._summary()
+
+ def test_dti_business_summary_dateutil(self):
+ pd.bdate_range('1/1/2005', '1/1/2009',
+ tz=dateutil.tz.tzutc())._summary()
+
+ def test_dti_custom_business_repr(self):
+ # only really care that it works
+ repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1),
+ freq='C'))
+
+ def test_dti_custom_business_summary(self):
+ rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1),
+ freq='C')
+ rng._summary()
+ rng[2:2]._summary()
+
+ def test_dti_custom_business_summary_pytz(self):
+ pd.bdate_range('1/1/2005', '1/1/2009', freq='C',
+ tz=pytz.utc)._summary()
+
+ def test_dti_custom_business_summary_dateutil(self):
+ pd.bdate_range('1/1/2005', '1/1/2009', freq='C',
+ tz=dateutil.tz.tzutc())._summary()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_indexing.py
new file mode 100644
index 00000000000..c3b00133228
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_indexing.py
@@ -0,0 +1,612 @@
+from datetime import datetime, time, timedelta
+
+import numpy as np
+import pytest
+import pytz
+
+import pandas.compat as compat
+
+import pandas as pd
+from pandas import DatetimeIndex, Index, Timestamp, date_range, notna
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import BDay, CDay
+
+START, END = datetime(2009, 1, 1), datetime(2010, 1, 1)
+
+
+class TestGetItem(object):
+ def test_ellipsis(self):
+ # GH#21282
+ idx = pd.date_range('2011-01-01', '2011-01-31', freq='D',
+ tz='Asia/Tokyo', name='idx')
+
+ result = idx[...]
+ assert result.equals(idx)
+ assert result is not idx
+
+ def test_getitem(self):
+ idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D',
+ tz='Asia/Tokyo', name='idx')
+
+ for idx in [idx1, idx2]:
+ result = idx[0]
+ assert result == Timestamp('2011-01-01', tz=idx.tz)
+
+ result = idx[0:5]
+ expected = pd.date_range('2011-01-01', '2011-01-05', freq='D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[0:10:2]
+ expected = pd.date_range('2011-01-01', '2011-01-09', freq='2D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[-20:-5:3]
+ expected = pd.date_range('2011-01-12', '2011-01-24', freq='3D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[4::-1]
+ expected = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-03',
+ '2011-01-02', '2011-01-01'],
+ freq='-1D', tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ def test_dti_business_getitem(self):
+ rng = pd.bdate_range(START, END)
+ smaller = rng[:5]
+ exp = DatetimeIndex(rng.view(np.ndarray)[:5])
+ tm.assert_index_equal(smaller, exp)
+
+ assert smaller.freq == rng.freq
+
+ sliced = rng[::5]
+ assert sliced.freq == BDay() * 5
+
+ fancy_indexed = rng[[4, 3, 2, 1, 0]]
+ assert len(fancy_indexed) == 5
+ assert isinstance(fancy_indexed, DatetimeIndex)
+ assert fancy_indexed.freq is None
+
+ # 32-bit vs. 64-bit platforms
+ assert rng[4] == rng[np.int_(4)]
+
+ def test_dti_business_getitem_matplotlib_hackaround(self):
+ rng = pd.bdate_range(START, END)
+ values = rng[:, None]
+ expected = rng.values[:, None]
+ tm.assert_numpy_array_equal(values, expected)
+
+ def test_dti_custom_getitem(self):
+ rng = pd.bdate_range(START, END, freq='C')
+ smaller = rng[:5]
+ exp = DatetimeIndex(rng.view(np.ndarray)[:5])
+ tm.assert_index_equal(smaller, exp)
+ assert smaller.freq == rng.freq
+
+ sliced = rng[::5]
+ assert sliced.freq == CDay() * 5
+
+ fancy_indexed = rng[[4, 3, 2, 1, 0]]
+ assert len(fancy_indexed) == 5
+ assert isinstance(fancy_indexed, DatetimeIndex)
+ assert fancy_indexed.freq is None
+
+ # 32-bit vs. 64-bit platforms
+ assert rng[4] == rng[np.int_(4)]
+
+ def test_dti_custom_getitem_matplotlib_hackaround(self):
+ rng = pd.bdate_range(START, END, freq='C')
+ values = rng[:, None]
+ expected = rng.values[:, None]
+ tm.assert_numpy_array_equal(values, expected)
+
+
+class TestWhere(object):
+ def test_where_other(self):
+ # other is ndarray or Index
+ i = pd.date_range('20130101', periods=3, tz='US/Eastern')
+
+ for arr in [np.nan, pd.NaT]:
+ result = i.where(notna(i), other=np.nan)
+ expected = i
+ tm.assert_index_equal(result, expected)
+
+ i2 = i.copy()
+ i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
+ result = i.where(notna(i2), i2)
+ tm.assert_index_equal(result, i2)
+
+ i2 = i.copy()
+ i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
+ result = i.where(notna(i2), i2.values)
+ tm.assert_index_equal(result, i2)
+
+ def test_where_tz(self):
+ i = pd.date_range('20130101', periods=3, tz='US/Eastern')
+ result = i.where(notna(i))
+ expected = i
+ tm.assert_index_equal(result, expected)
+
+ i2 = i.copy()
+ i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
+ result = i.where(notna(i2))
+ expected = i2
+ tm.assert_index_equal(result, expected)
+
+
+class TestTake(object):
+ def test_take(self):
+ # GH#10295
+ idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D',
+ tz='Asia/Tokyo', name='idx')
+
+ for idx in [idx1, idx2]:
+ result = idx.take([0])
+ assert result == Timestamp('2011-01-01', tz=idx.tz)
+
+ result = idx.take([0, 1, 2])
+ expected = pd.date_range('2011-01-01', '2011-01-03', freq='D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([0, 2, 4])
+ expected = pd.date_range('2011-01-01', '2011-01-05', freq='2D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([7, 4, 1])
+ expected = pd.date_range('2011-01-08', '2011-01-02', freq='-3D',
+ tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([3, 2, 5])
+ expected = DatetimeIndex(['2011-01-04', '2011-01-03',
+ '2011-01-06'],
+ freq=None, tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq is None
+
+ result = idx.take([-3, 2, 5])
+ expected = DatetimeIndex(['2011-01-29', '2011-01-03',
+ '2011-01-06'],
+ freq=None, tz=idx.tz, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq is None
+
+ def test_take_invalid_kwargs(self):
+ idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ indices = [1, 6, 5, 9, 10, 13, 15, 3]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+ # TODO: This method came from test_datetime; de-dup with version above
+ @pytest.mark.parametrize('tz', [None, 'US/Eastern', 'Asia/Tokyo'])
+ def test_take2(self, tz):
+ dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15),
+ datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)]
+
+ idx = pd.date_range(start='2010-01-01 09:00',
+ end='2010-02-01 09:00', freq='H', tz=tz,
+ name='idx')
+ expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz)
+
+ taken1 = idx.take([5, 6, 8, 12])
+ taken2 = idx[[5, 6, 8, 12]]
+
+ for taken in [taken1, taken2]:
+ tm.assert_index_equal(taken, expected)
+ assert isinstance(taken, DatetimeIndex)
+ assert taken.freq is None
+ assert taken.tz == expected.tz
+ assert taken.name == expected.name
+
+ def test_take_fill_value(self):
+ # GH#12631
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_take_fill_value_with_timezone(self):
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ name='xxx', tz='US/Eastern')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx', tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'],
+ name='xxx', tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx', tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+
+class TestDatetimeIndex(object):
+ @pytest.mark.parametrize('null', [None, np.nan, pd.NaT])
+ @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern'])
+ def test_insert_nat(self, tz, null):
+ # GH#16537, GH#18295 (test missing)
+ idx = pd.DatetimeIndex(['2017-01-01'], tz=tz)
+ expected = pd.DatetimeIndex(['NaT', '2017-01-01'], tz=tz)
+ res = idx.insert(0, null)
+ tm.assert_index_equal(res, expected)
+
+ def test_insert(self):
+ idx = DatetimeIndex(
+ ['2000-01-04', '2000-01-01', '2000-01-02'], name='idx')
+
+ result = idx.insert(2, datetime(2000, 1, 5))
+ exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05',
+ '2000-01-02'], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ # insertion of non-datetime should coerce to object index
+ result = idx.insert(1, 'inserted')
+ expected = Index([datetime(2000, 1, 4), 'inserted',
+ datetime(2000, 1, 1),
+ datetime(2000, 1, 2)], name='idx')
+ assert not isinstance(result, DatetimeIndex)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ idx = date_range('1/1/2000', periods=3, freq='M', name='idx')
+
+ # preserve freq
+ expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29',
+ '2000-03-31'], name='idx', freq='M')
+ expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31',
+ '2000-04-30'], name='idx', freq='M')
+
+ # reset freq to None
+ expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31',
+ '2000-02-29',
+ '2000-03-31'], name='idx',
+ freq=None)
+ expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29',
+ '2000-03-31',
+ '2000-01-02'], name='idx',
+ freq=None)
+
+ cases = [(0, datetime(1999, 12, 31), expected_0),
+ (-3, datetime(1999, 12, 31), expected_0),
+ (3, datetime(2000, 4, 30), expected_3),
+ (1, datetime(2000, 1, 31), expected_1_nofreq),
+ (3, datetime(2000, 1, 2), expected_3_nofreq)]
+
+ for n, d, expected in cases:
+ result = idx.insert(n, d)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ # reset freq to None
+ result = idx.insert(3, datetime(2000, 1, 2))
+ expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31',
+ '2000-01-02'], name='idx', freq=None)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq is None
+
+ # see gh-7299
+ idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo',
+ name='idx')
+ with pytest.raises(ValueError):
+ idx.insert(3, pd.Timestamp('2000-01-04'))
+ with pytest.raises(ValueError):
+ idx.insert(3, datetime(2000, 1, 4))
+ with pytest.raises(ValueError):
+ idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern'))
+ with pytest.raises(ValueError):
+ idx.insert(3, datetime(2000, 1, 4,
+ tzinfo=pytz.timezone('US/Eastern')))
+
+ for tz in ['US/Pacific', 'Asia/Singapore']:
+ idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz,
+ name='idx')
+ # preserve freq
+ expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz,
+ name='idx')
+ for d in [pd.Timestamp('2000-01-01 15:00', tz=tz),
+ pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]:
+
+ result = idx.insert(6, d)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+ assert result.tz == expected.tz
+
+ expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00',
+ '2000-01-01 11:00',
+ '2000-01-01 12:00', '2000-01-01 13:00',
+ '2000-01-01 14:00',
+ '2000-01-01 10:00'], name='idx',
+ tz=tz, freq=None)
+ # reset freq to None
+ for d in [pd.Timestamp('2000-01-01 10:00', tz=tz),
+ pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]:
+ result = idx.insert(6, d)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.tz == expected.tz
+ assert result.freq is None
+
+ def test_delete(self):
+ idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx')
+
+ # prserve freq
+ expected_0 = date_range(start='2000-02-01', periods=4, freq='M',
+ name='idx')
+ expected_4 = date_range(start='2000-01-01', periods=4, freq='M',
+ name='idx')
+
+ # reset freq to None
+ expected_1 = DatetimeIndex(['2000-01-31', '2000-03-31', '2000-04-30',
+ '2000-05-31'], freq=None, name='idx')
+
+ cases = {0: expected_0,
+ -5: expected_0,
+ -1: expected_4,
+ 4: expected_4,
+ 1: expected_1}
+ for n, expected in compat.iteritems(cases):
+ result = idx.delete(n)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ with pytest.raises((IndexError, ValueError)):
+ # either depending on numpy version
+ idx.delete(5)
+
+ for tz in [None, 'Asia/Tokyo', 'US/Pacific']:
+ idx = date_range(start='2000-01-01 09:00', periods=10, freq='H',
+ name='idx', tz=tz)
+
+ expected = date_range(start='2000-01-01 10:00', periods=9,
+ freq='H', name='idx', tz=tz)
+ result = idx.delete(0)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freqstr == 'H'
+ assert result.tz == expected.tz
+
+ expected = date_range(start='2000-01-01 09:00', periods=9,
+ freq='H', name='idx', tz=tz)
+ result = idx.delete(-1)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freqstr == 'H'
+ assert result.tz == expected.tz
+
+ def test_delete_slice(self):
+ idx = date_range(start='2000-01-01', periods=10, freq='D', name='idx')
+
+ # prserve freq
+ expected_0_2 = date_range(start='2000-01-04', periods=7, freq='D',
+ name='idx')
+ expected_7_9 = date_range(start='2000-01-01', periods=7, freq='D',
+ name='idx')
+
+ # reset freq to None
+ expected_3_5 = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03',
+ '2000-01-07', '2000-01-08', '2000-01-09',
+ '2000-01-10'], freq=None, name='idx')
+
+ cases = {(0, 1, 2): expected_0_2,
+ (7, 8, 9): expected_7_9,
+ (3, 4, 5): expected_3_5}
+ for n, expected in compat.iteritems(cases):
+ result = idx.delete(n)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ result = idx.delete(slice(n[0], n[-1] + 1))
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ for tz in [None, 'Asia/Tokyo', 'US/Pacific']:
+ ts = pd.Series(1, index=pd.date_range(
+ '2000-01-01 09:00', periods=10, freq='H', name='idx', tz=tz))
+ # preserve freq
+ result = ts.drop(ts.index[:5]).index
+ expected = pd.date_range('2000-01-01 14:00', periods=5, freq='H',
+ name='idx', tz=tz)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+ assert result.tz == expected.tz
+
+ # reset freq to None
+ result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index
+ expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 11:00',
+ '2000-01-01 13:00',
+ '2000-01-01 15:00', '2000-01-01 17:00'],
+ freq=None, name='idx', tz=tz)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+ assert result.tz == expected.tz
+
+ def test_get_loc(self):
+ idx = pd.date_range('2000-01-01', periods=3)
+
+ for method in [None, 'pad', 'backfill', 'nearest']:
+ assert idx.get_loc(idx[1], method) == 1
+ assert idx.get_loc(idx[1].to_pydatetime(), method) == 1
+ assert idx.get_loc(str(idx[1]), method) == 1
+
+ if method is not None:
+ assert idx.get_loc(idx[1], method,
+ tolerance=pd.Timedelta('0 days')) == 1
+
+ assert idx.get_loc('2000-01-01', method='nearest') == 0
+ assert idx.get_loc('2000-01-01T12', method='nearest') == 1
+
+ assert idx.get_loc('2000-01-01T12', method='nearest',
+ tolerance='1 day') == 1
+ assert idx.get_loc('2000-01-01T12', method='nearest',
+ tolerance=pd.Timedelta('1D')) == 1
+ assert idx.get_loc('2000-01-01T12', method='nearest',
+ tolerance=np.timedelta64(1, 'D')) == 1
+ assert idx.get_loc('2000-01-01T12', method='nearest',
+ tolerance=timedelta(1)) == 1
+ with pytest.raises(ValueError, match='unit abbreviation w/o a number'):
+ idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
+ with pytest.raises(KeyError):
+ idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')
+ with pytest.raises(
+ ValueError,
+ match='tolerance size must match target index size'):
+ idx.get_loc('2000-01-01', method='nearest',
+ tolerance=[pd.Timedelta('1day').to_timedelta64(),
+ pd.Timedelta('1day').to_timedelta64()])
+
+ assert idx.get_loc('2000', method='nearest') == slice(0, 3)
+ assert idx.get_loc('2000-01', method='nearest') == slice(0, 3)
+
+ assert idx.get_loc('1999', method='nearest') == 0
+ assert idx.get_loc('2001', method='nearest') == 2
+
+ with pytest.raises(KeyError):
+ idx.get_loc('1999', method='pad')
+ with pytest.raises(KeyError):
+ idx.get_loc('2001', method='backfill')
+
+ with pytest.raises(KeyError):
+ idx.get_loc('foobar')
+ with pytest.raises(TypeError):
+ idx.get_loc(slice(2))
+
+ idx = pd.to_datetime(['2000-01-01', '2000-01-04'])
+ assert idx.get_loc('2000-01-02', method='nearest') == 0
+ assert idx.get_loc('2000-01-03', method='nearest') == 1
+ assert idx.get_loc('2000-01', method='nearest') == slice(0, 2)
+
+ # time indexing
+ idx = pd.date_range('2000-01-01', periods=24, freq='H')
+ tm.assert_numpy_array_equal(idx.get_loc(time(12)),
+ np.array([12]), check_dtype=False)
+ tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)),
+ np.array([]), check_dtype=False)
+ with pytest.raises(NotImplementedError):
+ idx.get_loc(time(12, 30), method='pad')
+
+ def test_get_indexer(self):
+ idx = pd.date_range('2000-01-01', periods=3)
+ exp = np.array([0, 1, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(idx.get_indexer(idx), exp)
+
+ target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours',
+ '1 day 1 hour'])
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
+ np.array([-1, 0, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
+ np.array([0, 1, 2], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
+ np.array([0, 1, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(
+ idx.get_indexer(target, 'nearest',
+ tolerance=pd.Timedelta('1 hour')),
+ np.array([0, -1, 1], dtype=np.intp))
+ tol_raw = [pd.Timedelta('1 hour'),
+ pd.Timedelta('1 hour'),
+ pd.Timedelta('1 hour').to_timedelta64(), ]
+ tm.assert_numpy_array_equal(
+ idx.get_indexer(target, 'nearest',
+ tolerance=[np.timedelta64(x) for x in tol_raw]),
+ np.array([0, -1, 1], dtype=np.intp))
+ tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+ pd.Timedelta('1 hour').to_timedelta64(),
+ 'foo', ]
+ with pytest.raises(
+ ValueError, match='abbreviation w/o a number'):
+ idx.get_indexer(target, 'nearest', tolerance=tol_bad)
+ with pytest.raises(ValueError):
+ idx.get_indexer(idx[[0]], method='nearest', tolerance='foo')
+
+ def test_reasonable_key_error(self):
+ # GH#1062
+ index = DatetimeIndex(['1/3/2000'])
+ with pytest.raises(KeyError, match='2000'):
+ index.get_loc('1/1/2000')
+
+ @pytest.mark.parametrize('key', [pd.Timedelta(0),
+ pd.Timedelta(1),
+ timedelta(0)])
+ def test_timedelta_invalid_key(self, key):
+ # GH#20464
+ dti = pd.date_range('1970-01-01', periods=10)
+ with pytest.raises(TypeError):
+ dti.get_loc(key)
+
+ def test_get_loc_nat(self):
+ # GH#20464
+ index = DatetimeIndex(['1/3/2000', 'NaT'])
+ assert index.get_loc(pd.NaT) == 1
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_misc.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_misc.py
new file mode 100644
index 00000000000..cec181161fc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_misc.py
@@ -0,0 +1,312 @@
+import calendar
+import locale
+import unicodedata
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Index, Timestamp, compat, date_range, datetime, offsets)
+import pandas.util.testing as tm
+
+
+class TestTimeSeries(object):
+
+ def test_pass_datetimeindex_to_index(self):
+ # Bugs in #1396
+ rng = date_range('1/1/2000', '3/1/2000')
+ idx = Index(rng, dtype=object)
+
+ expected = Index(rng.to_pydatetime(), dtype=object)
+
+ tm.assert_numpy_array_equal(idx.values, expected.values)
+
+ def test_range_edges(self):
+ # GH#13672
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'),
+ end=Timestamp('1970-01-01 00:00:00.000000004'),
+ freq='N')
+ exp = DatetimeIndex(['1970-01-01 00:00:00.000000001',
+ '1970-01-01 00:00:00.000000002',
+ '1970-01-01 00:00:00.000000003',
+ '1970-01-01 00:00:00.000000004'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000004'),
+ end=Timestamp('1970-01-01 00:00:00.000000001'),
+ freq='N')
+ exp = DatetimeIndex([])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'),
+ end=Timestamp('1970-01-01 00:00:00.000000001'),
+ freq='N')
+ exp = DatetimeIndex(['1970-01-01 00:00:00.000000001'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000001'),
+ end=Timestamp('1970-01-01 00:00:00.000004'),
+ freq='U')
+ exp = DatetimeIndex(['1970-01-01 00:00:00.000001',
+ '1970-01-01 00:00:00.000002',
+ '1970-01-01 00:00:00.000003',
+ '1970-01-01 00:00:00.000004'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.001'),
+ end=Timestamp('1970-01-01 00:00:00.004'),
+ freq='L')
+ exp = DatetimeIndex(['1970-01-01 00:00:00.001',
+ '1970-01-01 00:00:00.002',
+ '1970-01-01 00:00:00.003',
+ '1970-01-01 00:00:00.004'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:00:01'),
+ end=Timestamp('1970-01-01 00:00:04'), freq='S')
+ exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02',
+ '1970-01-01 00:00:03', '1970-01-01 00:00:04'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 00:01'),
+ end=Timestamp('1970-01-01 00:04'), freq='T')
+ exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02',
+ '1970-01-01 00:03', '1970-01-01 00:04'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01 01:00'),
+ end=Timestamp('1970-01-01 04:00'), freq='H')
+ exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00',
+ '1970-01-01 03:00', '1970-01-01 04:00'])
+ tm.assert_index_equal(idx, exp)
+
+ idx = pd.date_range(start=Timestamp('1970-01-01'),
+ end=Timestamp('1970-01-04'), freq='D')
+ exp = DatetimeIndex(['1970-01-01', '1970-01-02',
+ '1970-01-03', '1970-01-04'])
+ tm.assert_index_equal(idx, exp)
+
+
+class TestDatetime64(object):
+
+ def test_datetimeindex_accessors(self):
+ dti_naive = pd.date_range(freq='D', start=datetime(1998, 1, 1),
+ periods=365)
+ # GH#13303
+ dti_tz = pd.date_range(freq='D', start=datetime(1998, 1, 1),
+ periods=365, tz='US/Eastern')
+ for dti in [dti_naive, dti_tz]:
+
+ assert dti.year[0] == 1998
+ assert dti.month[0] == 1
+ assert dti.day[0] == 1
+ assert dti.hour[0] == 0
+ assert dti.minute[0] == 0
+ assert dti.second[0] == 0
+ assert dti.microsecond[0] == 0
+ assert dti.dayofweek[0] == 3
+
+ assert dti.dayofyear[0] == 1
+ assert dti.dayofyear[120] == 121
+
+ assert dti.weekofyear[0] == 1
+ assert dti.weekofyear[120] == 18
+
+ assert dti.quarter[0] == 1
+ assert dti.quarter[120] == 2
+
+ assert dti.days_in_month[0] == 31
+ assert dti.days_in_month[90] == 30
+
+ assert dti.is_month_start[0]
+ assert not dti.is_month_start[1]
+ assert dti.is_month_start[31]
+ assert dti.is_quarter_start[0]
+ assert dti.is_quarter_start[90]
+ assert dti.is_year_start[0]
+ assert not dti.is_year_start[364]
+ assert not dti.is_month_end[0]
+ assert dti.is_month_end[30]
+ assert not dti.is_month_end[31]
+ assert dti.is_month_end[364]
+ assert not dti.is_quarter_end[0]
+ assert not dti.is_quarter_end[30]
+ assert dti.is_quarter_end[89]
+ assert dti.is_quarter_end[364]
+ assert not dti.is_year_end[0]
+ assert dti.is_year_end[364]
+
+ assert len(dti.year) == 365
+ assert len(dti.month) == 365
+ assert len(dti.day) == 365
+ assert len(dti.hour) == 365
+ assert len(dti.minute) == 365
+ assert len(dti.second) == 365
+ assert len(dti.microsecond) == 365
+ assert len(dti.dayofweek) == 365
+ assert len(dti.dayofyear) == 365
+ assert len(dti.weekofyear) == 365
+ assert len(dti.quarter) == 365
+ assert len(dti.is_month_start) == 365
+ assert len(dti.is_month_end) == 365
+ assert len(dti.is_quarter_start) == 365
+ assert len(dti.is_quarter_end) == 365
+ assert len(dti.is_year_start) == 365
+ assert len(dti.is_year_end) == 365
+ assert len(dti.weekday_name) == 365
+
+ dti.name = 'name'
+
+ # non boolean accessors -> return Index
+ for accessor in DatetimeIndex._field_ops:
+ res = getattr(dti, accessor)
+ assert len(res) == 365
+ assert isinstance(res, Index)
+ assert res.name == 'name'
+
+ # boolean accessors -> return array
+ for accessor in DatetimeIndex._bool_ops:
+ res = getattr(dti, accessor)
+ assert len(res) == 365
+ assert isinstance(res, np.ndarray)
+
+ # test boolean indexing
+ res = dti[dti.is_quarter_start]
+ exp = dti[[0, 90, 181, 273]]
+ tm.assert_index_equal(res, exp)
+ res = dti[dti.is_leap_year]
+ exp = DatetimeIndex([], freq='D', tz=dti.tz, name='name')
+ tm.assert_index_equal(res, exp)
+
+ dti = pd.date_range(freq='BQ-FEB', start=datetime(1998, 1, 1),
+ periods=4)
+
+ assert sum(dti.is_quarter_start) == 0
+ assert sum(dti.is_quarter_end) == 4
+ assert sum(dti.is_year_start) == 0
+ assert sum(dti.is_year_end) == 1
+
+ # Ensure is_start/end accessors throw ValueError for CustomBusinessDay,
+ bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu')
+ dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt)
+ pytest.raises(ValueError, lambda: dti.is_month_start)
+
+ dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'])
+
+ assert dti.is_month_start[0] == 1
+
+ tests = [
+ (Timestamp('2013-06-01', freq='M').is_month_start, 1),
+ (Timestamp('2013-06-01', freq='BM').is_month_start, 0),
+ (Timestamp('2013-06-03', freq='M').is_month_start, 0),
+ (Timestamp('2013-06-03', freq='BM').is_month_start, 1),
+ (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1),
+ (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1),
+ (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1),
+ (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1),
+ (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1),
+ (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1),
+ (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1),
+ (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0),
+ (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0),
+ (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1),
+ (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1),
+ (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1),
+ (Timestamp('2013-06-30', freq='BQ').is_month_end, 0),
+ (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0),
+ (Timestamp('2013-06-30', freq='BQ').is_year_end, 0),
+ (Timestamp('2013-06-28', freq='BQ').is_month_end, 1),
+ (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1),
+ (Timestamp('2013-06-28', freq='BQ').is_year_end, 0),
+ (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0),
+ (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0),
+ (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0),
+ (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1),
+ (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1),
+ (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1),
+ (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1),
+ (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1),
+ (Timestamp('2012-02-01').days_in_month, 29),
+ (Timestamp('2013-02-01').days_in_month, 28)]
+
+ for ts, value in tests:
+ assert ts == value
+
+ # GH 6538: Check that DatetimeIndex and its TimeStamp elements
+ # return the same weekofyear accessor close to new year w/ tz
+ dates = ["2013/12/29", "2013/12/30", "2013/12/31"]
+ dates = DatetimeIndex(dates, tz="Europe/Brussels")
+ expected = [52, 1, 1]
+ assert dates.weekofyear.tolist() == expected
+ assert [d.weekofyear for d in dates] == expected
+
+ # GH 12806
+ @pytest.mark.parametrize('time_locale', [
+ None] if tm.get_locales() is None else [None] + tm.get_locales())
+ def test_datetime_name_accessors(self, time_locale):
+ # Test Monday -> Sunday and January -> December, in that sequence
+ if time_locale is None:
+ # If the time_locale is None, day-name and month_name should
+ # return the english attributes
+ expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
+ 'Friday', 'Saturday', 'Sunday']
+ expected_months = ['January', 'February', 'March', 'April', 'May',
+ 'June', 'July', 'August', 'September',
+ 'October', 'November', 'December']
+ else:
+ with tm.set_locale(time_locale, locale.LC_TIME):
+ expected_days = calendar.day_name[:]
+ expected_months = calendar.month_name[1:]
+
+ # GH#11128
+ dti = pd.date_range(freq='D', start=datetime(1998, 1, 1),
+ periods=365)
+ english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
+ 'Friday', 'Saturday', 'Sunday']
+ for day, name, eng_name in zip(range(4, 11),
+ expected_days,
+ english_days):
+ name = name.capitalize()
+ assert dti.weekday_name[day] == eng_name
+ assert dti.day_name(locale=time_locale)[day] == name
+ ts = Timestamp(datetime(2016, 4, day))
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert ts.weekday_name == eng_name
+ assert ts.day_name(locale=time_locale) == name
+ dti = dti.append(DatetimeIndex([pd.NaT]))
+ assert np.isnan(dti.day_name(locale=time_locale)[-1])
+ ts = Timestamp(pd.NaT)
+ assert np.isnan(ts.day_name(locale=time_locale))
+
+ # GH#12805
+ dti = pd.date_range(freq='M', start='2012', end='2013')
+ result = dti.month_name(locale=time_locale)
+ expected = Index([month.capitalize() for month in expected_months])
+
+ # work around different normalization schemes
+ # https://github.com/pandas-dev/pandas/issues/22342
+ if not compat.PY2:
+ result = result.str.normalize("NFD")
+ expected = expected.str.normalize("NFD")
+
+ tm.assert_index_equal(result, expected)
+
+ for date, expected in zip(dti, expected_months):
+ result = date.month_name(locale=time_locale)
+ expected = expected.capitalize()
+
+ if not compat.PY2:
+ result = unicodedata.normalize("NFD", result)
+ expected = unicodedata.normalize("NFD", result)
+
+ assert result == expected
+ dti = dti.append(DatetimeIndex([pd.NaT]))
+ assert np.isnan(dti.month_name(locale=time_locale)[-1])
+
+ def test_nanosecond_field(self):
+ dti = DatetimeIndex(np.arange(10))
+
+ tm.assert_index_equal(dti.nanosecond,
+ pd.Index(np.arange(10, dtype=np.int64)))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_missing.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_missing.py
new file mode 100644
index 00000000000..c8d47caa7e9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_missing.py
@@ -0,0 +1,52 @@
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestDatetimeIndex(object):
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'Asia/Tokyo'])
+ def test_fillna_datetime64(self, tz):
+ # GH 11343
+ idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT,
+ '2011-01-01 11:00'])
+
+ exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00',
+ '2011-01-01 11:00'])
+ tm.assert_index_equal(
+ idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp)
+
+ # tz mismatch
+ exp = pd.Index([pd.Timestamp('2011-01-01 09:00'),
+ pd.Timestamp('2011-01-01 10:00', tz=tz),
+ pd.Timestamp('2011-01-01 11:00')], dtype=object)
+ tm.assert_index_equal(
+ idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp)
+
+ # object
+ exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x',
+ pd.Timestamp('2011-01-01 11:00')], dtype=object)
+ tm.assert_index_equal(idx.fillna('x'), exp)
+
+ idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT,
+ '2011-01-01 11:00'], tz=tz)
+
+ exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00',
+ '2011-01-01 11:00'], tz=tz)
+ tm.assert_index_equal(
+ idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp)
+
+ exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz),
+ pd.Timestamp('2011-01-01 10:00'),
+ pd.Timestamp('2011-01-01 11:00', tz=tz)],
+ dtype=object)
+ tm.assert_index_equal(
+ idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp)
+
+ # object
+ exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz),
+ 'x',
+ pd.Timestamp('2011-01-01 11:00', tz=tz)],
+ dtype=object)
+ tm.assert_index_equal(idx.fillna('x'), exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_ops.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_ops.py
new file mode 100644
index 00000000000..2a546af7993
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_ops.py
@@ -0,0 +1,498 @@
+from datetime import datetime
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.generic import ABCDateOffset
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Index, PeriodIndex, Series, Timestamp, bdate_range,
+ date_range)
+from pandas.tests.test_base import Ops
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour
+
+START, END = datetime(2009, 1, 1), datetime(2010, 1, 1)
+
+
+class TestDatetimeIndexOps(Ops):
+
+ def setup_method(self, method):
+ super(TestDatetimeIndexOps, self).setup_method(method)
+ mask = lambda x: (isinstance(x, DatetimeIndex) or
+ isinstance(x, PeriodIndex))
+ self.is_valid_objs = [o for o in self.objs if mask(o)]
+ self.not_valid_objs = [o for o in self.objs if not mask(o)]
+
+ def test_ops_properties(self):
+ f = lambda x: isinstance(x, DatetimeIndex)
+ self.check_ops_properties(DatetimeIndex._field_ops, f)
+ self.check_ops_properties(DatetimeIndex._object_ops, f)
+ self.check_ops_properties(DatetimeIndex._bool_ops, f)
+
+ def test_ops_properties_basic(self):
+
+ # sanity check that the behavior didn't change
+ # GH#7206
+ for op in ['year', 'day', 'second', 'weekday']:
+ pytest.raises(TypeError, lambda x: getattr(self.dt_series, op))
+
+ # attribute access should still work!
+ s = Series(dict(year=2000, month=1, day=10))
+ assert s.year == 2000
+ assert s.month == 1
+ assert s.day == 10
+ pytest.raises(AttributeError, lambda: s.weekday)
+
+ def test_repeat_range(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ rng = date_range('1/1/2000', '1/1/2001')
+
+ result = rng.repeat(5)
+ assert result.freq is None
+ assert len(result) == 5 * len(rng)
+
+ index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz)
+ exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01',
+ '2001-01-02', '2001-01-02'], tz=tz)
+ for res in [index.repeat(2), np.repeat(index, 2)]:
+ tm.assert_index_equal(res, exp)
+ assert res.freq is None
+
+ index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz)
+ exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01',
+ '2001-01-03', '2001-01-03'], tz=tz)
+ for res in [index.repeat(2), np.repeat(index, 2)]:
+ tm.assert_index_equal(res, exp)
+ assert res.freq is None
+
+ index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'],
+ tz=tz)
+ exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01',
+ 'NaT', 'NaT', 'NaT',
+ '2003-01-01', '2003-01-01', '2003-01-01'],
+ tz=tz)
+ for res in [index.repeat(3), np.repeat(index, 3)]:
+ tm.assert_index_equal(res, exp)
+ assert res.freq is None
+
+ def test_repeat(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ reps = 2
+ msg = "the 'axis' parameter is not supported"
+
+ rng = pd.date_range(start='2016-01-01', periods=2,
+ freq='30Min', tz=tz)
+
+ expected_rng = DatetimeIndex([
+ Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'),
+ ])
+
+ res = rng.repeat(reps)
+ tm.assert_index_equal(res, expected_rng)
+ assert res.freq is None
+
+ tm.assert_index_equal(np.repeat(rng, reps), expected_rng)
+ with pytest.raises(ValueError, match=msg):
+ np.repeat(rng, reps, axis=1)
+
+ def test_resolution(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T',
+ 'S', 'L', 'U'],
+ ['day', 'day', 'day', 'day', 'hour',
+ 'minute', 'second', 'millisecond',
+ 'microsecond']):
+ idx = pd.date_range(start='2013-04-01', periods=30, freq=freq,
+ tz=tz)
+ assert idx.resolution == expected
+
+ def test_value_counts_unique(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ # GH 7735
+ idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10)
+ # create repeated values, 'n'th element is repeated by n+1 times
+ idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)),
+ tz=tz)
+
+ exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10,
+ tz=tz)
+ expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10,
+ tz=tz)
+ tm.assert_index_equal(idx.unique(), expected)
+
+ idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00',
+ '2013-01-01 09:00', '2013-01-01 08:00',
+ '2013-01-01 08:00', pd.NaT], tz=tz)
+
+ exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
+ tz=tz)
+ expected = Series([3, 2], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00',
+ pd.NaT], tz=tz)
+ expected = Series([3, 2, 1], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(dropna=False),
+ expected)
+
+ tm.assert_index_equal(idx.unique(), exp_idx)
+
+ def test_nonunique_contains(self):
+ # GH 9512
+ for idx in map(DatetimeIndex,
+ ([0, 1, 0], [0, 0, -1], [0, -1, -1],
+ ['2015', '2015', '2016'], ['2015', '2015', '2014'])):
+ assert idx[0] in idx
+
+ @pytest.mark.parametrize('idx',
+ [
+ DatetimeIndex(
+ ['2011-01-01',
+ '2011-01-02',
+ '2011-01-03'],
+ freq='D', name='idx'),
+ DatetimeIndex(
+ ['2011-01-01 09:00',
+ '2011-01-01 10:00',
+ '2011-01-01 11:00'],
+ freq='H', name='tzidx', tz='Asia/Tokyo')
+ ])
+ def test_order_with_freq(self, idx):
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, idx)
+ assert ordered.freq == idx.freq
+
+ ordered = idx.sort_values(ascending=False)
+ expected = idx[::-1]
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq == expected.freq
+ assert ordered.freq.n == -1
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, idx)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]),
+ check_dtype=False)
+ assert ordered.freq == idx.freq
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ expected = idx[::-1]
+ tm.assert_index_equal(ordered, expected)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([2, 1, 0]),
+ check_dtype=False)
+ assert ordered.freq == expected.freq
+ assert ordered.freq.n == -1
+
+ @pytest.mark.parametrize('index_dates,expected_dates', [
+ (['2011-01-01', '2011-01-03', '2011-01-05',
+ '2011-01-02', '2011-01-01'],
+ ['2011-01-01', '2011-01-01', '2011-01-02',
+ '2011-01-03', '2011-01-05']),
+ (['2011-01-01', '2011-01-03', '2011-01-05',
+ '2011-01-02', '2011-01-01'],
+ ['2011-01-01', '2011-01-01', '2011-01-02',
+ '2011-01-03', '2011-01-05']),
+ ([pd.NaT, '2011-01-03', '2011-01-05',
+ '2011-01-02', pd.NaT],
+ [pd.NaT, pd.NaT, '2011-01-02', '2011-01-03',
+ '2011-01-05'])
+ ])
+ def test_order_without_freq(self, index_dates, expected_dates,
+ tz_naive_fixture):
+ tz = tz_naive_fixture
+
+ # without freq
+ index = DatetimeIndex(index_dates, tz=tz, name='idx')
+ expected = DatetimeIndex(expected_dates, tz=tz, name='idx')
+
+ ordered = index.sort_values()
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq is None
+
+ ordered = index.sort_values(ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+ assert ordered.freq is None
+
+ ordered, indexer = index.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, expected)
+
+ exp = np.array([0, 4, 3, 1, 2])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq is None
+
+ ordered, indexer = index.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+
+ exp = np.array([2, 1, 3, 4, 0])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq is None
+
+ def test_drop_duplicates_metadata(self):
+ # GH 10115
+ idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ result = idx.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert idx.freq == result.freq
+
+ idx_dup = idx.append(idx)
+ assert idx_dup.freq is None # freq is reset
+ result = idx_dup.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert result.freq is None
+
+ def test_drop_duplicates(self):
+ # to check Index/Series compat
+ base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ idx = base.append(base[:5])
+
+ res = idx.drop_duplicates()
+ tm.assert_index_equal(res, base)
+ res = Series(idx).drop_duplicates()
+ tm.assert_series_equal(res, Series(base))
+
+ res = idx.drop_duplicates(keep='last')
+ exp = base[5:].append(base[:5])
+ tm.assert_index_equal(res, exp)
+ res = Series(idx).drop_duplicates(keep='last')
+ tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
+
+ res = idx.drop_duplicates(keep=False)
+ tm.assert_index_equal(res, base[5:])
+ res = Series(idx).drop_duplicates(keep=False)
+ tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+
+ @pytest.mark.parametrize('freq', [
+ 'A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D',
+ '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S',
+ '-3S'])
+ def test_infer_freq(self, freq):
+ # GH 11018
+ idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10)
+ result = pd.DatetimeIndex(idx.asi8, freq='infer')
+ tm.assert_index_equal(idx, result)
+ assert result.freq == freq
+
+ def test_nat(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ assert pd.DatetimeIndex._na_value is pd.NaT
+ assert pd.DatetimeIndex([])._na_value is pd.NaT
+
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, False]))
+ assert idx.hasnans is False
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([], dtype=np.intp))
+
+ idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz)
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, True]))
+ assert idx.hasnans is True
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([1], dtype=np.intp))
+
+ def test_equals(self):
+ # GH 13107
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'])
+ assert idx.equals(idx)
+ assert idx.equals(idx.copy())
+ assert idx.equals(idx.astype(object))
+ assert idx.astype(object).equals(idx)
+ assert idx.astype(object).equals(idx.astype(object))
+ assert not idx.equals(list(idx))
+ assert not idx.equals(pd.Series(idx))
+
+ idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'],
+ tz='US/Pacific')
+ assert not idx.equals(idx2)
+ assert not idx.equals(idx2.copy())
+ assert not idx.equals(idx2.astype(object))
+ assert not idx.astype(object).equals(idx2)
+ assert not idx.equals(list(idx2))
+ assert not idx.equals(pd.Series(idx2))
+
+ # same internal, different tz
+ idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific')
+ tm.assert_numpy_array_equal(idx.asi8, idx3.asi8)
+ assert not idx.equals(idx3)
+ assert not idx.equals(idx3.copy())
+ assert not idx.equals(idx3.astype(object))
+ assert not idx.astype(object).equals(idx3)
+ assert not idx.equals(list(idx3))
+ assert not idx.equals(pd.Series(idx3))
+
+ @pytest.mark.parametrize('values', [
+ ['20180101', '20180103', '20180105'], []])
+ @pytest.mark.parametrize('freq', [
+ '2D', Day(2), '2B', BDay(2), '48H', Hour(48)])
+ @pytest.mark.parametrize('tz', [None, 'US/Eastern'])
+ def test_freq_setter(self, values, freq, tz):
+ # GH 20678
+ idx = DatetimeIndex(values, tz=tz)
+
+ # can set to an offset, converting from string if necessary
+ idx.freq = freq
+ assert idx.freq == freq
+ assert isinstance(idx.freq, ABCDateOffset)
+
+ # can reset to None
+ idx.freq = None
+ assert idx.freq is None
+
+ def test_freq_setter_errors(self):
+ # GH 20678
+ idx = DatetimeIndex(['20180101', '20180103', '20180105'])
+
+ # setting with an incompatible freq
+ msg = ('Inferred frequency 2D from passed values does not conform to '
+ 'passed frequency 5D')
+ with pytest.raises(ValueError, match=msg):
+ idx.freq = '5D'
+
+ # setting with non-freq string
+ with pytest.raises(ValueError, match='Invalid frequency'):
+ idx.freq = 'foo'
+
+ def test_offset_deprecated(self):
+ # GH 20716
+ idx = pd.DatetimeIndex(['20180101', '20180102'])
+
+ # getter deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ idx.offset
+
+ # setter deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ idx.offset = BDay()
+
+
+class TestBusinessDatetimeIndex(object):
+
+ def setup_method(self, method):
+ self.rng = bdate_range(START, END)
+
+ def test_comparison(self):
+ d = self.rng[10]
+
+ comp = self.rng > d
+ assert comp[11]
+ assert not comp[9]
+
+ def test_pickle_unpickle(self):
+ unpickled = tm.round_trip_pickle(self.rng)
+ assert unpickled.freq is not None
+
+ def test_copy(self):
+ cp = self.rng.copy()
+ repr(cp)
+ tm.assert_index_equal(cp, self.rng)
+
+ def test_shift(self):
+ shifted = self.rng.shift(5)
+ assert shifted[0] == self.rng[5]
+ assert shifted.freq == self.rng.freq
+
+ shifted = self.rng.shift(-5)
+ assert shifted[5] == self.rng[0]
+ assert shifted.freq == self.rng.freq
+
+ shifted = self.rng.shift(0)
+ assert shifted[0] == self.rng[0]
+ assert shifted.freq == self.rng.freq
+
+ rng = date_range(START, END, freq=BMonthEnd())
+ shifted = rng.shift(1, freq=BDay())
+ assert shifted[0] == rng[0] + BDay()
+
+ def test_equals(self):
+ assert not self.rng.equals(list(self.rng))
+
+ def test_identical(self):
+ t1 = self.rng.copy()
+ t2 = self.rng.copy()
+ assert t1.identical(t2)
+
+ # name
+ t1 = t1.rename('foo')
+ assert t1.equals(t2)
+ assert not t1.identical(t2)
+ t2 = t2.rename('foo')
+ assert t1.identical(t2)
+
+ # freq
+ t2v = Index(t2.values)
+ assert t1.equals(t2v)
+ assert not t1.identical(t2v)
+
+
+class TestCustomDatetimeIndex(object):
+ def setup_method(self, method):
+ self.rng = bdate_range(START, END, freq='C')
+
+ def test_comparison(self):
+ d = self.rng[10]
+
+ comp = self.rng > d
+ assert comp[11]
+ assert not comp[9]
+
+ def test_copy(self):
+ cp = self.rng.copy()
+ repr(cp)
+ tm.assert_index_equal(cp, self.rng)
+
+ def test_shift(self):
+
+ shifted = self.rng.shift(5)
+ assert shifted[0] == self.rng[5]
+ assert shifted.freq == self.rng.freq
+
+ shifted = self.rng.shift(-5)
+ assert shifted[5] == self.rng[0]
+ assert shifted.freq == self.rng.freq
+
+ shifted = self.rng.shift(0)
+ assert shifted[0] == self.rng[0]
+ assert shifted.freq == self.rng.freq
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", pd.errors.PerformanceWarning)
+ rng = date_range(START, END, freq=BMonthEnd())
+ shifted = rng.shift(1, freq=CDay())
+ assert shifted[0] == rng[0] + CDay()
+
+ def test_shift_periods(self):
+ # GH#22458 : argument 'n' was deprecated in favor of 'periods'
+ idx = pd.date_range(start=START, end=END, periods=3)
+ tm.assert_index_equal(idx.shift(periods=0), idx)
+ tm.assert_index_equal(idx.shift(0), idx)
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=True):
+ tm.assert_index_equal(idx.shift(n=0), idx)
+
+ def test_pickle_unpickle(self):
+ unpickled = tm.round_trip_pickle(self.rng)
+ assert unpickled.freq is not None
+
+ def test_equals(self):
+ assert not self.rng.equals(list(self.rng))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_partial_slicing.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_partial_slicing.py
new file mode 100644
index 00000000000..1b2aab9d370
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_partial_slicing.py
@@ -0,0 +1,388 @@
+""" test partial slicing on Series/Frame """
+
+from datetime import datetime
+import operator as op
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range)
+from pandas.core.indexing import IndexingError
+from pandas.util import testing as tm
+
+
+class TestSlicing(object):
+ def test_dti_slicing(self):
+ dti = date_range(start='1/1/2005', end='12/1/2005', freq='M')
+ dti2 = dti[[1, 3, 5]]
+
+ v1 = dti2[0]
+ v2 = dti2[1]
+ v3 = dti2[2]
+
+ assert v1 == Timestamp('2/28/2005')
+ assert v2 == Timestamp('4/30/2005')
+ assert v3 == Timestamp('6/30/2005')
+
+ # don't carry freq through irregular slicing
+ assert dti2.freq is None
+
+ def test_slice_keeps_name(self):
+ # GH4226
+ st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles')
+ et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles')
+ dr = pd.date_range(st, et, freq='H', name='timebucket')
+ assert dr[1:].name == dr.name
+
+ def test_slice_with_negative_step(self):
+ ts = Series(np.arange(20),
+ date_range('2014-01-01', periods=20, freq='MS'))
+ SLC = pd.IndexSlice
+
+ def assert_slices_equivalent(l_slc, i_slc):
+ tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc])
+ tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+ tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+
+ assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1])
+ assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1])
+
+ assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1])
+ assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1])
+
+ assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1],
+ SLC[13:8:-1])
+ assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp(
+ '2014-10-01'):-1], SLC[13:8:-1])
+ assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1],
+ SLC[13:8:-1])
+ assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1],
+ SLC[13:8:-1])
+
+ assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0])
+
+ def test_slice_with_zero_step_raises(self):
+ ts = Series(np.arange(20),
+ date_range('2014-01-01', periods=20, freq='MS'))
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
+
+ def test_slice_bounds_empty(self):
+ # GH#14354
+ empty_idx = date_range(freq='1H', periods=0, end='2015')
+
+ right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc')
+ exp = Timestamp('2015-01-02 23:59:59.999999999')
+ assert right == exp
+
+ left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc')
+ exp = Timestamp('2015-01-02 00:00:00')
+ assert left == exp
+
+ def test_slice_duplicate_monotonic(self):
+ # https://github.com/pandas-dev/pandas/issues/16515
+ idx = pd.DatetimeIndex(['2017', '2017'])
+ result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc')
+ expected = Timestamp('2017-01-01')
+ assert result == expected
+
+ def test_monotone_DTI_indexing_bug(self):
+ # GH 19362
+ # Testing accessing the first element in a montononic descending
+ # partial string indexing.
+
+ df = pd.DataFrame(list(range(5)))
+ date_list = ['2018-01-02', '2017-02-10', '2016-03-10',
+ '2015-03-15', '2014-03-16']
+ date_index = pd.to_datetime(date_list)
+ df['date'] = date_index
+ expected = pd.DataFrame({0: list(range(5)), 'date': date_index})
+ tm.assert_frame_equal(df, expected)
+
+ df = pd.DataFrame({'A': [1, 2, 3]},
+ index=pd.date_range('20170101',
+ periods=3)[::-1])
+ expected = pd.DataFrame({'A': 1},
+ index=pd.date_range('20170103',
+ periods=1))
+ tm.assert_frame_equal(df.loc['2017-01-03'], expected)
+
+ def test_slice_year(self):
+ dti = date_range(freq='B', start=datetime(2005, 1, 1), periods=500)
+
+ s = Series(np.arange(len(dti)), index=dti)
+ result = s['2005']
+ expected = s[s.index.year == 2005]
+ tm.assert_series_equal(result, expected)
+
+ df = DataFrame(np.random.rand(len(dti), 5), index=dti)
+ result = df.loc['2005']
+ expected = df[df.index.year == 2005]
+ tm.assert_frame_equal(result, expected)
+
+ rng = date_range('1/1/2000', '1/1/2010')
+
+ result = rng.get_loc('2009')
+ expected = slice(3288, 3653)
+ assert result == expected
+
+ def test_slice_quarter(self):
+ dti = date_range(freq='D', start=datetime(2000, 6, 1), periods=500)
+
+ s = Series(np.arange(len(dti)), index=dti)
+ assert len(s['2001Q1']) == 90
+
+ df = DataFrame(np.random.rand(len(dti), 5), index=dti)
+ assert len(df.loc['1Q01']) == 90
+
+ def test_slice_month(self):
+ dti = date_range(freq='D', start=datetime(2005, 1, 1), periods=500)
+ s = Series(np.arange(len(dti)), index=dti)
+ assert len(s['2005-11']) == 30
+
+ df = DataFrame(np.random.rand(len(dti), 5), index=dti)
+ assert len(df.loc['2005-11']) == 30
+
+ tm.assert_series_equal(s['2005-11'], s['11-2005'])
+
+ def test_partial_slice(self):
+ rng = date_range(freq='D', start=datetime(2005, 1, 1), periods=500)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['2005-05':'2006-02']
+ expected = s['20050501':'20060228']
+ tm.assert_series_equal(result, expected)
+
+ result = s['2005-05':]
+ expected = s['20050501':]
+ tm.assert_series_equal(result, expected)
+
+ result = s[:'2006-02']
+ expected = s[:'20060228']
+ tm.assert_series_equal(result, expected)
+
+ result = s['2005-1-1']
+ assert result == s.iloc[0]
+
+ pytest.raises(Exception, s.__getitem__, '2004-12-31')
+
+ def test_partial_slice_daily(self):
+ rng = date_range(freq='H', start=datetime(2005, 1, 31), periods=500)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['2005-1-31']
+ tm.assert_series_equal(result, s.iloc[:24])
+
+ pytest.raises(Exception, s.__getitem__, '2004-12-31 00')
+
+ def test_partial_slice_hourly(self):
+ rng = date_range(freq='T', start=datetime(2005, 1, 1, 20, 0, 0),
+ periods=500)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['2005-1-1']
+ tm.assert_series_equal(result, s.iloc[:60 * 4])
+
+ result = s['2005-1-1 20']
+ tm.assert_series_equal(result, s.iloc[:60])
+
+ assert s['2005-1-1 20:00'] == s.iloc[0]
+ pytest.raises(Exception, s.__getitem__, '2004-12-31 00:15')
+
+ def test_partial_slice_minutely(self):
+ rng = date_range(freq='S', start=datetime(2005, 1, 1, 23, 59, 0),
+ periods=500)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['2005-1-1 23:59']
+ tm.assert_series_equal(result, s.iloc[:60])
+
+ result = s['2005-1-1']
+ tm.assert_series_equal(result, s.iloc[:60])
+
+ assert s[Timestamp('2005-1-1 23:59:00')] == s.iloc[0]
+ pytest.raises(Exception, s.__getitem__, '2004-12-31 00:00:00')
+
+ def test_partial_slice_second_precision(self):
+ rng = date_range(start=datetime(2005, 1, 1, 0, 0, 59,
+ microsecond=999990),
+ periods=20, freq='US')
+ s = Series(np.arange(20), rng)
+
+ tm.assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10])
+ tm.assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10])
+
+ tm.assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:])
+ tm.assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:])
+
+ assert s[Timestamp('2005-1-1 00:00:59.999990')] == s.iloc[0]
+ with pytest.raises(KeyError, match='2005-1-1 00:00:00'):
+ s['2005-1-1 00:00:00']
+
+ def test_partial_slicing_dataframe(self):
+ # GH14856
+ # Test various combinations of string slicing resolution vs.
+ # index resolution
+ # - If string resolution is less precise than index resolution,
+ # string is considered a slice
+ # - If string resolution is equal to or more precise than index
+ # resolution, string is considered an exact match
+ formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H',
+ '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']
+ resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second']
+ for rnum, resolution in enumerate(resolutions[2:], 2):
+ # we check only 'day', 'hour', 'minute' and 'second'
+ unit = Timedelta("1 " + resolution)
+ middate = datetime(2012, 1, 1, 0, 0, 0)
+ index = DatetimeIndex([middate - unit,
+ middate, middate + unit])
+ values = [1, 2, 3]
+ df = DataFrame({'a': values}, index, dtype=np.int64)
+ assert df.index.resolution == resolution
+
+ # Timestamp with the same resolution as index
+ # Should be exact match for Series (return scalar)
+ # and raise KeyError for Frame
+ for timestamp, expected in zip(index, values):
+ ts_string = timestamp.strftime(formats[rnum])
+ # make ts_string as precise as index
+ result = df['a'][ts_string]
+ assert isinstance(result, np.int64)
+ assert result == expected
+ pytest.raises(KeyError, df.__getitem__, ts_string)
+
+ # Timestamp with resolution less precise than index
+ for fmt in formats[:rnum]:
+ for element, theslice in [[0, slice(None, 1)],
+ [1, slice(1, None)]]:
+ ts_string = index[element].strftime(fmt)
+
+ # Series should return slice
+ result = df['a'][ts_string]
+ expected = df['a'][theslice]
+ tm.assert_series_equal(result, expected)
+
+ # Frame should return slice as well
+ result = df[ts_string]
+ expected = df[theslice]
+ tm.assert_frame_equal(result, expected)
+
+ # Timestamp with resolution more precise than index
+ # Compatible with existing key
+ # Should return scalar for Series
+ # and raise KeyError for Frame
+ for fmt in formats[rnum + 1:]:
+ ts_string = index[1].strftime(fmt)
+ result = df['a'][ts_string]
+ assert isinstance(result, np.int64)
+ assert result == 2
+ pytest.raises(KeyError, df.__getitem__, ts_string)
+
+ # Not compatible with existing key
+ # Should raise KeyError
+ for fmt, res in list(zip(formats, resolutions))[rnum + 1:]:
+ ts = index[1] + Timedelta("1 " + res)
+ ts_string = ts.strftime(fmt)
+ pytest.raises(KeyError, df['a'].__getitem__, ts_string)
+ pytest.raises(KeyError, df.__getitem__, ts_string)
+
+ def test_partial_slicing_with_multiindex(self):
+
+ # GH 4758
+ # partial string indexing with a multi-index buggy
+ df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"],
+ 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"],
+ 'val': [1, 2, 3, 4]},
+ index=date_range("2013-06-19 09:30:00",
+ periods=4, freq='5T'))
+ df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True)
+
+ expected = DataFrame([
+ [1]
+ ], index=Index(['ABC'], name='TICKER'), columns=['val'])
+ result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')]
+ tm.assert_frame_equal(result, expected)
+
+ expected = df_multi.loc[
+ (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')]
+ result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')]
+ tm.assert_series_equal(result, expected)
+
+ # this is an IndexingError as we don't do partial string selection on
+ # multi-levels.
+ def f():
+ df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')]
+
+ pytest.raises(IndexingError, f)
+
+ # GH 4294
+ # partial slice on a series mi
+ s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range(
+ '2000-1-1', periods=1000)).stack()
+
+ s2 = s[:-1].copy()
+ expected = s2['2000-1-4']
+ result = s2[pd.Timestamp('2000-1-4')]
+ tm.assert_series_equal(result, expected)
+
+ result = s[pd.Timestamp('2000-1-4')]
+ expected = s['2000-1-4']
+ tm.assert_series_equal(result, expected)
+
+ df2 = pd.DataFrame(s)
+ expected = df2.xs('2000-1-4')
+ result = df2.loc[pd.Timestamp('2000-1-4')]
+ tm.assert_frame_equal(result, expected)
+
+ def test_partial_slice_doesnt_require_monotonicity(self):
+ # For historical reasons.
+ s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10))
+
+ nonmonotonic = s[[3, 5, 4]]
+ expected = nonmonotonic.iloc[:0]
+ timestamp = pd.Timestamp('2014-01-10')
+
+ tm.assert_series_equal(nonmonotonic['2014-01-10':], expected)
+ with pytest.raises(KeyError,
+ match=r"Timestamp\('2014-01-10 00:00:00'\)"):
+ nonmonotonic[timestamp:]
+
+ tm.assert_series_equal(nonmonotonic.loc['2014-01-10':], expected)
+ with pytest.raises(KeyError,
+ match=r"Timestamp\('2014-01-10 00:00:00'\)"):
+ nonmonotonic.loc[timestamp:]
+
+ def test_loc_datetime_length_one(self):
+ # GH16071
+ df = pd.DataFrame(columns=['1'],
+ index=pd.date_range('2016-10-01T00:00:00',
+ '2016-10-01T23:59:59'))
+ result = df.loc[datetime(2016, 10, 1):]
+ tm.assert_frame_equal(result, df)
+
+ result = df.loc['2016-10-01T00:00:00':]
+ tm.assert_frame_equal(result, df)
+
+ @pytest.mark.parametrize('datetimelike', [
+ Timestamp('20130101'), datetime(2013, 1, 1),
+ np.datetime64('2013-01-01T00:00', 'ns')])
+ @pytest.mark.parametrize('op,expected', [
+ (op.lt, [True, False, False, False]),
+ (op.le, [True, True, False, False]),
+ (op.eq, [False, True, False, False]),
+ (op.gt, [False, False, False, True])])
+ def test_selection_by_datetimelike(self, datetimelike, op, expected):
+ # GH issue #17965, test for ability to compare datetime64[ns] columns
+ # to datetimelike
+ df = DataFrame({'A': [pd.Timestamp('20120101'),
+ pd.Timestamp('20130101'),
+ np.nan, pd.Timestamp('20130103')]})
+ result = op(df.A, datetimelike)
+ expected = Series(expected, name='A')
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_scalar_compat.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_scalar_compat.py
new file mode 100644
index 00000000000..680eddd27cf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_scalar_compat.py
@@ -0,0 +1,280 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for DatetimeIndex methods behaving like their Timestamp counterparts
+"""
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DatetimeIndex, Timestamp, date_range
+import pandas.util.testing as tm
+
+from pandas.tseries.frequencies import to_offset
+
+
+class TestDatetimeIndexOps(object):
+ def test_dti_time(self):
+ rng = date_range('1/1/2000', freq='12min', periods=10)
+ result = pd.Index(rng).time
+ expected = [t.time() for t in rng]
+ assert (result == expected).all()
+
+ def test_dti_date(self):
+ rng = date_range('1/1/2000', freq='12H', periods=10)
+ result = pd.Index(rng).date
+ expected = [t.date() for t in rng]
+ assert (result == expected).all()
+
+ def test_dti_date_out_of_range(self):
+ # GH#1475
+ pytest.raises(ValueError, DatetimeIndex, ['1400-01-01'])
+ pytest.raises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)])
+
+ @pytest.mark.parametrize('field', [
+ 'dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter',
+ 'days_in_month', 'is_month_start', 'is_month_end',
+ 'is_quarter_start', 'is_quarter_end', 'is_year_start',
+ 'is_year_end', 'weekday_name'])
+ def test_dti_timestamp_fields(self, field):
+ # extra fields from DatetimeIndex like quarter and week
+ idx = tm.makeDateIndex(100)
+ expected = getattr(idx, field)[-1]
+ if field == 'weekday_name':
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = getattr(Timestamp(idx[-1]), field)
+ else:
+ result = getattr(Timestamp(idx[-1]), field)
+ assert result == expected
+
+ def test_dti_timestamp_freq_fields(self):
+ # extra fields from DatetimeIndex like quarter and week
+ idx = tm.makeDateIndex(100)
+
+ assert idx.freq == Timestamp(idx[-1], idx.freq).freq
+ assert idx.freqstr == Timestamp(idx[-1], idx.freq).freqstr
+
+ # ----------------------------------------------------------------
+ # DatetimeIndex.round
+
+ def test_round_daily(self):
+ dti = date_range('20130101 09:10:11', periods=5)
+ result = dti.round('D')
+ expected = date_range('20130101', periods=5)
+ tm.assert_index_equal(result, expected)
+
+ dti = dti.tz_localize('UTC').tz_convert('US/Eastern')
+ result = dti.round('D')
+ expected = date_range('20130101',
+ periods=5).tz_localize('US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ result = dti.round('s')
+ tm.assert_index_equal(result, dti)
+
+ # invalid
+ for freq in ['Y', 'M', 'foobar']:
+ pytest.raises(ValueError, lambda: dti.round(freq))
+
+ def test_round(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ rng = date_range(start='2016-01-01', periods=5,
+ freq='30Min', tz=tz)
+ elt = rng[1]
+
+ expected_rng = DatetimeIndex([
+ Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'),
+ Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'),
+ ])
+ expected_elt = expected_rng[1]
+
+ tm.assert_index_equal(rng.round(freq='H'), expected_rng)
+ assert elt.round(freq='H') == expected_elt
+
+ msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ rng.round(freq='foo')
+ with pytest.raises(ValueError, match=msg):
+ elt.round(freq='foo')
+
+ msg = "<MonthEnd> is a non-fixed frequency"
+ with pytest.raises(ValueError, match=msg):
+ rng.round(freq='M')
+ with pytest.raises(ValueError, match=msg):
+ elt.round(freq='M')
+
+ # GH#14440 & GH#15578
+ index = DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz)
+ result = index.round('ms')
+ expected = DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz)
+ tm.assert_index_equal(result, expected)
+
+ for freq in ['us', 'ns']:
+ tm.assert_index_equal(index, index.round(freq))
+
+ index = DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz)
+ result = index.round('ms')
+ expected = DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz)
+ tm.assert_index_equal(result, expected)
+
+ index = DatetimeIndex(['2016-10-17 12:00:00.001501031'])
+ result = index.round('10ns')
+ expected = DatetimeIndex(['2016-10-17 12:00:00.001501030'])
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(False):
+ ts = '2016-10-17 12:00:00.001501031'
+ DatetimeIndex([ts]).round('1010ns')
+
+ def test_no_rounding_occurs(self, tz_naive_fixture):
+ # GH 21262
+ tz = tz_naive_fixture
+ rng = date_range(start='2016-01-01', periods=5,
+ freq='2Min', tz=tz)
+
+ expected_rng = DatetimeIndex([
+ Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'),
+ Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'),
+ Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'),
+ Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'),
+ Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'),
+ ])
+
+ tm.assert_index_equal(rng.round(freq='2T'), expected_rng)
+
+ @pytest.mark.parametrize('test_input, rounder, freq, expected', [
+ (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']),
+ (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']),
+ (['2117-01-01 00:00:45.000000012'], 'floor', '10ns',
+ ['2117-01-01 00:00:45.000000010']),
+ (['1823-01-01 00:00:01.000000012'], 'ceil', '10ns',
+ ['1823-01-01 00:00:01.000000020']),
+ (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']),
+ (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']),
+ (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']),
+ (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']),
+ (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']),
+ (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']),
+ (('NaT', '1823-01-01 00:00:01'), 'floor', '1s',
+ ('NaT', '1823-01-01 00:00:01')),
+ (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s',
+ ('NaT', '1823-01-01 00:00:01'))
+ ])
+ def test_ceil_floor_edge(self, test_input, rounder, freq, expected):
+ dt = DatetimeIndex(list(test_input))
+ func = getattr(dt, rounder)
+ result = func(freq)
+ expected = DatetimeIndex(list(expected))
+ assert expected.equals(result)
+
+ @pytest.mark.parametrize('start, index_freq, periods', [
+ ('2018-01-01', '12H', 25),
+ ('2018-01-01 0:0:0.124999', '1ns', 1000),
+ ])
+ @pytest.mark.parametrize('round_freq', [
+ '2ns', '3ns', '4ns', '5ns', '6ns', '7ns',
+ '250ns', '500ns', '750ns',
+ '1us', '19us', '250us', '500us', '750us',
+ '1s', '2s', '3s',
+ '12H', '1D',
+ ])
+ def test_round_int64(self, start, index_freq, periods, round_freq):
+ dt = date_range(start=start, freq=index_freq, periods=periods)
+ unit = to_offset(round_freq).nanos
+
+ # test floor
+ result = dt.floor(round_freq)
+ diff = dt.asi8 - result.asi8
+ mod = result.asi8 % unit
+ assert (mod == 0).all(), "floor not a {} multiple".format(round_freq)
+ assert (0 <= diff).all() and (diff < unit).all(), "floor error"
+
+ # test ceil
+ result = dt.ceil(round_freq)
+ diff = result.asi8 - dt.asi8
+ mod = result.asi8 % unit
+ assert (mod == 0).all(), "ceil not a {} multiple".format(round_freq)
+ assert (0 <= diff).all() and (diff < unit).all(), "ceil error"
+
+ # test round
+ result = dt.round(round_freq)
+ diff = abs(result.asi8 - dt.asi8)
+ mod = result.asi8 % unit
+ assert (mod == 0).all(), "round not a {} multiple".format(round_freq)
+ assert (diff <= unit // 2).all(), "round error"
+ if unit % 2 == 0:
+ assert (
+ result.asi8[diff == unit // 2] % 2 == 0
+ ).all(), "round half to even error"
+
+ # ----------------------------------------------------------------
+ # DatetimeIndex.normalize
+
+ def test_normalize(self):
+ rng = date_range('1/1/2000 9:30', periods=10, freq='D')
+
+ result = rng.normalize()
+ expected = date_range('1/1/2000', periods=10, freq='D')
+ tm.assert_index_equal(result, expected)
+
+ arr_ns = np.array([1380585623454345752,
+ 1380585612343234312]).astype("datetime64[ns]")
+ rng_ns = DatetimeIndex(arr_ns)
+ rng_ns_normalized = rng_ns.normalize()
+
+ arr_ns = np.array([1380585600000000000,
+ 1380585600000000000]).astype("datetime64[ns]")
+ expected = DatetimeIndex(arr_ns)
+ tm.assert_index_equal(rng_ns_normalized, expected)
+
+ assert result.is_normalized
+ assert not rng.is_normalized
+
+ def test_normalize_nat(self):
+ dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')])
+ result = dti.normalize()
+ expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')])
+ tm.assert_index_equal(result, expected)
+
+
+class TestDateTimeIndexToJulianDate(object):
+
+ def test_1700(self):
+ dr = date_range(start=Timestamp('1710-10-01'), periods=5, freq='D')
+ r1 = pd.Index([x.to_julian_date() for x in dr])
+ r2 = dr.to_julian_date()
+ assert isinstance(r2, pd.Float64Index)
+ tm.assert_index_equal(r1, r2)
+
+ def test_2000(self):
+ dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='D')
+ r1 = pd.Index([x.to_julian_date() for x in dr])
+ r2 = dr.to_julian_date()
+ assert isinstance(r2, pd.Float64Index)
+ tm.assert_index_equal(r1, r2)
+
+ def test_hour(self):
+ dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='H')
+ r1 = pd.Index([x.to_julian_date() for x in dr])
+ r2 = dr.to_julian_date()
+ assert isinstance(r2, pd.Float64Index)
+ tm.assert_index_equal(r1, r2)
+
+ def test_minute(self):
+ dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='T')
+ r1 = pd.Index([x.to_julian_date() for x in dr])
+ r2 = dr.to_julian_date()
+ assert isinstance(r2, pd.Float64Index)
+ tm.assert_index_equal(r1, r2)
+
+ def test_second(self):
+ dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='S')
+ r1 = pd.Index([x.to_julian_date() for x in dr])
+ r2 = dr.to_julian_date()
+ assert isinstance(r2, pd.Float64Index)
+ tm.assert_index_equal(r1, r2)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_setops.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_setops.py
new file mode 100644
index 00000000000..19009e45ee8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_setops.py
@@ -0,0 +1,500 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, Int64Index, Series, bdate_range,
+ date_range, to_datetime)
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd
+
+START, END = datetime(2009, 1, 1), datetime(2010, 1, 1)
+
+
+class TestDatetimeIndexSetOps(object):
+ tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore',
+ 'dateutil/US/Pacific']
+
+ # TODO: moved from test_datetimelike; dedup with version below
+ def test_union2(self):
+ everything = tm.makeDateIndex(10)
+ first = everything[:5]
+ second = everything[5:]
+ union = first.union(second)
+ assert tm.equalContents(union, everything)
+
+ # GH 10149
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
+ for case in cases:
+ result = first.union(case)
+ assert tm.equalContents(result, everything)
+
+ @pytest.mark.parametrize("tz", tz)
+ def test_union(self, tz):
+ rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz)
+ other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz)
+ expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz)
+
+ rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz)
+ other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz)
+ expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz)
+
+ rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz)
+ other3 = pd.DatetimeIndex([], tz=tz)
+ expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz)
+
+ for rng, other, expected in [(rng1, other1, expected1),
+ (rng2, other2, expected2),
+ (rng3, other3, expected3)]:
+
+ result_union = rng.union(other)
+ tm.assert_index_equal(result_union, expected)
+
+ def test_union_coverage(self):
+ idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02'])
+ ordered = DatetimeIndex(idx.sort_values(), freq='infer')
+ result = ordered.union(idx)
+ tm.assert_index_equal(result, ordered)
+
+ result = ordered[:0].union(ordered)
+ tm.assert_index_equal(result, ordered)
+ assert result.freq == ordered.freq
+
+ def test_union_bug_1730(self):
+ rng_a = date_range('1/1/2012', periods=4, freq='3H')
+ rng_b = date_range('1/1/2012', periods=4, freq='4H')
+
+ result = rng_a.union(rng_b)
+ exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b))))
+ tm.assert_index_equal(result, exp)
+
+ def test_union_bug_1745(self):
+ left = DatetimeIndex(['2012-05-11 15:19:49.695000'])
+ right = DatetimeIndex(['2012-05-29 13:04:21.322000',
+ '2012-05-11 15:27:24.873000',
+ '2012-05-11 15:31:05.350000'])
+
+ result = left.union(right)
+ exp = DatetimeIndex(sorted(set(list(left)) | set(list(right))))
+ tm.assert_index_equal(result, exp)
+
+ def test_union_bug_4564(self):
+ from pandas import DateOffset
+ left = date_range("2013-01-01", "2013-02-01")
+ right = left + DateOffset(minutes=15)
+
+ result = left.union(right)
+ exp = DatetimeIndex(sorted(set(list(left)) | set(list(right))))
+ tm.assert_index_equal(result, exp)
+
+ def test_union_freq_both_none(self):
+ # GH11086
+ expected = bdate_range('20150101', periods=10)
+ expected.freq = None
+
+ result = expected.union(expected)
+ tm.assert_index_equal(result, expected)
+ assert result.freq is None
+
+ def test_union_dataframe_index(self):
+ rng1 = date_range('1/1/1999', '1/1/2012', freq='MS')
+ s1 = Series(np.random.randn(len(rng1)), rng1)
+
+ rng2 = date_range('1/1/1980', '12/1/2001', freq='MS')
+ s2 = Series(np.random.randn(len(rng2)), rng2)
+ df = DataFrame({'s1': s1, 's2': s2})
+
+ exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS')
+ tm.assert_index_equal(df.index, exp)
+
+ def test_union_with_DatetimeIndex(self):
+ i1 = Int64Index(np.arange(0, 20, 2))
+ i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D')
+ i1.union(i2) # Works
+ i2.union(i1) # Fails with "AttributeError: can't set attribute"
+
+ # TODO: moved from test_datetimelike; de-duplicate with version below
+ def test_intersection2(self):
+ first = tm.makeDateIndex(10)
+ second = first[5:]
+ intersect = first.intersection(second)
+ assert tm.equalContents(intersect, second)
+
+ # GH 10149
+ cases = [klass(second.values) for klass in [np.array, Series, list]]
+ for case in cases:
+ result = first.intersection(case)
+ assert tm.equalContents(result, second)
+
+ third = Index(['a', 'b', 'c'])
+ result = first.intersection(third)
+ expected = pd.Index([], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern',
+ 'dateutil/US/Pacific'])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection(self, tz, sort):
+ # GH 4690 (with tz)
+ base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx')
+
+ # if target has the same name, it is preserved
+ rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx')
+ expected2 = date_range('6/1/2000', '6/20/2000', freq='D', name='idx')
+
+ # if target name is different, it will be reset
+ rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other')
+ expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None)
+
+ rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx')
+ expected4 = DatetimeIndex([], name='idx')
+
+ for (rng, expected) in [(rng2, expected2), (rng3, expected3),
+ (rng4, expected4)]:
+ result = base.intersection(rng)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+ assert result.tz == expected.tz
+
+ # non-monotonic
+ base = DatetimeIndex(['2011-01-05', '2011-01-04',
+ '2011-01-02', '2011-01-03'],
+ tz=tz, name='idx')
+
+ rng2 = DatetimeIndex(['2011-01-04', '2011-01-02',
+ '2011-02-02', '2011-02-03'],
+ tz=tz, name='idx')
+ expected2 = DatetimeIndex(['2011-01-04', '2011-01-02'],
+ tz=tz, name='idx')
+
+ rng3 = DatetimeIndex(['2011-01-04', '2011-01-02',
+ '2011-02-02', '2011-02-03'],
+ tz=tz, name='other')
+ expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'],
+ tz=tz, name=None)
+
+ # GH 7880
+ rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz,
+ name='idx')
+ expected4 = DatetimeIndex([], tz=tz, name='idx')
+
+ for (rng, expected) in [(rng2, expected2), (rng3, expected3),
+ (rng4, expected4)]:
+ result = base.intersection(rng, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq is None
+ assert result.tz == expected.tz
+
+ def test_intersection_empty(self):
+ # empty same freq GH2129
+ rng = date_range('6/1/2000', '6/15/2000', freq='T')
+ result = rng[0:0].intersection(rng)
+ assert len(result) == 0
+
+ result = rng.intersection(rng[0:0])
+ assert len(result) == 0
+
+ def test_intersection_bug_1708(self):
+ from pandas import DateOffset
+ index_1 = date_range('1/1/2012', periods=4, freq='12H')
+ index_2 = index_1 + DateOffset(hours=1)
+
+ result = index_1 & index_2
+ assert len(result) == 0
+
+ @pytest.mark.parametrize("tz", tz)
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference(self, tz, sort):
+ rng_dates = ['1/2/2000', '1/3/2000', '1/1/2000', '1/4/2000',
+ '1/5/2000']
+
+ rng1 = pd.DatetimeIndex(rng_dates, tz=tz)
+ other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz)
+ expected1 = pd.DatetimeIndex(rng_dates, tz=tz)
+
+ rng2 = pd.DatetimeIndex(rng_dates, tz=tz)
+ other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz)
+ expected2 = pd.DatetimeIndex(rng_dates[:3], tz=tz)
+
+ rng3 = pd.DatetimeIndex(rng_dates, tz=tz)
+ other3 = pd.DatetimeIndex([], tz=tz)
+ expected3 = pd.DatetimeIndex(rng_dates, tz=tz)
+
+ for rng, other, expected in [(rng1, other1, expected1),
+ (rng2, other2, expected2),
+ (rng3, other3, expected3)]:
+ result_diff = rng.difference(other, sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result_diff, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_freq(self, sort):
+ # GH14323: difference of DatetimeIndex should not preserve frequency
+
+ index = date_range("20160920", "20160925", freq="D")
+ other = date_range("20160921", "20160924", freq="D")
+ expected = DatetimeIndex(["20160920", "20160925"], freq=None)
+ idx_diff = index.difference(other, sort)
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ other = date_range("20160922", "20160925", freq="D")
+ idx_diff = index.difference(other, sort)
+ expected = DatetimeIndex(["20160920", "20160921"], freq=None)
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_datetimeindex_diff(self, sort):
+ dti1 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31),
+ periods=100)
+ dti2 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31),
+ periods=98)
+ assert len(dti1.difference(dti2, sort)) == 2
+
+ def test_datetimeindex_union_join_empty(self):
+ dti = date_range(start='1/1/2001', end='2/1/2001', freq='D')
+ empty = Index([])
+
+ result = dti.union(empty)
+ assert isinstance(result, DatetimeIndex)
+ assert result is result
+
+ result = dti.join(empty)
+ assert isinstance(result, DatetimeIndex)
+
+ def test_join_nonunique(self):
+ idx1 = to_datetime(['2012-11-06 16:00:11.477563',
+ '2012-11-06 16:00:11.477563'])
+ idx2 = to_datetime(['2012-11-06 15:11:09.006507',
+ '2012-11-06 15:11:09.006507'])
+ rs = idx1.join(idx2, how='outer')
+ assert rs.is_monotonic
+
+
+class TestBusinessDatetimeIndex(object):
+
+ def setup_method(self, method):
+ self.rng = bdate_range(START, END)
+
+ def test_union(self):
+ # overlapping
+ left = self.rng[:10]
+ right = self.rng[5:10]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, DatetimeIndex)
+
+ # non-overlapping, gap in middle
+ left = self.rng[:5]
+ right = self.rng[10:]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, Index)
+
+ # non-overlapping, no gap
+ left = self.rng[:5]
+ right = self.rng[5:10]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, DatetimeIndex)
+
+ # order does not matter
+ tm.assert_index_equal(right.union(left), the_union)
+
+ # overlapping, but different offset
+ rng = date_range(START, END, freq=BMonthEnd())
+
+ the_union = self.rng.union(rng)
+ assert isinstance(the_union, DatetimeIndex)
+
+ def test_outer_join(self):
+ # should just behave as union
+
+ # overlapping
+ left = self.rng[:10]
+ right = self.rng[5:10]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+
+ # non-overlapping, gap in middle
+ left = self.rng[:5]
+ right = self.rng[10:]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+ assert the_join.freq is None
+
+ # non-overlapping, no gap
+ left = self.rng[:5]
+ right = self.rng[5:10]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+
+ # overlapping, but different offset
+ rng = date_range(START, END, freq=BMonthEnd())
+
+ the_join = self.rng.join(rng, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+ assert the_join.freq is None
+
+ def test_union_not_cacheable(self):
+ rng = date_range('1/1/2000', periods=50, freq=Minute())
+ rng1 = rng[10:]
+ rng2 = rng[:25]
+ the_union = rng1.union(rng2)
+ tm.assert_index_equal(the_union, rng)
+
+ rng1 = rng[10:]
+ rng2 = rng[15:35]
+ the_union = rng1.union(rng2)
+ expected = rng[10:]
+ tm.assert_index_equal(the_union, expected)
+
+ def test_intersection(self):
+ rng = date_range('1/1/2000', periods=50, freq=Minute())
+ rng1 = rng[10:]
+ rng2 = rng[:25]
+ the_int = rng1.intersection(rng2)
+ expected = rng[10:25]
+ tm.assert_index_equal(the_int, expected)
+ assert isinstance(the_int, DatetimeIndex)
+ assert the_int.freq == rng.freq
+
+ the_int = rng1.intersection(rng2.view(DatetimeIndex))
+ tm.assert_index_equal(the_int, expected)
+
+ # non-overlapping
+ the_int = rng[:10].intersection(rng[10:])
+ expected = DatetimeIndex([])
+ tm.assert_index_equal(the_int, expected)
+
+ def test_intersection_bug(self):
+ # GH #771
+ a = bdate_range('11/30/2011', '12/31/2011')
+ b = bdate_range('12/10/2011', '12/20/2011')
+ result = a.intersection(b)
+ tm.assert_index_equal(result, b)
+
+ def test_month_range_union_tz_pytz(self):
+ from pytz import timezone
+ tz = timezone('US/Eastern')
+
+ early_start = datetime(2011, 1, 1)
+ early_end = datetime(2011, 3, 1)
+
+ late_start = datetime(2011, 3, 1)
+ late_end = datetime(2011, 5, 1)
+
+ early_dr = date_range(start=early_start, end=early_end, tz=tz,
+ freq=MonthEnd())
+ late_dr = date_range(start=late_start, end=late_end, tz=tz,
+ freq=MonthEnd())
+
+ early_dr.union(late_dr)
+
+ @td.skip_if_windows_python_3
+ def test_month_range_union_tz_dateutil(self):
+ from pandas._libs.tslibs.timezones import dateutil_gettz
+ tz = dateutil_gettz('US/Eastern')
+
+ early_start = datetime(2011, 1, 1)
+ early_end = datetime(2011, 3, 1)
+
+ late_start = datetime(2011, 3, 1)
+ late_end = datetime(2011, 5, 1)
+
+ early_dr = date_range(start=early_start, end=early_end, tz=tz,
+ freq=MonthEnd())
+ late_dr = date_range(start=late_start, end=late_end, tz=tz,
+ freq=MonthEnd())
+
+ early_dr.union(late_dr)
+
+
+class TestCustomDatetimeIndex(object):
+
+ def setup_method(self, method):
+ self.rng = bdate_range(START, END, freq='C')
+
+ def test_union(self):
+ # overlapping
+ left = self.rng[:10]
+ right = self.rng[5:10]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, DatetimeIndex)
+
+ # non-overlapping, gap in middle
+ left = self.rng[:5]
+ right = self.rng[10:]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, Index)
+
+ # non-overlapping, no gap
+ left = self.rng[:5]
+ right = self.rng[5:10]
+
+ the_union = left.union(right)
+ assert isinstance(the_union, DatetimeIndex)
+
+ # order does not matter
+ tm.assert_index_equal(right.union(left), the_union)
+
+ # overlapping, but different offset
+ rng = date_range(START, END, freq=BMonthEnd())
+
+ the_union = self.rng.union(rng)
+ assert isinstance(the_union, DatetimeIndex)
+
+ def test_outer_join(self):
+ # should just behave as union
+
+ # overlapping
+ left = self.rng[:10]
+ right = self.rng[5:10]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+
+ # non-overlapping, gap in middle
+ left = self.rng[:5]
+ right = self.rng[10:]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+ assert the_join.freq is None
+
+ # non-overlapping, no gap
+ left = self.rng[:5]
+ right = self.rng[5:10]
+
+ the_join = left.join(right, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+
+ # overlapping, but different offset
+ rng = date_range(START, END, freq=BMonthEnd())
+
+ the_join = self.rng.join(rng, how='outer')
+ assert isinstance(the_join, DatetimeIndex)
+ assert the_join.freq is None
+
+ def test_intersection_bug(self):
+ # GH #771
+ a = bdate_range('11/30/2011', '12/31/2011', freq='C')
+ b = bdate_range('12/10/2011', '12/20/2011', freq='C')
+ result = a.intersection(b)
+ tm.assert_index_equal(result, b)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_timezones.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_timezones.py
new file mode 100644
index 00000000000..8bcc9296cb0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_timezones.py
@@ -0,0 +1,1161 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for DatetimeIndex timezone-related methods
+"""
+from datetime import date, datetime, time, timedelta, tzinfo
+from distutils.version import LooseVersion
+
+import dateutil
+from dateutil.tz import gettz, tzlocal
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs import conversion, timezones
+from pandas.compat import PY3, lrange, zip
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Index, Timestamp, bdate_range, date_range, isna,
+ to_datetime)
+import pandas.util.testing as tm
+
+
+class FixedOffset(tzinfo):
+ """Fixed offset in minutes east from UTC."""
+
+ def __init__(self, offset, name):
+ self.__offset = timedelta(minutes=offset)
+ self.__name = name
+
+ def utcoffset(self, dt):
+ return self.__offset
+
+ def tzname(self, dt):
+ return self.__name
+
+ def dst(self, dt):
+ return timedelta(0)
+
+
+fixed_off = FixedOffset(-420, '-07:00')
+fixed_off_no_name = FixedOffset(-330, None)
+
+
+class TestDatetimeIndexTimezones(object):
+ # -------------------------------------------------------------
+ # DatetimeIndex.tz_convert
+ def test_tz_convert_nat(self):
+ # GH#5546
+ dates = [pd.NaT]
+ idx = DatetimeIndex(dates)
+ idx = idx.tz_localize('US/Pacific')
+ tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific'))
+ idx = idx.tz_convert('US/Eastern')
+ tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern'))
+ idx = idx.tz_convert('UTC')
+ tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC'))
+
+ dates = ['2010-12-01 00:00', '2010-12-02 00:00', pd.NaT]
+ idx = DatetimeIndex(dates)
+ idx = idx.tz_localize('US/Pacific')
+ tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific'))
+ idx = idx.tz_convert('US/Eastern')
+ expected = ['2010-12-01 03:00', '2010-12-02 03:00', pd.NaT]
+ tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern'))
+
+ idx = idx + pd.offsets.Hour(5)
+ expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT]
+ tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern'))
+ idx = idx.tz_convert('US/Pacific')
+ expected = ['2010-12-01 05:00', '2010-12-02 05:00', pd.NaT]
+ tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific'))
+
+ idx = idx + np.timedelta64(3, 'h')
+ expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT]
+ tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific'))
+
+ idx = idx.tz_convert('US/Eastern')
+ expected = ['2010-12-01 11:00', '2010-12-02 11:00', pd.NaT]
+ tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern'))
+
+ @pytest.mark.parametrize('prefix', ['', 'dateutil/'])
+ def test_dti_tz_convert_compat_timestamp(self, prefix):
+ strdates = ['1/1/2012', '3/1/2012', '4/1/2012']
+ idx = DatetimeIndex(strdates, tz=prefix + 'US/Eastern')
+
+ conv = idx[0].tz_convert(prefix + 'US/Pacific')
+ expected = idx.tz_convert(prefix + 'US/Pacific')[0]
+
+ assert conv == expected
+
+ def test_dti_tz_convert_hour_overflow_dst(self):
+ # Regression test for:
+ # https://github.com/pandas-dev/pandas/issues/13306
+
+ # sorted case US/Eastern -> UTC
+ ts = ['2008-05-12 09:50:00',
+ '2008-12-12 09:50:35',
+ '2009-05-12 09:50:32']
+ tt = DatetimeIndex(ts).tz_localize('US/Eastern')
+ ut = tt.tz_convert('UTC')
+ expected = Index([13, 14, 13])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # sorted case UTC -> US/Eastern
+ ts = ['2008-05-12 13:50:00',
+ '2008-12-12 14:50:35',
+ '2009-05-12 13:50:32']
+ tt = DatetimeIndex(ts).tz_localize('UTC')
+ ut = tt.tz_convert('US/Eastern')
+ expected = Index([9, 9, 9])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # unsorted case US/Eastern -> UTC
+ ts = ['2008-05-12 09:50:00',
+ '2008-12-12 09:50:35',
+ '2008-05-12 09:50:32']
+ tt = DatetimeIndex(ts).tz_localize('US/Eastern')
+ ut = tt.tz_convert('UTC')
+ expected = Index([13, 14, 13])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # unsorted case UTC -> US/Eastern
+ ts = ['2008-05-12 13:50:00',
+ '2008-12-12 14:50:35',
+ '2008-05-12 13:50:32']
+ tt = DatetimeIndex(ts).tz_localize('UTC')
+ ut = tt.tz_convert('US/Eastern')
+ expected = Index([9, 9, 9])
+ tm.assert_index_equal(ut.hour, expected)
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz):
+ # Regression test for GH#13306
+
+ # sorted case US/Eastern -> UTC
+ ts = [Timestamp('2008-05-12 09:50:00', tz=tz),
+ Timestamp('2008-12-12 09:50:35', tz=tz),
+ Timestamp('2009-05-12 09:50:32', tz=tz)]
+ tt = DatetimeIndex(ts)
+ ut = tt.tz_convert('UTC')
+ expected = Index([13, 14, 13])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # sorted case UTC -> US/Eastern
+ ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'),
+ Timestamp('2008-12-12 14:50:35', tz='UTC'),
+ Timestamp('2009-05-12 13:50:32', tz='UTC')]
+ tt = DatetimeIndex(ts)
+ ut = tt.tz_convert('US/Eastern')
+ expected = Index([9, 9, 9])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # unsorted case US/Eastern -> UTC
+ ts = [Timestamp('2008-05-12 09:50:00', tz=tz),
+ Timestamp('2008-12-12 09:50:35', tz=tz),
+ Timestamp('2008-05-12 09:50:32', tz=tz)]
+ tt = DatetimeIndex(ts)
+ ut = tt.tz_convert('UTC')
+ expected = Index([13, 14, 13])
+ tm.assert_index_equal(ut.hour, expected)
+
+ # unsorted case UTC -> US/Eastern
+ ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'),
+ Timestamp('2008-12-12 14:50:35', tz='UTC'),
+ Timestamp('2008-05-12 13:50:32', tz='UTC')]
+ tt = DatetimeIndex(ts)
+ ut = tt.tz_convert('US/Eastern')
+ expected = Index([9, 9, 9])
+ tm.assert_index_equal(ut.hour, expected)
+
+ @pytest.mark.parametrize('freq, n', [('H', 1), ('T', 60), ('S', 3600)])
+ def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n):
+ # Regression test for tslib.tz_convert(vals, tz1, tz2).
+ # See https://github.com/pandas-dev/pandas/issues/4496 for details.
+ idx = date_range(datetime(2011, 3, 26, 23),
+ datetime(2011, 3, 27, 1), freq=freq)
+ idx = idx.tz_localize('UTC')
+ idx = idx.tz_convert('Europe/Moscow')
+
+ expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1]))
+ tm.assert_index_equal(idx.hour, Index(expected))
+
+ def test_dti_tz_convert_dst(self):
+ for freq, n in [('H', 1), ('T', 60), ('S', 3600)]:
+ # Start DST
+ idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq,
+ tz='UTC')
+ idx = idx.tz_convert('US/Eastern')
+ expected = np.repeat(np.array([18, 19, 20, 21, 22, 23,
+ 0, 1, 3, 4, 5]),
+ np.array([n, n, n, n, n, n, n, n, n, n, 1]))
+ tm.assert_index_equal(idx.hour, Index(expected))
+
+ idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq,
+ tz='US/Eastern')
+ idx = idx.tz_convert('UTC')
+ expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+ np.array([n, n, n, n, n, n, n, n, n, n, 1]))
+ tm.assert_index_equal(idx.hour, Index(expected))
+
+ # End DST
+ idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq,
+ tz='UTC')
+ idx = idx.tz_convert('US/Eastern')
+ expected = np.repeat(np.array([19, 20, 21, 22, 23,
+ 0, 1, 1, 2, 3, 4]),
+ np.array([n, n, n, n, n, n, n, n, n, n, 1]))
+ tm.assert_index_equal(idx.hour, Index(expected))
+
+ idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq,
+ tz='US/Eastern')
+ idx = idx.tz_convert('UTC')
+ expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10]),
+ np.array([n, n, n, n, n, n, n, n, n,
+ n, n, n, 1]))
+ tm.assert_index_equal(idx.hour, Index(expected))
+
+ # daily
+ # Start DST
+ idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D',
+ tz='UTC')
+ idx = idx.tz_convert('US/Eastern')
+ tm.assert_index_equal(idx.hour, Index([19, 19]))
+
+ idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D',
+ tz='US/Eastern')
+ idx = idx.tz_convert('UTC')
+ tm.assert_index_equal(idx.hour, Index([5, 5]))
+
+ # End DST
+ idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D',
+ tz='UTC')
+ idx = idx.tz_convert('US/Eastern')
+ tm.assert_index_equal(idx.hour, Index([20, 20]))
+
+ idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D',
+ tz='US/Eastern')
+ idx = idx.tz_convert('UTC')
+ tm.assert_index_equal(idx.hour, Index([4, 4]))
+
+ def test_tz_convert_roundtrip(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M',
+ tz='UTC')
+ exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M')
+
+ idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D',
+ tz='UTC')
+ exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D')
+
+ idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H',
+ tz='UTC')
+ exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H')
+
+ idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T',
+ tz='UTC')
+ exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T')
+
+ for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3),
+ (idx4, exp4)]:
+ converted = idx.tz_convert(tz)
+ reset = converted.tz_convert(None)
+ tm.assert_index_equal(reset, expected)
+ assert reset.tzinfo is None
+ expected = converted.tz_convert('UTC').tz_localize(None)
+ tm.assert_index_equal(reset, expected)
+
+ def test_dti_tz_convert_tzlocal(self):
+ # GH#13583
+ # tz_convert doesn't affect to internal
+ dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC')
+ dti2 = dti.tz_convert(dateutil.tz.tzlocal())
+ tm.assert_numpy_array_equal(dti2.asi8, dti.asi8)
+
+ dti = date_range(start='2001-01-01', end='2001-03-01',
+ tz=dateutil.tz.tzlocal())
+ dti2 = dti.tz_convert(None)
+ tm.assert_numpy_array_equal(dti2.asi8, dti.asi8)
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern',
+ pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_convert_utc_to_local_no_modify(self, tz):
+ rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc')
+ rng_eastern = rng.tz_convert(tz)
+
+ # Values are unmodified
+ tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8)
+
+ assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz))
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_tz_convert_unsorted(self, tzstr):
+ dr = date_range('2012-03-09', freq='H', periods=100, tz='utc')
+ dr = dr.tz_convert(tzstr)
+
+ result = dr[::-1].hour
+ exp = dr.hour[::-1]
+ tm.assert_almost_equal(result, exp)
+
+ # -------------------------------------------------------------
+ # DatetimeIndex.tz_localize
+
+ def test_dti_tz_localize_nonexistent_raise_coerce(self):
+ # GH#13057
+ times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00']
+ index = DatetimeIndex(times)
+ tz = 'US/Eastern'
+ with pytest.raises(pytz.NonExistentTimeError):
+ index.tz_localize(tz=tz)
+
+ with pytest.raises(pytz.NonExistentTimeError):
+ with tm.assert_produces_warning(FutureWarning):
+ index.tz_localize(tz=tz, errors='raise')
+
+ with tm.assert_produces_warning(FutureWarning,
+ clear=FutureWarning,
+ check_stacklevel=False):
+ result = index.tz_localize(tz=tz, errors='coerce')
+ test_times = ['2015-03-08 01:00-05:00', 'NaT',
+ '2015-03-08 03:00-04:00']
+ dti = to_datetime(test_times, utc=True)
+ expected = dti.tz_convert('US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_localize_ambiguous_infer(self, tz):
+ # November 6, 2011, fall back, repeat 2 AM hour
+ # With no repeated hours, we cannot infer the transition
+ dr = date_range(datetime(2011, 11, 6, 0), periods=5,
+ freq=pd.offsets.Hour())
+ with pytest.raises(pytz.AmbiguousTimeError):
+ dr.tz_localize(tz)
+
+ # With repeated hours, we can infer the transition
+ dr = date_range(datetime(2011, 11, 6, 0), periods=5,
+ freq=pd.offsets.Hour(), tz=tz)
+ times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00',
+ '11/06/2011 02:00', '11/06/2011 03:00']
+ di = DatetimeIndex(times)
+ localized = di.tz_localize(tz, ambiguous='infer')
+ tm.assert_index_equal(dr, localized)
+ tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz,
+ ambiguous='infer'))
+
+ # When there is no dst transition, nothing special happens
+ dr = date_range(datetime(2011, 6, 1, 0), periods=10,
+ freq=pd.offsets.Hour())
+ localized = dr.tz_localize(tz)
+ localized_infer = dr.tz_localize(tz, ambiguous='infer')
+ tm.assert_index_equal(localized, localized_infer)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_localize_ambiguous_times(self, tz):
+ # March 13, 2011, spring forward, skip from 2 AM to 3 AM
+ dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3,
+ freq=pd.offsets.Hour())
+ with pytest.raises(pytz.NonExistentTimeError):
+ dr.tz_localize(tz)
+
+ # after dst transition, it works
+ dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3,
+ freq=pd.offsets.Hour(), tz=tz)
+
+ # November 6, 2011, fall back, repeat 2 AM hour
+ dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3,
+ freq=pd.offsets.Hour())
+ with pytest.raises(pytz.AmbiguousTimeError):
+ dr.tz_localize(tz)
+
+ # UTC is OK
+ dr = date_range(datetime(2011, 3, 13), periods=48,
+ freq=pd.offsets.Minute(30), tz=pytz.utc)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_tz_localize_pass_dates_to_utc(self, tzstr):
+ strdates = ['1/1/2012', '3/1/2012', '4/1/2012']
+
+ idx = DatetimeIndex(strdates)
+ conv = idx.tz_localize(tzstr)
+
+ fromdates = DatetimeIndex(strdates, tz=tzstr)
+
+ assert conv.tz == fromdates.tz
+ tm.assert_numpy_array_equal(conv.values, fromdates.values)
+
+ @pytest.mark.parametrize('prefix', ['', 'dateutil/'])
+ def test_dti_tz_localize(self, prefix):
+ tzstr = prefix + 'US/Eastern'
+ dti = pd.date_range(start='1/1/2005', end='1/1/2005 0:00:30.256',
+ freq='L')
+ dti2 = dti.tz_localize(tzstr)
+
+ dti_utc = pd.date_range(start='1/1/2005 05:00',
+ end='1/1/2005 5:00:30.256', freq='L', tz='utc')
+
+ tm.assert_numpy_array_equal(dti2.values, dti_utc.values)
+
+ dti3 = dti2.tz_convert(prefix + 'US/Pacific')
+ tm.assert_numpy_array_equal(dti3.values, dti_utc.values)
+
+ dti = pd.date_range(start='11/6/2011 1:59', end='11/6/2011 2:00',
+ freq='L')
+ with pytest.raises(pytz.AmbiguousTimeError):
+ dti.tz_localize(tzstr)
+
+ dti = pd.date_range(start='3/13/2011 1:59', end='3/13/2011 2:00',
+ freq='L')
+ with pytest.raises(pytz.NonExistentTimeError):
+ dti.tz_localize(tzstr)
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern',
+ pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_localize_utc_conversion(self, tz):
+ # Localizing to time zone should:
+ # 1) check for DST ambiguities
+ # 2) convert to UTC
+
+ rng = date_range('3/10/2012', '3/11/2012', freq='30T')
+
+ converted = rng.tz_localize(tz)
+ expected_naive = rng + pd.offsets.Hour(5)
+ tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8)
+
+ # DST ambiguity, this should fail
+ rng = date_range('3/11/2012', '3/12/2012', freq='30T')
+ # Is this really how it should fail??
+ with pytest.raises(pytz.NonExistentTimeError):
+ rng.tz_localize(tz)
+
+ @pytest.mark.parametrize('idx', [
+ date_range(start='2014-01-01', end='2014-12-31', freq='M'),
+ date_range(start='2014-01-01', end='2014-12-31', freq='D'),
+ date_range(start='2014-01-01', end='2014-03-01', freq='H'),
+ date_range(start='2014-08-01', end='2014-10-31', freq='T')
+ ])
+ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture, idx):
+ tz = tz_aware_fixture
+ localized = idx.tz_localize(tz)
+ expected = date_range(start=idx[0], end=idx[-1], freq=idx.freq,
+ tz=tz)
+ tm.assert_index_equal(localized, expected)
+ with pytest.raises(TypeError):
+ localized.tz_localize(tz)
+
+ reset = localized.tz_localize(None)
+ tm.assert_index_equal(reset, idx)
+ assert reset.tzinfo is None
+
+ def test_dti_tz_localize_naive(self):
+ rng = date_range('1/1/2011', periods=100, freq='H')
+
+ conv = rng.tz_localize('US/Pacific')
+ exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific')
+
+ tm.assert_index_equal(conv, exp)
+
+ def test_dti_tz_localize_tzlocal(self):
+ # GH#13583
+ offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1))
+ offset = int(offset.total_seconds() * 1000000000)
+
+ dti = date_range(start='2001-01-01', end='2001-03-01')
+ dti2 = dti.tz_localize(dateutil.tz.tzlocal())
+ tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8)
+
+ dti = date_range(start='2001-01-01', end='2001-03-01',
+ tz=dateutil.tz.tzlocal())
+ dti2 = dti.tz_localize(None)
+ tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_localize_ambiguous_nat(self, tz):
+ times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00',
+ '11/06/2011 02:00', '11/06/2011 03:00']
+ di = DatetimeIndex(times)
+ localized = di.tz_localize(tz, ambiguous='NaT')
+
+ times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00',
+ '11/06/2011 03:00']
+ di_test = DatetimeIndex(times, tz='US/Eastern')
+
+ # left dtype is datetime64[ns, US/Eastern]
+ # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')]
+ tm.assert_numpy_array_equal(di_test.values, localized.values)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_tz_localize_ambiguous_flags(self, tz):
+ # November 6, 2011, fall back, repeat 2 AM hour
+
+ # Pass in flags to determine right dst transition
+ dr = date_range(datetime(2011, 11, 6, 0), periods=5,
+ freq=pd.offsets.Hour(), tz=tz)
+ times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00',
+ '11/06/2011 02:00', '11/06/2011 03:00']
+
+ # Test tz_localize
+ di = DatetimeIndex(times)
+ is_dst = [1, 1, 0, 0, 0]
+ localized = di.tz_localize(tz, ambiguous=is_dst)
+ tm.assert_index_equal(dr, localized)
+ tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz,
+ ambiguous=is_dst))
+
+ localized = di.tz_localize(tz, ambiguous=np.array(is_dst))
+ tm.assert_index_equal(dr, localized)
+
+ localized = di.tz_localize(tz,
+ ambiguous=np.array(is_dst).astype('bool'))
+ tm.assert_index_equal(dr, localized)
+
+ # Test constructor
+ localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst)
+ tm.assert_index_equal(dr, localized)
+
+ # Test duplicate times where inferring the dst fails
+ times += times
+ di = DatetimeIndex(times)
+
+ # When the sizes are incompatible, make sure error is raised
+ with pytest.raises(Exception):
+ di.tz_localize(tz, ambiguous=is_dst)
+
+ # When sizes are compatible and there are repeats ('infer' won't work)
+ is_dst = np.hstack((is_dst, is_dst))
+ localized = di.tz_localize(tz, ambiguous=is_dst)
+ dr = dr.append(dr)
+ tm.assert_index_equal(dr, localized)
+
+ # When there is no dst transition, nothing special happens
+ dr = date_range(datetime(2011, 6, 1, 0), periods=10,
+ freq=pd.offsets.Hour())
+ is_dst = np.array([1] * 10)
+ localized = dr.tz_localize(tz)
+ localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst)
+ tm.assert_index_equal(localized, localized_is_dst)
+
+ # TODO: belongs outside tz_localize tests?
+ @pytest.mark.parametrize('tz', ['Europe/London', 'dateutil/Europe/London'])
+ def test_dti_construction_ambiguous_endpoint(self, tz):
+ # construction with an ambiguous end-point
+ # GH#11626
+
+ # FIXME: This next block fails to raise; it was taken from an older
+ # version of this test that had an indention mistake that caused it
+ # to not get executed.
+ # with pytest.raises(pytz.AmbiguousTimeError):
+ # date_range("2013-10-26 23:00", "2013-10-27 01:00",
+ # tz="Europe/London", freq="H")
+
+ times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H",
+ tz=tz, ambiguous='infer')
+ assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H")
+
+ if str(tz).startswith('dateutil'):
+ if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'):
+ # see GH#14621
+ assert times[-1] == Timestamp('2013-10-27 01:00:00+0000',
+ tz=tz, freq="H")
+ elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'):
+ # fixed ambiguous behavior
+ assert times[-1] == Timestamp('2013-10-27 01:00:00+0100',
+ tz=tz, freq="H")
+ else:
+ assert times[-1] == Timestamp('2013-10-27 01:00:00+0000',
+ tz=tz, freq="H")
+
+ def test_dti_tz_localize_bdate_range(self):
+ dr = pd.bdate_range('1/1/2009', '1/1/2010')
+ dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc)
+ localized = dr.tz_localize(pytz.utc)
+ tm.assert_index_equal(dr_utc, localized)
+
+ @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw'])
+ @pytest.mark.parametrize('method, exp', [
+ ['NaT', pd.NaT],
+ ['raise', None],
+ ['foo', 'invalid']
+ ])
+ def test_dti_tz_localize_nonexistent(self, tz, method, exp):
+ # GH 8917
+ n = 60
+ dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min')
+ if method == 'raise':
+ with pytest.raises(pytz.NonExistentTimeError):
+ dti.tz_localize(tz, nonexistent=method)
+ elif exp == 'invalid':
+ with pytest.raises(ValueError):
+ dti.tz_localize(tz, nonexistent=method)
+ else:
+ result = dti.tz_localize(tz, nonexistent=method)
+ expected = DatetimeIndex([exp] * n, tz=tz)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [
+ ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00',
+ 'forward'],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 01:59:59.999999999', 'backward'],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 03:20:00', timedelta(hours=1)],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 01:20:00', timedelta(hours=-1)],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00',
+ 'forward'],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999',
+ 'backward'],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00',
+ timedelta(hours=1)],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00',
+ timedelta(hours=-1)]
+ ])
+ @pytest.mark.parametrize('tz_type', ['', 'dateutil/'])
+ def test_dti_tz_localize_nonexistent_shift(self, start_ts, tz,
+ end_ts, shift,
+ tz_type):
+ # GH 8917
+ tz = tz_type + tz
+ if isinstance(shift, str):
+ shift = 'shift_' + shift
+ dti = DatetimeIndex([Timestamp(start_ts)])
+ result = dti.tz_localize(tz, nonexistent=shift)
+ expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('offset', [-1, 1])
+ @pytest.mark.parametrize('tz_type', ['', 'dateutil/'])
+ def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, tz_type):
+ # GH 8917
+ tz = tz_type + 'Europe/Warsaw'
+ dti = DatetimeIndex([Timestamp('2015-03-29 02:20:00')])
+ msg = "The provided timedelta will relocalize on a nonexistent time"
+ with pytest.raises(ValueError, match=msg):
+ dti.tz_localize(tz, nonexistent=timedelta(seconds=offset))
+
+ @pytest.mark.filterwarnings('ignore::FutureWarning')
+ def test_dti_tz_localize_errors_deprecation(self):
+ # GH 22644
+ tz = 'Europe/Warsaw'
+ n = 60
+ dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ with pytest.raises(ValueError):
+ dti.tz_localize(tz, errors='foo')
+ # make sure errors='coerce' gets mapped correctly to nonexistent
+ result = dti.tz_localize(tz, errors='coerce')
+ expected = dti.tz_localize(tz, nonexistent='NaT')
+ tm.assert_index_equal(result, expected)
+
+ # -------------------------------------------------------------
+ # DatetimeIndex.normalize
+
+ def test_normalize_tz(self):
+ rng = date_range('1/1/2000 9:30', periods=10, freq='D',
+ tz='US/Eastern')
+
+ result = rng.normalize()
+ expected = date_range('1/1/2000', periods=10, freq='D',
+ tz='US/Eastern')
+ tm.assert_index_equal(result, expected)
+
+ assert result.is_normalized
+ assert not rng.is_normalized
+
+ rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC')
+
+ result = rng.normalize()
+ expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC')
+ tm.assert_index_equal(result, expected)
+
+ assert result.is_normalized
+ assert not rng.is_normalized
+
+ rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal())
+ result = rng.normalize()
+ expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal())
+ tm.assert_index_equal(result, expected)
+
+ assert result.is_normalized
+ assert not rng.is_normalized
+
+ @td.skip_if_windows
+ @pytest.mark.parametrize('timezone', ['US/Pacific', 'US/Eastern', 'UTC',
+ 'Asia/Kolkata', 'Asia/Shanghai',
+ 'Australia/Canberra'])
+ def test_normalize_tz_local(self, timezone):
+ # GH#13459
+ with tm.set_timezone(timezone):
+ rng = date_range('1/1/2000 9:30', periods=10, freq='D',
+ tz=tzlocal())
+
+ result = rng.normalize()
+ expected = date_range('1/1/2000', periods=10, freq='D',
+ tz=tzlocal())
+ tm.assert_index_equal(result, expected)
+
+ assert result.is_normalized
+ assert not rng.is_normalized
+
+ # ------------------------------------------------------------
+ # DatetimeIndex.__new__
+
+ @pytest.mark.parametrize('prefix', ['', 'dateutil/'])
+ def test_dti_constructor_static_tzinfo(self, prefix):
+ # it works!
+ index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + 'EST')
+ index.hour
+ index[0]
+
+ def test_dti_constructor_with_fixed_tz(self):
+ off = FixedOffset(420, '+07:00')
+ start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off)
+ end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off)
+ rng = date_range(start=start, end=end)
+ assert off == rng.tz
+
+ rng2 = date_range(start, periods=len(rng), tz=off)
+ tm.assert_index_equal(rng, rng2)
+
+ rng3 = date_range('3/11/2012 05:00:00+07:00',
+ '6/11/2012 05:00:00+07:00')
+ assert (rng.values == rng3.values).all()
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_convert_datetime_list(self, tzstr):
+ dr = date_range('2012-06-02', periods=10,
+ tz=tzstr, name='foo')
+ dr2 = DatetimeIndex(list(dr), name='foo')
+ tm.assert_index_equal(dr, dr2)
+ assert dr.tz == dr2.tz
+ assert dr2.name == 'foo'
+
+ def test_dti_construction_univalent(self):
+ rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI',
+ tz='US/Eastern')
+ rng2 = DatetimeIndex(data=rng, tz='US/Eastern')
+ tm.assert_index_equal(rng, rng2)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_from_tzaware_datetime(self, tz):
+ d = [datetime(2012, 8, 19, tzinfo=tz)]
+
+ index = DatetimeIndex(d)
+ assert timezones.tz_compare(index.tz, tz)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_tz_constructors(self, tzstr):
+ """ Test different DatetimeIndex constructions with timezone
+ Follow-up of GH#4229
+ """
+
+ arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00']
+
+ idx1 = to_datetime(arr).tz_localize(tzstr)
+ idx2 = pd.date_range(start="2005-11-10 08:00:00", freq='H', periods=2,
+ tz=tzstr)
+ idx3 = DatetimeIndex(arr, tz=tzstr)
+ idx4 = DatetimeIndex(np.array(arr), tz=tzstr)
+
+ for other in [idx2, idx3, idx4]:
+ tm.assert_index_equal(idx1, other)
+
+ # -------------------------------------------------------------
+ # Unsorted
+
+ def test_join_utc_convert(self, join_type):
+ rng = date_range('1/1/2011', periods=100, freq='H', tz='utc')
+
+ left = rng.tz_convert('US/Eastern')
+ right = rng.tz_convert('Europe/Berlin')
+
+ result = left.join(left[:-5], how=join_type)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz == left.tz
+
+ result = left.join(right[:-5], how=join_type)
+ assert isinstance(result, DatetimeIndex)
+ assert result.tz.zone == 'UTC'
+
+ @pytest.mark.parametrize("dtype", [
+ None, 'datetime64[ns, CET]',
+ 'datetime64[ns, EST]', 'datetime64[ns, UTC]'
+ ])
+ def test_date_accessor(self, dtype):
+ # Regression test for GH#21230
+ expected = np.array([date(2018, 6, 4), pd.NaT])
+
+ index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype)
+ result = index.date
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [
+ None, 'datetime64[ns, CET]',
+ 'datetime64[ns, EST]', 'datetime64[ns, UTC]'
+ ])
+ def test_time_accessor(self, dtype):
+ # Regression test for GH#21267
+ expected = np.array([time(10, 20, 30), pd.NaT])
+
+ index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype)
+ result = index.time
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_timetz_accessor(self, tz_naive_fixture):
+ # GH21358
+ tz = timezones.maybe_get_tz(tz_naive_fixture)
+
+ expected = np.array([time(10, 20, 30, tzinfo=tz), pd.NaT])
+
+ index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], tz=tz)
+ result = index.timetz
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_dti_drop_dont_lose_tz(self):
+ # GH#2621
+ ind = date_range("2012-12-01", periods=10, tz="utc")
+ ind = ind.drop(ind[-1])
+
+ assert ind.tz is not None
+
+ def test_drop_dst_boundary(self):
+ # see gh-18031
+ tz = "Europe/Brussels"
+ freq = "15min"
+
+ start = pd.Timestamp("201710290100", tz=tz)
+ end = pd.Timestamp("201710290300", tz=tz)
+ index = pd.date_range(start=start, end=end, freq=freq)
+
+ expected = DatetimeIndex(["201710290115", "201710290130",
+ "201710290145", "201710290200",
+ "201710290215", "201710290230",
+ "201710290245", "201710290200",
+ "201710290215", "201710290230",
+ "201710290245", "201710290300"],
+ tz=tz, freq=freq,
+ ambiguous=[True, True, True, True,
+ True, True, True, False,
+ False, False, False, False])
+ result = index.drop(index[0])
+ tm.assert_index_equal(result, expected)
+
+ def test_date_range_localize(self):
+ rng = date_range('3/11/2012 03:00', periods=15, freq='H',
+ tz='US/Eastern')
+ rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'],
+ tz='US/Eastern')
+ rng3 = date_range('3/11/2012 03:00', periods=15, freq='H')
+ rng3 = rng3.tz_localize('US/Eastern')
+
+ tm.assert_index_equal(rng, rng3)
+
+ # DST transition time
+ val = rng[0]
+ exp = Timestamp('3/11/2012 03:00', tz='US/Eastern')
+
+ assert val.hour == 3
+ assert exp.hour == 3
+ assert val == exp # same UTC value
+ tm.assert_index_equal(rng[:2], rng2)
+
+ # Right before the DST transition
+ rng = date_range('3/11/2012 00:00', periods=2, freq='H',
+ tz='US/Eastern')
+ rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'],
+ tz='US/Eastern')
+ tm.assert_index_equal(rng, rng2)
+ exp = Timestamp('3/11/2012 00:00', tz='US/Eastern')
+ assert exp.hour == 0
+ assert rng[0] == exp
+ exp = Timestamp('3/11/2012 01:00', tz='US/Eastern')
+ assert exp.hour == 1
+ assert rng[1] == exp
+
+ rng = date_range('3/11/2012 00:00', periods=10, freq='H',
+ tz='US/Eastern')
+ assert rng[2].hour == 3
+
+ def test_timestamp_equality_different_timezones(self):
+ utc_range = date_range('1/1/2000', periods=20, tz='UTC')
+ eastern_range = utc_range.tz_convert('US/Eastern')
+ berlin_range = utc_range.tz_convert('Europe/Berlin')
+
+ for a, b, c in zip(utc_range, eastern_range, berlin_range):
+ assert a == b
+ assert b == c
+ assert a == c
+
+ assert (utc_range == eastern_range).all()
+ assert (utc_range == berlin_range).all()
+ assert (berlin_range == eastern_range).all()
+
+ def test_dti_intersection(self):
+ rng = date_range('1/1/2011', periods=100, freq='H', tz='utc')
+
+ left = rng[10:90][::-1]
+ right = rng[20:80][::-1]
+
+ assert left.tz == rng.tz
+ result = left.intersection(right)
+ assert result.tz == left.tz
+
+ def test_dti_equals_with_tz(self):
+ left = date_range('1/1/2011', periods=100, freq='H', tz='utc')
+ right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern')
+
+ assert not left.equals(right)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_tz_nat(self, tzstr):
+ idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT])
+
+ assert isna(idx[1])
+ assert idx[0].tzinfo is not None
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_astype_asobject_tzinfos(self, tzstr):
+ # GH#1345
+
+ # dates around a dst transition
+ rng = date_range('2/13/2010', '5/6/2010', tz=tzstr)
+
+ objs = rng.astype(object)
+ for i, x in enumerate(objs):
+ exval = rng[i]
+ assert x == exval
+ assert x.tzinfo == exval.tzinfo
+
+ objs = rng.astype(object)
+ for i, x in enumerate(objs):
+ exval = rng[i]
+ assert x == exval
+ assert x.tzinfo == exval.tzinfo
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_with_timezone_repr(self, tzstr):
+ rng = date_range('4/13/2010', '5/6/2010')
+
+ rng_eastern = rng.tz_localize(tzstr)
+
+ rng_repr = repr(rng_eastern)
+ assert '2010-04-13 00:00:00' in rng_repr
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_dti_take_dont_lose_meta(self, tzstr):
+ rng = date_range('1/1/2000', periods=20, tz=tzstr)
+
+ result = rng.take(lrange(5))
+ assert result.tz == rng.tz
+ assert result.freq == rng.freq
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_utc_box_timestamp_and_localize(self, tzstr):
+ tz = timezones.maybe_get_tz(tzstr)
+
+ rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc')
+ rng_eastern = rng.tz_convert(tzstr)
+
+ expected = rng[-1].astimezone(tz)
+
+ stamp = rng_eastern[-1]
+ assert stamp == expected
+ assert stamp.tzinfo == expected.tzinfo
+
+ # right tzinfo
+ rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc')
+ rng_eastern = rng.tz_convert(tzstr)
+ # test not valid for dateutil timezones.
+ # assert 'EDT' in repr(rng_eastern[0].tzinfo)
+ assert ('EDT' in repr(rng_eastern[0].tzinfo) or
+ 'tzfile' in repr(rng_eastern[0].tzinfo))
+
+ def test_dti_to_pydatetime(self):
+ dt = dateutil.parser.parse('2012-06-13T01:39:00Z')
+ dt = dt.replace(tzinfo=tzlocal())
+
+ arr = np.array([dt], dtype=object)
+
+ result = to_datetime(arr, utc=True)
+ assert result.tz is pytz.utc
+
+ rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal())
+ arr = rng.to_pydatetime()
+ result = to_datetime(arr, utc=True)
+ assert result.tz is pytz.utc
+
+ def test_dti_to_pydatetime_fizedtz(self):
+ dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off),
+ datetime(2000, 1, 2, tzinfo=fixed_off),
+ datetime(2000, 1, 3, tzinfo=fixed_off)])
+ dti = DatetimeIndex(dates)
+
+ result = dti.to_pydatetime()
+ tm.assert_numpy_array_equal(dates, result)
+
+ result = dti._mpl_repr()
+ tm.assert_numpy_array_equal(dates, result)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Central'),
+ gettz('US/Central')])
+ def test_with_tz(self, tz):
+ # just want it to work
+ start = datetime(2011, 3, 12, tzinfo=pytz.utc)
+ dr = bdate_range(start, periods=50, freq=pd.offsets.Hour())
+ assert dr.tz is pytz.utc
+
+ # DateRange with naive datetimes
+ dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc)
+ dr = bdate_range('1/1/2005', '1/1/2009', tz=tz)
+
+ # normalized
+ central = dr.tz_convert(tz)
+ assert central.tz is tz
+ naive = central[0].to_pydatetime().replace(tzinfo=None)
+ comp = conversion.localize_pydatetime(naive, tz).tzinfo
+ assert central[0].tz is comp
+
+ # compare vs a localized tz
+ naive = dr[0].to_pydatetime().replace(tzinfo=None)
+ comp = conversion.localize_pydatetime(naive, tz).tzinfo
+ assert central[0].tz is comp
+
+ # datetimes with tzinfo set
+ dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc),
+ datetime(2009, 1, 1, tzinfo=pytz.utc))
+ with pytest.raises(Exception):
+ bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009',
+ tz=tz)
+
+ @pytest.mark.parametrize('prefix', ['', 'dateutil/'])
+ def test_field_access_localize(self, prefix):
+ strdates = ['1/1/2012', '3/1/2012', '4/1/2012']
+ rng = DatetimeIndex(strdates, tz=prefix + 'US/Eastern')
+ assert (rng.hour == 0).all()
+
+ # a more unusual time zone, #1946
+ dr = date_range('2011-10-02 00:00', freq='h', periods=10,
+ tz=prefix + 'America/Atikokan')
+
+ expected = Index(np.arange(10, dtype=np.int64))
+ tm.assert_index_equal(dr.hour, expected)
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern')])
+ def test_dti_convert_tz_aware_datetime_datetime(self, tz):
+ # GH#1581
+ dates = [datetime(2000, 1, 1), datetime(2000, 1, 2),
+ datetime(2000, 1, 3)]
+
+ dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates]
+ result = DatetimeIndex(dates_aware)
+ assert timezones.tz_compare(result.tz, tz)
+
+ converted = to_datetime(dates_aware, utc=True)
+ ex_vals = np.array([Timestamp(x).value for x in dates_aware])
+ tm.assert_numpy_array_equal(converted.asi8, ex_vals)
+ assert converted.tz is pytz.utc
+
+ def test_dti_union_aware(self):
+ # non-overlapping
+ rng = date_range("2012-11-15 00:00:00", periods=6, freq="H",
+ tz="US/Central")
+
+ rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H",
+ tz="US/Eastern")
+
+ result = rng.union(rng2)
+ assert result.tz.zone == 'UTC'
+
+ @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central",
+ dateutil.tz.tzoffset(None, -28800)])
+ @pytest.mark.usefixtures("datetime_tz_utc")
+ @pytest.mark.skipif(not PY3, reason="datetime.timezone not in PY2")
+ def test_iteration_preserves_nanoseconds(self, tz):
+ # GH 19603
+ index = DatetimeIndex(["2018-02-08 15:00:00.168456358",
+ "2018-02-08 15:00:00.168456359"], tz=tz)
+ for i, ts in enumerate(index):
+ assert ts == index[i]
+
+
+class TestDateRange(object):
+ """Tests for date_range with timezones"""
+ def test_hongkong_tz_convert(self):
+ # GH#1673 smoke test
+ dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong')
+
+ # it works!
+ dr.hour
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_date_range_span_dst_transition(self, tzstr):
+ # GH#1778
+
+ # Standard -> Daylight Savings Time
+ dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI',
+ tz='US/Eastern')
+
+ assert (dr.hour == 0).all()
+
+ dr = date_range('2012-11-02', periods=10, tz=tzstr)
+ result = dr.hour
+ expected = Index([0] * 10)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_date_range_timezone_str_argument(self, tzstr):
+ tz = timezones.maybe_get_tz(tzstr)
+ result = date_range('1/1/2000', periods=10, tz=tzstr)
+ expected = date_range('1/1/2000', periods=10, tz=tz)
+
+ tm.assert_index_equal(result, expected)
+
+ def test_date_range_with_fixedoffset_noname(self):
+ off = fixed_off_no_name
+ start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off)
+ end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off)
+ rng = date_range(start=start, end=end)
+ assert off == rng.tz
+
+ idx = Index([start, end])
+ assert off == idx.tz
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_date_range_with_tz(self, tzstr):
+ stamp = Timestamp('3/11/2012 05:00', tz=tzstr)
+ assert stamp.hour == 5
+
+ rng = date_range('3/11/2012 04:00', periods=10, freq='H',
+ tz=tzstr)
+
+ assert stamp == rng[1]
+
+
+class TestToDatetime(object):
+ """Tests for the to_datetime constructor with timezones"""
+ def test_to_datetime_utc(self):
+ arr = np.array([dateutil.parser.parse('2012-06-13T01:39:00Z')],
+ dtype=object)
+
+ result = to_datetime(arr, utc=True)
+ assert result.tz is pytz.utc
+
+ def test_to_datetime_fixed_offset(self):
+ dates = [datetime(2000, 1, 1, tzinfo=fixed_off),
+ datetime(2000, 1, 2, tzinfo=fixed_off),
+ datetime(2000, 1, 3, tzinfo=fixed_off)]
+ result = to_datetime(dates)
+ assert result.tz == fixed_off
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_tools.py b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_tools.py
new file mode 100644
index 00000000000..bec2fa66c43
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/datetimes/test_tools.py
@@ -0,0 +1,1841 @@
+""" test to_datetime """
+
+import calendar
+from datetime import datetime, time
+from distutils.version import LooseVersion
+import locale
+
+import dateutil
+from dateutil.parser import parse
+from dateutil.tz.tz import tzoffset
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs import tslib
+from pandas._libs.tslibs import iNaT, parsing
+from pandas.compat import PY3, lmap
+from pandas.errors import OutOfBoundsDatetime
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_datetime64_ns_dtype
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, NaT, Series, Timestamp, compat,
+ date_range, isna, to_datetime)
+from pandas.core.arrays import DatetimeArray
+from pandas.core.tools import datetimes as tools
+from pandas.util import testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestTimeConversionFormats(object):
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format(self, cache):
+ values = ['1/1/2000', '1/2/2000', '1/3/2000']
+
+ results1 = [Timestamp('20000101'), Timestamp('20000201'),
+ Timestamp('20000301')]
+ results2 = [Timestamp('20000101'), Timestamp('20000102'),
+ Timestamp('20000103')]
+ for vals, expecteds in [(values, (Index(results1), Index(results2))),
+ (Series(values),
+ (Series(results1), Series(results2))),
+ (values[0], (results1[0], results2[0])),
+ (values[1], (results1[1], results2[1])),
+ (values[2], (results1[2], results2[2]))]:
+
+ for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']):
+ result = to_datetime(vals, format=fmt, cache=cache)
+ expected = expecteds[i]
+
+ if isinstance(expected, Series):
+ assert_series_equal(result, Series(expected))
+ elif isinstance(expected, Timestamp):
+ assert result == expected
+ else:
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format_YYYYMMDD(self, cache):
+ s = Series([19801222, 19801222] + [19810105] * 5)
+ expected = Series([Timestamp(x) for x in s.apply(str)])
+
+ result = to_datetime(s, format='%Y%m%d', cache=cache)
+ assert_series_equal(result, expected)
+
+ result = to_datetime(s.apply(str), format='%Y%m%d', cache=cache)
+ assert_series_equal(result, expected)
+
+ # with NaT
+ expected = Series([Timestamp("19801222"), Timestamp("19801222")] +
+ [Timestamp("19810105")] * 5)
+ expected[2] = np.nan
+ s[2] = np.nan
+
+ result = to_datetime(s, format='%Y%m%d', cache=cache)
+ assert_series_equal(result, expected)
+
+ # string with NaT
+ s = s.apply(str)
+ s[2] = 'nat'
+ result = to_datetime(s, format='%Y%m%d', cache=cache)
+ assert_series_equal(result, expected)
+
+ # coercion
+ # GH 7930
+ s = Series([20121231, 20141231, 99991231])
+ result = pd.to_datetime(s, format='%Y%m%d', errors='ignore',
+ cache=cache)
+ expected = Series([datetime(2012, 12, 31),
+ datetime(2014, 12, 31), datetime(9999, 12, 31)],
+ dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ result = pd.to_datetime(s, format='%Y%m%d', errors='coerce',
+ cache=cache)
+ expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format_integer(self, cache):
+ # GH 10178
+ s = Series([2000, 2001, 2002])
+ expected = Series([Timestamp(x) for x in s.apply(str)])
+
+ result = to_datetime(s, format='%Y', cache=cache)
+ assert_series_equal(result, expected)
+
+ s = Series([200001, 200105, 200206])
+ expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str)
+ ])
+
+ result = to_datetime(s, format='%Y%m', cache=cache)
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format_microsecond(self, cache):
+
+ # these are locale dependent
+ lang, _ = locale.getlocale()
+ month_abbr = calendar.month_abbr[4]
+ val = '01-{}-2011 00:00:01.978'.format(month_abbr)
+
+ format = '%d-%b-%Y %H:%M:%S.%f'
+ result = to_datetime(val, format=format, cache=cache)
+ exp = datetime.strptime(val, format)
+ assert result == exp
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format_time(self, cache):
+ data = [
+ ['01/10/2010 15:20', '%m/%d/%Y %H:%M',
+ Timestamp('2010-01-10 15:20')],
+ ['01/10/2010 05:43', '%m/%d/%Y %I:%M',
+ Timestamp('2010-01-10 05:43')],
+ ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S',
+ Timestamp('2010-01-10 13:56:01')] # ,
+ # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p',
+ # Timestamp('2010-01-10 20:14')],
+ # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p',
+ # Timestamp('2010-01-10 07:40')],
+ # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p',
+ # Timestamp('2010-01-10 09:12:56')]
+ ]
+ for s, format, dt in data:
+ assert to_datetime(s, format=format, cache=cache) == dt
+
+ @td.skip_if_has_locale
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_with_non_exact(self, cache):
+ # GH 10834
+ # 8904
+ # exact kw
+ s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00',
+ '19MAY11 00:00:00Z'])
+ result = to_datetime(s, format='%d%b%y', exact=False, cache=cache)
+ expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False),
+ format='%d%b%y', cache=cache)
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_parse_nanoseconds_with_formula(self, cache):
+
+ # GH8989
+ # trunctaing the nanoseconds when a format was provided
+ for v in ["2012-01-01 09:00:00.000000001",
+ "2012-01-01 09:00:00.000001",
+ "2012-01-01 09:00:00.001",
+ "2012-01-01 09:00:00.001000",
+ "2012-01-01 09:00:00.001000000", ]:
+ expected = pd.to_datetime(v, cache=cache)
+ result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f",
+ cache=cache)
+ assert result == expected
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_format_weeks(self, cache):
+ data = [
+ ['2009324', '%Y%W%w', Timestamp('2009-08-13')],
+ ['2013020', '%Y%U%w', Timestamp('2013-01-13')]
+ ]
+ for s, format, dt in data:
+ assert to_datetime(s, format=format, cache=cache) == dt
+
+ @pytest.mark.parametrize("box,const", [
+ [True, pd.Index],
+ [False, np.array]])
+ @pytest.mark.parametrize("fmt,dates,expected_dates", [
+ ['%Y-%m-%d %H:%M:%S %Z',
+ ['2010-01-01 12:00:00 UTC'] * 2,
+ [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
+ ['%Y-%m-%d %H:%M:%S %Z',
+ ['2010-01-01 12:00:00 UTC',
+ '2010-01-01 12:00:00 GMT',
+ '2010-01-01 12:00:00 US/Pacific'],
+ [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
+ pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
+ pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
+ ['%Y-%m-%d %H:%M:%S%z',
+ ['2010-01-01 12:00:00+0100'] * 2,
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(60))] * 2],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 +0100'] * 2,
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(60))] * 2],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(60)),
+ pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(-60))]],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC
+ pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(0))]]])
+ def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
+ fmt, dates, expected_dates):
+ # GH 13486
+ result = pd.to_datetime(dates, format=fmt, box=box)
+ expected = const(expected_dates)
+ tm.assert_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(dates, format=fmt, box=box, utc=True)
+
+ @pytest.mark.parametrize('offset', [
+ '+0', '-1foo', 'UTCbar', ':10', '+01:000:01', ''])
+ def test_to_datetime_parse_timezone_malformed(self, offset):
+ fmt = '%Y-%m-%d %H:%M:%S %z'
+ date = '2010-01-01 12:00:00 ' + offset
+ with pytest.raises(ValueError):
+ pd.to_datetime([date], format=fmt)
+
+ def test_to_datetime_parse_timezone_keeps_name(self):
+ # GH 21697
+ fmt = '%Y-%m-%d %H:%M:%S %z'
+ arg = pd.Index(['2010-01-01 12:00:00 Z'], name='foo')
+ result = pd.to_datetime(arg, format=fmt)
+ expected = pd.DatetimeIndex(['2010-01-01 12:00:00'], tz='UTC',
+ name='foo')
+ tm.assert_index_equal(result, expected)
+
+
+class TestToDatetime(object):
+ @pytest.mark.parametrize('tz', [None, 'US/Central'])
+ def test_to_datetime_dtarr(self, tz):
+ # DatetimeArray
+ dti = date_range('1965-04-03', periods=19, freq='2W', tz=tz)
+ arr = DatetimeArray(dti)
+
+ result = to_datetime(arr)
+ assert result is arr
+
+ result = to_datetime(arr, box=True)
+ assert result is arr
+
+ def test_to_datetime_pydatetime(self):
+ actual = pd.to_datetime(datetime(2008, 1, 15))
+ assert actual == datetime(2008, 1, 15)
+
+ def test_to_datetime_YYYYMMDD(self):
+ actual = pd.to_datetime('20080115')
+ assert actual == datetime(2008, 1, 15)
+
+ def test_to_datetime_unparseable_ignore(self):
+ # unparseable
+ s = 'Month 1, 1999'
+ assert pd.to_datetime(s, errors='ignore') == s
+
+ @td.skip_if_windows # `tm.set_timezone` does not work in windows
+ def test_to_datetime_now(self):
+ # See GH#18666
+ with tm.set_timezone('US/Eastern'):
+ npnow = np.datetime64('now').astype('datetime64[ns]')
+ pdnow = pd.to_datetime('now')
+ pdnow2 = pd.to_datetime(['now'])[0]
+
+ # These should all be equal with infinite perf; this gives
+ # a generous margin of 10 seconds
+ assert abs(pdnow.value - npnow.astype(np.int64)) < 1e10
+ assert abs(pdnow2.value - npnow.astype(np.int64)) < 1e10
+
+ assert pdnow.tzinfo is None
+ assert pdnow2.tzinfo is None
+
+ @td.skip_if_windows # `tm.set_timezone` does not work in windows
+ def test_to_datetime_today(self):
+ # See GH#18666
+ # Test with one timezone far ahead of UTC and another far behind, so
+ # one of these will _almost_ alawys be in a different day from UTC.
+ # Unfortunately this test between 12 and 1 AM Samoa time
+ # this both of these timezones _and_ UTC will all be in the same day,
+ # so this test will not detect the regression introduced in #18666.
+ with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC
+ nptoday = np.datetime64('today')\
+ .astype('datetime64[ns]').astype(np.int64)
+ pdtoday = pd.to_datetime('today')
+ pdtoday2 = pd.to_datetime(['today'])[0]
+
+ tstoday = pd.Timestamp('today')
+ tstoday2 = pd.Timestamp.today()
+
+ # These should all be equal with infinite perf; this gives
+ # a generous margin of 10 seconds
+ assert abs(pdtoday.normalize().value - nptoday) < 1e10
+ assert abs(pdtoday2.normalize().value - nptoday) < 1e10
+ assert abs(pdtoday.value - tstoday.value) < 1e10
+ assert abs(pdtoday.value - tstoday2.value) < 1e10
+
+ assert pdtoday.tzinfo is None
+ assert pdtoday2.tzinfo is None
+
+ with tm.set_timezone('US/Samoa'): # 11 hours behind UTC
+ nptoday = np.datetime64('today')\
+ .astype('datetime64[ns]').astype(np.int64)
+ pdtoday = pd.to_datetime('today')
+ pdtoday2 = pd.to_datetime(['today'])[0]
+
+ # These should all be equal with infinite perf; this gives
+ # a generous margin of 10 seconds
+ assert abs(pdtoday.normalize().value - nptoday) < 1e10
+ assert abs(pdtoday2.normalize().value - nptoday) < 1e10
+
+ assert pdtoday.tzinfo is None
+ assert pdtoday2.tzinfo is None
+
+ def test_to_datetime_today_now_unicode_bytes(self):
+ to_datetime([u'now'])
+ to_datetime([u'today'])
+ if not PY3:
+ to_datetime(['now'])
+ to_datetime(['today'])
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_dt64s(self, cache):
+ in_bound_dts = [
+ np.datetime64('2000-01-01'),
+ np.datetime64('2000-01-02'),
+ ]
+
+ for dt in in_bound_dts:
+ assert pd.to_datetime(dt, cache=cache) == Timestamp(dt)
+
+ oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ]
+
+ for dt in oob_dts:
+ pytest.raises(ValueError, pd.to_datetime, dt, errors='raise')
+ pytest.raises(ValueError, Timestamp, dt)
+ assert pd.to_datetime(dt, errors='coerce', cache=cache) is NaT
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_array_of_dt64s(self, cache):
+ dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ]
+
+ # Assuming all datetimes are in bounds, to_datetime() returns
+ # an array that is equal to Timestamp() parsing
+ tm.assert_numpy_array_equal(
+ pd.to_datetime(dts, box=False, cache=cache),
+ np.array([Timestamp(x).asm8 for x in dts])
+ )
+
+ # A list of datetimes where the last one is out of bounds
+ dts_with_oob = dts + [np.datetime64('9999-01-01')]
+
+ pytest.raises(ValueError, pd.to_datetime, dts_with_oob,
+ errors='raise')
+
+ tm.assert_numpy_array_equal(
+ pd.to_datetime(dts_with_oob, box=False, errors='coerce',
+ cache=cache),
+ np.array(
+ [
+ Timestamp(dts_with_oob[0]).asm8,
+ Timestamp(dts_with_oob[1]).asm8,
+ tslib.iNaT,
+ ],
+ dtype='M8'
+ )
+ )
+
+ # With errors='ignore', out of bounds datetime64s
+ # are converted to their .item(), which depending on the version of
+ # numpy is either a python datetime.datetime or datetime.date
+ tm.assert_numpy_array_equal(
+ pd.to_datetime(dts_with_oob, box=False, errors='ignore',
+ cache=cache),
+ np.array(
+ [dt.item() for dt in dts_with_oob],
+ dtype='O'
+ )
+ )
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_tz(self, cache):
+
+ # xref 8260
+ # uniform returns a DatetimeIndex
+ arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
+ pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]
+ result = pd.to_datetime(arr, cache=cache)
+ expected = DatetimeIndex(
+ ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific')
+ tm.assert_index_equal(result, expected)
+
+ # mixed tzs will raise
+ arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'),
+ pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')]
+ pytest.raises(ValueError, lambda: pd.to_datetime(arr, cache=cache))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_tz_pytz(self, cache):
+ # see gh-8260
+ us_eastern = pytz.timezone('US/Eastern')
+ arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1,
+ hour=3, minute=0)),
+ us_eastern.localize(datetime(year=2000, month=6, day=1,
+ hour=3, minute=0))],
+ dtype=object)
+ result = pd.to_datetime(arr, utc=True, cache=cache)
+ expected = DatetimeIndex(['2000-01-01 08:00:00+00:00',
+ '2000-06-01 07:00:00+00:00'],
+ dtype='datetime64[ns, UTC]', freq=None)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ @pytest.mark.parametrize("init_constructor, end_constructor, test_method",
+ [(Index, DatetimeIndex, tm.assert_index_equal),
+ (list, DatetimeIndex, tm.assert_index_equal),
+ (np.array, DatetimeIndex, tm.assert_index_equal),
+ (Series, Series, tm.assert_series_equal)])
+ def test_to_datetime_utc_true(self,
+ cache,
+ init_constructor,
+ end_constructor,
+ test_method):
+ # See gh-11934 & gh-6415
+ data = ['20100102 121314', '20100102 121315']
+ expected_data = [pd.Timestamp('2010-01-02 12:13:14', tz='utc'),
+ pd.Timestamp('2010-01-02 12:13:15', tz='utc')]
+
+ result = pd.to_datetime(init_constructor(data),
+ format='%Y%m%d %H%M%S',
+ utc=True,
+ cache=cache)
+ expected = end_constructor(expected_data)
+ test_method(result, expected)
+
+ # Test scalar case as well
+ for scalar, expected in zip(data, expected_data):
+ result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True,
+ cache=cache)
+ assert result == expected
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_utc_true_with_series_single_value(self, cache):
+ # GH 15760 UTC=True with Series
+ ts = 1.5e18
+ result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache)
+ expected = pd.Series([pd.Timestamp(ts, tz='utc')])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_utc_true_with_series_tzaware_string(self, cache):
+ ts = '2013-01-01 00:00:00-01:00'
+ expected_ts = '2013-01-01 01:00:00'
+ data = pd.Series([ts] * 3)
+ result = pd.to_datetime(data, utc=True, cache=cache)
+ expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ @pytest.mark.parametrize('date, dtype',
+ [('2013-01-01 01:00:00', 'datetime64[ns]'),
+ ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')])
+ def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date,
+ dtype):
+ expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')])
+ result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True,
+ cache=cache)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_tz_psycopg2(self, cache):
+
+ # xref 8260
+ try:
+ import psycopg2
+ except ImportError:
+ pytest.skip("no psycopg2 installed")
+
+ # misc cases
+ tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)
+ tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None)
+ arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1),
+ datetime(2000, 6, 1, 3, 0, tzinfo=tz2)],
+ dtype=object)
+
+ result = pd.to_datetime(arr, errors='coerce', utc=True, cache=cache)
+ expected = DatetimeIndex(['2000-01-01 08:00:00+00:00',
+ '2000-06-01 07:00:00+00:00'],
+ dtype='datetime64[ns, UTC]', freq=None)
+ tm.assert_index_equal(result, expected)
+
+ # dtype coercion
+ i = pd.DatetimeIndex([
+ '2000-01-01 08:00:00'
+ ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None))
+ assert is_datetime64_ns_dtype(i)
+
+ # tz coerceion
+ result = pd.to_datetime(i, errors='coerce', cache=cache)
+ tm.assert_index_equal(result, i)
+
+ result = pd.to_datetime(i, errors='coerce', utc=True, cache=cache)
+ expected = pd.DatetimeIndex(['2000-01-01 13:00:00'],
+ dtype='datetime64[ns, UTC]')
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ 'cache',
+ [pytest.param(True,
+ marks=pytest.mark.skipif(True, reason="GH 18111")),
+ False])
+ def test_datetime_bool(self, cache):
+ # GH13176
+ with pytest.raises(TypeError):
+ to_datetime(False)
+ assert to_datetime(False, errors="coerce", cache=cache) is NaT
+ assert to_datetime(False, errors="ignore", cache=cache) is False
+ with pytest.raises(TypeError):
+ to_datetime(True)
+ assert to_datetime(True, errors="coerce", cache=cache) is NaT
+ assert to_datetime(True, errors="ignore", cache=cache) is True
+ with pytest.raises(TypeError):
+ to_datetime([False, datetime.today()], cache=cache)
+ with pytest.raises(TypeError):
+ to_datetime(['20130101', True], cache=cache)
+ tm.assert_index_equal(to_datetime([0, False, NaT, 0.0],
+ errors="coerce", cache=cache),
+ DatetimeIndex([to_datetime(0, cache=cache),
+ NaT,
+ NaT,
+ to_datetime(0, cache=cache)]))
+
+ def test_datetime_invalid_datatype(self):
+ # GH13176
+
+ with pytest.raises(TypeError):
+ pd.to_datetime(bool)
+ with pytest.raises(TypeError):
+ pd.to_datetime(pd.to_datetime)
+
+ @pytest.mark.parametrize('value', ["a", "00:01:99"])
+ @pytest.mark.parametrize('infer', [True, False])
+ @pytest.mark.parametrize('format', [None, 'H%:M%:S%'])
+ def test_datetime_invalid_scalar(self, value, format, infer):
+ # GH24763
+ res = pd.to_datetime(value, errors='ignore', format=format,
+ infer_datetime_format=infer)
+ assert res == value
+
+ res = pd.to_datetime(value, errors='coerce', format=format,
+ infer_datetime_format=infer)
+ assert res is pd.NaT
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(value, errors='raise', format=format,
+ infer_datetime_format=infer)
+
+ @pytest.mark.parametrize('value', ["3000/12/11 00:00:00"])
+ @pytest.mark.parametrize('infer', [True, False])
+ @pytest.mark.parametrize('format', [None, 'H%:M%:S%'])
+ def test_datetime_outofbounds_scalar(self, value, format, infer):
+ # GH24763
+ res = pd.to_datetime(value, errors='ignore', format=format,
+ infer_datetime_format=infer)
+ assert res == value
+
+ res = pd.to_datetime(value, errors='coerce', format=format,
+ infer_datetime_format=infer)
+ assert res is pd.NaT
+
+ if format is not None:
+ with pytest.raises(ValueError):
+ pd.to_datetime(value, errors='raise', format=format,
+ infer_datetime_format=infer)
+ else:
+ with pytest.raises(OutOfBoundsDatetime):
+ pd.to_datetime(value, errors='raise', format=format,
+ infer_datetime_format=infer)
+
+ @pytest.mark.parametrize('values', [["a"], ["00:01:99"],
+ ["a", "b", "99:00:00"]])
+ @pytest.mark.parametrize('infer', [True, False])
+ @pytest.mark.parametrize('format', [None, 'H%:M%:S%'])
+ def test_datetime_invalid_index(self, values, format, infer):
+ # GH24763
+ res = pd.to_datetime(values, errors='ignore', format=format,
+ infer_datetime_format=infer)
+ tm.assert_index_equal(res, pd.Index(values))
+
+ res = pd.to_datetime(values, errors='coerce', format=format,
+ infer_datetime_format=infer)
+ tm.assert_index_equal(res, pd.DatetimeIndex([pd.NaT] * len(values)))
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(values, errors='raise', format=format,
+ infer_datetime_format=infer)
+
+ @pytest.mark.parametrize("utc", [True, None])
+ @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None])
+ @pytest.mark.parametrize("box", [True, False])
+ @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index])
+ def test_to_datetime_cache(self, utc, format, box, constructor):
+ date = '20130101 00:00:00'
+ test_dates = [date] * 10**5
+ data = constructor(test_dates)
+ result = pd.to_datetime(data, utc=utc, format=format, box=box,
+ cache=True)
+ expected = pd.to_datetime(data, utc=utc, format=format, box=box,
+ cache=False)
+ if box:
+ tm.assert_index_equal(result, expected)
+ else:
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("utc", [True, None])
+ @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None])
+ def test_to_datetime_cache_series(self, utc, format):
+ date = '20130101 00:00:00'
+ test_dates = [date] * 10**5
+ data = pd.Series(test_dates)
+ result = pd.to_datetime(data, utc=utc, format=format, cache=True)
+ expected = pd.to_datetime(data, utc=utc, format=format, cache=False)
+ tm.assert_series_equal(result, expected)
+
+ def test_to_datetime_cache_scalar(self):
+ date = '20130101 00:00:00'
+ result = pd.to_datetime(date, cache=True)
+ expected = pd.Timestamp('20130101 00:00:00')
+ assert result == expected
+
+ @pytest.mark.parametrize('date, format',
+ [('2017-20', '%Y-%W'),
+ ('20 Sunday', '%W %A'),
+ ('20 Sun', '%W %a'),
+ ('2017-21', '%Y-%U'),
+ ('20 Sunday', '%U %A'),
+ ('20 Sun', '%U %a')])
+ def test_week_without_day_and_calendar_year(self, date, format):
+ # GH16774
+
+ msg = "Cannot use '%W' or '%U' without day and year"
+ with pytest.raises(ValueError, match=msg):
+ pd.to_datetime(date, format=format)
+
+ def test_iso_8601_strings_with_same_offset(self):
+ # GH 17697, 11736
+ ts_str = "2015-11-18 15:30:00+05:30"
+ result = to_datetime(ts_str)
+ expected = Timestamp(ts_str)
+ assert result == expected
+
+ expected = DatetimeIndex([Timestamp(ts_str)] * 2)
+ result = to_datetime([ts_str] * 2)
+ tm.assert_index_equal(result, expected)
+
+ result = DatetimeIndex([ts_str] * 2)
+ tm.assert_index_equal(result, expected)
+
+ def test_iso_8601_strings_same_offset_no_box(self):
+ # GH 22446
+ data = ['2018-01-04 09:01:00+09:00', '2018-01-04 09:02:00+09:00']
+ result = pd.to_datetime(data, box=False)
+ expected = np.array([
+ datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)),
+ datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540))
+ ],
+ dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_iso_8601_strings_with_different_offsets(self):
+ # GH 17697, 11736
+ ts_strings = ["2015-11-18 15:30:00+05:30",
+ "2015-11-18 16:30:00+06:30",
+ NaT]
+ result = to_datetime(ts_strings)
+ expected = np.array([datetime(2015, 11, 18, 15, 30,
+ tzinfo=tzoffset(None, 19800)),
+ datetime(2015, 11, 18, 16, 30,
+ tzinfo=tzoffset(None, 23400)),
+ NaT],
+ dtype=object)
+ # GH 21864
+ expected = Index(expected)
+ tm.assert_index_equal(result, expected)
+
+ result = to_datetime(ts_strings, utc=True)
+ expected = DatetimeIndex([Timestamp(2015, 11, 18, 10),
+ Timestamp(2015, 11, 18, 10),
+ NaT], tz='UTC')
+ tm.assert_index_equal(result, expected)
+
+ def test_non_iso_strings_with_tz_offset(self):
+ result = to_datetime(['March 1, 2018 12:00:00+0400'] * 2)
+ expected = DatetimeIndex([datetime(2018, 3, 1, 12,
+ tzinfo=pytz.FixedOffset(240))] * 2)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('ts, expected', [
+ (Timestamp('2018-01-01'),
+ Timestamp('2018-01-01', tz='UTC')),
+ (Timestamp('2018-01-01', tz='US/Pacific'),
+ Timestamp('2018-01-01 08:00', tz='UTC'))])
+ def test_timestamp_utc_true(self, ts, expected):
+ # GH 24415
+ result = to_datetime(ts, utc=True)
+ assert result == expected
+
+
+class TestToDatetimeUnit(object):
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit(self, cache):
+ # GH 11758
+ # test proper behavior with erros
+
+ with pytest.raises(ValueError):
+ to_datetime([1], unit='D', format='%Y%m%d', cache=cache)
+
+ values = [11111111, 1, 1.0, iNaT, NaT, np.nan,
+ 'NaT', '']
+ result = to_datetime(values, unit='D', errors='ignore', cache=cache)
+ expected = Index([11111111, Timestamp('1970-01-02'),
+ Timestamp('1970-01-02'), NaT,
+ NaT, NaT, NaT, NaT],
+ dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ result = to_datetime(values, unit='D', errors='coerce', cache=cache)
+ expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
+ 'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(tslib.OutOfBoundsDatetime):
+ to_datetime(values, unit='D', errors='raise', cache=cache)
+
+ values = [1420043460000, iNaT, NaT, np.nan, 'NaT']
+
+ result = to_datetime(values, errors='ignore', unit='s', cache=cache)
+ expected = Index([1420043460000, NaT, NaT,
+ NaT, NaT], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ result = to_datetime(values, errors='coerce', unit='s', cache=cache)
+ expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(tslib.OutOfBoundsDatetime):
+ to_datetime(values, errors='raise', unit='s', cache=cache)
+
+ # if we have a string, then we raise a ValueError
+ # and NOT an OutOfBoundsDatetime
+ for val in ['foo', Timestamp('20130101')]:
+ try:
+ to_datetime(val, errors='raise', unit='s', cache=cache)
+ except tslib.OutOfBoundsDatetime:
+ raise AssertionError("incorrect exception raised")
+ except ValueError:
+ pass
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit_consistency(self, cache):
+
+ # consistency of conversions
+ expected = Timestamp('1970-05-09 14:25:11')
+ result = pd.to_datetime(11111111, unit='s', errors='raise',
+ cache=cache)
+ assert result == expected
+ assert isinstance(result, Timestamp)
+
+ result = pd.to_datetime(11111111, unit='s', errors='coerce',
+ cache=cache)
+ assert result == expected
+ assert isinstance(result, Timestamp)
+
+ result = pd.to_datetime(11111111, unit='s', errors='ignore',
+ cache=cache)
+ assert result == expected
+ assert isinstance(result, Timestamp)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit_with_numeric(self, cache):
+
+ # GH 13180
+ # coercions from floats/ints are ok
+ expected = DatetimeIndex(['2015-06-19 05:33:20',
+ '2015-05-27 22:33:20'])
+ arr1 = [1.434692e+18, 1.432766e+18]
+ arr2 = np.array(arr1).astype('int64')
+ for errors in ['ignore', 'raise', 'coerce']:
+ result = pd.to_datetime(arr1, errors=errors, cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ result = pd.to_datetime(arr2, errors=errors, cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ # but we want to make sure that we are coercing
+ # if we have ints/strings
+ expected = DatetimeIndex(['NaT',
+ '2015-06-19 05:33:20',
+ '2015-05-27 22:33:20'])
+ arr = ['foo', 1.434692e+18, 1.432766e+18]
+ result = pd.to_datetime(arr, errors='coerce', cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ expected = DatetimeIndex(['2015-06-19 05:33:20',
+ '2015-05-27 22:33:20',
+ 'NaT',
+ 'NaT'])
+ arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT']
+ result = pd.to_datetime(arr, errors='coerce', cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit_mixed(self, cache):
+
+ # mixed integers/datetimes
+ expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT'])
+ arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18]
+ result = pd.to_datetime(arr, errors='coerce', cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(arr, errors='raise', cache=cache)
+
+ expected = DatetimeIndex(['NaT',
+ 'NaT',
+ '2013-01-01'])
+ arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')]
+ result = pd.to_datetime(arr, errors='coerce', cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(arr, errors='raise', cache=cache)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit_rounding(self, cache):
+ # GH 14156: argument will incur floating point errors but no
+ # premature rounding
+ result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache)
+ expected = pd.Timestamp('2015-06-19 19:55:31.877000093')
+ assert result == expected
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_unit_ignore_keeps_name(self, cache):
+ # GH 21697
+ expected = pd.Index([15e9] * 2, name='name')
+ result = pd.to_datetime(expected, errors='ignore', box=True, unit='s',
+ cache=cache)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_dataframe(self, cache):
+
+ df = DataFrame({'year': [2015, 2016],
+ 'month': [2, 3],
+ 'day': [4, 5],
+ 'hour': [6, 7],
+ 'minute': [58, 59],
+ 'second': [10, 11],
+ 'ms': [1, 1],
+ 'us': [2, 2],
+ 'ns': [3, 3]})
+
+ result = to_datetime({'year': df['year'],
+ 'month': df['month'],
+ 'day': df['day']}, cache=cache)
+ expected = Series([Timestamp('20150204 00:00:00'),
+ Timestamp('20160305 00:0:00')])
+ assert_series_equal(result, expected)
+
+ # dict-like
+ result = to_datetime(df[['year', 'month', 'day']].to_dict(),
+ cache=cache)
+ assert_series_equal(result, expected)
+
+ # dict but with constructable
+ df2 = df[['year', 'month', 'day']].to_dict()
+ df2['month'] = 2
+ result = to_datetime(df2, cache=cache)
+ expected2 = Series([Timestamp('20150204 00:00:00'),
+ Timestamp('20160205 00:0:00')])
+ assert_series_equal(result, expected2)
+
+ # unit mappings
+ units = [{'year': 'years',
+ 'month': 'months',
+ 'day': 'days',
+ 'hour': 'hours',
+ 'minute': 'minutes',
+ 'second': 'seconds'},
+ {'year': 'year',
+ 'month': 'month',
+ 'day': 'day',
+ 'hour': 'hour',
+ 'minute': 'minute',
+ 'second': 'second'},
+ ]
+
+ for d in units:
+ result = to_datetime(df[list(d.keys())].rename(columns=d),
+ cache=cache)
+ expected = Series([Timestamp('20150204 06:58:10'),
+ Timestamp('20160305 07:59:11')])
+ assert_series_equal(result, expected)
+
+ d = {'year': 'year',
+ 'month': 'month',
+ 'day': 'day',
+ 'hour': 'hour',
+ 'minute': 'minute',
+ 'second': 'second',
+ 'ms': 'ms',
+ 'us': 'us',
+ 'ns': 'ns'}
+
+ result = to_datetime(df.rename(columns=d), cache=cache)
+ expected = Series([Timestamp('20150204 06:58:10.001002003'),
+ Timestamp('20160305 07:59:11.001002003')])
+ assert_series_equal(result, expected)
+
+ # coerce back to int
+ result = to_datetime(df.astype(str), cache=cache)
+ assert_series_equal(result, expected)
+
+ # passing coerce
+ df2 = DataFrame({'year': [2015, 2016],
+ 'month': [2, 20],
+ 'day': [4, 5]})
+
+ msg = ("cannot assemble the datetimes: time data .+ does not "
+ r"match format '%Y%m%d' \(match\)")
+ with pytest.raises(ValueError, match=msg):
+ to_datetime(df2, cache=cache)
+ result = to_datetime(df2, errors='coerce', cache=cache)
+ expected = Series([Timestamp('20150204 00:00:00'),
+ NaT])
+ assert_series_equal(result, expected)
+
+ # extra columns
+ msg = ("extra keys have been passed to the datetime assemblage: "
+ r"\[foo\]")
+ with pytest.raises(ValueError, match=msg):
+ df2 = df.copy()
+ df2['foo'] = 1
+ to_datetime(df2, cache=cache)
+
+ # not enough
+ msg = (r'to assemble mappings requires at least that \[year, month, '
+ r'day\] be specified: \[.+\] is missing')
+ for c in [['year'],
+ ['year', 'month'],
+ ['year', 'month', 'second'],
+ ['month', 'day'],
+ ['year', 'day', 'second']]:
+ with pytest.raises(ValueError, match=msg):
+ to_datetime(df[c], cache=cache)
+
+ # duplicates
+ msg = 'cannot assemble with duplicate keys'
+ df2 = DataFrame({'year': [2015, 2016],
+ 'month': [2, 20],
+ 'day': [4, 5]})
+ df2.columns = ['year', 'year', 'day']
+ with pytest.raises(ValueError, match=msg):
+ to_datetime(df2, cache=cache)
+
+ df2 = DataFrame({'year': [2015, 2016],
+ 'month': [2, 20],
+ 'day': [4, 5],
+ 'hour': [4, 5]})
+ df2.columns = ['year', 'month', 'day', 'day']
+ with pytest.raises(ValueError, match=msg):
+ to_datetime(df2, cache=cache)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_dataframe_dtypes(self, cache):
+ # #13451
+ df = DataFrame({'year': [2015, 2016],
+ 'month': [2, 3],
+ 'day': [4, 5]})
+
+ # int16
+ result = to_datetime(df.astype('int16'), cache=cache)
+ expected = Series([Timestamp('20150204 00:00:00'),
+ Timestamp('20160305 00:00:00')])
+ assert_series_equal(result, expected)
+
+ # mixed dtypes
+ df['month'] = df['month'].astype('int8')
+ df['day'] = df['day'].astype('int8')
+ result = to_datetime(df, cache=cache)
+ expected = Series([Timestamp('20150204 00:00:00'),
+ Timestamp('20160305 00:00:00')])
+ assert_series_equal(result, expected)
+
+ # float
+ df = DataFrame({'year': [2000, 2001],
+ 'month': [1.5, 1],
+ 'day': [1, 1]})
+ with pytest.raises(ValueError):
+ to_datetime(df, cache=cache)
+
+ def test_dataframe_box_false(self):
+ # GH 23760
+ df = pd.DataFrame({'year': [2015, 2016],
+ 'month': [2, 3],
+ 'day': [4, 5]})
+ result = pd.to_datetime(df, box=False)
+ expected = np.array(['2015-02-04', '2016-03-05'],
+ dtype='datetime64[ns]')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_dataframe_utc_true(self):
+ # GH 23760
+ df = pd.DataFrame({'year': [2015, 2016],
+ 'month': [2, 3],
+ 'day': [4, 5]})
+ result = pd.to_datetime(df, utc=True)
+ expected = pd.Series(np.array(['2015-02-04', '2016-03-05'],
+ dtype='datetime64[ns]')).dt.tz_localize('UTC')
+ tm.assert_series_equal(result, expected)
+
+ def test_to_datetime_errors_ignore_utc_true(self):
+ # GH 23758
+ result = pd.to_datetime([1], unit='s', box=True, utc=True,
+ errors='ignore')
+ expected = DatetimeIndex(['1970-01-01 00:00:01'], tz='UTC')
+ tm.assert_index_equal(result, expected)
+
+
+class TestToDatetimeMisc(object):
+ def test_to_datetime_barely_out_of_bounds(self):
+ # GH#19529
+ # GH#19382 close enough to bounds that dropping nanos would result
+ # in an in-bounds datetime
+ arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object)
+
+ with pytest.raises(OutOfBoundsDatetime):
+ to_datetime(arr)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_iso8601(self, cache):
+ result = to_datetime(["2012-01-01 00:00:00"], cache=cache)
+ exp = Timestamp("2012-01-01 00:00:00")
+ assert result[0] == exp
+
+ result = to_datetime(['20121001'], cache=cache) # bad iso 8601
+ exp = Timestamp('2012-10-01')
+ assert result[0] == exp
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_default(self, cache):
+ rs = to_datetime('2001', cache=cache)
+ xp = datetime(2001, 1, 1)
+ assert rs == xp
+
+ # dayfirst is essentially broken
+
+ # to_datetime('01-13-2012', dayfirst=True)
+ # pytest.raises(ValueError, to_datetime('01-13-2012',
+ # dayfirst=True))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_on_datetime64_series(self, cache):
+ # #2699
+ s = Series(date_range('1/1/2000', periods=10))
+
+ result = to_datetime(s, cache=cache)
+ assert result[0] == s[0]
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_with_space_in_series(self, cache):
+ # GH 6428
+ s = Series(['10/18/2006', '10/18/2008', ' '])
+ pytest.raises(ValueError, lambda: to_datetime(s,
+ errors='raise',
+ cache=cache))
+ result_coerce = to_datetime(s, errors='coerce', cache=cache)
+ expected_coerce = Series([datetime(2006, 10, 18),
+ datetime(2008, 10, 18),
+ NaT])
+ tm.assert_series_equal(result_coerce, expected_coerce)
+ result_ignore = to_datetime(s, errors='ignore', cache=cache)
+ tm.assert_series_equal(result_ignore, s)
+
+ @td.skip_if_has_locale
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_with_apply(self, cache):
+ # this is only locale tested with US/None locales
+ # GH 5195
+ # with a format and coerce a single item to_datetime fails
+ td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3])
+ expected = pd.to_datetime(td, format='%b %y', cache=cache)
+ result = td.apply(pd.to_datetime, format='%b %y', cache=cache)
+ assert_series_equal(result, expected)
+
+ td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3])
+ pytest.raises(ValueError,
+ lambda: pd.to_datetime(td, format='%b %y',
+ errors='raise',
+ cache=cache))
+ pytest.raises(ValueError,
+ lambda: td.apply(pd.to_datetime, format='%b %y',
+ errors='raise', cache=cache))
+ expected = pd.to_datetime(td, format='%b %y', errors='coerce',
+ cache=cache)
+
+ result = td.apply(
+ lambda x: pd.to_datetime(x, format='%b %y', errors='coerce',
+ cache=cache))
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_types(self, cache):
+
+ # empty string
+ result = to_datetime('', cache=cache)
+ assert result is NaT
+
+ result = to_datetime(['', ''], cache=cache)
+ assert isna(result).all()
+
+ # ints
+ result = Timestamp(0)
+ expected = to_datetime(0, cache=cache)
+ assert result == expected
+
+ # GH 3888 (strings)
+ expected = to_datetime(['2012'], cache=cache)[0]
+ result = to_datetime('2012', cache=cache)
+ assert result == expected
+
+ # array = ['2012','20120101','20120101 12:01:01']
+ array = ['20120101', '20120101 12:01:01']
+ expected = list(to_datetime(array, cache=cache))
+ result = lmap(Timestamp, array)
+ tm.assert_almost_equal(result, expected)
+
+ # currently fails ###
+ # result = Timestamp('2012')
+ # expected = to_datetime('2012')
+ # assert result == expected
+
+ @pytest.mark.parametrize('cache', [True, False])
+ @pytest.mark.parametrize('box, klass', [
+ [True, Index],
+ [False, np.array]
+ ])
+ def test_to_datetime_unprocessable_input(self, cache, box, klass):
+ # GH 4928
+ # GH 21864
+ result = to_datetime([1, '1'], errors='ignore', cache=cache, box=box)
+ expected = klass(np.array([1, '1'], dtype='O'))
+ tm.assert_equal(result, expected)
+ pytest.raises(TypeError, to_datetime, [1, '1'], errors='raise',
+ cache=cache, box=box)
+
+ def test_to_datetime_other_datetime64_units(self):
+ # 5/25/2012
+ scalar = np.int64(1337904000000000).view('M8[us]')
+ as_obj = scalar.astype('O')
+
+ index = DatetimeIndex([scalar])
+ assert index[0] == scalar.astype('O')
+
+ value = Timestamp(scalar)
+ assert value == as_obj
+
+ def test_to_datetime_list_of_integers(self):
+ rng = date_range('1/1/2000', periods=20)
+ rng = DatetimeIndex(rng.values)
+
+ ints = list(rng.asi8)
+
+ result = DatetimeIndex(ints)
+
+ tm.assert_index_equal(rng, result)
+
+ def test_to_datetime_overflow(self):
+ # gh-17637
+ # we are overflowing Timedelta range here
+
+ with pytest.raises(OverflowError):
+ date_range(start='1/1/1700', freq='B', periods=100000)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_string_na_nat_conversion(self, cache):
+ # GH #999, #858
+
+ from pandas.compat import parse_date
+
+ strings = np.array(['1/1/2000', '1/2/2000', np.nan,
+ '1/4/2000, 12:34:56'], dtype=object)
+
+ expected = np.empty(4, dtype='M8[ns]')
+ for i, val in enumerate(strings):
+ if isna(val):
+ expected[i] = iNaT
+ else:
+ expected[i] = parse_date(val)
+
+ result = tslib.array_to_datetime(strings)[0]
+ tm.assert_almost_equal(result, expected)
+
+ result2 = to_datetime(strings, cache=cache)
+ assert isinstance(result2, DatetimeIndex)
+ tm.assert_numpy_array_equal(result, result2.values)
+
+ malformed = np.array(['1/100/2000', np.nan], dtype=object)
+
+ # GH 10636, default is now 'raise'
+ pytest.raises(ValueError,
+ lambda: to_datetime(malformed, errors='raise',
+ cache=cache))
+
+ result = to_datetime(malformed, errors='ignore', cache=cache)
+ # GH 21864
+ expected = Index(malformed)
+ tm.assert_index_equal(result, expected)
+
+ pytest.raises(ValueError, to_datetime, malformed, errors='raise',
+ cache=cache)
+
+ idx = ['a', 'b', 'c', 'd', 'e']
+ series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan,
+ '1/5/2000'], index=idx, name='foo')
+ dseries = Series([to_datetime('1/1/2000', cache=cache), np.nan,
+ to_datetime('1/3/2000', cache=cache), np.nan,
+ to_datetime('1/5/2000', cache=cache)],
+ index=idx, name='foo')
+
+ result = to_datetime(series, cache=cache)
+ dresult = to_datetime(dseries, cache=cache)
+
+ expected = Series(np.empty(5, dtype='M8[ns]'), index=idx)
+ for i in range(5):
+ x = series[i]
+ if isna(x):
+ expected[i] = iNaT
+ else:
+ expected[i] = to_datetime(x, cache=cache)
+
+ assert_series_equal(result, expected, check_names=False)
+ assert result.name == 'foo'
+
+ assert_series_equal(dresult, expected, check_names=False)
+ assert dresult.name == 'foo'
+
+ @pytest.mark.parametrize('dtype', [
+ 'datetime64[h]', 'datetime64[m]',
+ 'datetime64[s]', 'datetime64[ms]',
+ 'datetime64[us]', 'datetime64[ns]'])
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_dti_constructor_numpy_timeunits(self, cache, dtype):
+ # GH 9114
+ base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'],
+ cache=cache)
+
+ values = base.values.astype(dtype)
+
+ tm.assert_index_equal(DatetimeIndex(values), base)
+ tm.assert_index_equal(to_datetime(values, cache=cache), base)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_dayfirst(self, cache):
+ # GH 5917
+ arr = ['10/02/2014', '11/02/2014', '12/02/2014']
+ expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11),
+ datetime(2014, 2, 12)])
+ idx1 = DatetimeIndex(arr, dayfirst=True)
+ idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
+ idx3 = to_datetime(arr, dayfirst=True, cache=cache)
+ idx4 = to_datetime(np.array(arr), dayfirst=True, cache=cache)
+ idx5 = DatetimeIndex(Index(arr), dayfirst=True)
+ idx6 = DatetimeIndex(Series(arr), dayfirst=True)
+ tm.assert_index_equal(expected, idx1)
+ tm.assert_index_equal(expected, idx2)
+ tm.assert_index_equal(expected, idx3)
+ tm.assert_index_equal(expected, idx4)
+ tm.assert_index_equal(expected, idx5)
+ tm.assert_index_equal(expected, idx6)
+
+
+class TestGuessDatetimeFormat(object):
+
+ @td.skip_if_not_us_locale
+ def test_guess_datetime_format_for_array(self):
+ expected_format = '%Y-%m-%d %H:%M:%S.%f'
+ dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format)
+
+ test_arrays = [
+ np.array([dt_string, dt_string, dt_string], dtype='O'),
+ np.array([np.nan, np.nan, dt_string], dtype='O'),
+ np.array([dt_string, 'random_string'], dtype='O'),
+ ]
+
+ for test_array in test_arrays:
+ assert tools._guess_datetime_format_for_array(
+ test_array) == expected_format
+
+ format_for_string_of_nans = tools._guess_datetime_format_for_array(
+ np.array(
+ [np.nan, np.nan, np.nan], dtype='O'))
+ assert format_for_string_of_nans is None
+
+
+class TestToDatetimeInferFormat(object):
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_infer_datetime_format_consistent_format(self, cache):
+ s = pd.Series(pd.date_range('20000101', periods=50, freq='H'))
+
+ test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f',
+ '%Y-%m-%dT%H:%M:%S.%f']
+
+ for test_format in test_formats:
+ s_as_dt_strings = s.apply(lambda x: x.strftime(test_format))
+
+ with_format = pd.to_datetime(s_as_dt_strings, format=test_format,
+ cache=cache)
+ no_infer = pd.to_datetime(s_as_dt_strings,
+ infer_datetime_format=False,
+ cache=cache)
+ yes_infer = pd.to_datetime(s_as_dt_strings,
+ infer_datetime_format=True,
+ cache=cache)
+
+ # Whether the format is explicitly passed, it is inferred, or
+ # it is not inferred, the results should all be the same
+ tm.assert_series_equal(with_format, no_infer)
+ tm.assert_series_equal(no_infer, yes_infer)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_infer_datetime_format_inconsistent_format(self,
+ cache):
+ s = pd.Series(np.array(['01/01/2011 00:00:00',
+ '01-02-2011 00:00:00',
+ '2011-01-03T00:00:00']))
+
+ # When the format is inconsistent, infer_datetime_format should just
+ # fallback to the default parsing
+ tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False,
+ cache=cache),
+ pd.to_datetime(s, infer_datetime_format=True,
+ cache=cache))
+
+ s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011']))
+
+ tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False,
+ cache=cache),
+ pd.to_datetime(s, infer_datetime_format=True,
+ cache=cache))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_infer_datetime_format_series_with_nans(self, cache):
+ s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan,
+ '01/03/2011 00:00:00', np.nan]))
+ tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False,
+ cache=cache),
+ pd.to_datetime(s, infer_datetime_format=True,
+ cache=cache))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_infer_datetime_format_series_start_with_nans(self,
+ cache):
+ s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00',
+ '01/02/2011 00:00:00', '01/03/2011 00:00:00']))
+
+ tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False,
+ cache=cache),
+ pd.to_datetime(s, infer_datetime_format=True,
+ cache=cache))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_to_datetime_iso8601_noleading_0s(self, cache):
+ # GH 11871
+ s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3'])
+ expected = pd.Series([pd.Timestamp('2014-01-01'),
+ pd.Timestamp('2014-02-02'),
+ pd.Timestamp('2015-03-03')])
+ tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected)
+ tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d',
+ cache=cache), expected)
+
+
+class TestDaysInMonth(object):
+ # tests for issue #10154
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_day_not_in_month_coerce(self, cache):
+ assert isna(to_datetime('2015-02-29', errors='coerce', cache=cache))
+ assert isna(to_datetime('2015-02-29', format="%Y-%m-%d",
+ errors='coerce', cache=cache))
+ assert isna(to_datetime('2015-02-32', format="%Y-%m-%d",
+ errors='coerce', cache=cache))
+ assert isna(to_datetime('2015-04-31', format="%Y-%m-%d",
+ errors='coerce', cache=cache))
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_day_not_in_month_raise(self, cache):
+ pytest.raises(ValueError, to_datetime, '2015-02-29',
+ errors='raise', cache=cache)
+ pytest.raises(ValueError, to_datetime, '2015-02-29',
+ errors='raise', format="%Y-%m-%d", cache=cache)
+ pytest.raises(ValueError, to_datetime, '2015-02-32',
+ errors='raise', format="%Y-%m-%d", cache=cache)
+ pytest.raises(ValueError, to_datetime, '2015-04-31',
+ errors='raise', format="%Y-%m-%d", cache=cache)
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_day_not_in_month_ignore(self, cache):
+ assert to_datetime('2015-02-29', errors='ignore',
+ cache=cache) == '2015-02-29'
+ assert to_datetime('2015-02-29', errors='ignore',
+ format="%Y-%m-%d", cache=cache) == '2015-02-29'
+ assert to_datetime('2015-02-32', errors='ignore',
+ format="%Y-%m-%d", cache=cache) == '2015-02-32'
+ assert to_datetime('2015-04-31', errors='ignore',
+ format="%Y-%m-%d", cache=cache) == '2015-04-31'
+
+
+class TestDatetimeParsingWrappers(object):
+
+ @pytest.mark.parametrize('date_str,expected', list({
+ '2011-01-01': datetime(2011, 1, 1),
+ '2Q2005': datetime(2005, 4, 1),
+ '2Q05': datetime(2005, 4, 1),
+ '2005Q1': datetime(2005, 1, 1),
+ '05Q1': datetime(2005, 1, 1),
+ '2011Q3': datetime(2011, 7, 1),
+ '11Q3': datetime(2011, 7, 1),
+ '3Q2011': datetime(2011, 7, 1),
+ '3Q11': datetime(2011, 7, 1),
+
+ # quarterly without space
+ '2000Q4': datetime(2000, 10, 1),
+ '00Q4': datetime(2000, 10, 1),
+ '4Q2000': datetime(2000, 10, 1),
+ '4Q00': datetime(2000, 10, 1),
+ '2000q4': datetime(2000, 10, 1),
+ '2000-Q4': datetime(2000, 10, 1),
+ '00-Q4': datetime(2000, 10, 1),
+ '4Q-2000': datetime(2000, 10, 1),
+ '4Q-00': datetime(2000, 10, 1),
+ '00q4': datetime(2000, 10, 1),
+ '2005': datetime(2005, 1, 1),
+ '2005-11': datetime(2005, 11, 1),
+ '2005 11': datetime(2005, 11, 1),
+ '11-2005': datetime(2005, 11, 1),
+ '11 2005': datetime(2005, 11, 1),
+ '200511': datetime(2020, 5, 11),
+ '20051109': datetime(2005, 11, 9),
+ '20051109 10:15': datetime(2005, 11, 9, 10, 15),
+ '20051109 08H': datetime(2005, 11, 9, 8, 0),
+ '2005-11-09 10:15': datetime(2005, 11, 9, 10, 15),
+ '2005-11-09 08H': datetime(2005, 11, 9, 8, 0),
+ '2005/11/09 10:15': datetime(2005, 11, 9, 10, 15),
+ '2005/11/09 08H': datetime(2005, 11, 9, 8, 0),
+ "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28),
+ "Thu Sep 25 2003": datetime(2003, 9, 25),
+ "Sep 25 2003": datetime(2003, 9, 25),
+ "January 1 2014": datetime(2014, 1, 1),
+
+ # GHE10537
+ '2014-06': datetime(2014, 6, 1),
+ '06-2014': datetime(2014, 6, 1),
+ '2014-6': datetime(2014, 6, 1),
+ '6-2014': datetime(2014, 6, 1),
+
+ '20010101 12': datetime(2001, 1, 1, 12),
+ '20010101 1234': datetime(2001, 1, 1, 12, 34),
+ '20010101 123456': datetime(2001, 1, 1, 12, 34, 56)}.items()))
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_parsers(self, date_str, expected, cache):
+
+ # dateutil >= 2.5.0 defaults to yearfirst=True
+ # https://github.com/dateutil/dateutil/issues/217
+ yearfirst = True
+
+ result1, _, _ = parsing.parse_time_string(date_str,
+ yearfirst=yearfirst)
+ result2 = to_datetime(date_str, yearfirst=yearfirst)
+ result3 = to_datetime([date_str], yearfirst=yearfirst)
+ # result5 is used below
+ result4 = to_datetime(np.array([date_str], dtype=object),
+ yearfirst=yearfirst, cache=cache)
+ result6 = DatetimeIndex([date_str], yearfirst=yearfirst)
+ # result7 is used below
+ result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst)
+ result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst)
+
+ for res in [result1, result2]:
+ assert res == expected
+ for res in [result3, result4, result6, result8, result9]:
+ exp = DatetimeIndex([pd.Timestamp(expected)])
+ tm.assert_index_equal(res, exp)
+
+ # these really need to have yearfirst, but we don't support
+ if not yearfirst:
+ result5 = Timestamp(date_str)
+ assert result5 == expected
+ result7 = date_range(date_str, freq='S', periods=1,
+ yearfirst=yearfirst)
+ assert result7 == expected
+
+ def test_parsers_nat(self):
+ # Test that each of several string-accepting methods return pd.NaT
+ result1, _, _ = parsing.parse_time_string('NaT')
+ result2 = to_datetime('NaT')
+ result3 = Timestamp('NaT')
+ result4 = DatetimeIndex(['NaT'])[0]
+ assert result1 is NaT
+ assert result2 is NaT
+ assert result3 is NaT
+ assert result4 is NaT
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_parsers_dayfirst_yearfirst(self, cache):
+ # OK
+ # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00
+ # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00
+ # 2.5.3 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00
+
+ # OK
+ # 2.5.1 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+ # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+ # 2.5.3 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+
+ # bug fix in 2.5.2
+ # 2.5.1 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-11-12 00:00:00
+ # 2.5.2 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00
+ # 2.5.3 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00
+
+ # OK
+ # 2.5.1 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+ # 2.5.2 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+ # 2.5.3 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+
+ # OK
+ # 2.5.1 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+ # 2.5.2 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+ # 2.5.3 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+
+ # OK
+ # 2.5.1 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+ # 2.5.2 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+ # 2.5.3 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+
+ # revert of bug in 2.5.2
+ # 2.5.1 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00
+ # 2.5.2 20/12/21 [dayfirst=1, yearfirst=1] -> month must be in 1..12
+ # 2.5.3 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00
+
+ # OK
+ # 2.5.1 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+ # 2.5.2 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+ # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+
+ is_lt_253 = LooseVersion(dateutil.__version__) < LooseVersion('2.5.3')
+
+ # str : dayfirst, yearfirst, expected
+ cases = {'10-11-12': [(False, False,
+ datetime(2012, 10, 11)),
+ (True, False,
+ datetime(2012, 11, 10)),
+ (False, True,
+ datetime(2010, 11, 12)),
+ (True, True,
+ datetime(2010, 12, 11))],
+ '20/12/21': [(False, False,
+ datetime(2021, 12, 20)),
+ (True, False,
+ datetime(2021, 12, 20)),
+ (False, True,
+ datetime(2020, 12, 21)),
+ (True, True,
+ datetime(2020, 12, 21))]}
+
+ for date_str, values in compat.iteritems(cases):
+ for dayfirst, yearfirst, expected in values:
+
+ # odd comparisons across version
+ # let's just skip
+ if dayfirst and yearfirst and is_lt_253:
+ continue
+
+ # compare with dateutil result
+ dateutil_result = parse(date_str, dayfirst=dayfirst,
+ yearfirst=yearfirst)
+ assert dateutil_result == expected
+
+ result1, _, _ = parsing.parse_time_string(date_str,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst)
+
+ # we don't support dayfirst/yearfirst here:
+ if not dayfirst and not yearfirst:
+ result2 = Timestamp(date_str)
+ assert result2 == expected
+
+ result3 = to_datetime(date_str, dayfirst=dayfirst,
+ yearfirst=yearfirst, cache=cache)
+
+ result4 = DatetimeIndex([date_str], dayfirst=dayfirst,
+ yearfirst=yearfirst)[0]
+
+ assert result1 == expected
+ assert result3 == expected
+ assert result4 == expected
+
+ @pytest.mark.parametrize('cache', [True, False])
+ def test_parsers_timestring(self, cache):
+ # must be the same as dateutil result
+ cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)),
+ '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))}
+
+ for date_str, (exp_now, exp_def) in compat.iteritems(cases):
+ result1, _, _ = parsing.parse_time_string(date_str)
+ result2 = to_datetime(date_str)
+ result3 = to_datetime([date_str])
+ result4 = Timestamp(date_str)
+ result5 = DatetimeIndex([date_str])[0]
+ # parse time string return time string based on default date
+ # others are not, and can't be changed because it is used in
+ # time series plot
+ assert result1 == exp_def
+ assert result2 == exp_now
+ assert result3 == exp_now
+ assert result4 == exp_now
+ assert result5 == exp_now
+
+ @td.skip_if_has_locale
+ def test_parsers_time(self):
+ # GH11818
+ strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500",
+ "2:15:00pm", "021500pm", time(14, 15)]
+ expected = time(14, 15)
+
+ for time_string in strings:
+ assert tools.to_time(time_string) == expected
+
+ new_string = "14.15"
+ pytest.raises(ValueError, tools.to_time, new_string)
+ assert tools.to_time(new_string, format="%H.%M") == expected
+
+ arg = ["14:15", "20:20"]
+ expected_arr = [time(14, 15), time(20, 20)]
+ assert tools.to_time(arg) == expected_arr
+ assert tools.to_time(arg, format="%H:%M") == expected_arr
+ assert tools.to_time(arg, infer_time_format=True) == expected_arr
+ assert tools.to_time(arg, format="%I:%M%p",
+ errors="coerce") == [None, None]
+
+ res = tools.to_time(arg, format="%I:%M%p", errors="ignore")
+ tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_))
+
+ with pytest.raises(ValueError):
+ tools.to_time(arg, format="%I:%M%p", errors="raise")
+
+ tm.assert_series_equal(tools.to_time(Series(arg, name="test")),
+ Series(expected_arr, name="test"))
+
+ res = tools.to_time(np.array(arg))
+ assert isinstance(res, list)
+ assert res == expected_arr
+
+ @pytest.mark.parametrize('cache', [True, False])
+ @pytest.mark.parametrize('dt_string, tz, dt_string_repr', [
+ ('2013-01-01 05:45+0545', pytz.FixedOffset(345),
+ "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')"),
+ ('2013-01-01 05:30+0530', pytz.FixedOffset(330),
+ "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')")])
+ def test_parsers_timezone_minute_offsets_roundtrip(self, cache, dt_string,
+ tz, dt_string_repr):
+ # GH11708
+ base = to_datetime("2013-01-01 00:00:00", cache=cache)
+ base = base.tz_localize('UTC').tz_convert(tz)
+ dt_time = to_datetime(dt_string, cache=cache)
+ assert base == dt_time
+ assert dt_string_repr == repr(dt_time)
+
+
[email protected](params=['D', 's', 'ms', 'us', 'ns'])
+def units(request):
+ """Day and some time units.
+
+ * D
+ * s
+ * ms
+ * us
+ * ns
+ """
+ return request.param
+
+
+def epoch_1960():
+ """Timestamp at 1960-01-01."""
+ return Timestamp('1960-01-01')
+
+
+def units_from_epochs():
+ return list(range(5))
+
+
[email protected](params=['timestamp', 'pydatetime', 'datetime64', 'str_1960'])
+def epochs(epoch_1960, request):
+ """Timestamp at 1960-01-01 in various forms.
+
+ * pd.Timestamp
+ * datetime.datetime
+ * numpy.datetime64
+ * str
+ """
+ assert request.param in {'timestamp', 'pydatetime', 'datetime64',
+ "str_1960"}
+ if request.param == 'timestamp':
+ return epoch_1960
+ elif request.param == 'pydatetime':
+ return epoch_1960.to_pydatetime()
+ elif request.param == "datetime64":
+ return epoch_1960.to_datetime64()
+ else:
+ return str(epoch_1960)
+
+
+def julian_dates():
+ return pd.date_range('2014-1-1', periods=10).to_julian_date().values
+
+
+class TestOrigin(object):
+
+ def test_to_basic(self, julian_dates):
+ # gh-11276, gh-11745
+ # for origin as julian
+
+ result = Series(pd.to_datetime(
+ julian_dates, unit='D', origin='julian'))
+ expected = Series(pd.to_datetime(
+ julian_dates - pd.Timestamp(0).to_julian_date(), unit='D'))
+ assert_series_equal(result, expected)
+
+ result = Series(pd.to_datetime(
+ [0, 1, 2], unit='D', origin='unix'))
+ expected = Series([Timestamp('1970-01-01'),
+ Timestamp('1970-01-02'),
+ Timestamp('1970-01-03')])
+ assert_series_equal(result, expected)
+
+ # default
+ result = Series(pd.to_datetime(
+ [0, 1, 2], unit='D'))
+ expected = Series([Timestamp('1970-01-01'),
+ Timestamp('1970-01-02'),
+ Timestamp('1970-01-03')])
+ assert_series_equal(result, expected)
+
+ def test_julian_round_trip(self):
+ result = pd.to_datetime(2456658, origin='julian', unit='D')
+ assert result.to_julian_date() == 2456658
+
+ # out-of-bounds
+ with pytest.raises(ValueError):
+ pd.to_datetime(1, origin="julian", unit='D')
+
+ def test_invalid_unit(self, units, julian_dates):
+
+ # checking for invalid combination of origin='julian' and unit != D
+ if units != 'D':
+ with pytest.raises(ValueError):
+ pd.to_datetime(julian_dates, unit=units, origin='julian')
+
+ def test_invalid_origin(self):
+
+ # need to have a numeric specified
+ with pytest.raises(ValueError):
+ pd.to_datetime("2005-01-01", origin="1960-01-01")
+
+ with pytest.raises(ValueError):
+ pd.to_datetime("2005-01-01", origin="1960-01-01", unit='D')
+
+ def test_epoch(self, units, epochs, epoch_1960, units_from_epochs):
+
+ expected = Series(
+ [pd.Timedelta(x, unit=units) +
+ epoch_1960 for x in units_from_epochs])
+
+ result = Series(pd.to_datetime(
+ units_from_epochs, unit=units, origin=epochs))
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("origin, exc",
+ [('random_string', ValueError),
+ ('epoch', ValueError),
+ ('13-24-1990', ValueError),
+ (datetime(1, 1, 1), tslib.OutOfBoundsDatetime)])
+ def test_invalid_origins(self, origin, exc, units, units_from_epochs):
+
+ with pytest.raises(exc):
+ pd.to_datetime(units_from_epochs, unit=units,
+ origin=origin)
+
+ def test_invalid_origins_tzinfo(self):
+ # GH16842
+ with pytest.raises(ValueError):
+ pd.to_datetime(1, unit='D',
+ origin=datetime(2000, 1, 1, tzinfo=pytz.utc))
+
+ def test_processing_order(self):
+ # make sure we handle out-of-bounds *before*
+ # constructing the dates
+
+ result = pd.to_datetime(200 * 365, unit='D')
+ expected = Timestamp('2169-11-13 00:00:00')
+ assert result == expected
+
+ result = pd.to_datetime(200 * 365, unit='D', origin='1870-01-01')
+ expected = Timestamp('2069-11-13 00:00:00')
+ assert result == expected
+
+ result = pd.to_datetime(300 * 365, unit='D', origin='1870-01-01')
+ expected = Timestamp('2169-10-20 00:00:00')
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_astype.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_astype.py
new file mode 100644
index 00000000000..2932a46f9bd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_astype.py
@@ -0,0 +1,206 @@
+from __future__ import division
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype
+
+from pandas import (
+ CategoricalIndex, Index, IntervalIndex, NaT, Timedelta, Timestamp,
+ interval_range)
+import pandas.util.testing as tm
+
+
+class Base(object):
+ """Tests common to IntervalIndex with any subtype"""
+
+ def test_astype_idempotent(self, index):
+ result = index.astype('interval')
+ tm.assert_index_equal(result, index)
+
+ result = index.astype(index.dtype)
+ tm.assert_index_equal(result, index)
+
+ def test_astype_object(self, index):
+ result = index.astype(object)
+ expected = Index(index.values, dtype='object')
+ tm.assert_index_equal(result, expected)
+ assert not result.equals(index)
+
+ def test_astype_category(self, index):
+ result = index.astype('category')
+ expected = CategoricalIndex(index.values)
+ tm.assert_index_equal(result, expected)
+
+ result = index.astype(CategoricalDtype())
+ tm.assert_index_equal(result, expected)
+
+ # non-default params
+ categories = index.dropna().unique().values[:-1]
+ dtype = CategoricalDtype(categories=categories, ordered=True)
+ result = index.astype(dtype)
+ expected = CategoricalIndex(
+ index.values, categories=categories, ordered=True)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [
+ 'int64', 'uint64', 'float64', 'complex128', 'period[M]',
+ 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]',
+ 'datetime64[ns, US/Eastern]'])
+ def test_astype_cannot_cast(self, index, dtype):
+ msg = 'Cannot cast IntervalIndex to dtype'
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
+
+ def test_astype_invalid_dtype(self, index):
+ msg = "data type 'fake_dtype' not understood"
+ with pytest.raises(TypeError, match=msg):
+ index.astype('fake_dtype')
+
+
+class TestIntSubtype(Base):
+ """Tests specific to IntervalIndex with integer-like subtype"""
+
+ indexes = [
+ IntervalIndex.from_breaks(np.arange(-10, 11, dtype='int64')),
+ IntervalIndex.from_breaks(
+ np.arange(100, dtype='uint64'), closed='left'),
+ ]
+
+ @pytest.fixture(params=indexes)
+ def index(self, request):
+ return request.param
+
+ @pytest.mark.parametrize('subtype', [
+ 'float64', 'datetime64[ns]', 'timedelta64[ns]'])
+ def test_subtype_conversion(self, index, subtype):
+ dtype = IntervalDtype(subtype)
+ result = index.astype(dtype)
+ expected = IntervalIndex.from_arrays(index.left.astype(subtype),
+ index.right.astype(subtype),
+ closed=index.closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('subtype_start, subtype_end', [
+ ('int64', 'uint64'), ('uint64', 'int64')])
+ def test_subtype_integer(self, subtype_start, subtype_end):
+ index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start))
+ dtype = IntervalDtype(subtype_end)
+ result = index.astype(dtype)
+ expected = IntervalIndex.from_arrays(index.left.astype(subtype_end),
+ index.right.astype(subtype_end),
+ closed=index.closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.xfail(reason='GH#15832')
+ def test_subtype_integer_errors(self):
+ # int64 -> uint64 fails with negative values
+ index = interval_range(-10, 10)
+ dtype = IntervalDtype('uint64')
+ with pytest.raises(ValueError):
+ index.astype(dtype)
+
+
+class TestFloatSubtype(Base):
+ """Tests specific to IntervalIndex with float subtype"""
+
+ indexes = [
+ interval_range(-10.0, 10.0, closed='neither'),
+ IntervalIndex.from_arrays([-1.5, np.nan, 0., 0., 1.5],
+ [-0.5, np.nan, 1., 1., 3.],
+ closed='both'),
+ ]
+
+ @pytest.fixture(params=indexes)
+ def index(self, request):
+ return request.param
+
+ @pytest.mark.parametrize('subtype', ['int64', 'uint64'])
+ def test_subtype_integer(self, subtype):
+ index = interval_range(0.0, 10.0)
+ dtype = IntervalDtype(subtype)
+ result = index.astype(dtype)
+ expected = IntervalIndex.from_arrays(index.left.astype(subtype),
+ index.right.astype(subtype),
+ closed=index.closed)
+ tm.assert_index_equal(result, expected)
+
+ # raises with NA
+ msg = 'Cannot convert NA to integer'
+ with pytest.raises(ValueError, match=msg):
+ index.insert(0, np.nan).astype(dtype)
+
+ @pytest.mark.xfail(reason='GH#15832')
+ def test_subtype_integer_errors(self):
+ # float64 -> uint64 fails with negative values
+ index = interval_range(-10.0, 10.0)
+ dtype = IntervalDtype('uint64')
+ with pytest.raises(ValueError):
+ index.astype(dtype)
+
+ # float64 -> integer-like fails with non-integer valued floats
+ index = interval_range(0.0, 10.0, freq=0.25)
+ dtype = IntervalDtype('int64')
+ with pytest.raises(ValueError):
+ index.astype(dtype)
+
+ dtype = IntervalDtype('uint64')
+ with pytest.raises(ValueError):
+ index.astype(dtype)
+
+ @pytest.mark.parametrize('subtype', ['datetime64[ns]', 'timedelta64[ns]'])
+ def test_subtype_datetimelike(self, index, subtype):
+ dtype = IntervalDtype(subtype)
+ msg = 'Cannot convert .* to .*; subtypes are incompatible'
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
+
+
+class TestDatetimelikeSubtype(Base):
+ """Tests specific to IntervalIndex with datetime-like subtype"""
+
+ indexes = [
+ interval_range(Timestamp('2018-01-01'), periods=10, closed='neither'),
+ interval_range(Timestamp('2018-01-01'), periods=10).insert(2, NaT),
+ interval_range(Timestamp('2018-01-01', tz='US/Eastern'), periods=10),
+ interval_range(Timedelta('0 days'), periods=10, closed='both'),
+ interval_range(Timedelta('0 days'), periods=10).insert(2, NaT),
+ ]
+
+ @pytest.fixture(params=indexes)
+ def index(self, request):
+ return request.param
+
+ @pytest.mark.parametrize('subtype', ['int64', 'uint64'])
+ def test_subtype_integer(self, index, subtype):
+ dtype = IntervalDtype(subtype)
+ result = index.astype(dtype)
+ expected = IntervalIndex.from_arrays(index.left.astype(subtype),
+ index.right.astype(subtype),
+ closed=index.closed)
+ tm.assert_index_equal(result, expected)
+
+ def test_subtype_float(self, index):
+ dtype = IntervalDtype('float64')
+ msg = 'Cannot convert .* to .*; subtypes are incompatible'
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
+
+ def test_subtype_datetimelike(self):
+ # datetime -> timedelta raises
+ dtype = IntervalDtype('timedelta64[ns]')
+ msg = 'Cannot convert .* to .*; subtypes are incompatible'
+
+ index = interval_range(Timestamp('2018-01-01'), periods=10)
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
+
+ index = interval_range(Timestamp('2018-01-01', tz='CET'), periods=10)
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
+
+ # timedelta -> datetime raises
+ dtype = IntervalDtype('datetime64[ns]')
+ index = interval_range(Timedelta('0 days'), periods=10)
+ with pytest.raises(TypeError, match=msg):
+ index.astype(dtype)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_construction.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_construction.py
new file mode 100644
index 00000000000..483978b40fe
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_construction.py
@@ -0,0 +1,389 @@
+from __future__ import division
+
+from functools import partial
+
+import numpy as np
+import pytest
+
+from pandas.compat import lzip
+
+from pandas.core.dtypes.common import is_categorical_dtype
+from pandas.core.dtypes.dtypes import IntervalDtype
+
+from pandas import (
+ Categorical, CategoricalIndex, Float64Index, Index, Int64Index, Interval,
+ IntervalIndex, date_range, notna, period_range, timedelta_range)
+from pandas.core.arrays import IntervalArray
+import pandas.core.common as com
+import pandas.util.testing as tm
+
+
[email protected](params=[None, 'foo'])
+def name(request):
+ return request.param
+
+
+class Base(object):
+ """
+ Common tests for all variations of IntervalIndex construction. Input data
+ to be supplied in breaks format, then converted by the subclass method
+ get_kwargs_from_breaks to the expected format.
+ """
+
+ @pytest.mark.parametrize('breaks', [
+ [3, 14, 15, 92, 653],
+ np.arange(10, dtype='int64'),
+ Int64Index(range(-10, 11)),
+ Float64Index(np.arange(20, 30, 0.5)),
+ date_range('20180101', periods=10),
+ date_range('20180101', periods=10, tz='US/Eastern'),
+ timedelta_range('1 day', periods=10)])
+ def test_constructor(self, constructor, breaks, closed, name):
+ result_kwargs = self.get_kwargs_from_breaks(breaks, closed)
+ result = constructor(closed=closed, name=name, **result_kwargs)
+
+ assert result.closed == closed
+ assert result.name == name
+ assert result.dtype.subtype == getattr(breaks, 'dtype', 'int64')
+ tm.assert_index_equal(result.left, Index(breaks[:-1]))
+ tm.assert_index_equal(result.right, Index(breaks[1:]))
+
+ @pytest.mark.parametrize('breaks, subtype', [
+ (Int64Index([0, 1, 2, 3, 4]), 'float64'),
+ (Int64Index([0, 1, 2, 3, 4]), 'datetime64[ns]'),
+ (Int64Index([0, 1, 2, 3, 4]), 'timedelta64[ns]'),
+ (Float64Index([0, 1, 2, 3, 4]), 'int64'),
+ (date_range('2017-01-01', periods=5), 'int64'),
+ (timedelta_range('1 day', periods=5), 'int64')])
+ def test_constructor_dtype(self, constructor, breaks, subtype):
+ # GH 19262: conversion via dtype parameter
+ expected_kwargs = self.get_kwargs_from_breaks(breaks.astype(subtype))
+ expected = constructor(**expected_kwargs)
+
+ result_kwargs = self.get_kwargs_from_breaks(breaks)
+ iv_dtype = IntervalDtype(subtype)
+ for dtype in (iv_dtype, str(iv_dtype)):
+ result = constructor(dtype=dtype, **result_kwargs)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('breaks', [
+ [np.nan] * 2, [np.nan] * 4, [np.nan] * 50])
+ def test_constructor_nan(self, constructor, breaks, closed):
+ # GH 18421
+ result_kwargs = self.get_kwargs_from_breaks(breaks)
+ result = constructor(closed=closed, **result_kwargs)
+
+ expected_subtype = np.float64
+ expected_values = np.array(breaks[:-1], dtype=object)
+
+ assert result.closed == closed
+ assert result.dtype.subtype == expected_subtype
+ tm.assert_numpy_array_equal(result._ndarray_values, expected_values)
+
+ @pytest.mark.parametrize('breaks', [
+ [],
+ np.array([], dtype='int64'),
+ np.array([], dtype='float64'),
+ np.array([], dtype='datetime64[ns]'),
+ np.array([], dtype='timedelta64[ns]')])
+ def test_constructor_empty(self, constructor, breaks, closed):
+ # GH 18421
+ result_kwargs = self.get_kwargs_from_breaks(breaks)
+ result = constructor(closed=closed, **result_kwargs)
+
+ expected_values = np.array([], dtype=object)
+ expected_subtype = getattr(breaks, 'dtype', np.int64)
+
+ assert result.empty
+ assert result.closed == closed
+ assert result.dtype.subtype == expected_subtype
+ tm.assert_numpy_array_equal(result._ndarray_values, expected_values)
+
+ @pytest.mark.parametrize('breaks', [
+ tuple('0123456789'),
+ list('abcdefghij'),
+ np.array(list('abcdefghij'), dtype=object),
+ np.array(list('abcdefghij'), dtype='<U1')])
+ def test_constructor_string(self, constructor, breaks):
+ # GH 19016
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalIndex')
+ with pytest.raises(TypeError, match=msg):
+ constructor(**self.get_kwargs_from_breaks(breaks))
+
+ @pytest.mark.parametrize('cat_constructor', [
+ Categorical, CategoricalIndex])
+ def test_constructor_categorical_valid(self, constructor, cat_constructor):
+ # GH 21243/21253
+ if isinstance(constructor, partial) and constructor.func is Index:
+ # Index is defined to create CategoricalIndex from categorical data
+ pytest.skip()
+
+ breaks = np.arange(10, dtype='int64')
+ expected = IntervalIndex.from_breaks(breaks)
+
+ cat_breaks = cat_constructor(breaks)
+ result_kwargs = self.get_kwargs_from_breaks(cat_breaks)
+ result = constructor(**result_kwargs)
+ tm.assert_index_equal(result, expected)
+
+ def test_generic_errors(self, constructor):
+ # filler input data to be used when supplying invalid kwargs
+ filler = self.get_kwargs_from_breaks(range(10))
+
+ # invalid closed
+ msg = "invalid option for 'closed': invalid"
+ with pytest.raises(ValueError, match=msg):
+ constructor(closed='invalid', **filler)
+
+ # unsupported dtype
+ msg = 'dtype must be an IntervalDtype, got int64'
+ with pytest.raises(TypeError, match=msg):
+ constructor(dtype='int64', **filler)
+
+ # invalid dtype
+ msg = "data type 'invalid' not understood"
+ with pytest.raises(TypeError, match=msg):
+ constructor(dtype='invalid', **filler)
+
+ # no point in nesting periods in an IntervalIndex
+ periods = period_range('2000-01-01', periods=10)
+ periods_kwargs = self.get_kwargs_from_breaks(periods)
+ msg = 'Period dtypes are not supported, use a PeriodIndex instead'
+ with pytest.raises(ValueError, match=msg):
+ constructor(**periods_kwargs)
+
+ # decreasing values
+ decreasing_kwargs = self.get_kwargs_from_breaks(range(10, -1, -1))
+ msg = 'left side of interval must be <= right side'
+ with pytest.raises(ValueError, match=msg):
+ constructor(**decreasing_kwargs)
+
+
+class TestFromArrays(Base):
+ """Tests specific to IntervalIndex.from_arrays"""
+
+ @pytest.fixture
+ def constructor(self):
+ return IntervalIndex.from_arrays
+
+ def get_kwargs_from_breaks(self, breaks, closed='right'):
+ """
+ converts intervals in breaks format to a dictionary of kwargs to
+ specific to the format expected by IntervalIndex.from_arrays
+ """
+ return {'left': breaks[:-1], 'right': breaks[1:]}
+
+ def test_constructor_errors(self):
+ # GH 19016: categorical data
+ data = Categorical(list('01234abcde'), ordered=True)
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalIndex')
+ with pytest.raises(TypeError, match=msg):
+ IntervalIndex.from_arrays(data[:-1], data[1:])
+
+ # unequal length
+ left = [0, 1, 2]
+ right = [2, 3]
+ msg = 'left and right must have the same length'
+ with pytest.raises(ValueError, match=msg):
+ IntervalIndex.from_arrays(left, right)
+
+ @pytest.mark.parametrize('left_subtype, right_subtype', [
+ (np.int64, np.float64), (np.float64, np.int64)])
+ def test_mixed_float_int(self, left_subtype, right_subtype):
+ """mixed int/float left/right results in float for both sides"""
+ left = np.arange(9, dtype=left_subtype)
+ right = np.arange(1, 10, dtype=right_subtype)
+ result = IntervalIndex.from_arrays(left, right)
+
+ expected_left = Float64Index(left)
+ expected_right = Float64Index(right)
+ expected_subtype = np.float64
+
+ tm.assert_index_equal(result.left, expected_left)
+ tm.assert_index_equal(result.right, expected_right)
+ assert result.dtype.subtype == expected_subtype
+
+
+class TestFromBreaks(Base):
+ """Tests specific to IntervalIndex.from_breaks"""
+
+ @pytest.fixture
+ def constructor(self):
+ return IntervalIndex.from_breaks
+
+ def get_kwargs_from_breaks(self, breaks, closed='right'):
+ """
+ converts intervals in breaks format to a dictionary of kwargs to
+ specific to the format expected by IntervalIndex.from_breaks
+ """
+ return {'breaks': breaks}
+
+ def test_constructor_errors(self):
+ # GH 19016: categorical data
+ data = Categorical(list('01234abcde'), ordered=True)
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalIndex')
+ with pytest.raises(TypeError, match=msg):
+ IntervalIndex.from_breaks(data)
+
+ def test_length_one(self):
+ """breaks of length one produce an empty IntervalIndex"""
+ breaks = [0]
+ result = IntervalIndex.from_breaks(breaks)
+ expected = IntervalIndex.from_breaks([])
+ tm.assert_index_equal(result, expected)
+
+
+class TestFromTuples(Base):
+ """Tests specific to IntervalIndex.from_tuples"""
+
+ @pytest.fixture
+ def constructor(self):
+ return IntervalIndex.from_tuples
+
+ def get_kwargs_from_breaks(self, breaks, closed='right'):
+ """
+ converts intervals in breaks format to a dictionary of kwargs to
+ specific to the format expected by IntervalIndex.from_tuples
+ """
+ if len(breaks) == 0:
+ return {'data': breaks}
+
+ tuples = lzip(breaks[:-1], breaks[1:])
+ if isinstance(breaks, (list, tuple)):
+ return {'data': tuples}
+ elif is_categorical_dtype(breaks):
+ return {'data': breaks._constructor(tuples)}
+ return {'data': com.asarray_tuplesafe(tuples)}
+
+ def test_constructor_errors(self):
+ # non-tuple
+ tuples = [(0, 1), 2, (3, 4)]
+ msg = 'IntervalIndex.from_tuples received an invalid item, 2'
+ with pytest.raises(TypeError, match=msg.format(t=tuples)):
+ IntervalIndex.from_tuples(tuples)
+
+ # too few/many items
+ tuples = [(0, 1), (2,), (3, 4)]
+ msg = 'IntervalIndex.from_tuples requires tuples of length 2, got {t}'
+ with pytest.raises(ValueError, match=msg.format(t=tuples)):
+ IntervalIndex.from_tuples(tuples)
+
+ tuples = [(0, 1), (2, 3, 4), (5, 6)]
+ with pytest.raises(ValueError, match=msg.format(t=tuples)):
+ IntervalIndex.from_tuples(tuples)
+
+ def test_na_tuples(self):
+ # tuple (NA, NA) evaluates the same as NA as an elemenent
+ na_tuple = [(0, 1), (np.nan, np.nan), (2, 3)]
+ idx_na_tuple = IntervalIndex.from_tuples(na_tuple)
+ idx_na_element = IntervalIndex.from_tuples([(0, 1), np.nan, (2, 3)])
+ tm.assert_index_equal(idx_na_tuple, idx_na_element)
+
+
+class TestClassConstructors(Base):
+ """Tests specific to the IntervalIndex/Index constructors"""
+
+ @pytest.fixture(params=[IntervalIndex, partial(Index, dtype='interval')],
+ ids=['IntervalIndex', 'Index'])
+ def constructor(self, request):
+ return request.param
+
+ def get_kwargs_from_breaks(self, breaks, closed='right'):
+ """
+ converts intervals in breaks format to a dictionary of kwargs to
+ specific to the format expected by the IntervalIndex/Index constructors
+ """
+ if len(breaks) == 0:
+ return {'data': breaks}
+
+ ivs = [Interval(l, r, closed) if notna(l) else l
+ for l, r in zip(breaks[:-1], breaks[1:])]
+
+ if isinstance(breaks, list):
+ return {'data': ivs}
+ elif is_categorical_dtype(breaks):
+ return {'data': breaks._constructor(ivs)}
+ return {'data': np.array(ivs, dtype=object)}
+
+ def test_generic_errors(self, constructor):
+ """
+ override the base class implementation since errors are handled
+ differently; checks unnecessary since caught at the Interval level
+ """
+ pass
+
+ def test_constructor_errors(self, constructor):
+ # mismatched closed within intervals with no constructor override
+ ivs = [Interval(0, 1, closed='right'), Interval(2, 3, closed='left')]
+ msg = 'intervals must all be closed on the same side'
+ with pytest.raises(ValueError, match=msg):
+ constructor(ivs)
+
+ # scalar
+ msg = (r'IntervalIndex\(...\) must be called with a collection of '
+ 'some kind, 5 was passed')
+ with pytest.raises(TypeError, match=msg):
+ constructor(5)
+
+ # not an interval
+ msg = ("type <(class|type) 'numpy.int64'> with value 0 "
+ "is not an interval")
+ with pytest.raises(TypeError, match=msg):
+ constructor([0, 1])
+
+ @pytest.mark.parametrize('data, closed', [
+ ([], 'both'),
+ ([np.nan, np.nan], 'neither'),
+ ([Interval(0, 3, closed='neither'),
+ Interval(2, 5, closed='neither')], 'left'),
+ ([Interval(0, 3, closed='left'),
+ Interval(2, 5, closed='right')], 'neither'),
+ (IntervalIndex.from_breaks(range(5), closed='both'), 'right')])
+ def test_override_inferred_closed(self, constructor, data, closed):
+ # GH 19370
+ if isinstance(data, IntervalIndex):
+ tuples = data.to_tuples()
+ else:
+ tuples = [(iv.left, iv.right) if notna(iv) else iv for iv in data]
+ expected = IntervalIndex.from_tuples(tuples, closed=closed)
+ result = constructor(data, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('values_constructor', [
+ list, np.array, IntervalIndex, IntervalArray])
+ def test_index_object_dtype(self, values_constructor):
+ # Index(intervals, dtype=object) is an Index (not an IntervalIndex)
+ intervals = [Interval(0, 1), Interval(1, 2), Interval(2, 3)]
+ values = values_constructor(intervals)
+ result = Index(values, dtype=object)
+
+ assert type(result) is Index
+ tm.assert_numpy_array_equal(result.values, np.array(values))
+
+
+class TestFromIntervals(TestClassConstructors):
+ """
+ Tests for IntervalIndex.from_intervals, which is deprecated in favor of the
+ IntervalIndex constructor. Same tests as the IntervalIndex constructor,
+ plus deprecation test. Should only need to delete this class when removed.
+ """
+
+ @pytest.fixture
+ def constructor(self):
+ def from_intervals_ignore_warnings(*args, **kwargs):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ return IntervalIndex.from_intervals(*args, **kwargs)
+ return from_intervals_ignore_warnings
+
+ def test_deprecated(self):
+ ivs = [Interval(0, 1), Interval(1, 2)]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ IntervalIndex.from_intervals(ivs)
+
+ @pytest.mark.skip(reason='parent class test that is not applicable')
+ def test_index_object_dtype(self):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval.py
new file mode 100644
index 00000000000..e4f25ff1432
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval.py
@@ -0,0 +1,1246 @@
+from __future__ import division
+
+from itertools import permutations
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat import lzip
+
+import pandas as pd
+from pandas import (
+ Index, Interval, IntervalIndex, Timedelta, Timestamp, date_range,
+ interval_range, isna, notna, timedelta_range)
+import pandas.core.common as com
+from pandas.tests.indexes.common import Base
+import pandas.util.testing as tm
+
+
[email protected](scope='class', params=[None, 'foo'])
+def name(request):
+ return request.param
+
+
+class TestIntervalIndex(Base):
+ _holder = IntervalIndex
+
+ def setup_method(self, method):
+ self.index = IntervalIndex.from_arrays([0, 1], [1, 2])
+ self.index_with_nan = IntervalIndex.from_tuples(
+ [(0, 1), np.nan, (1, 2)])
+ self.indices = dict(intervalIndex=tm.makeIntervalIndex(10))
+
+ def create_index(self, closed='right'):
+ return IntervalIndex.from_breaks(range(11), closed=closed)
+
+ def create_index_with_nan(self, closed='right'):
+ mask = [True, False] + [True] * 8
+ return IntervalIndex.from_arrays(
+ np.where(mask, np.arange(10), np.nan),
+ np.where(mask, np.arange(1, 11), np.nan), closed=closed)
+
+ def test_properties(self, closed):
+ index = self.create_index(closed=closed)
+ assert len(index) == 10
+ assert index.size == 10
+ assert index.shape == (10, )
+
+ tm.assert_index_equal(index.left, Index(np.arange(10)))
+ tm.assert_index_equal(index.right, Index(np.arange(1, 11)))
+ tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5)))
+
+ assert index.closed == closed
+
+ ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))]
+ expected = np.array(ivs, dtype=object)
+ tm.assert_numpy_array_equal(np.asarray(index), expected)
+
+ # with nans
+ index = self.create_index_with_nan(closed=closed)
+ assert len(index) == 10
+ assert index.size == 10
+ assert index.shape == (10, )
+
+ expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9])
+ expected_right = expected_left + 1
+ expected_mid = expected_left + 0.5
+ tm.assert_index_equal(index.left, expected_left)
+ tm.assert_index_equal(index.right, expected_right)
+ tm.assert_index_equal(index.mid, expected_mid)
+
+ assert index.closed == closed
+
+ ivs = [Interval(l, r, closed) if notna(l) else np.nan
+ for l, r in zip(expected_left, expected_right)]
+ expected = np.array(ivs, dtype=object)
+ tm.assert_numpy_array_equal(np.asarray(index), expected)
+
+ @pytest.mark.parametrize('breaks', [
+ [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608],
+ [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf],
+ pd.to_datetime(['20170101', '20170202', '20170303', '20170404']),
+ pd.to_timedelta(['1ns', '2ms', '3s', '4M', '5H', '6D'])])
+ def test_length(self, closed, breaks):
+ # GH 18789
+ index = IntervalIndex.from_breaks(breaks, closed=closed)
+ result = index.length
+ expected = Index(iv.length for iv in index)
+ tm.assert_index_equal(result, expected)
+
+ # with NA
+ index = index.insert(1, np.nan)
+ result = index.length
+ expected = Index(iv.length if notna(iv) else iv for iv in index)
+ tm.assert_index_equal(result, expected)
+
+ def test_with_nans(self, closed):
+ index = self.create_index(closed=closed)
+ assert index.hasnans is False
+
+ result = index.isna()
+ expected = np.repeat(False, len(index))
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = index.notna()
+ expected = np.repeat(True, len(index))
+ tm.assert_numpy_array_equal(result, expected)
+
+ index = self.create_index_with_nan(closed=closed)
+ assert index.hasnans is True
+
+ result = index.isna()
+ expected = np.array([False, True] + [False] * (len(index) - 2))
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = index.notna()
+ expected = np.array([True, False] + [True] * (len(index) - 2))
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_copy(self, closed):
+ expected = self.create_index(closed=closed)
+
+ result = expected.copy()
+ assert result.equals(expected)
+
+ result = expected.copy(deep=True)
+ assert result.equals(expected)
+ assert result.left is not expected.left
+
+ def test_ensure_copied_data(self, closed):
+ # exercise the copy flag in the constructor
+
+ # not copying
+ index = self.create_index(closed=closed)
+ result = IntervalIndex(index, copy=False)
+ tm.assert_numpy_array_equal(index.left.values, result.left.values,
+ check_same='same')
+ tm.assert_numpy_array_equal(index.right.values, result.right.values,
+ check_same='same')
+
+ # by-definition make a copy
+ result = IntervalIndex(index._ndarray_values, copy=False)
+ tm.assert_numpy_array_equal(index.left.values, result.left.values,
+ check_same='copy')
+ tm.assert_numpy_array_equal(index.right.values, result.right.values,
+ check_same='copy')
+
+ def test_equals(self, closed):
+ expected = IntervalIndex.from_breaks(np.arange(5), closed=closed)
+ assert expected.equals(expected)
+ assert expected.equals(expected.copy())
+
+ assert not expected.equals(expected.astype(object))
+ assert not expected.equals(np.array(expected))
+ assert not expected.equals(list(expected))
+
+ assert not expected.equals([1, 2])
+ assert not expected.equals(np.array([1, 2]))
+ assert not expected.equals(pd.date_range('20130101', periods=2))
+
+ expected_name1 = IntervalIndex.from_breaks(
+ np.arange(5), closed=closed, name='foo')
+ expected_name2 = IntervalIndex.from_breaks(
+ np.arange(5), closed=closed, name='bar')
+ assert expected.equals(expected_name1)
+ assert expected_name1.equals(expected_name2)
+
+ for other_closed in {'left', 'right', 'both', 'neither'} - {closed}:
+ expected_other_closed = IntervalIndex.from_breaks(
+ np.arange(5), closed=other_closed)
+ assert not expected.equals(expected_other_closed)
+
+ @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series])
+ def test_where(self, closed, klass):
+ idx = self.create_index(closed=closed)
+ cond = [True] * len(idx)
+ expected = idx
+ result = expected.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ cond = [False] + [True] * len(idx[1:])
+ expected = IntervalIndex([np.nan] + idx[1:].tolist())
+ result = idx.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ def test_delete(self, closed):
+ expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed)
+ result = self.create_index(closed=closed).delete(0)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('data', [
+ interval_range(0, periods=10, closed='neither'),
+ interval_range(1.7, periods=8, freq=2.5, closed='both'),
+ interval_range(Timestamp('20170101'), periods=12, closed='left'),
+ interval_range(Timedelta('1 day'), periods=6, closed='right')])
+ def test_insert(self, data):
+ item = data[0]
+ idx_item = IntervalIndex([item])
+
+ # start
+ expected = idx_item.append(data)
+ result = data.insert(0, item)
+ tm.assert_index_equal(result, expected)
+
+ # end
+ expected = data.append(idx_item)
+ result = data.insert(len(data), item)
+ tm.assert_index_equal(result, expected)
+
+ # mid
+ expected = data[:3].append(idx_item).append(data[3:])
+ result = data.insert(3, item)
+ tm.assert_index_equal(result, expected)
+
+ # invalid type
+ msg = 'can only insert Interval objects and NA into an IntervalIndex'
+ with pytest.raises(ValueError, match=msg):
+ data.insert(1, 'foo')
+
+ # invalid closed
+ msg = 'inserted item must be closed on the same side as the index'
+ for closed in {'left', 'right', 'both', 'neither'} - {item.closed}:
+ with pytest.raises(ValueError, match=msg):
+ bad_item = Interval(item.left, item.right, closed=closed)
+ data.insert(1, bad_item)
+
+ # GH 18295 (test missing)
+ na_idx = IntervalIndex([np.nan], closed=data.closed)
+ for na in (np.nan, pd.NaT, None):
+ expected = data[:1].append(na_idx).append(data[1:])
+ result = data.insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+ def test_take(self, closed):
+ index = self.create_index(closed=closed)
+
+ result = index.take(range(10))
+ tm.assert_index_equal(result, index)
+
+ result = index.take([0, 0, 1])
+ expected = IntervalIndex.from_arrays(
+ [0, 0, 1], [1, 1, 2], closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ def test_is_unique_interval(self, closed):
+ """
+ Interval specific tests for is_unique in addition to base class tests
+ """
+ # unique overlapping - distinct endpoints
+ idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed)
+ assert idx.is_unique is True
+
+ # unique overlapping - shared endpoints
+ idx = pd.IntervalIndex.from_tuples(
+ [(1, 2), (1, 3), (2, 3)], closed=closed)
+ assert idx.is_unique is True
+
+ # unique nested
+ idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed)
+ assert idx.is_unique is True
+
+ def test_monotonic(self, closed):
+ # increasing non-overlapping
+ idx = IntervalIndex.from_tuples(
+ [(0, 1), (2, 3), (4, 5)], closed=closed)
+ assert idx.is_monotonic is True
+ assert idx._is_strictly_monotonic_increasing is True
+ assert idx.is_monotonic_decreasing is False
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # decreasing non-overlapping
+ idx = IntervalIndex.from_tuples(
+ [(4, 5), (2, 3), (1, 2)], closed=closed)
+ assert idx.is_monotonic is False
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is True
+
+ # unordered non-overlapping
+ idx = IntervalIndex.from_tuples(
+ [(0, 1), (4, 5), (2, 3)], closed=closed)
+ assert idx.is_monotonic is False
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is False
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # increasing overlapping
+ idx = IntervalIndex.from_tuples(
+ [(0, 2), (0.5, 2.5), (1, 3)], closed=closed)
+ assert idx.is_monotonic is True
+ assert idx._is_strictly_monotonic_increasing is True
+ assert idx.is_monotonic_decreasing is False
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # decreasing overlapping
+ idx = IntervalIndex.from_tuples(
+ [(1, 3), (0.5, 2.5), (0, 2)], closed=closed)
+ assert idx.is_monotonic is False
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is True
+
+ # unordered overlapping
+ idx = IntervalIndex.from_tuples(
+ [(0.5, 2.5), (0, 2), (1, 3)], closed=closed)
+ assert idx.is_monotonic is False
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is False
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # increasing overlapping shared endpoints
+ idx = pd.IntervalIndex.from_tuples(
+ [(1, 2), (1, 3), (2, 3)], closed=closed)
+ assert idx.is_monotonic is True
+ assert idx._is_strictly_monotonic_increasing is True
+ assert idx.is_monotonic_decreasing is False
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # decreasing overlapping shared endpoints
+ idx = pd.IntervalIndex.from_tuples(
+ [(2, 3), (1, 3), (1, 2)], closed=closed)
+ assert idx.is_monotonic is False
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is True
+
+ # stationary
+ idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed)
+ assert idx.is_monotonic is True
+ assert idx._is_strictly_monotonic_increasing is False
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is False
+
+ # empty
+ idx = IntervalIndex([], closed=closed)
+ assert idx.is_monotonic is True
+ assert idx._is_strictly_monotonic_increasing is True
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is True
+
+ @pytest.mark.skip(reason='not a valid repr as we use interval notation')
+ def test_repr(self):
+ i = IntervalIndex.from_tuples([(0, 1), (1, 2)], closed='right')
+ expected = ("IntervalIndex(left=[0, 1],"
+ "\n right=[1, 2],"
+ "\n closed='right',"
+ "\n dtype='interval[int64]')")
+ assert repr(i) == expected
+
+ i = IntervalIndex.from_tuples((Timestamp('20130101'),
+ Timestamp('20130102')),
+ (Timestamp('20130102'),
+ Timestamp('20130103')),
+ closed='right')
+ expected = ("IntervalIndex(left=['2013-01-01', '2013-01-02'],"
+ "\n right=['2013-01-02', '2013-01-03'],"
+ "\n closed='right',"
+ "\n dtype='interval[datetime64[ns]]')")
+ assert repr(i) == expected
+
+ @pytest.mark.skip(reason='not a valid repr as we use interval notation')
+ def test_repr_max_seq_item_setting(self):
+ super(TestIntervalIndex, self).test_repr_max_seq_item_setting()
+
+ @pytest.mark.skip(reason='not a valid repr as we use interval notation')
+ def test_repr_roundtrip(self):
+ super(TestIntervalIndex, self).test_repr_roundtrip()
+
+ def test_frame_repr(self):
+ # https://github.com/pandas-dev/pandas/pull/24134/files
+ df = pd.DataFrame({'A': [1, 2, 3, 4]},
+ index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
+ result = repr(df)
+ expected = (
+ ' A\n'
+ '(0, 1] 1\n'
+ '(1, 2] 2\n'
+ '(2, 3] 3\n'
+ '(3, 4] 4'
+ )
+ assert result == expected
+
+ # TODO: check this behavior is consistent with test_interval_new.py
+ def test_get_item(self, closed):
+ i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan),
+ closed=closed)
+ assert i[0] == Interval(0.0, 1.0, closed=closed)
+ assert i[1] == Interval(1.0, 2.0, closed=closed)
+ assert isna(i[2])
+
+ result = i[0:1]
+ expected = IntervalIndex.from_arrays((0.,), (1.,), closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ result = i[0:2]
+ expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ result = i[1:3]
+ expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan),
+ closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_get_loc_value(self):
+ pytest.raises(KeyError, self.index.get_loc, 0)
+ assert self.index.get_loc(0.5) == 0
+ assert self.index.get_loc(1) == 0
+ assert self.index.get_loc(1.5) == 1
+ assert self.index.get_loc(2) == 1
+ pytest.raises(KeyError, self.index.get_loc, -1)
+ pytest.raises(KeyError, self.index.get_loc, 3)
+
+ idx = IntervalIndex.from_tuples([(0, 2), (1, 3)])
+ assert idx.get_loc(0.5) == 0
+ assert idx.get_loc(1) == 0
+ tm.assert_numpy_array_equal(idx.get_loc(1.5),
+ np.array([0, 1], dtype='intp'))
+ tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)),
+ np.array([0, 1], dtype='intp'))
+ assert idx.get_loc(3) == 1
+ pytest.raises(KeyError, idx.get_loc, 3.5)
+
+ idx = IntervalIndex.from_arrays([0, 2], [1, 3])
+ pytest.raises(KeyError, idx.get_loc, 1.5)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def slice_locs_cases(self, breaks):
+ # TODO: same tests for more index types
+ index = IntervalIndex.from_breaks([0, 1, 2], closed='right')
+ assert index.slice_locs() == (0, 2)
+ assert index.slice_locs(0, 1) == (0, 1)
+ assert index.slice_locs(1, 1) == (0, 1)
+ assert index.slice_locs(0, 2) == (0, 2)
+ assert index.slice_locs(0.5, 1.5) == (0, 2)
+ assert index.slice_locs(0, 0.5) == (0, 1)
+ assert index.slice_locs(start=1) == (0, 2)
+ assert index.slice_locs(start=1.2) == (1, 2)
+ assert index.slice_locs(end=1) == (0, 1)
+ assert index.slice_locs(end=1.1) == (0, 2)
+ assert index.slice_locs(end=1.0) == (0, 1)
+ assert index.slice_locs(-1, -1) == (0, 0)
+
+ index = IntervalIndex.from_breaks([0, 1, 2], closed='neither')
+ assert index.slice_locs(0, 1) == (0, 1)
+ assert index.slice_locs(0, 2) == (0, 2)
+ assert index.slice_locs(0.5, 1.5) == (0, 2)
+ assert index.slice_locs(1, 1) == (1, 1)
+ assert index.slice_locs(1, 2) == (1, 2)
+
+ index = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)],
+ closed='both')
+ assert index.slice_locs(1, 1) == (0, 1)
+ assert index.slice_locs(1, 2) == (0, 2)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_slice_locs_int64(self):
+ self.slice_locs_cases([0, 1, 2])
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_slice_locs_float64(self):
+ self.slice_locs_cases([0.0, 1.0, 2.0])
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def slice_locs_decreasing_cases(self, tuples):
+ index = IntervalIndex.from_tuples(tuples)
+ assert index.slice_locs(1.5, 0.5) == (1, 3)
+ assert index.slice_locs(2, 0) == (1, 3)
+ assert index.slice_locs(2, 1) == (1, 3)
+ assert index.slice_locs(3, 1.1) == (0, 3)
+ assert index.slice_locs(3, 3) == (0, 2)
+ assert index.slice_locs(3.5, 3.3) == (0, 1)
+ assert index.slice_locs(1, -3) == (2, 3)
+
+ slice_locs = index.slice_locs(-1, -1)
+ assert slice_locs[0] == slice_locs[1]
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_slice_locs_decreasing_int64(self):
+ self.slice_locs_cases([(2, 4), (1, 3), (0, 2)])
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_slice_locs_decreasing_float64(self):
+ self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)])
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_slice_locs_fails(self):
+ index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)])
+ with pytest.raises(KeyError):
+ index.slice_locs(1, 2)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_get_loc_interval(self):
+ assert self.index.get_loc(Interval(0, 1)) == 0
+ assert self.index.get_loc(Interval(0, 0.5)) == 0
+ assert self.index.get_loc(Interval(0, 1, 'left')) == 0
+ pytest.raises(KeyError, self.index.get_loc, Interval(2, 3))
+ pytest.raises(KeyError, self.index.get_loc,
+ Interval(-1, 0, 'left'))
+
+ # Make consistent with test_interval_new.py (see #16316, #16386)
+ @pytest.mark.parametrize('item', [3, Interval(1, 4)])
+ def test_get_loc_length_one(self, item, closed):
+ # GH 20921
+ index = IntervalIndex.from_tuples([(0, 5)], closed=closed)
+ result = index.get_loc(item)
+ assert result == 0
+
+ # Make consistent with test_interval_new.py (see #16316, #16386)
+ @pytest.mark.parametrize('breaks', [
+ date_range('20180101', periods=4),
+ date_range('20180101', periods=4, tz='US/Eastern'),
+ timedelta_range('0 days', periods=4)], ids=lambda x: str(x.dtype))
+ def test_get_loc_datetimelike_nonoverlapping(self, breaks):
+ # GH 20636
+ # nonoverlapping = IntervalIndex method and no i8 conversion
+ index = IntervalIndex.from_breaks(breaks)
+
+ value = index[0].mid
+ result = index.get_loc(value)
+ expected = 0
+ assert result == expected
+
+ interval = Interval(index[0].left, index[1].right)
+ result = index.get_loc(interval)
+ expected = slice(0, 2)
+ assert result == expected
+
+ # Make consistent with test_interval_new.py (see #16316, #16386)
+ @pytest.mark.parametrize('arrays', [
+ (date_range('20180101', periods=4), date_range('20180103', periods=4)),
+ (date_range('20180101', periods=4, tz='US/Eastern'),
+ date_range('20180103', periods=4, tz='US/Eastern')),
+ (timedelta_range('0 days', periods=4),
+ timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype))
+ def test_get_loc_datetimelike_overlapping(self, arrays):
+ # GH 20636
+ # overlapping = IntervalTree method with i8 conversion
+ index = IntervalIndex.from_arrays(*arrays)
+
+ value = index[0].mid + Timedelta('12 hours')
+ result = np.sort(index.get_loc(value))
+ expected = np.array([0, 1], dtype='intp')
+ assert tm.assert_numpy_array_equal(result, expected)
+
+ interval = Interval(index[0].left, index[1].right)
+ result = np.sort(index.get_loc(interval))
+ expected = np.array([0, 1, 2], dtype='intp')
+ assert tm.assert_numpy_array_equal(result, expected)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_get_indexer(self):
+ actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3])
+ expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index.get_indexer(self.index)
+ expected = np.array([0, 1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ index = IntervalIndex.from_breaks([0, 1, 2], closed='left')
+ actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3])
+ expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index.get_indexer(index[:1])
+ expected = np.array([0], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index.get_indexer(index)
+ expected = np.array([-1, 1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_get_indexer_subintervals(self):
+
+ # TODO: is this right?
+ # return indexers for wholly contained subintervals
+ target = IntervalIndex.from_breaks(np.linspace(0, 2, 5))
+ actual = self.index.get_indexer(target)
+ expected = np.array([0, 0, 1, 1], dtype='p')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2])
+ actual = self.index.get_indexer(target)
+ expected = np.array([0, 0, 1, 1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index.get_indexer(target[[0, -1]])
+ expected = np.array([0, 1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left')
+ actual = self.index.get_indexer(target)
+ expected = np.array([0, 0, 0], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ # Make consistent with test_interval_new.py (see #16316, #16386)
+ @pytest.mark.parametrize('item', [
+ [3], np.arange(1, 5), [Interval(1, 4)], interval_range(1, 4)])
+ def test_get_indexer_length_one(self, item, closed):
+ # GH 17284
+ index = IntervalIndex.from_tuples([(0, 5)], closed=closed)
+ result = index.get_indexer(item)
+ expected = np.array([0] * len(item), dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Make consistent with test_interval_new.py (see #16316, #16386)
+ @pytest.mark.parametrize('arrays', [
+ (date_range('20180101', periods=4), date_range('20180103', periods=4)),
+ (date_range('20180101', periods=4, tz='US/Eastern'),
+ date_range('20180103', periods=4, tz='US/Eastern')),
+ (timedelta_range('0 days', periods=4),
+ timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype))
+ def test_get_reindexer_datetimelike(self, arrays):
+ # GH 20636
+ index = IntervalIndex.from_arrays(*arrays)
+ tuples = [(index[0].left, index[0].left + pd.Timedelta('12H')),
+ (index[-1].right - pd.Timedelta('12H'), index[-1].right)]
+ target = IntervalIndex.from_tuples(tuples)
+
+ result = index._get_reindexer(target)
+ expected = np.array([0, 3], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('breaks', [
+ date_range('20180101', periods=4),
+ date_range('20180101', periods=4, tz='US/Eastern'),
+ timedelta_range('0 days', periods=4)], ids=lambda x: str(x.dtype))
+ def test_maybe_convert_i8(self, breaks):
+ # GH 20636
+ index = IntervalIndex.from_breaks(breaks)
+
+ # intervalindex
+ result = index._maybe_convert_i8(index)
+ expected = IntervalIndex.from_breaks(breaks.asi8)
+ tm.assert_index_equal(result, expected)
+
+ # interval
+ interval = Interval(breaks[0], breaks[1])
+ result = index._maybe_convert_i8(interval)
+ expected = Interval(breaks[0].value, breaks[1].value)
+ assert result == expected
+
+ # datetimelike index
+ result = index._maybe_convert_i8(breaks)
+ expected = Index(breaks.asi8)
+ tm.assert_index_equal(result, expected)
+
+ # datetimelike scalar
+ result = index._maybe_convert_i8(breaks[0])
+ expected = breaks[0].value
+ assert result == expected
+
+ # list-like of datetimelike scalars
+ result = index._maybe_convert_i8(list(breaks))
+ expected = Index(breaks.asi8)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('breaks', [
+ date_range('2018-01-01', periods=5),
+ timedelta_range('0 days', periods=5)])
+ def test_maybe_convert_i8_nat(self, breaks):
+ # GH 20636
+ index = IntervalIndex.from_breaks(breaks)
+
+ to_convert = breaks._constructor([pd.NaT] * 3)
+ expected = pd.Float64Index([np.nan] * 3)
+ result = index._maybe_convert_i8(to_convert)
+ tm.assert_index_equal(result, expected)
+
+ to_convert = to_convert.insert(0, breaks[0])
+ expected = expected.insert(0, float(breaks[0].value))
+ result = index._maybe_convert_i8(to_convert)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('breaks', [
+ np.arange(5, dtype='int64'),
+ np.arange(5, dtype='float64')], ids=lambda x: str(x.dtype))
+ @pytest.mark.parametrize('make_key', [
+ IntervalIndex.from_breaks,
+ lambda breaks: Interval(breaks[0], breaks[1]),
+ lambda breaks: breaks,
+ lambda breaks: breaks[0],
+ list], ids=['IntervalIndex', 'Interval', 'Index', 'scalar', 'list'])
+ def test_maybe_convert_i8_numeric(self, breaks, make_key):
+ # GH 20636
+ index = IntervalIndex.from_breaks(breaks)
+ key = make_key(breaks)
+
+ # no conversion occurs for numeric
+ result = index._maybe_convert_i8(key)
+ assert result is key
+
+ @pytest.mark.parametrize('breaks1, breaks2', permutations([
+ date_range('20180101', periods=4),
+ date_range('20180101', periods=4, tz='US/Eastern'),
+ timedelta_range('0 days', periods=4)], 2), ids=lambda x: str(x.dtype))
+ @pytest.mark.parametrize('make_key', [
+ IntervalIndex.from_breaks,
+ lambda breaks: Interval(breaks[0], breaks[1]),
+ lambda breaks: breaks,
+ lambda breaks: breaks[0],
+ list], ids=['IntervalIndex', 'Interval', 'Index', 'scalar', 'list'])
+ def test_maybe_convert_i8_errors(self, breaks1, breaks2, make_key):
+ # GH 20636
+ index = IntervalIndex.from_breaks(breaks1)
+ key = make_key(breaks2)
+
+ msg = ('Cannot index an IntervalIndex of subtype {dtype1} with '
+ 'values of dtype {dtype2}')
+ msg = re.escape(msg.format(dtype1=breaks1.dtype, dtype2=breaks2.dtype))
+ with pytest.raises(ValueError, match=msg):
+ index._maybe_convert_i8(key)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_contains(self):
+ # Only endpoints are valid.
+ i = IntervalIndex.from_arrays([0, 1], [1, 2])
+
+ # Invalid
+ assert 0 not in i
+ assert 1 not in i
+ assert 2 not in i
+
+ # Valid
+ assert Interval(0, 1) in i
+ assert Interval(0, 2) in i
+ assert Interval(0, 0.5) in i
+ assert Interval(3, 5) not in i
+ assert Interval(-1, 0, closed='left') not in i
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def testcontains(self):
+ # can select values that are IN the range of a value
+ i = IntervalIndex.from_arrays([0, 1], [1, 2])
+
+ assert i.contains(0.1)
+ assert i.contains(0.5)
+ assert i.contains(1)
+ assert i.contains(Interval(0, 1))
+ assert i.contains(Interval(0, 2))
+
+ # these overlaps completely
+ assert i.contains(Interval(0, 3))
+ assert i.contains(Interval(1, 3))
+
+ assert not i.contains(20)
+ assert not i.contains(-20)
+
+ def test_dropna(self, closed):
+
+ expected = IntervalIndex.from_tuples(
+ [(0.0, 1.0), (1.0, 2.0)], closed=closed)
+
+ ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed)
+ result = ii.dropna()
+ tm.assert_index_equal(result, expected)
+
+ ii = IntervalIndex.from_arrays(
+ [0, 1, np.nan], [1, 2, np.nan], closed=closed)
+ result = ii.dropna()
+ tm.assert_index_equal(result, expected)
+
+ # TODO: check this behavior is consistent with test_interval_new.py
+ def test_non_contiguous(self, closed):
+ index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed)
+ target = [0.5, 1.5, 2.5]
+ actual = index.get_indexer(target)
+ expected = np.array([0, -1, 1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ assert 1.5 not in index
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union(self, closed, sort):
+ index = self.create_index(closed=closed)
+ other = IntervalIndex.from_breaks(range(5, 13), closed=closed)
+
+ expected = IntervalIndex.from_breaks(range(13), closed=closed)
+ result = index[::-1].union(other, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ result = other[::-1].union(index, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ tm.assert_index_equal(index.union(index, sort=sort), index)
+ tm.assert_index_equal(index.union(index[:1], sort=sort), index)
+
+ # GH 19101: empty result, same dtype
+ index = IntervalIndex(np.array([], dtype='int64'), closed=closed)
+ result = index.union(index, sort=sort)
+ tm.assert_index_equal(result, index)
+
+ # GH 19101: empty result, different dtypes
+ other = IntervalIndex(np.array([], dtype='float64'), closed=closed)
+ result = index.union(other, sort=sort)
+ tm.assert_index_equal(result, index)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection(self, closed, sort):
+ index = self.create_index(closed=closed)
+ other = IntervalIndex.from_breaks(range(5, 13), closed=closed)
+
+ expected = IntervalIndex.from_breaks(range(5, 11), closed=closed)
+ result = index[::-1].intersection(other, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ result = other[::-1].intersection(index, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ tm.assert_index_equal(index.intersection(index, sort=sort), index)
+
+ # GH 19101: empty result, same dtype
+ other = IntervalIndex.from_breaks(range(300, 314), closed=closed)
+ expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
+ result = index.intersection(other, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ # GH 19101: empty result, different dtypes
+ breaks = np.arange(300, 314, dtype='float64')
+ other = IntervalIndex.from_breaks(breaks, closed=closed)
+ result = index.intersection(other, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference(self, closed, sort):
+ index = IntervalIndex.from_arrays([1, 0, 3, 2],
+ [1, 2, 3, 4],
+ closed=closed)
+ result = index.difference(index[:1], sort=sort)
+ expected = index[1:]
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+
+ # GH 19101: empty result, same dtype
+ result = index.difference(index, sort=sort)
+ expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # GH 19101: empty result, different dtypes
+ other = IntervalIndex.from_arrays(index.left.astype('float64'),
+ index.right, closed=closed)
+ result = index.difference(other, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_symmetric_difference(self, closed, sort):
+ index = self.create_index(closed=closed)
+ result = index[1:].symmetric_difference(index[:-1], sort=sort)
+ expected = IntervalIndex([index[0], index[-1]])
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ # GH 19101: empty result, same dtype
+ result = index.symmetric_difference(index, sort=sort)
+ expected = IntervalIndex(np.array([], dtype='int64'), closed=closed)
+ if sort is None:
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ # GH 19101: empty result, different dtypes
+ other = IntervalIndex.from_arrays(index.left.astype('float64'),
+ index.right, closed=closed)
+ result = index.symmetric_difference(other, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('op_name', [
+ 'union', 'intersection', 'difference', 'symmetric_difference'])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_set_operation_errors(self, closed, op_name, sort):
+ index = self.create_index(closed=closed)
+ set_op = getattr(index, op_name)
+
+ # non-IntervalIndex
+ msg = ('the other index needs to be an IntervalIndex too, but '
+ 'was type Int64Index')
+ with pytest.raises(TypeError, match=msg):
+ set_op(Index([1, 2, 3]), sort=sort)
+
+ # mixed closed
+ msg = ('can only do set operations between two IntervalIndex objects '
+ 'that are closed on the same side')
+ for other_closed in {'right', 'left', 'both', 'neither'} - {closed}:
+ other = self.create_index(closed=other_closed)
+ with pytest.raises(ValueError, match=msg):
+ set_op(other, sort=sort)
+
+ # GH 19016: incompatible dtypes
+ other = interval_range(Timestamp('20180101'), periods=9, closed=closed)
+ msg = ('can only do {op} between two IntervalIndex objects that have '
+ 'compatible dtypes').format(op=op_name)
+ with pytest.raises(TypeError, match=msg):
+ set_op(other, sort=sort)
+
+ def test_isin(self, closed):
+ index = self.create_index(closed=closed)
+
+ expected = np.array([True] + [False] * (len(index) - 1))
+ result = index.isin(index[:1])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = index.isin([index[0]])
+ tm.assert_numpy_array_equal(result, expected)
+
+ other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed)
+ expected = np.array([True] * (len(index) - 1) + [False])
+ result = index.isin(other)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = index.isin(other.tolist())
+ tm.assert_numpy_array_equal(result, expected)
+
+ for other_closed in {'right', 'left', 'both', 'neither'}:
+ other = self.create_index(closed=other_closed)
+ expected = np.repeat(closed == other_closed, len(index))
+ result = index.isin(other)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = index.isin(other.tolist())
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_comparison(self):
+ actual = Interval(0, 1) < self.index
+ expected = np.array([False, True])
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = Interval(0.5, 1.5) < self.index
+ expected = np.array([False, True])
+ tm.assert_numpy_array_equal(actual, expected)
+ actual = self.index > Interval(0.5, 1.5)
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index == self.index
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(actual, expected)
+ actual = self.index <= self.index
+ tm.assert_numpy_array_equal(actual, expected)
+ actual = self.index >= self.index
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index < self.index
+ expected = np.array([False, False])
+ tm.assert_numpy_array_equal(actual, expected)
+ actual = self.index > self.index
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ actual = self.index == self.index.values
+ tm.assert_numpy_array_equal(actual, np.array([True, True]))
+ actual = self.index.values == self.index
+ tm.assert_numpy_array_equal(actual, np.array([True, True]))
+ actual = self.index <= self.index.values
+ tm.assert_numpy_array_equal(actual, np.array([True, True]))
+ actual = self.index != self.index.values
+ tm.assert_numpy_array_equal(actual, np.array([False, False]))
+ actual = self.index > self.index.values
+ tm.assert_numpy_array_equal(actual, np.array([False, False]))
+ actual = self.index.values > self.index
+ tm.assert_numpy_array_equal(actual, np.array([False, False]))
+
+ # invalid comparisons
+ actual = self.index == 0
+ tm.assert_numpy_array_equal(actual, np.array([False, False]))
+ actual = self.index == self.index.left
+ tm.assert_numpy_array_equal(actual, np.array([False, False]))
+
+ with pytest.raises(TypeError, match='unorderable types'):
+ self.index > 0
+ with pytest.raises(TypeError, match='unorderable types'):
+ self.index <= 0
+ with pytest.raises(TypeError):
+ self.index > np.arange(2)
+ with pytest.raises(ValueError):
+ self.index > np.arange(3)
+
+ def test_missing_values(self, closed):
+ idx = Index([np.nan, Interval(0, 1, closed=closed),
+ Interval(1, 2, closed=closed)])
+ idx2 = IntervalIndex.from_arrays(
+ [np.nan, 0, 1], [np.nan, 1, 2], closed=closed)
+ assert idx.equals(idx2)
+
+ with pytest.raises(ValueError):
+ IntervalIndex.from_arrays(
+ [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed)
+
+ tm.assert_numpy_array_equal(isna(idx),
+ np.array([True, False, False]))
+
+ def test_sort_values(self, closed):
+ index = self.create_index(closed=closed)
+
+ result = index.sort_values()
+ tm.assert_index_equal(result, index)
+
+ result = index.sort_values(ascending=False)
+ tm.assert_index_equal(result, index[::-1])
+
+ # with nan
+ index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)])
+
+ result = index.sort_values()
+ expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan])
+ tm.assert_index_equal(result, expected)
+
+ result = index.sort_values(ascending=False)
+ expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'US/Eastern'])
+ def test_datetime(self, tz):
+ start = Timestamp('2000-01-01', tz=tz)
+ dates = date_range(start=start, periods=10)
+ index = IntervalIndex.from_breaks(dates)
+
+ # test mid
+ start = Timestamp('2000-01-01T12:00', tz=tz)
+ expected = date_range(start=start, periods=9)
+ tm.assert_index_equal(index.mid, expected)
+
+ # __contains__ doesn't check individual points
+ assert Timestamp('2000-01-01', tz=tz) not in index
+ assert Timestamp('2000-01-01T12', tz=tz) not in index
+ assert Timestamp('2000-01-02', tz=tz) not in index
+ iv_true = Interval(Timestamp('2000-01-01T08', tz=tz),
+ Timestamp('2000-01-01T18', tz=tz))
+ iv_false = Interval(Timestamp('1999-12-31', tz=tz),
+ Timestamp('2000-01-01', tz=tz))
+ assert iv_true in index
+ assert iv_false not in index
+
+ # .contains does check individual points
+ assert not index.contains(Timestamp('2000-01-01', tz=tz))
+ assert index.contains(Timestamp('2000-01-01T12', tz=tz))
+ assert index.contains(Timestamp('2000-01-02', tz=tz))
+ assert index.contains(iv_true)
+ assert not index.contains(iv_false)
+
+ # test get_indexer
+ start = Timestamp('1999-12-31T12:00', tz=tz)
+ target = date_range(start=start, periods=7, freq='12H')
+ actual = index.get_indexer(target)
+ expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ start = Timestamp('2000-01-08T18:00', tz=tz)
+ target = date_range(start=start, periods=7, freq='6H')
+ actual = index.get_indexer(target)
+ expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype='intp')
+ tm.assert_numpy_array_equal(actual, expected)
+
+ def test_append(self, closed):
+
+ index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed)
+ index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed)
+
+ result = index1.append(index2)
+ expected = IntervalIndex.from_arrays(
+ [0, 1, 1, 2], [1, 2, 2, 3], closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ result = index1.append([index1, index2])
+ expected = IntervalIndex.from_arrays(
+ [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ msg = ('can only append two IntervalIndex objects that are closed '
+ 'on the same side')
+ for other_closed in {'left', 'right', 'both', 'neither'} - {closed}:
+ index_other_closed = IntervalIndex.from_arrays(
+ [0, 1], [1, 2], closed=other_closed)
+ with pytest.raises(ValueError, match=msg):
+ index1.append(index_other_closed)
+
+ def test_is_non_overlapping_monotonic(self, closed):
+ # Should be True in all cases
+ tpls = [(0, 1), (2, 3), (4, 5), (6, 7)]
+ idx = IntervalIndex.from_tuples(tpls, closed=closed)
+ assert idx.is_non_overlapping_monotonic is True
+
+ idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
+ assert idx.is_non_overlapping_monotonic is True
+
+ # Should be False in all cases (overlapping)
+ tpls = [(0, 2), (1, 3), (4, 5), (6, 7)]
+ idx = IntervalIndex.from_tuples(tpls, closed=closed)
+ assert idx.is_non_overlapping_monotonic is False
+
+ idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
+ assert idx.is_non_overlapping_monotonic is False
+
+ # Should be False in all cases (non-monotonic)
+ tpls = [(0, 1), (2, 3), (6, 7), (4, 5)]
+ idx = IntervalIndex.from_tuples(tpls, closed=closed)
+ assert idx.is_non_overlapping_monotonic is False
+
+ idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
+ assert idx.is_non_overlapping_monotonic is False
+
+ # Should be False for closed='both', otherwise True (GH16560)
+ if closed == 'both':
+ idx = IntervalIndex.from_breaks(range(4), closed=closed)
+ assert idx.is_non_overlapping_monotonic is False
+ else:
+ idx = IntervalIndex.from_breaks(range(4), closed=closed)
+ assert idx.is_non_overlapping_monotonic is True
+
+ @pytest.mark.parametrize('start, shift, na_value', [
+ (0, 1, np.nan),
+ (Timestamp('2018-01-01'), Timedelta('1 day'), pd.NaT),
+ (Timedelta('0 days'), Timedelta('1 day'), pd.NaT)])
+ def test_is_overlapping(self, start, shift, na_value, closed):
+ # GH 23309
+ # see test_interval_tree.py for extensive tests; interface tests here
+
+ # non-overlapping
+ tuples = [(start + n * shift, start + (n + 1) * shift)
+ for n in (0, 2, 4)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ assert index.is_overlapping is False
+
+ # non-overlapping with NA
+ tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ assert index.is_overlapping is False
+
+ # overlapping
+ tuples = [(start + n * shift, start + (n + 2) * shift)
+ for n in range(3)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ assert index.is_overlapping is True
+
+ # overlapping with NA
+ tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ assert index.is_overlapping is True
+
+ # common endpoints
+ tuples = [(start + n * shift, start + (n + 1) * shift)
+ for n in range(3)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ result = index.is_overlapping
+ expected = closed == 'both'
+ assert result is expected
+
+ # common endpoints with NA
+ tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)]
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+ result = index.is_overlapping
+ assert result is expected
+
+ @pytest.mark.parametrize('tuples', [
+ lzip(range(10), range(1, 11)),
+ lzip(date_range('20170101', periods=10),
+ date_range('20170101', periods=10)),
+ lzip(timedelta_range('0 days', periods=10),
+ timedelta_range('1 day', periods=10))])
+ def test_to_tuples(self, tuples):
+ # GH 18756
+ idx = IntervalIndex.from_tuples(tuples)
+ result = idx.to_tuples()
+ expected = Index(com.asarray_tuplesafe(tuples))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tuples', [
+ lzip(range(10), range(1, 11)) + [np.nan],
+ lzip(date_range('20170101', periods=10),
+ date_range('20170101', periods=10)) + [np.nan],
+ lzip(timedelta_range('0 days', periods=10),
+ timedelta_range('1 day', periods=10)) + [np.nan]])
+ @pytest.mark.parametrize('na_tuple', [True, False])
+ def test_to_tuples_na(self, tuples, na_tuple):
+ # GH 18756
+ idx = IntervalIndex.from_tuples(tuples)
+ result = idx.to_tuples(na_tuple=na_tuple)
+
+ # check the non-NA portion
+ expected_notna = Index(com.asarray_tuplesafe(tuples[:-1]))
+ result_notna = result[:-1]
+ tm.assert_index_equal(result_notna, expected_notna)
+
+ # check the NA portion
+ result_na = result[-1]
+ if na_tuple:
+ assert isinstance(result_na, tuple)
+ assert len(result_na) == 2
+ assert all(isna(x) for x in result_na)
+ else:
+ assert isna(result_na)
+
+ def test_nbytes(self):
+ # GH 19209
+ left = np.arange(0, 4, dtype='i8')
+ right = np.arange(1, 5, dtype='i8')
+
+ result = IntervalIndex.from_arrays(left, right).nbytes
+ expected = 64 # 4 * 8 * 2
+ assert result == expected
+
+ def test_itemsize(self):
+ # GH 19209
+ left = np.arange(0, 4, dtype='i8')
+ right = np.arange(1, 5, dtype='i8')
+ expected = 16 # 8 * 2
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = IntervalIndex.from_arrays(left, right).itemsize
+
+ assert result == expected
+
+ @pytest.mark.parametrize('new_closed', [
+ 'left', 'right', 'both', 'neither'])
+ def test_set_closed(self, name, closed, new_closed):
+ # GH 21670
+ index = interval_range(0, 5, closed=closed, name=name)
+ result = index.set_closed(new_closed)
+ expected = interval_range(0, 5, closed=new_closed, name=name)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('bad_closed', ['foo', 10, 'LEFT', True, False])
+ def test_set_closed_errors(self, bad_closed):
+ # GH 21670
+ index = interval_range(0, 5)
+ msg = "invalid option for 'closed': {closed}".format(closed=bad_closed)
+ with pytest.raises(ValueError, match=msg):
+ index.set_closed(bad_closed)
+
+ def test_is_all_dates(self):
+ # GH 23576
+ year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
+ pd.Timestamp('2018-01-01 00:00:00'))
+ year_2017_index = pd.IntervalIndex([year_2017])
+ assert not year_2017_index.is_all_dates
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_new.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_new.py
new file mode 100644
index 00000000000..fcffa29f7ea
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_new.py
@@ -0,0 +1,271 @@
+from __future__ import division
+
+import numpy as np
+import pytest
+
+from pandas import Int64Index, Interval, IntervalIndex
+import pandas.util.testing as tm
+
+pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316")
+
+
+class TestIntervalIndex(object):
+
+ @pytest.mark.parametrize("side", ['right', 'left', 'both', 'neither'])
+ def test_get_loc_interval(self, closed, side):
+
+ idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed)
+
+ for bound in [[0, 1], [1, 2], [2, 3], [3, 4],
+ [0, 2], [2.5, 3], [-1, 4]]:
+ # if get_loc is supplied an interval, it should only search
+ # for exact matches, not overlaps or covers, else KeyError.
+ if closed == side:
+ if bound == [0, 1]:
+ assert idx.get_loc(Interval(0, 1, closed=side)) == 0
+ elif bound == [2, 3]:
+ assert idx.get_loc(Interval(2, 3, closed=side)) == 1
+ else:
+ with pytest.raises(KeyError):
+ idx.get_loc(Interval(*bound, closed=side))
+ else:
+ with pytest.raises(KeyError):
+ idx.get_loc(Interval(*bound, closed=side))
+
+ @pytest.mark.parametrize("scalar", [-0.5, 0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5])
+ def test_get_loc_scalar(self, closed, scalar):
+
+ # correct = {side: {query: answer}}.
+ # If query is not in the dict, that query should raise a KeyError
+ correct = {'right': {0.5: 0, 1: 0, 2.5: 1, 3: 1},
+ 'left': {0: 0, 0.5: 0, 2: 1, 2.5: 1},
+ 'both': {0: 0, 0.5: 0, 1: 0, 2: 1, 2.5: 1, 3: 1},
+ 'neither': {0.5: 0, 2.5: 1}}
+
+ idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed)
+
+ # if get_loc is supplied a scalar, it should return the index of
+ # the interval which contains the scalar, or KeyError.
+ if scalar in correct[closed].keys():
+ assert idx.get_loc(scalar) == correct[closed][scalar]
+ else:
+ pytest.raises(KeyError, idx.get_loc, scalar)
+
+ def test_slice_locs_with_interval(self):
+
+ # increasing monotonically
+ index = IntervalIndex.from_tuples([(0, 2), (1, 3), (2, 4)])
+
+ assert index.slice_locs(
+ start=Interval(0, 2), end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(start=Interval(0, 2)) == (0, 3)
+ assert index.slice_locs(end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(end=Interval(0, 2)) == (0, 1)
+ assert index.slice_locs(
+ start=Interval(2, 4), end=Interval(0, 2)) == (2, 1)
+
+ # decreasing monotonically
+ index = IntervalIndex.from_tuples([(2, 4), (1, 3), (0, 2)])
+
+ assert index.slice_locs(
+ start=Interval(0, 2), end=Interval(2, 4)) == (2, 1)
+ assert index.slice_locs(start=Interval(0, 2)) == (2, 3)
+ assert index.slice_locs(end=Interval(2, 4)) == (0, 1)
+ assert index.slice_locs(end=Interval(0, 2)) == (0, 3)
+ assert index.slice_locs(
+ start=Interval(2, 4), end=Interval(0, 2)) == (0, 3)
+
+ # sorted duplicates
+ index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)])
+
+ assert index.slice_locs(
+ start=Interval(0, 2), end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(start=Interval(0, 2)) == (0, 3)
+ assert index.slice_locs(end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(end=Interval(0, 2)) == (0, 2)
+ assert index.slice_locs(
+ start=Interval(2, 4), end=Interval(0, 2)) == (2, 2)
+
+ # unsorted duplicates
+ index = IntervalIndex.from_tuples([(0, 2), (2, 4), (0, 2)])
+
+ pytest.raises(KeyError, index.slice_locs(
+ start=Interval(0, 2), end=Interval(2, 4)))
+ pytest.raises(KeyError, index.slice_locs(start=Interval(0, 2)))
+ assert index.slice_locs(end=Interval(2, 4)) == (0, 2)
+ pytest.raises(KeyError, index.slice_locs(end=Interval(0, 2)))
+ pytest.raises(KeyError, index.slice_locs(
+ start=Interval(2, 4), end=Interval(0, 2)))
+
+ # another unsorted duplicates
+ index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4), (1, 3)])
+
+ assert index.slice_locs(
+ start=Interval(0, 2), end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(start=Interval(0, 2)) == (0, 4)
+ assert index.slice_locs(end=Interval(2, 4)) == (0, 3)
+ assert index.slice_locs(end=Interval(0, 2)) == (0, 2)
+ assert index.slice_locs(
+ start=Interval(2, 4), end=Interval(0, 2)) == (2, 2)
+
+ def test_slice_locs_with_ints_and_floats_succeeds(self):
+
+ # increasing non-overlapping
+ index = IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)])
+
+ assert index.slice_locs(0, 1) == (0, 1)
+ assert index.slice_locs(0, 2) == (0, 2)
+ assert index.slice_locs(0, 3) == (0, 2)
+ assert index.slice_locs(3, 1) == (2, 1)
+ assert index.slice_locs(3, 4) == (2, 3)
+ assert index.slice_locs(0, 4) == (0, 3)
+
+ # decreasing non-overlapping
+ index = IntervalIndex.from_tuples([(3, 4), (1, 2), (0, 1)])
+ assert index.slice_locs(0, 1) == (3, 2)
+ assert index.slice_locs(0, 2) == (3, 1)
+ assert index.slice_locs(0, 3) == (3, 1)
+ assert index.slice_locs(3, 1) == (1, 2)
+ assert index.slice_locs(3, 4) == (1, 0)
+ assert index.slice_locs(0, 4) == (3, 0)
+
+ @pytest.mark.parametrize("query", [
+ [0, 1], [0, 2], [0, 3], [3, 1], [3, 4], [0, 4]])
+ @pytest.mark.parametrize("tuples", [
+ [(0, 2), (1, 3), (2, 4)], [(2, 4), (1, 3), (0, 2)],
+ [(0, 2), (0, 2), (2, 4)], [(0, 2), (2, 4), (0, 2)],
+ [(0, 2), (0, 2), (2, 4), (1, 3)]])
+ def test_slice_locs_with_ints_and_floats_errors(self, tuples, query):
+ index = IntervalIndex.from_tuples(tuples)
+ with pytest.raises(KeyError):
+ index.slice_locs(query)
+
+ @pytest.mark.parametrize('query, expected', [
+ ([Interval(1, 3, closed='right')], [1]),
+ ([Interval(1, 3, closed='left')], [-1]),
+ ([Interval(1, 3, closed='both')], [-1]),
+ ([Interval(1, 3, closed='neither')], [-1]),
+ ([Interval(1, 4, closed='right')], [-1]),
+ ([Interval(0, 4, closed='right')], [-1]),
+ ([Interval(1, 2, closed='right')], [-1]),
+ ([Interval(2, 4, closed='right'), Interval(1, 3, closed='right')],
+ [2, 1]),
+ ([Interval(1, 3, closed='right'), Interval(0, 2, closed='right')],
+ [1, -1]),
+ ([Interval(1, 3, closed='right'), Interval(1, 3, closed='left')],
+ [1, -1])])
+ def test_get_indexer_with_interval(self, query, expected):
+
+ tuples = [(0, 2.5), (1, 3), (2, 4)]
+ index = IntervalIndex.from_tuples(tuples, closed='right')
+
+ result = index.get_indexer(query)
+ expected = np.array(expected, dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('query, expected', [
+ ([-0.5], [-1]),
+ ([0], [-1]),
+ ([0.5], [0]),
+ ([1], [0]),
+ ([1.5], [1]),
+ ([2], [1]),
+ ([2.5], [-1]),
+ ([3], [-1]),
+ ([3.5], [2]),
+ ([4], [2]),
+ ([4.5], [-1]),
+ ([1, 2], [0, 1]),
+ ([1, 2, 3], [0, 1, -1]),
+ ([1, 2, 3, 4], [0, 1, -1, 2]),
+ ([1, 2, 3, 4, 2], [0, 1, -1, 2, 1])])
+ def test_get_indexer_with_int_and_float(self, query, expected):
+
+ tuples = [(0, 1), (1, 2), (3, 4)]
+ index = IntervalIndex.from_tuples(tuples, closed='right')
+
+ result = index.get_indexer(query)
+ expected = np.array(expected, dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('tuples, closed', [
+ ([(0, 2), (1, 3), (3, 4)], 'neither'),
+ ([(0, 5), (1, 4), (6, 7)], 'left'),
+ ([(0, 1), (0, 1), (1, 2)], 'right'),
+ ([(0, 1), (2, 3), (3, 4)], 'both')])
+ def test_get_indexer_errors(self, tuples, closed):
+ # IntervalIndex needs non-overlapping for uniqueness when querying
+ index = IntervalIndex.from_tuples(tuples, closed=closed)
+
+ msg = ('cannot handle overlapping indices; use '
+ 'IntervalIndex.get_indexer_non_unique')
+ with pytest.raises(ValueError, match=msg):
+ index.get_indexer([0, 2])
+
+ @pytest.mark.parametrize('query, expected', [
+ ([-0.5], ([-1], [0])),
+ ([0], ([0], [])),
+ ([0.5], ([0], [])),
+ ([1], ([0, 1], [])),
+ ([1.5], ([0, 1], [])),
+ ([2], ([0, 1, 2], [])),
+ ([2.5], ([1, 2], [])),
+ ([3], ([2], [])),
+ ([3.5], ([2], [])),
+ ([4], ([-1], [0])),
+ ([4.5], ([-1], [0])),
+ ([1, 2], ([0, 1, 0, 1, 2], [])),
+ ([1, 2, 3], ([0, 1, 0, 1, 2, 2], [])),
+ ([1, 2, 3, 4], ([0, 1, 0, 1, 2, 2, -1], [3])),
+ ([1, 2, 3, 4, 2], ([0, 1, 0, 1, 2, 2, -1, 0, 1, 2], [3]))])
+ def test_get_indexer_non_unique_with_int_and_float(self, query, expected):
+
+ tuples = [(0, 2.5), (1, 3), (2, 4)]
+ index = IntervalIndex.from_tuples(tuples, closed='left')
+
+ result_indexer, result_missing = index.get_indexer_non_unique(query)
+ expected_indexer = Int64Index(expected[0])
+ expected_missing = np.array(expected[1], dtype='intp')
+
+ tm.assert_index_equal(result_indexer, expected_indexer)
+ tm.assert_numpy_array_equal(result_missing, expected_missing)
+
+ # TODO we may also want to test get_indexer for the case when
+ # the intervals are duplicated, decreasing, non-monotonic, etc..
+
+ def test_contains(self):
+
+ index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right')
+
+ # __contains__ requires perfect matches to intervals.
+ assert 0 not in index
+ assert 1 not in index
+ assert 2 not in index
+
+ assert Interval(0, 1, closed='right') in index
+ assert Interval(0, 2, closed='right') not in index
+ assert Interval(0, 0.5, closed='right') not in index
+ assert Interval(3, 5, closed='right') not in index
+ assert Interval(-1, 0, closed='left') not in index
+ assert Interval(0, 1, closed='left') not in index
+ assert Interval(0, 1, closed='both') not in index
+
+ def test_contains_method(self):
+
+ index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right')
+
+ assert not index.contains(0)
+ assert index.contains(0.1)
+ assert index.contains(0.5)
+ assert index.contains(1)
+
+ assert index.contains(Interval(0, 1, closed='right'))
+ assert not index.contains(Interval(0, 1, closed='left'))
+ assert not index.contains(Interval(0, 1, closed='both'))
+ assert not index.contains(Interval(0, 2, closed='right'))
+
+ assert not index.contains(Interval(0, 3, closed='right'))
+ assert not index.contains(Interval(1, 3, closed='right'))
+
+ assert not index.contains(20)
+ assert not index.contains(-20)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_range.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_range.py
new file mode 100644
index 00000000000..13b7b643999
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_range.py
@@ -0,0 +1,316 @@
+from __future__ import division
+
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_integer
+
+from pandas import (
+ DateOffset, Interval, IntervalIndex, Timedelta, Timestamp, date_range,
+ interval_range, timedelta_range)
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day
+
+
[email protected](scope='class', params=[None, 'foo'])
+def name(request):
+ return request.param
+
+
+class TestIntervalRange(object):
+
+ @pytest.mark.parametrize('freq, periods', [
+ (1, 100), (2.5, 40), (5, 20), (25, 4)])
+ def test_constructor_numeric(self, closed, name, freq, periods):
+ start, end = 0, 100
+ breaks = np.arange(101, step=freq)
+ expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed)
+
+ # defined from start/end/freq
+ result = interval_range(
+ start=start, end=end, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from start/periods/freq
+ result = interval_range(
+ start=start, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from end/periods/freq
+ result = interval_range(
+ end=end, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # GH 20976: linspace behavior defined from start/end/periods
+ result = interval_range(
+ start=start, end=end, periods=periods, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'US/Eastern'])
+ @pytest.mark.parametrize('freq, periods', [
+ ('D', 364), ('2D', 182), ('22D18H', 16), ('M', 11)])
+ def test_constructor_timestamp(self, closed, name, freq, periods, tz):
+ start, end = Timestamp('20180101', tz=tz), Timestamp('20181231', tz=tz)
+ breaks = date_range(start=start, end=end, freq=freq)
+ expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed)
+
+ # defined from start/end/freq
+ result = interval_range(
+ start=start, end=end, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from start/periods/freq
+ result = interval_range(
+ start=start, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from end/periods/freq
+ result = interval_range(
+ end=end, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # GH 20976: linspace behavior defined from start/end/periods
+ if not breaks.freq.isAnchored() and tz is None:
+ # matches expected only for non-anchored offsets and tz naive
+ # (anchored/DST transitions cause unequal spacing in expected)
+ result = interval_range(start=start, end=end, periods=periods,
+ name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('freq, periods', [
+ ('D', 100), ('2D12H', 40), ('5D', 20), ('25D', 4)])
+ def test_constructor_timedelta(self, closed, name, freq, periods):
+ start, end = Timedelta('0 days'), Timedelta('100 days')
+ breaks = timedelta_range(start=start, end=end, freq=freq)
+ expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed)
+
+ # defined from start/end/freq
+ result = interval_range(
+ start=start, end=end, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from start/periods/freq
+ result = interval_range(
+ start=start, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # defined from end/periods/freq
+ result = interval_range(
+ end=end, periods=periods, freq=freq, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ # GH 20976: linspace behavior defined from start/end/periods
+ result = interval_range(
+ start=start, end=end, periods=periods, name=name, closed=closed)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('start, end, freq, expected_endpoint', [
+ (0, 10, 3, 9),
+ (0, 10, 1.5, 9),
+ (0.5, 10, 3, 9.5),
+ (Timedelta('0D'), Timedelta('10D'), '2D4H', Timedelta('8D16H')),
+ (Timestamp('2018-01-01'),
+ Timestamp('2018-02-09'),
+ 'MS',
+ Timestamp('2018-02-01')),
+ (Timestamp('2018-01-01', tz='US/Eastern'),
+ Timestamp('2018-01-20', tz='US/Eastern'),
+ '5D12H',
+ Timestamp('2018-01-17 12:00:00', tz='US/Eastern'))])
+ def test_early_truncation(self, start, end, freq, expected_endpoint):
+ # index truncates early if freq causes end to be skipped
+ result = interval_range(start=start, end=end, freq=freq)
+ result_endpoint = result.right[-1]
+ assert result_endpoint == expected_endpoint
+
+ @pytest.mark.parametrize('start, end, freq', [
+ (0.5, None, None),
+ (None, 4.5, None),
+ (0.5, None, 1.5),
+ (None, 6.5, 1.5)])
+ def test_no_invalid_float_truncation(self, start, end, freq):
+ # GH 21161
+ if freq is None:
+ breaks = [0.5, 1.5, 2.5, 3.5, 4.5]
+ else:
+ breaks = [0.5, 2.0, 3.5, 5.0, 6.5]
+ expected = IntervalIndex.from_breaks(breaks)
+
+ result = interval_range(start=start, end=end, periods=4, freq=freq)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('start, mid, end', [
+ (Timestamp('2018-03-10', tz='US/Eastern'),
+ Timestamp('2018-03-10 23:30:00', tz='US/Eastern'),
+ Timestamp('2018-03-12', tz='US/Eastern')),
+ (Timestamp('2018-11-03', tz='US/Eastern'),
+ Timestamp('2018-11-04 00:30:00', tz='US/Eastern'),
+ Timestamp('2018-11-05', tz='US/Eastern'))])
+ def test_linspace_dst_transition(self, start, mid, end):
+ # GH 20976: linspace behavior defined from start/end/periods
+ # accounts for the hour gained/lost during DST transition
+ result = interval_range(start=start, end=end, periods=2)
+ expected = IntervalIndex.from_breaks([start, mid, end])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', [2, 2.0])
+ @pytest.mark.parametrize('end', [10, 10.0])
+ @pytest.mark.parametrize('start', [0, 0.0])
+ def test_float_subtype(self, start, end, freq):
+ # Has float subtype if any of start/end/freq are float, even if all
+ # resulting endpoints can safely be upcast to integers
+
+ # defined from start/end/freq
+ index = interval_range(start=start, end=end, freq=freq)
+ result = index.dtype.subtype
+ expected = 'int64' if is_integer(start + end + freq) else 'float64'
+ assert result == expected
+
+ # defined from start/periods/freq
+ index = interval_range(start=start, periods=5, freq=freq)
+ result = index.dtype.subtype
+ expected = 'int64' if is_integer(start + freq) else 'float64'
+ assert result == expected
+
+ # defined from end/periods/freq
+ index = interval_range(end=end, periods=5, freq=freq)
+ result = index.dtype.subtype
+ expected = 'int64' if is_integer(end + freq) else 'float64'
+ assert result == expected
+
+ # GH 20976: linspace behavior defined from start/end/periods
+ index = interval_range(start=start, end=end, periods=5)
+ result = index.dtype.subtype
+ expected = 'int64' if is_integer(start + end) else 'float64'
+ assert result == expected
+
+ def test_constructor_coverage(self):
+ # float value for periods
+ expected = interval_range(start=0, periods=10)
+ result = interval_range(start=0, periods=10.5)
+ tm.assert_index_equal(result, expected)
+
+ # equivalent timestamp-like start/end
+ start, end = Timestamp('2017-01-01'), Timestamp('2017-01-15')
+ expected = interval_range(start=start, end=end)
+
+ result = interval_range(start=start.to_pydatetime(),
+ end=end.to_pydatetime())
+ tm.assert_index_equal(result, expected)
+
+ result = interval_range(start=start.asm8, end=end.asm8)
+ tm.assert_index_equal(result, expected)
+
+ # equivalent freq with timestamp
+ equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1),
+ DateOffset(days=1)]
+ for freq in equiv_freq:
+ result = interval_range(start=start, end=end, freq=freq)
+ tm.assert_index_equal(result, expected)
+
+ # equivalent timedelta-like start/end
+ start, end = Timedelta(days=1), Timedelta(days=10)
+ expected = interval_range(start=start, end=end)
+
+ result = interval_range(start=start.to_pytimedelta(),
+ end=end.to_pytimedelta())
+ tm.assert_index_equal(result, expected)
+
+ result = interval_range(start=start.asm8, end=end.asm8)
+ tm.assert_index_equal(result, expected)
+
+ # equivalent freq with timedelta
+ equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1)]
+ for freq in equiv_freq:
+ result = interval_range(start=start, end=end, freq=freq)
+ tm.assert_index_equal(result, expected)
+
+ def test_errors(self):
+ # not enough params
+ msg = ('Of the four parameters: start, end, periods, and freq, '
+ 'exactly three must be specified')
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range(start=0)
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range(end=5)
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range(periods=2)
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range()
+
+ # too many params
+ with pytest.raises(ValueError, match=msg):
+ interval_range(start=0, end=5, periods=6, freq=1.5)
+
+ # mixed units
+ msg = 'start, end, freq need to be type compatible'
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=0, end=Timestamp('20130101'), freq=2)
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=0, end=Timedelta('1 day'), freq=2)
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=0, end=10, freq='D')
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timestamp('20130101'), end=10, freq='D')
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timestamp('20130101'),
+ end=Timedelta('1 day'), freq='D')
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timestamp('20130101'),
+ end=Timestamp('20130110'), freq=2)
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timedelta('1 day'), end=10, freq='D')
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timedelta('1 day'),
+ end=Timestamp('20130110'), freq='D')
+
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=Timedelta('1 day'),
+ end=Timedelta('10 days'), freq=2)
+
+ # invalid periods
+ msg = 'periods must be a number, got foo'
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=0, periods='foo')
+
+ # invalid start
+ msg = 'start must be numeric or datetime-like, got foo'
+ with pytest.raises(ValueError, match=msg):
+ interval_range(start='foo', periods=10)
+
+ # invalid end
+ msg = r'end must be numeric or datetime-like, got \(0, 1\]'
+ with pytest.raises(ValueError, match=msg):
+ interval_range(end=Interval(0, 1), periods=10)
+
+ # invalid freq for datetime-like
+ msg = 'freq must be numeric or convertible to DateOffset, got foo'
+ with pytest.raises(ValueError, match=msg):
+ interval_range(start=0, end=10, freq='foo')
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range(start=Timestamp('20130101'), periods=10, freq='foo')
+
+ with pytest.raises(ValueError, match=msg):
+ interval_range(end=Timedelta('1 day'), periods=10, freq='foo')
+
+ # mixed tz
+ start = Timestamp('2017-01-01', tz='US/Eastern')
+ end = Timestamp('2017-01-07', tz='US/Pacific')
+ msg = 'Start and end cannot both be tz-aware with different timezones'
+ with pytest.raises(TypeError, match=msg):
+ interval_range(start=start, end=end)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_tree.py b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_tree.py
new file mode 100644
index 00000000000..5d9ef2a9a6c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/interval/test_interval_tree.py
@@ -0,0 +1,184 @@
+from __future__ import division
+
+from itertools import permutations
+
+import numpy as np
+import pytest
+
+from pandas._libs.interval import IntervalTree
+
+from pandas import compat
+import pandas.util.testing as tm
+
+
+def skipif_32bit(param):
+ """
+ Skip parameters in a parametrize on 32bit systems. Specifically used
+ here to skip leaf_size parameters related to GH 23440.
+ """
+ marks = pytest.mark.skipif(compat.is_platform_32bit(),
+ reason='GH 23440: int type mismatch on 32bit')
+ return pytest.param(param, marks=marks)
+
+
+ scope='class', params=['int32', 'int64', 'float32', 'float64', 'uint64'])
+def dtype(request):
+ return request.param
+
+
[email protected](params=[skipif_32bit(1), skipif_32bit(2), 10])
+def leaf_size(request):
+ """
+ Fixture to specify IntervalTree leaf_size parameter; to be used with the
+ tree fixture.
+ """
+ return request.param
+
+
+ np.arange(5, dtype='int64'),
+ np.arange(5, dtype='int32'),
+ np.arange(5, dtype='uint64'),
+ np.arange(5, dtype='float64'),
+ np.arange(5, dtype='float32'),
+ np.array([0, 1, 2, 3, 4, np.nan], dtype='float64'),
+ np.array([0, 1, 2, 3, 4, np.nan], dtype='float32')])
+def tree(request, leaf_size):
+ left = request.param
+ return IntervalTree(left, left + 2, leaf_size=leaf_size)
+
+
+class TestIntervalTree(object):
+
+ def test_get_loc(self, tree):
+ result = tree.get_loc(1)
+ expected = np.array([0], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.sort(tree.get_loc(2))
+ expected = np.array([0, 1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ tree.get_loc(-1)
+
+ def test_get_indexer(self, tree):
+ result = tree.get_indexer(np.array([1.0, 5.5, 6.5]))
+ expected = np.array([0, 4, -1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ tree.get_indexer(np.array([3.0]))
+
+ def test_get_indexer_non_unique(self, tree):
+ indexer, missing = tree.get_indexer_non_unique(
+ np.array([1.0, 2.0, 6.5]))
+
+ result = indexer[:1]
+ expected = np.array([0], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.sort(indexer[1:3])
+ expected = np.array([0, 1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = np.sort(indexer[3:])
+ expected = np.array([-1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = missing
+ expected = np.array([2], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_duplicates(self, dtype):
+ left = np.array([0, 0, 0], dtype=dtype)
+ tree = IntervalTree(left, left + 1)
+
+ result = np.sort(tree.get_loc(0.5))
+ expected = np.array([0, 1, 2], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ tree.get_indexer(np.array([0.5]))
+
+ indexer, missing = tree.get_indexer_non_unique(np.array([0.5]))
+ result = np.sort(indexer)
+ expected = np.array([0, 1, 2], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = missing
+ expected = np.array([], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_loc_closed(self, closed):
+ tree = IntervalTree([0], [1], closed=closed)
+ for p, errors in [(0, tree.open_left),
+ (1, tree.open_right)]:
+ if errors:
+ with pytest.raises(KeyError):
+ tree.get_loc(p)
+ else:
+ result = tree.get_loc(p)
+ expected = np.array([0], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('leaf_size', [
+ skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000])
+ def test_get_indexer_closed(self, closed, leaf_size):
+ x = np.arange(1000, dtype='float64')
+ found = x.astype('intp')
+ not_found = (-1 * np.ones(1000)).astype('intp')
+
+ tree = IntervalTree(x, x + 0.5, closed=closed, leaf_size=leaf_size)
+ tm.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25))
+
+ expected = found if tree.closed_left else not_found
+ tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0))
+
+ expected = found if tree.closed_right else not_found
+ tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5))
+
+ @pytest.mark.parametrize('left, right, expected', [
+ (np.array([0, 1, 4]), np.array([2, 3, 5]), True),
+ (np.array([0, 1, 2]), np.array([5, 4, 3]), True),
+ (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True),
+ (np.array([0, 2, 4]), np.array([1, 3, 5]), False),
+ (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False)])
+ @pytest.mark.parametrize('order', map(list, permutations(range(3))))
+ def test_is_overlapping(self, closed, order, left, right, expected):
+ # GH 23309
+ tree = IntervalTree(left[order], right[order], closed=closed)
+ result = tree.is_overlapping
+ assert result is expected
+
+ @pytest.mark.parametrize('order', map(list, permutations(range(3))))
+ def test_is_overlapping_endpoints(self, closed, order):
+ """shared endpoints are marked as overlapping"""
+ # GH 23309
+ left, right = np.arange(3), np.arange(1, 4)
+ tree = IntervalTree(left[order], right[order], closed=closed)
+ result = tree.is_overlapping
+ expected = closed is 'both'
+ assert result is expected
+
+ @pytest.mark.parametrize('left, right', [
+ (np.array([], dtype='int64'), np.array([], dtype='int64')),
+ (np.array([0], dtype='int64'), np.array([1], dtype='int64')),
+ (np.array([np.nan]), np.array([np.nan])),
+ (np.array([np.nan] * 3), np.array([np.nan] * 3))])
+ def test_is_overlapping_trivial(self, closed, left, right):
+ # GH 23309
+ tree = IntervalTree(left, right, closed=closed)
+ assert tree.is_overlapping is False
+
+ @pytest.mark.skipif(compat.is_platform_32bit(), reason='GH 23440')
+ def test_construction_overflow(self):
+ # GH 25485
+ left, right = np.arange(101), [np.iinfo(np.int64).max] * 101
+ tree = IntervalTree(left, right)
+
+ # pivot should be average of left/right medians
+ result = tree.root.pivot
+ expected = (50 + np.iinfo(np.int64).max) / 2
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/conftest.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/conftest.py
new file mode 100644
index 00000000000..7fb862c69f5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/conftest.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import Index, MultiIndex
+
+
+def idx():
+ # a MultiIndex used to test the general functionality of the
+ # general functionality of this object
+ major_axis = Index(['foo', 'bar', 'baz', 'qux'])
+ minor_axis = Index(['one', 'two'])
+
+ major_codes = np.array([0, 0, 1, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+ index_names = ['first', 'second']
+ mi = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=index_names, verify_integrity=False)
+ return mi
+
+
+def idx_dup():
+ # compare tests/indexes/multi/conftest.py
+ major_axis = Index(['foo', 'bar', 'baz', 'qux'])
+ minor_axis = Index(['one', 'two'])
+
+ major_codes = np.array([0, 0, 1, 0, 1, 1])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+ index_names = ['first', 'second']
+ mi = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=index_names, verify_integrity=False)
+ return mi
+
+
+def index_names():
+ # names that match those in the idx fixture for testing equality of
+ # names assigned to the idx
+ return ['first', 'second']
+
+
+def holder():
+ # the MultiIndex constructor used to base compatibility with pickle
+ return MultiIndex
+
+
+def compat_props():
+ # a MultiIndex must have these properties associated with it
+ return ['shape', 'ndim', 'size']
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_analytics.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_analytics.py
new file mode 100644
index 00000000000..632ab7aa7be
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_analytics.py
@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+from pandas.compat.numpy import _np_version_under1p17
+
+import pandas as pd
+from pandas import Index, MultiIndex, date_range, period_range
+import pandas.util.testing as tm
+
+
+def test_shift(idx):
+
+ # GH8083 test the base class for shift
+ pytest.raises(NotImplementedError, idx.shift, 1)
+ pytest.raises(NotImplementedError, idx.shift, 1, 2)
+
+
+def test_groupby(idx):
+ groups = idx.groupby(np.array([1, 1, 1, 2, 2, 2]))
+ labels = idx.get_values().tolist()
+ exp = {1: labels[:3], 2: labels[3:]}
+ tm.assert_dict_equal(groups, exp)
+
+ # GH5620
+ groups = idx.groupby(idx)
+ exp = {key: [key] for key in idx}
+ tm.assert_dict_equal(groups, exp)
+
+
+def test_truncate():
+ major_axis = Index(lrange(4))
+ minor_axis = Index(lrange(2))
+
+ major_codes = np.array([0, 0, 1, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+
+ result = index.truncate(before=1)
+ assert 'foo' not in result.levels[0]
+ assert 1 in result.levels[0]
+
+ result = index.truncate(after=1)
+ assert 2 not in result.levels[0]
+ assert 1 in result.levels[0]
+
+ result = index.truncate(before=1, after=2)
+ assert len(result.levels[0]) == 2
+
+ # after < before
+ pytest.raises(ValueError, index.truncate, 3, 1)
+
+
+def test_where():
+ i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
+
+ with pytest.raises(NotImplementedError):
+ i.where(True)
+
+
+def test_where_array_like():
+ i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
+ klasses = [list, tuple, np.array, pd.Series]
+ cond = [False, True]
+
+ for klass in klasses:
+ with pytest.raises(NotImplementedError):
+ i.where(klass(cond))
+
+
+# TODO: reshape
+
+
+def test_reorder_levels(idx):
+ # this blows up
+ with pytest.raises(IndexError, match='^Too many levels'):
+ idx.reorder_levels([2, 1, 0])
+
+
+def test_numpy_repeat():
+ reps = 2
+ numbers = [1, 2, 3]
+ names = np.array(['foo', 'bar'])
+
+ m = MultiIndex.from_product([
+ numbers, names], names=names)
+ expected = MultiIndex.from_product([
+ numbers, names.repeat(reps)], names=names)
+ tm.assert_index_equal(np.repeat(m, reps), expected)
+
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.repeat(m, reps, axis=1)
+
+
+def test_append_mixed_dtypes():
+ # GH 13660
+ dti = date_range('2011-01-01', freq='M', periods=3, )
+ dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern')
+ pi = period_range('2011-01', freq='M', periods=3)
+
+ mi = MultiIndex.from_arrays([[1, 2, 3],
+ [1.1, np.nan, 3.3],
+ ['a', 'b', 'c'],
+ dti, dti_tz, pi])
+ assert mi.nlevels == 6
+
+ res = mi.append(mi)
+ exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3],
+ [1.1, np.nan, 3.3, 1.1, np.nan, 3.3],
+ ['a', 'b', 'c', 'a', 'b', 'c'],
+ dti.append(dti),
+ dti_tz.append(dti_tz),
+ pi.append(pi)])
+ tm.assert_index_equal(res, exp)
+
+ other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'],
+ ['x', 'y', 'z'], ['x', 'y', 'z'],
+ ['x', 'y', 'z'], ['x', 'y', 'z']])
+
+ res = mi.append(other)
+ exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'],
+ [1.1, np.nan, 3.3, 'x', 'y', 'z'],
+ ['a', 'b', 'c', 'x', 'y', 'z'],
+ dti.append(pd.Index(['x', 'y', 'z'])),
+ dti_tz.append(pd.Index(['x', 'y', 'z'])),
+ pi.append(pd.Index(['x', 'y', 'z']))])
+ tm.assert_index_equal(res, exp)
+
+
+def test_take(idx):
+ indexer = [4, 3, 0, 2]
+ result = idx.take(indexer)
+ expected = idx[indexer]
+ assert result.equals(expected)
+
+ # TODO: Remove Commented Code
+ # if not isinstance(idx,
+ # (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ # GH 10791
+ with pytest.raises(AttributeError):
+ idx.freq
+
+
+def test_take_invalid_kwargs(idx):
+ idx = idx
+ indices = [1, 2]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+
+def test_take_fill_value():
+ # GH 12631
+ vals = [['A', 'B'],
+ [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]]
+ idx = pd.MultiIndex.from_product(vals, names=['str', 'dt'])
+
+ result = idx.take(np.array([1, 0, -1]))
+ exp_vals = [('A', pd.Timestamp('2011-01-02')),
+ ('A', pd.Timestamp('2011-01-01')),
+ ('B', pd.Timestamp('2011-01-02'))]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt'])
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ exp_vals = [('A', pd.Timestamp('2011-01-02')),
+ ('A', pd.Timestamp('2011-01-01')),
+ (np.nan, pd.NaT)]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt'])
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ exp_vals = [('A', pd.Timestamp('2011-01-02')),
+ ('A', pd.Timestamp('2011-01-01')),
+ ('B', pd.Timestamp('2011-01-02'))]
+ expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt'])
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+
+def test_iter(idx):
+ result = list(idx)
+ expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'),
+ ('baz', 'two'), ('qux', 'one'), ('qux', 'two')]
+ assert result == expected
+
+
+def test_sub(idx):
+
+ first = idx
+
+ # - now raises (previously was set op difference)
+ with pytest.raises(TypeError):
+ first - idx[-3:]
+ with pytest.raises(TypeError):
+ idx[-3:] - first
+ with pytest.raises(TypeError):
+ idx[-3:] - first.tolist()
+ with pytest.raises(TypeError):
+ first.tolist() - idx[-3:]
+
+
+def test_map(idx):
+ # callable
+ index = idx
+
+ # we don't infer UInt64
+ if isinstance(index, pd.UInt64Index):
+ expected = index.astype('int64')
+ else:
+ expected = index
+
+ result = index.map(lambda x: x)
+ tm.assert_index_equal(result, expected)
+
+
+ "mapper",
+ [
+ lambda values, idx: {i: e for e, i in zip(values, idx)},
+ lambda values, idx: pd.Series(values, idx)])
+def test_map_dictlike(idx, mapper):
+
+ if isinstance(idx, (pd.CategoricalIndex, pd.IntervalIndex)):
+ pytest.skip("skipping tests for {}".format(type(idx)))
+
+ identity = mapper(idx.values, idx)
+
+ # we don't infer to UInt64 for a dict
+ if isinstance(idx, pd.UInt64Index) and isinstance(identity, dict):
+ expected = idx.astype('int64')
+ else:
+ expected = idx
+
+ result = idx.map(identity)
+ tm.assert_index_equal(result, expected)
+
+ # empty mappable
+ expected = pd.Index([np.nan] * len(idx))
+ result = idx.map(mapper(expected, idx))
+ tm.assert_index_equal(result, expected)
+
+
+ np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10,
+ np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin,
+ np.arccos, np.arctan, np.sinh, np.cosh, np.tanh,
+ np.arcsinh, np.arccosh, np.arctanh, np.deg2rad,
+ np.rad2deg
+])
+def test_numpy_ufuncs(func):
+ # test ufuncs of numpy. see:
+ # http://docs.scipy.org/doc/numpy/reference/ufuncs.html
+
+ # copy and paste from idx fixture as pytest doesn't support
+ # parameters and fixtures at the same time.
+ major_axis = Index(['foo', 'bar', 'baz', 'qux'])
+ minor_axis = Index(['one', 'two'])
+ major_codes = np.array([0, 0, 1, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+ index_names = ['first', 'second']
+
+ idx = MultiIndex(
+ levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=index_names,
+ verify_integrity=False
+ )
+
+ if _np_version_under1p17:
+ expected_exception = AttributeError
+ msg = "'tuple' object has no attribute '{}'".format(func.__name__)
+ else:
+ expected_exception = TypeError
+ msg = ("loop of ufunc does not support argument 0 of type tuple which"
+ " has no callable {} method").format(func.__name__)
+ with pytest.raises(expected_exception, match=msg):
+ func(idx)
+
+
+ np.isfinite, np.isinf, np.isnan, np.signbit
+])
+def test_numpy_type_funcs(func):
+ # for func in [np.isfinite, np.isinf, np.isnan, np.signbit]:
+ # copy and paste from idx fixture as pytest doesn't support
+ # parameters and fixtures at the same time.
+ major_axis = Index(['foo', 'bar', 'baz', 'qux'])
+ minor_axis = Index(['one', 'two'])
+ major_codes = np.array([0, 0, 1, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+ index_names = ['first', 'second']
+
+ idx = MultiIndex(
+ levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=index_names,
+ verify_integrity=False
+ )
+
+ with pytest.raises(Exception):
+ func(idx)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_astype.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_astype.py
new file mode 100644
index 00000000000..c77b23c7400
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_astype.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas.util.testing import assert_copy
+
+
+def test_astype(idx):
+ expected = idx.copy()
+ actual = idx.astype('O')
+ assert_copy(actual.levels, expected.levels)
+ assert_copy(actual.codes, expected.codes)
+ assert [level.name for level in actual.levels] == list(expected.names)
+
+ with pytest.raises(TypeError, match="^Setting.*dtype.*object"):
+ idx.astype(np.dtype(int))
+
+
[email protected]('ordered', [True, False])
+def test_astype_category(idx, ordered):
+ # GH 18630
+ msg = '> 1 ndim Categorical are not supported at this time'
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.astype(CategoricalDtype(ordered=ordered))
+
+ if ordered is False:
+ # dtype='category' defaults to ordered=False, so only test once
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.astype('category')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_compat.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_compat.py
new file mode 100644
index 00000000000..f405fc659c7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_compat.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, long
+
+from pandas import MultiIndex
+import pandas.util.testing as tm
+
+
+def test_numeric_compat(idx):
+ with pytest.raises(TypeError, match="cannot perform __mul__"):
+ idx * 1
+
+ with pytest.raises(TypeError, match="cannot perform __rmul__"):
+ 1 * idx
+
+ div_err = ("cannot perform __truediv__" if PY3
+ else "cannot perform __div__")
+ with pytest.raises(TypeError, match=div_err):
+ idx / 1
+
+ div_err = div_err.replace(" __", " __r")
+ with pytest.raises(TypeError, match=div_err):
+ 1 / idx
+
+ with pytest.raises(TypeError, match="cannot perform __floordiv__"):
+ idx // 1
+
+ with pytest.raises(TypeError, match="cannot perform __rfloordiv__"):
+ 1 // idx
+
+
[email protected]("method", ["all", "any"])
+def test_logical_compat(idx, method):
+ msg = "cannot perform {method}".format(method=method)
+
+ with pytest.raises(TypeError, match=msg):
+ getattr(idx, method)()
+
+
+def test_boolean_context_compat(idx):
+
+ with pytest.raises(ValueError):
+ bool(idx)
+
+
+def test_boolean_context_compat2():
+
+ # boolean context compat
+ # GH7897
+ i1 = MultiIndex.from_tuples([('A', 1), ('A', 2)])
+ i2 = MultiIndex.from_tuples([('A', 1), ('A', 3)])
+ common = i1.intersection(i2)
+
+ with pytest.raises(ValueError):
+ bool(common)
+
+
+def test_inplace_mutation_resets_values():
+ levels = [['a', 'b', 'c'], [4]]
+ levels2 = [[1, 2, 3], ['a']]
+ codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]]
+
+ mi1 = MultiIndex(levels=levels, codes=codes)
+ mi2 = MultiIndex(levels=levels2, codes=codes)
+ vals = mi1.values.copy()
+ vals2 = mi2.values.copy()
+
+ assert mi1._tuples is not None
+
+ # Make sure level setting works
+ new_vals = mi1.set_levels(levels2).values
+ tm.assert_almost_equal(vals2, new_vals)
+
+ # Non-inplace doesn't kill _tuples [implementation detail]
+ tm.assert_almost_equal(mi1._tuples, vals)
+
+ # ...and values is still same too
+ tm.assert_almost_equal(mi1.values, vals)
+
+ # Inplace should kill _tuples
+ mi1.set_levels(levels2, inplace=True)
+ tm.assert_almost_equal(mi1.values, vals2)
+
+ # Make sure label setting works too
+ codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
+ exp_values = np.empty((6,), dtype=object)
+ exp_values[:] = [(long(1), 'a')] * 6
+
+ # Must be 1d array of tuples
+ assert exp_values.shape == (6,)
+ new_values = mi2.set_codes(codes2).values
+
+ # Not inplace shouldn't change
+ tm.assert_almost_equal(mi2._tuples, vals2)
+
+ # Should have correct values
+ tm.assert_almost_equal(exp_values, new_values)
+
+ # ...and again setting inplace should kill _tuples, etc
+ mi2.set_codes(codes2, inplace=True)
+ tm.assert_almost_equal(mi2.values, new_values)
+
+
+def test_ndarray_compat_properties(idx, compat_props):
+ assert idx.T.equals(idx)
+ assert idx.transpose().equals(idx)
+
+ values = idx.values
+ for prop in compat_props:
+ assert getattr(idx, prop) == getattr(values, prop)
+
+ # test for validity
+ idx.nbytes
+ idx.values.nbytes
+
+
+def test_compat(indices):
+ assert indices.tolist() == list(indices)
+
+
+def test_pickle_compat_construction(holder):
+ # this is testing for pickle compat
+ if holder is None:
+ return
+
+ # need an object to create with
+ pytest.raises(TypeError, holder)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_constructor.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_constructor.py
new file mode 100644
index 00000000000..e6678baf8a9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_constructor.py
@@ -0,0 +1,577 @@
+# -*- coding: utf-8 -*-
+
+from collections import OrderedDict
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import Timestamp
+from pandas.compat import lrange, range
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+
+import pandas as pd
+from pandas import Index, MultiIndex, date_range
+import pandas.util.testing as tm
+
+
+def test_constructor_single_level():
+ result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
+ codes=[[0, 1, 2, 3]], names=['first'])
+ assert isinstance(result, MultiIndex)
+ expected = Index(['foo', 'bar', 'baz', 'qux'], name='first')
+ tm.assert_index_equal(result.levels[0], expected)
+ assert result.names == ['first']
+
+
+def test_constructor_no_levels():
+ msg = "non-zero number of levels/codes"
+ with pytest.raises(ValueError, match=msg):
+ MultiIndex(levels=[], codes=[])
+
+ both_re = re.compile('Must pass both levels and codes')
+ with pytest.raises(TypeError, match=both_re):
+ MultiIndex(levels=[])
+ with pytest.raises(TypeError, match=both_re):
+ MultiIndex(codes=[])
+
+
+def test_constructor_nonhashable_names():
+ # GH 20527
+ levels = [[1, 2], [u'one', u'two']]
+ codes = [[0, 0, 1, 1], [0, 1, 0, 1]]
+ names = (['foo'], ['bar'])
+ message = "MultiIndex.name must be a hashable type"
+ with pytest.raises(TypeError, match=message):
+ MultiIndex(levels=levels, codes=codes, names=names)
+
+ # With .rename()
+ mi = MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=('foo', 'bar'))
+ renamed = [['foor'], ['barr']]
+ with pytest.raises(TypeError, match=message):
+ mi.rename(names=renamed)
+
+ # With .set_names()
+ with pytest.raises(TypeError, match=message):
+ mi.set_names(names=renamed)
+
+
+def test_constructor_mismatched_codes_levels(idx):
+ codes = [np.array([1]), np.array([2]), np.array([3])]
+ levels = ["a"]
+
+ msg = "Length of levels and codes must be the same"
+ with pytest.raises(ValueError, match=msg):
+ MultiIndex(levels=levels, codes=codes)
+
+ length_error = re.compile('>= length of level')
+ label_error = re.compile(r'Unequal code lengths: \[4, 2\]')
+
+ # important to check that it's looking at the right thing.
+ with pytest.raises(ValueError, match=length_error):
+ MultiIndex(levels=[['a'], ['b']],
+ codes=[[0, 1, 2, 3], [0, 3, 4, 1]])
+
+ with pytest.raises(ValueError, match=label_error):
+ MultiIndex(levels=[['a'], ['b']], codes=[[0, 0, 0, 0], [0, 0]])
+
+ # external API
+ with pytest.raises(ValueError, match=length_error):
+ idx.copy().set_levels([['a'], ['b']])
+
+ with pytest.raises(ValueError, match=label_error):
+ idx.copy().set_codes([[0, 0, 0, 0], [0, 0]])
+
+
+def test_labels_deprecated(idx):
+ # GH23752
+ with tm.assert_produces_warning(FutureWarning):
+ MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
+ labels=[[0, 1, 2, 3]], names=['first'])
+ with tm.assert_produces_warning(FutureWarning):
+ idx.labels
+
+
+def test_copy_in_constructor():
+ levels = np.array(["a", "b", "c"])
+ codes = np.array([1, 1, 2, 0, 0, 1, 1])
+ val = codes[0]
+ mi = MultiIndex(levels=[levels, levels], codes=[codes, codes],
+ copy=True)
+ assert mi.codes[0][0] == val
+ codes[0] = 15
+ assert mi.codes[0][0] == val
+ val = levels[0]
+ levels[0] = "PANDA"
+ assert mi.levels[0][0] == val
+
+
+# ----------------------------------------------------------------------------
+# from_arrays
+# ----------------------------------------------------------------------------
+def test_from_arrays(idx):
+ arrays = [np.asarray(lev).take(level_codes)
+ for lev, level_codes in zip(idx.levels, idx.codes)]
+
+ # list of arrays as input
+ result = MultiIndex.from_arrays(arrays, names=idx.names)
+ tm.assert_index_equal(result, idx)
+
+ # infer correctly
+ result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')],
+ ['a', 'b']])
+ assert result.levels[0].equals(Index([Timestamp('20130101')]))
+ assert result.levels[1].equals(Index(['a', 'b']))
+
+
+def test_from_arrays_iterator(idx):
+ # GH 18434
+ arrays = [np.asarray(lev).take(level_codes)
+ for lev, level_codes in zip(idx.levels, idx.codes)]
+
+ # iterator as input
+ result = MultiIndex.from_arrays(iter(arrays), names=idx.names)
+ tm.assert_index_equal(result, idx)
+
+ # invalid iterator input
+ msg = "Input must be a list / sequence of array-likes."
+ with pytest.raises(TypeError, match=msg):
+ MultiIndex.from_arrays(0)
+
+
+def test_from_arrays_index_series_datetimetz():
+ idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern')
+ idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3,
+ tz='Asia/Tokyo')
+ result = pd.MultiIndex.from_arrays([idx1, idx2])
+ tm.assert_index_equal(result.get_level_values(0), idx1)
+ tm.assert_index_equal(result.get_level_values(1), idx2)
+
+ result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)])
+ tm.assert_index_equal(result2.get_level_values(0), idx1)
+ tm.assert_index_equal(result2.get_level_values(1), idx2)
+
+ tm.assert_index_equal(result, result2)
+
+
+def test_from_arrays_index_series_timedelta():
+ idx1 = pd.timedelta_range('1 days', freq='D', periods=3)
+ idx2 = pd.timedelta_range('2 hours', freq='H', periods=3)
+ result = pd.MultiIndex.from_arrays([idx1, idx2])
+ tm.assert_index_equal(result.get_level_values(0), idx1)
+ tm.assert_index_equal(result.get_level_values(1), idx2)
+
+ result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)])
+ tm.assert_index_equal(result2.get_level_values(0), idx1)
+ tm.assert_index_equal(result2.get_level_values(1), idx2)
+
+ tm.assert_index_equal(result, result2)
+
+
+def test_from_arrays_index_series_period():
+ idx1 = pd.period_range('2011-01-01', freq='D', periods=3)
+ idx2 = pd.period_range('2015-01-01', freq='H', periods=3)
+ result = pd.MultiIndex.from_arrays([idx1, idx2])
+ tm.assert_index_equal(result.get_level_values(0), idx1)
+ tm.assert_index_equal(result.get_level_values(1), idx2)
+
+ result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)])
+ tm.assert_index_equal(result2.get_level_values(0), idx1)
+ tm.assert_index_equal(result2.get_level_values(1), idx2)
+
+ tm.assert_index_equal(result, result2)
+
+
+def test_from_arrays_index_datetimelike_mixed():
+ idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern')
+ idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3)
+ idx3 = pd.timedelta_range('1 days', freq='D', periods=3)
+ idx4 = pd.period_range('2011-01-01', freq='D', periods=3)
+
+ result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4])
+ tm.assert_index_equal(result.get_level_values(0), idx1)
+ tm.assert_index_equal(result.get_level_values(1), idx2)
+ tm.assert_index_equal(result.get_level_values(2), idx3)
+ tm.assert_index_equal(result.get_level_values(3), idx4)
+
+ result2 = pd.MultiIndex.from_arrays([pd.Series(idx1),
+ pd.Series(idx2),
+ pd.Series(idx3),
+ pd.Series(idx4)])
+ tm.assert_index_equal(result2.get_level_values(0), idx1)
+ tm.assert_index_equal(result2.get_level_values(1), idx2)
+ tm.assert_index_equal(result2.get_level_values(2), idx3)
+ tm.assert_index_equal(result2.get_level_values(3), idx4)
+
+ tm.assert_index_equal(result, result2)
+
+
+def test_from_arrays_index_series_categorical():
+ # GH13743
+ idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"),
+ ordered=False)
+ idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"),
+ ordered=True)
+
+ result = pd.MultiIndex.from_arrays([idx1, idx2])
+ tm.assert_index_equal(result.get_level_values(0), idx1)
+ tm.assert_index_equal(result.get_level_values(1), idx2)
+
+ result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)])
+ tm.assert_index_equal(result2.get_level_values(0), idx1)
+ tm.assert_index_equal(result2.get_level_values(1), idx2)
+
+ result3 = pd.MultiIndex.from_arrays([idx1.values, idx2.values])
+ tm.assert_index_equal(result3.get_level_values(0), idx1)
+ tm.assert_index_equal(result3.get_level_values(1), idx2)
+
+
+def test_from_arrays_empty():
+ # 0 levels
+ msg = "Must pass non-zero number of levels/codes"
+ with pytest.raises(ValueError, match=msg):
+ MultiIndex.from_arrays(arrays=[])
+
+ # 1 level
+ result = MultiIndex.from_arrays(arrays=[[]], names=['A'])
+ assert isinstance(result, MultiIndex)
+ expected = Index([], name='A')
+ tm.assert_index_equal(result.levels[0], expected)
+
+ # N levels
+ for N in [2, 3]:
+ arrays = [[]] * N
+ names = list('ABC')[:N]
+ result = MultiIndex.from_arrays(arrays=arrays, names=names)
+ expected = MultiIndex(levels=[[]] * N, codes=[[]] * N,
+ names=names)
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('invalid_array', [
+ (1),
+ ([1]),
+ ([1, 2]),
+ ([[1], 2]),
+ ('a'),
+ (['a']),
+ (['a', 'b']),
+ ([['a'], 'b']),
+])
+def test_from_arrays_invalid_input(invalid_array):
+ invalid_inputs = [1, [1], [1, 2], [[1], 2],
+ 'a', ['a'], ['a', 'b'], [['a'], 'b']]
+ for i in invalid_inputs:
+ pytest.raises(TypeError, MultiIndex.from_arrays, arrays=i)
+
+
[email protected]('idx1, idx2', [
+ ([1, 2, 3], ['a', 'b']),
+ ([], ['a', 'b']),
+ ([1, 2, 3], [])
+])
+def test_from_arrays_different_lengths(idx1, idx2):
+ # see gh-13599
+ msg = '^all arrays must be same length$'
+ with pytest.raises(ValueError, match=msg):
+ MultiIndex.from_arrays([idx1, idx2])
+
+
+# ----------------------------------------------------------------------------
+# from_tuples
+# ----------------------------------------------------------------------------
+def test_from_tuples():
+ msg = 'Cannot infer number of levels from empty list'
+ with pytest.raises(TypeError, match=msg):
+ MultiIndex.from_tuples([])
+
+ expected = MultiIndex(levels=[[1, 3], [2, 4]],
+ codes=[[0, 1], [0, 1]],
+ names=['a', 'b'])
+
+ # input tuples
+ result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b'])
+ tm.assert_index_equal(result, expected)
+
+
+def test_from_tuples_iterator():
+ # GH 18434
+ # input iterator for tuples
+ expected = MultiIndex(levels=[[1, 3], [2, 4]],
+ codes=[[0, 1], [0, 1]],
+ names=['a', 'b'])
+
+ result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b'])
+ tm.assert_index_equal(result, expected)
+
+ # input non-iterables
+ msg = 'Input must be a list / sequence of tuple-likes.'
+ with pytest.raises(TypeError, match=msg):
+ MultiIndex.from_tuples(0)
+
+
+def test_from_tuples_empty():
+ # GH 16777
+ result = MultiIndex.from_tuples([], names=['a', 'b'])
+ expected = MultiIndex.from_arrays(arrays=[[], []],
+ names=['a', 'b'])
+ tm.assert_index_equal(result, expected)
+
+
+def test_from_tuples_index_values(idx):
+ result = MultiIndex.from_tuples(idx)
+ assert (result.values == idx.values).all()
+
+
+def test_tuples_with_name_string():
+ # GH 15110 and GH 14848
+
+ li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)]
+ with pytest.raises(ValueError):
+ pd.Index(li, name='abc')
+ with pytest.raises(ValueError):
+ pd.Index(li, name='a')
+
+
+def test_from_tuples_with_tuple_label():
+ # GH 15457
+ expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]],
+ columns=['a', 'b', 'c']).set_index(['a', 'b'])
+ idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b'))
+ result = pd.DataFrame([2, 3], columns=['c'], index=idx)
+ tm.assert_frame_equal(expected, result)
+
+
+# ----------------------------------------------------------------------------
+# from_product
+# ----------------------------------------------------------------------------
+def test_from_product_empty_zero_levels():
+ # 0 levels
+ msg = "Must pass non-zero number of levels/codes"
+ with pytest.raises(ValueError, match=msg):
+ MultiIndex.from_product([])
+
+
+def test_from_product_empty_one_level():
+ result = MultiIndex.from_product([[]], names=['A'])
+ expected = pd.Index([], name='A')
+ tm.assert_index_equal(result.levels[0], expected)
+
+
[email protected]('first, second', [
+ ([], []),
+ (['foo', 'bar', 'baz'], []),
+ ([], ['a', 'b', 'c']),
+])
+def test_from_product_empty_two_levels(first, second):
+ names = ['A', 'B']
+ result = MultiIndex.from_product([first, second], names=names)
+ expected = MultiIndex(levels=[first, second],
+ codes=[[], []], names=names)
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('N', list(range(4)))
+def test_from_product_empty_three_levels(N):
+ # GH12258
+ names = ['A', 'B', 'C']
+ lvl2 = lrange(N)
+ result = MultiIndex.from_product([[], lvl2, []], names=names)
+ expected = MultiIndex(levels=[[], lvl2, []],
+ codes=[[], [], []], names=names)
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('invalid_input', [
+ 1,
+ [1],
+ [1, 2],
+ [[1], 2],
+ 'a',
+ ['a'],
+ ['a', 'b'],
+ [['a'], 'b'],
+])
+def test_from_product_invalid_input(invalid_input):
+ pytest.raises(TypeError, MultiIndex.from_product, iterables=invalid_input)
+
+
+def test_from_product_datetimeindex():
+ dt_index = date_range('2000-01-01', periods=2)
+ mi = pd.MultiIndex.from_product([[1, 2], dt_index])
+ etalon = construct_1d_object_array_from_listlike([
+ (1, pd.Timestamp('2000-01-01')),
+ (1, pd.Timestamp('2000-01-02')),
+ (2, pd.Timestamp('2000-01-01')),
+ (2, pd.Timestamp('2000-01-02')),
+ ])
+ tm.assert_numpy_array_equal(mi.values, etalon)
+
+
[email protected]('ordered', [False, True])
+ lambda x: x,
+ lambda x: pd.Series(x),
+ lambda x: x.values
+])
+def test_from_product_index_series_categorical(ordered, f):
+ # GH13743
+ first = ['foo', 'bar']
+
+ idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"),
+ ordered=ordered)
+ expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"),
+ categories=list("bac"),
+ ordered=ordered)
+
+ result = pd.MultiIndex.from_product([first, f(idx)])
+ tm.assert_index_equal(result.get_level_values(1), expected)
+
+
+def test_from_product():
+
+ first = ['foo', 'bar', 'buz']
+ second = ['a', 'b', 'c']
+ names = ['first', 'second']
+ result = MultiIndex.from_product([first, second], names=names)
+
+ tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'),
+ ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'),
+ ('buz', 'c')]
+ expected = MultiIndex.from_tuples(tuples, names=names)
+
+ tm.assert_index_equal(result, expected)
+
+
+def test_from_product_iterator():
+ # GH 18434
+ first = ['foo', 'bar', 'buz']
+ second = ['a', 'b', 'c']
+ names = ['first', 'second']
+ tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'),
+ ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'),
+ ('buz', 'c')]
+ expected = MultiIndex.from_tuples(tuples, names=names)
+
+ # iterator as input
+ result = MultiIndex.from_product(iter([first, second]), names=names)
+ tm.assert_index_equal(result, expected)
+
+ # Invalid non-iterable input
+ msg = "Input must be a list / sequence of iterables."
+ with pytest.raises(TypeError, match=msg):
+ MultiIndex.from_product(0)
+
+
+def test_create_index_existing_name(idx):
+
+ # GH11193, when an existing index is passed, and a new name is not
+ # specified, the new index should inherit the previous object name
+ index = idx
+ index.names = ['foo', 'bar']
+ result = pd.Index(index)
+ expected = Index(
+ Index([
+ ('foo', 'one'), ('foo', 'two'),
+ ('bar', 'one'), ('baz', 'two'),
+ ('qux', 'one'), ('qux', 'two')],
+ dtype='object'
+ ),
+ names=['foo', 'bar']
+ )
+ tm.assert_index_equal(result, expected)
+
+ result = pd.Index(index, names=['A', 'B'])
+ expected = Index(
+ Index([
+ ('foo', 'one'), ('foo', 'two'),
+ ('bar', 'one'), ('baz', 'two'),
+ ('qux', 'one'), ('qux', 'two')],
+ dtype='object'
+ ),
+ names=['A', 'B']
+ )
+ tm.assert_index_equal(result, expected)
+
+
+# ----------------------------------------------------------------------------
+# from_frame
+# ----------------------------------------------------------------------------
+def test_from_frame():
+ # GH 22420
+ df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']],
+ columns=['L1', 'L2'])
+ expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'),
+ ('b', 'a'), ('b', 'b')],
+ names=['L1', 'L2'])
+ result = pd.MultiIndex.from_frame(df)
+ tm.assert_index_equal(expected, result)
+
+
[email protected]('non_frame', [
+ pd.Series([1, 2, 3, 4]),
+ [1, 2, 3, 4],
+ [[1, 2], [3, 4], [5, 6]],
+ pd.Index([1, 2, 3, 4]),
+ np.array([[1, 2], [3, 4], [5, 6]]),
+ 27
+])
+def test_from_frame_error(non_frame):
+ # GH 22420
+ with pytest.raises(TypeError, match='Input must be a DataFrame'):
+ pd.MultiIndex.from_frame(non_frame)
+
+
+def test_from_frame_dtype_fidelity():
+ # GH 22420
+ df = pd.DataFrame(OrderedDict([
+ ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')),
+ ('a', [1, 1, 1, 2, 2, 2]),
+ ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)),
+ ('c', ['x', 'x', 'y', 'z', 'x', 'y'])
+ ]))
+ original_dtypes = df.dtypes.to_dict()
+
+ expected_mi = pd.MultiIndex.from_arrays([
+ pd.date_range('19910905', periods=6, tz='US/Eastern'),
+ [1, 1, 1, 2, 2, 2],
+ pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True),
+ ['x', 'x', 'y', 'z', 'x', 'y']
+ ], names=['dates', 'a', 'b', 'c'])
+ mi = pd.MultiIndex.from_frame(df)
+ mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)}
+
+ tm.assert_index_equal(expected_mi, mi)
+ assert original_dtypes == mi_dtypes
+
+
[email protected]('names_in,names_out', [
+ (None, [('L1', 'x'), ('L2', 'y')]),
+ (['x', 'y'], ['x', 'y']),
+])
+def test_from_frame_valid_names(names_in, names_out):
+ # GH 22420
+ df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']],
+ columns=pd.MultiIndex.from_tuples([('L1', 'x'),
+ ('L2', 'y')]))
+ mi = pd.MultiIndex.from_frame(df, names=names_in)
+ assert mi.names == names_out
+
+
[email protected]('names_in,names_out', [
+ ('bad_input', ValueError("Names should be list-like for a MultiIndex")),
+ (['a', 'b', 'c'], ValueError("Length of names must match number of "
+ "levels in MultiIndex."))
+])
+def test_from_frame_invalid_names(names_in, names_out):
+ # GH 22420
+ df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']],
+ columns=pd.MultiIndex.from_tuples([('L1', 'x'),
+ ('L2', 'y')]))
+ with pytest.raises(type(names_out), match=names_out.args[0]):
+ pd.MultiIndex.from_frame(df, names=names_in)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_contains.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_contains.py
new file mode 100644
index 00000000000..b73ff11a4dd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_contains.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.compat import PYPY
+
+import pandas as pd
+from pandas import MultiIndex
+import pandas.util.testing as tm
+
+
+def test_contains_top_level():
+ midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
+ assert 'A' in midx
+ assert 'A' not in midx._engine
+
+
+def test_contains_with_nat():
+ # MI with a NaT
+ mi = MultiIndex(levels=[['C'],
+ pd.date_range('2012-01-01', periods=5)],
+ codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
+ names=[None, 'B'])
+ assert ('C', pd.Timestamp('2012-01-01')) in mi
+ for val in mi.values:
+ assert val in mi
+
+
+def test_contains(idx):
+ assert ('foo', 'two') in idx
+ assert ('bar', 'two') not in idx
+ assert None not in idx
+
+
[email protected](not PYPY, reason="tuples cmp recursively on PyPy")
+def test_isin_nan_pypy():
+ idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
+ tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
+ np.array([False, True]))
+ tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
+ np.array([False, True]))
+
+
+def test_isin():
+ values = [('foo', 2), ('bar', 3), ('quux', 4)]
+
+ idx = MultiIndex.from_arrays([
+ ['qux', 'baz', 'foo', 'bar'],
+ np.arange(4)
+ ])
+ result = idx.isin(values)
+ expected = np.array([False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # empty, return dtype bool
+ idx = MultiIndex.from_arrays([[], []])
+ result = idx.isin(values)
+ assert len(result) == 0
+ assert result.dtype == np.bool_
+
+
[email protected](PYPY, reason="tuples cmp recursively on PyPy")
+def test_isin_nan_not_pypy():
+ idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
+ tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
+ np.array([False, False]))
+ tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
+ np.array([False, False]))
+
+
+def test_isin_level_kwarg():
+ idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
+ 4)])
+
+ vals_0 = ['foo', 'bar', 'quux']
+ vals_1 = [2, 3, 10]
+
+ expected = np.array([False, False, True, True])
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0))
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2))
+
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1))
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1))
+
+ pytest.raises(IndexError, idx.isin, vals_0, level=5)
+ pytest.raises(IndexError, idx.isin, vals_0, level=-5)
+
+ pytest.raises(KeyError, idx.isin, vals_0, level=1.0)
+ pytest.raises(KeyError, idx.isin, vals_1, level=-1.0)
+ pytest.raises(KeyError, idx.isin, vals_1, level='A')
+
+ idx.names = ['A', 'B']
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A'))
+ tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B'))
+
+ pytest.raises(KeyError, idx.isin, vals_1, level='C')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_conversion.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_conversion.py
new file mode 100644
index 00000000000..00b935521ba
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_conversion.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+
+from collections import OrderedDict
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, date_range
+import pandas.util.testing as tm
+
+
+def test_tolist(idx):
+ result = idx.tolist()
+ exp = list(idx.values)
+ assert result == exp
+
+
+def test_to_numpy(idx):
+ result = idx.to_numpy()
+ exp = idx.values
+ tm.assert_numpy_array_equal(result, exp)
+
+
+def test_to_frame():
+ tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
+
+ index = MultiIndex.from_tuples(tuples)
+ result = index.to_frame(index=False)
+ expected = DataFrame(tuples)
+ tm.assert_frame_equal(result, expected)
+
+ result = index.to_frame()
+ expected.index = index
+ tm.assert_frame_equal(result, expected)
+
+ tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
+ index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+ result = index.to_frame(index=False)
+ expected = DataFrame(tuples)
+ expected.columns = ['first', 'second']
+ tm.assert_frame_equal(result, expected)
+
+ result = index.to_frame()
+ expected.index = index
+ tm.assert_frame_equal(result, expected)
+
+ # See GH-22580
+ index = MultiIndex.from_tuples(tuples)
+ result = index.to_frame(index=False, name=['first', 'second'])
+ expected = DataFrame(tuples)
+ expected.columns = ['first', 'second']
+ tm.assert_frame_equal(result, expected)
+
+ result = index.to_frame(name=['first', 'second'])
+ expected.index = index
+ expected.columns = ['first', 'second']
+ tm.assert_frame_equal(result, expected)
+
+ msg = "'name' must be a list / sequence of column names."
+ with pytest.raises(TypeError, match=msg):
+ index.to_frame(name='first')
+
+ msg = "'name' should have same length as number of levels on index."
+ with pytest.raises(ValueError, match=msg):
+ index.to_frame(name=['first'])
+
+ # Tests for datetime index
+ index = MultiIndex.from_product([range(5),
+ pd.date_range('20130101', periods=3)])
+ result = index.to_frame(index=False)
+ expected = DataFrame(
+ {0: np.repeat(np.arange(5, dtype='int64'), 3),
+ 1: np.tile(pd.date_range('20130101', periods=3), 5)})
+ tm.assert_frame_equal(result, expected)
+
+ result = index.to_frame()
+ expected.index = index
+ tm.assert_frame_equal(result, expected)
+
+ # See GH-22580
+ result = index.to_frame(index=False, name=['first', 'second'])
+ expected = DataFrame(
+ {'first': np.repeat(np.arange(5, dtype='int64'), 3),
+ 'second': np.tile(pd.date_range('20130101', periods=3), 5)})
+ tm.assert_frame_equal(result, expected)
+
+ result = index.to_frame(name=['first', 'second'])
+ expected.index = index
+ tm.assert_frame_equal(result, expected)
+
+
+def test_to_frame_dtype_fidelity():
+ # GH 22420
+ mi = pd.MultiIndex.from_arrays([
+ pd.date_range('19910905', periods=6, tz='US/Eastern'),
+ [1, 1, 1, 2, 2, 2],
+ pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True),
+ ['x', 'x', 'y', 'z', 'x', 'y']
+ ], names=['dates', 'a', 'b', 'c'])
+ original_dtypes = {name: mi.levels[i].dtype
+ for i, name in enumerate(mi.names)}
+
+ expected_df = pd.DataFrame(OrderedDict([
+ ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')),
+ ('a', [1, 1, 1, 2, 2, 2]),
+ ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)),
+ ('c', ['x', 'x', 'y', 'z', 'x', 'y'])
+ ]))
+ df = mi.to_frame(index=False)
+ df_dtypes = df.dtypes.to_dict()
+
+ tm.assert_frame_equal(df, expected_df)
+ assert original_dtypes == df_dtypes
+
+
+def test_to_frame_resulting_column_order():
+ # GH 22420
+ expected = ['z', 0, 'a']
+ mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'],
+ ['q', 'w', 'e']], names=expected)
+ result = mi.to_frame().columns.tolist()
+ assert result == expected
+
+
+def test_to_hierarchical():
+ index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (
+ 2, 'two')])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = index.to_hierarchical(3)
+ expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
+ codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
+ [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
+ tm.assert_index_equal(result, expected)
+ assert result.names == index.names
+
+ # K > 1
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = index.to_hierarchical(3, 2)
+ expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
+ codes=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+ [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])
+ tm.assert_index_equal(result, expected)
+ assert result.names == index.names
+
+ # non-sorted
+ index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'),
+ (2, 'a'), (2, 'b')],
+ names=['N1', 'N2'])
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = index.to_hierarchical(2)
+ expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'),
+ (1, 'b'),
+ (2, 'a'), (2, 'a'),
+ (2, 'b'), (2, 'b')],
+ names=['N1', 'N2'])
+ tm.assert_index_equal(result, expected)
+ assert result.names == index.names
+
+
+def test_roundtrip_pickle_with_tz():
+ return
+
+ # GH 8367
+ # round-trip of timezone
+ index = MultiIndex.from_product(
+ [[1, 2], ['a', 'b'], date_range('20130101', periods=3,
+ tz='US/Eastern')
+ ], names=['one', 'two', 'three'])
+ unpickled = tm.round_trip_pickle(index)
+ assert index.equal_levels(unpickled)
+
+
+def test_pickle(indices):
+ return
+
+ unpickled = tm.round_trip_pickle(indices)
+ assert indices.equals(unpickled)
+ original_name, indices.name = indices.name, 'foo'
+ unpickled = tm.round_trip_pickle(indices)
+ assert indices.equals(unpickled)
+ indices.name = original_name
+
+
+def test_to_series(idx):
+ # assert that we are creating a copy of the index
+
+ s = idx.to_series()
+ assert s.values is not idx.values
+ assert s.index is not idx
+ assert s.name == idx.name
+
+
+def test_to_series_with_arguments(idx):
+ # GH18699
+
+ # index kwarg
+ s = idx.to_series(index=idx)
+
+ assert s.values is not idx.values
+ assert s.index is idx
+ assert s.name == idx.name
+
+ # name kwarg
+ idx = idx
+ s = idx.to_series(name='__test')
+
+ assert s.values is not idx.values
+ assert s.index is not idx
+ assert s.name != idx.name
+
+
+def test_to_flat_index(idx):
+ expected = pd.Index((('foo', 'one'), ('foo', 'two'), ('bar', 'one'),
+ ('baz', 'two'), ('qux', 'one'), ('qux', 'two')),
+ tupleize_cols=False)
+ result = idx.to_flat_index()
+ tm.assert_index_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_copy.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_copy.py
new file mode 100644
index 00000000000..aaf2fe1cb63
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_copy.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy, deepcopy
+
+import pytest
+
+from pandas import MultiIndex
+import pandas.util.testing as tm
+
+
+def assert_multiindex_copied(copy, original):
+ # Levels should be (at least, shallow copied)
+ tm.assert_copy(copy.levels, original.levels)
+ tm.assert_almost_equal(copy.codes, original.codes)
+
+ # Labels doesn't matter which way copied
+ tm.assert_almost_equal(copy.codes, original.codes)
+ assert copy.codes is not original.codes
+
+ # Names doesn't matter which way copied
+ assert copy.names == original.names
+ assert copy.names is not original.names
+
+ # Sort order should be copied
+ assert copy.sortorder == original.sortorder
+
+
+def test_copy(idx):
+ i_copy = idx.copy()
+
+ assert_multiindex_copied(i_copy, idx)
+
+
+def test_shallow_copy(idx):
+ i_copy = idx._shallow_copy()
+
+ assert_multiindex_copied(i_copy, idx)
+
+
+def test_labels_deprecated(idx):
+ # GH23752
+ with tm.assert_produces_warning(FutureWarning):
+ idx.copy(labels=idx.codes)
+
+
+def test_view(idx):
+ i_view = idx.view()
+ assert_multiindex_copied(i_view, idx)
+
+
[email protected]('func', [copy, deepcopy])
+def test_copy_and_deepcopy(func):
+
+ idx = MultiIndex(
+ levels=[['foo', 'bar'], ['fizz', 'buzz']],
+ codes=[[0, 0, 0, 1], [0, 0, 1, 1]],
+ names=['first', 'second']
+ )
+ idx_copy = func(idx)
+ assert idx_copy is not idx
+ assert idx_copy.equals(idx)
+
+
[email protected]('deep', [True, False])
+def test_copy_method(deep):
+ idx = MultiIndex(
+ levels=[['foo', 'bar'], ['fizz', 'buzz']],
+ codes=[[0, 0, 0, 1], [0, 0, 1, 1]],
+ names=['first', 'second']
+ )
+ idx_copy = idx.copy(deep=deep)
+ assert idx_copy.equals(idx)
+
+
[email protected]('deep', [True, False])
[email protected]('kwarg, value', [
+ ('names', ['thrid', 'fourth']),
+ ('levels', [['foo2', 'bar2'], ['fizz2', 'buzz2']]),
+ ('codes', [[1, 0, 0, 0], [1, 1, 0, 0]])
+])
+def test_copy_method_kwargs(deep, kwarg, value):
+ # gh-12309: Check that the "name" argument as well other kwargs are honored
+ idx = MultiIndex(
+ levels=[['foo', 'bar'], ['fizz', 'buzz']],
+ codes=[[0, 0, 0, 1], [0, 0, 1, 1]],
+ names=['first', 'second']
+ )
+ return
+ idx_copy = idx.copy(**{kwarg: value, 'deep': deep})
+ if kwarg == 'names':
+ assert getattr(idx_copy, kwarg) == value
+ else:
+ assert [list(i) for i in getattr(idx_copy, kwarg)] == value
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_drop.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_drop.py
new file mode 100644
index 00000000000..0cf73d3d752
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_drop.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def test_drop(idx):
+ dropped = idx.drop([('foo', 'two'), ('qux', 'one')])
+
+ index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')])
+ dropped2 = idx.drop(index)
+
+ expected = idx[[0, 2, 3, 5]]
+ tm.assert_index_equal(dropped, expected)
+ tm.assert_index_equal(dropped2, expected)
+
+ dropped = idx.drop(['bar'])
+ expected = idx[[0, 1, 3, 4, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ dropped = idx.drop('foo')
+ expected = idx[[2, 3, 4, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ index = MultiIndex.from_tuples([('bar', 'two')])
+ pytest.raises(KeyError, idx.drop, [('bar', 'two')])
+ pytest.raises(KeyError, idx.drop, index)
+ pytest.raises(KeyError, idx.drop, ['foo', 'two'])
+
+ # partially correct argument
+ mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')])
+ pytest.raises(KeyError, idx.drop, mixed_index)
+
+ # error='ignore'
+ dropped = idx.drop(index, errors='ignore')
+ expected = idx[[0, 1, 2, 3, 4, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ dropped = idx.drop(mixed_index, errors='ignore')
+ expected = idx[[0, 1, 2, 3, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ dropped = idx.drop(['foo', 'two'], errors='ignore')
+ expected = idx[[2, 3, 4, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ # mixed partial / full drop
+ dropped = idx.drop(['foo', ('qux', 'one')])
+ expected = idx[[2, 3, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+ # mixed partial / full drop / error='ignore'
+ mixed_index = ['foo', ('qux', 'one'), 'two']
+ pytest.raises(KeyError, idx.drop, mixed_index)
+ dropped = idx.drop(mixed_index, errors='ignore')
+ expected = idx[[2, 3, 5]]
+ tm.assert_index_equal(dropped, expected)
+
+
+def test_droplevel_with_names(idx):
+ index = idx[idx.get_loc('foo')]
+ dropped = index.droplevel(0)
+ assert dropped.name == 'second'
+
+ index = MultiIndex(
+ levels=[Index(lrange(4)), Index(lrange(4)), Index(lrange(4))],
+ codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])],
+ names=['one', 'two', 'three'])
+ dropped = index.droplevel(0)
+ assert dropped.names == ('two', 'three')
+
+ dropped = index.droplevel('two')
+ expected = index.droplevel(1)
+ assert dropped.equals(expected)
+
+
+def test_droplevel_list():
+ index = MultiIndex(
+ levels=[Index(lrange(4)), Index(lrange(4)), Index(lrange(4))],
+ codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])],
+ names=['one', 'two', 'three'])
+
+ dropped = index[:2].droplevel(['three', 'one'])
+ expected = index[:2].droplevel(2).droplevel(0)
+ assert dropped.equals(expected)
+
+ dropped = index[:2].droplevel([])
+ expected = index[:2]
+ assert dropped.equals(expected)
+
+ with pytest.raises(ValueError):
+ index[:2].droplevel(['one', 'two', 'three'])
+
+ with pytest.raises(KeyError):
+ index[:2].droplevel(['one', 'four'])
+
+
+def test_drop_not_lexsorted():
+ # GH 12078
+
+ # define the lexsorted version of the multi-index
+ tuples = [('a', ''), ('b1', 'c1'), ('b2', 'c2')]
+ lexsorted_mi = MultiIndex.from_tuples(tuples, names=['b', 'c'])
+ assert lexsorted_mi.is_lexsorted()
+
+ # and the not-lexsorted version
+ df = pd.DataFrame(columns=['a', 'b', 'c', 'd'],
+ data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]])
+ df = df.pivot_table(index='a', columns=['b', 'c'], values='d')
+ df = df.reset_index()
+ not_lexsorted_mi = df.columns
+ assert not not_lexsorted_mi.is_lexsorted()
+
+ # compare the results
+ tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi)
+ with tm.assert_produces_warning(PerformanceWarning):
+ tm.assert_index_equal(lexsorted_mi.drop('a'),
+ not_lexsorted_mi.drop('a'))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_duplicates.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_duplicates.py
new file mode 100644
index 00000000000..35034dc57b4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_duplicates.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+from itertools import product
+
+import numpy as np
+import pytest
+
+from pandas._libs import hashtable
+from pandas.compat import range, u
+
+from pandas import DatetimeIndex, MultiIndex
+import pandas.util.testing as tm
+
+
[email protected]('names', [None, ['first', 'second']])
+def test_unique(names):
+ mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
+
+ res = mi.unique()
+ exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
+ tm.assert_index_equal(res, exp)
+
+ mi = MultiIndex.from_arrays([list('aaaa'), list('abab')],
+ names=names)
+ res = mi.unique()
+ exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names)
+ tm.assert_index_equal(res, exp)
+
+ mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names)
+ res = mi.unique()
+ exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names)
+ tm.assert_index_equal(res, exp)
+
+ # GH #20568 - empty MI
+ mi = MultiIndex.from_arrays([[], []], names=names)
+ res = mi.unique()
+ tm.assert_index_equal(mi, res)
+
+
+def test_unique_datetimelike():
+ idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
+ '2015-01-01', 'NaT', 'NaT'])
+ idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
+ '2015-01-02', 'NaT', '2015-01-01'],
+ tz='Asia/Tokyo')
+ result = MultiIndex.from_arrays([idx1, idx2]).unique()
+
+ eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
+ eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02',
+ 'NaT', '2015-01-01'],
+ tz='Asia/Tokyo')
+ exp = MultiIndex.from_arrays([eidx1, eidx2])
+ tm.assert_index_equal(result, exp)
+
+
[email protected]('level', [0, 'first', 1, 'second'])
+def test_unique_level(idx, level):
+ # GH #17896 - with level= argument
+ result = idx.unique(level=level)
+ expected = idx.get_level_values(level).unique()
+ tm.assert_index_equal(result, expected)
+
+ # With already unique level
+ mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
+ names=['first', 'second'])
+ result = mi.unique(level=level)
+ expected = mi.get_level_values(level)
+ tm.assert_index_equal(result, expected)
+
+ # With empty MI
+ mi = MultiIndex.from_arrays([[], []], names=['first', 'second'])
+ result = mi.unique(level=level)
+ expected = mi.get_level_values(level)
+
+
[email protected]('dropna', [True, False])
+def test_get_unique_index(idx, dropna):
+ mi = idx[[0, 1, 0, 1, 1, 0, 0]]
+ expected = mi._shallow_copy(mi[[0, 1]])
+
+ result = mi._get_unique_index(dropna=dropna)
+ assert result.unique
+ tm.assert_index_equal(result, expected)
+
+
+def test_duplicate_multiindex_codes():
+ # GH 17464
+ # Make sure that a MultiIndex with duplicate levels throws a ValueError
+ with pytest.raises(ValueError):
+ mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])
+
+ # And that using set_levels with duplicate levels fails
+ mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
+ [1, 2, 1, 2, 3]])
+ with pytest.raises(ValueError):
+ mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
+ inplace=True)
+
+
[email protected]('names', [['a', 'b', 'a'], [1, 1, 2],
+ [1, 'a', 1]])
+def test_duplicate_level_names(names):
+ # GH18872, GH19029
+ mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
+ assert mi.names == names
+
+ # With .rename()
+ mi = MultiIndex.from_product([[0, 1]] * 3)
+ mi = mi.rename(names)
+ assert mi.names == names
+
+ # With .rename(., level=)
+ mi.rename(names[1], level=1, inplace=True)
+ mi = mi.rename([names[0], names[2]], level=[0, 2])
+ assert mi.names == names
+
+
+def test_duplicate_meta_data():
+ # GH 10115
+ mi = MultiIndex(
+ levels=[[0, 1], [0, 1, 2]],
+ codes=[[0, 0, 0, 0, 1, 1, 1],
+ [0, 1, 2, 0, 0, 1, 2]])
+
+ for idx in [mi,
+ mi.set_names([None, None]),
+ mi.set_names([None, 'Num']),
+ mi.set_names(['Upper', 'Num']), ]:
+ assert idx.has_duplicates
+ assert idx.drop_duplicates().names == idx.names
+
+
+def test_has_duplicates(idx, idx_dup):
+ # see fixtures
+ assert idx.is_unique is True
+ assert idx.has_duplicates is False
+ assert idx_dup.is_unique is False
+ assert idx_dup.has_duplicates is True
+
+ mi = MultiIndex(levels=[[0, 1], [0, 1, 2]],
+ codes=[[0, 0, 0, 0, 1, 1, 1],
+ [0, 1, 2, 0, 0, 1, 2]])
+ assert mi.is_unique is False
+ assert mi.has_duplicates is True
+
+ # single instance of NaN
+ mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]],
+ codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]])
+ assert mi_nan.is_unique is True
+ assert mi_nan.has_duplicates is False
+
+ # multiple instances of NaN
+ mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]],
+ codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]])
+ assert mi_nan_dup.is_unique is False
+ assert mi_nan_dup.has_duplicates is True
+
+
+def test_has_duplicates_from_tuples():
+ # GH 9075
+ t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
+ (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
+ (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135),
+ (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145),
+ (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158),
+ (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122),
+ (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160),
+ (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180),
+ (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143),
+ (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128),
+ (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129),
+ (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111),
+ (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114),
+ (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121),
+ (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126),
+ (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155),
+ (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
+ (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]
+
+ mi = MultiIndex.from_tuples(t)
+ assert not mi.has_duplicates
+
+
+def test_has_duplicates_overflow():
+ # handle int64 overflow if possible
+ def check(nlevels, with_nulls):
+ codes = np.tile(np.arange(500), 2)
+ level = np.arange(500)
+
+ if with_nulls: # inject some null values
+ codes[500] = -1 # common nan value
+ codes = [codes.copy() for i in range(nlevels)]
+ for i in range(nlevels):
+ codes[i][500 + i - nlevels // 2] = -1
+
+ codes += [np.array([-1, 1]).repeat(500)]
+ else:
+ codes = [codes] * nlevels + [np.arange(2).repeat(500)]
+
+ levels = [level] * nlevels + [[0, 1]]
+
+ # no dups
+ mi = MultiIndex(levels=levels, codes=codes)
+ assert not mi.has_duplicates
+
+ # with a dup
+ if with_nulls:
+ def f(a):
+ return np.insert(a, 1000, a[0])
+ codes = list(map(f, codes))
+ mi = MultiIndex(levels=levels, codes=codes)
+ else:
+ values = mi.values.tolist()
+ mi = MultiIndex.from_tuples(values + [values[0]])
+
+ assert mi.has_duplicates
+
+ # no overflow
+ check(4, False)
+ check(4, True)
+
+ # overflow possible
+ check(8, False)
+ check(8, True)
+
+
[email protected]('keep, expected', [
+ ('first', np.array([False, False, False, True, True, False])),
+ ('last', np.array([False, True, True, False, False, False])),
+ (False, np.array([False, True, True, True, True, False]))
+])
+def test_duplicated(idx_dup, keep, expected):
+ result = idx_dup.duplicated(keep=keep)
+ tm.assert_numpy_array_equal(result, expected)
+
+
[email protected]('keep', ['first', 'last', False])
+def test_duplicated_large(keep):
+ # GH 9125
+ n, k = 200, 5000
+ levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
+ codes = [np.random.choice(n, k * n) for lev in levels]
+ mi = MultiIndex(levels=levels, codes=codes)
+
+ result = mi.duplicated(keep=keep)
+ expected = hashtable.duplicated_object(mi.values, keep=keep)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_get_duplicates():
+ # GH5873
+ for a in [101, 102]:
+ mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
+ assert not mi.has_duplicates
+
+ with tm.assert_produces_warning(FutureWarning):
+ # Deprecated - see GH20239
+ assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))
+
+ tm.assert_numpy_array_equal(mi.duplicated(),
+ np.zeros(2, dtype='bool'))
+
+ for n in range(1, 6): # 1st level shape
+ for m in range(1, 5): # 2nd level shape
+ # all possible unique combinations, including nan
+ codes = product(range(-1, n), range(-1, m))
+ mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]],
+ codes=np.random.permutation(list(codes)).T)
+ assert len(mi) == (n + 1) * (m + 1)
+ assert not mi.has_duplicates
+
+ with tm.assert_produces_warning(FutureWarning):
+ # Deprecated - see GH20239
+ assert mi.get_duplicates().equals(MultiIndex.from_arrays(
+ [[], []]))
+
+ tm.assert_numpy_array_equal(mi.duplicated(),
+ np.zeros(len(mi), dtype='bool'))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_equivalence.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_equivalence.py
new file mode 100644
index 00000000000..6a9eb662dd9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_equivalence.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, lzip, range
+
+import pandas as pd
+from pandas import Index, MultiIndex, Series
+import pandas.util.testing as tm
+
+
+def test_equals(idx):
+ assert idx.equals(idx)
+ assert idx.equals(idx.copy())
+ assert idx.equals(idx.astype(object))
+
+ assert not idx.equals(list(idx))
+ assert not idx.equals(np.array(idx))
+
+ same_values = Index(idx, dtype=object)
+ assert idx.equals(same_values)
+ assert same_values.equals(idx)
+
+ if idx.nlevels == 1:
+ # do not test MultiIndex
+ assert not idx.equals(pd.Series(idx))
+
+
+def test_equals_op(idx):
+ # GH9947, GH10637
+ index_a = idx
+
+ n = len(index_a)
+ index_b = index_a[0:-1]
+ index_c = index_a[0:-1].append(index_a[-2:-1])
+ index_d = index_a[0:1]
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == index_b
+ expected1 = np.array([True] * n)
+ expected2 = np.array([True] * (n - 1) + [False])
+ tm.assert_numpy_array_equal(index_a == index_a, expected1)
+ tm.assert_numpy_array_equal(index_a == index_c, expected2)
+
+ # test comparisons with numpy arrays
+ array_a = np.array(index_a)
+ array_b = np.array(index_a[0:-1])
+ array_c = np.array(index_a[0:-1].append(index_a[-2:-1]))
+ array_d = np.array(index_a[0:1])
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == array_b
+ tm.assert_numpy_array_equal(index_a == array_a, expected1)
+ tm.assert_numpy_array_equal(index_a == array_c, expected2)
+
+ # test comparisons with Series
+ series_a = Series(array_a)
+ series_b = Series(array_b)
+ series_c = Series(array_c)
+ series_d = Series(array_d)
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == series_b
+
+ tm.assert_numpy_array_equal(index_a == series_a, expected1)
+ tm.assert_numpy_array_equal(index_a == series_c, expected2)
+
+ # cases where length is 1 for one of them
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == index_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == series_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ index_a == array_d
+ msg = "Can only compare identically-labeled Series objects"
+ with pytest.raises(ValueError, match=msg):
+ series_a == series_d
+ with pytest.raises(ValueError, match="Lengths must match"):
+ series_a == array_d
+
+ # comparing with a scalar should broadcast; note that we are excluding
+ # MultiIndex because in this case each item in the index is a tuple of
+ # length 2, and therefore is considered an array of length 2 in the
+ # comparison instead of a scalar
+ if not isinstance(index_a, MultiIndex):
+ expected3 = np.array([False] * (len(index_a) - 2) + [True, False])
+ # assuming the 2nd to last item is unique in the data
+ item = index_a[-2]
+ tm.assert_numpy_array_equal(index_a == item, expected3)
+ tm.assert_series_equal(series_a == item, Series(expected3))
+
+
+def test_equals_multi(idx):
+ assert idx.equals(idx)
+ assert not idx.equals(idx.values)
+ assert idx.equals(Index(idx.values))
+
+ assert idx.equal_levels(idx)
+ assert not idx.equals(idx[:-1])
+ assert not idx.equals(idx[-1])
+
+ # different number of levels
+ index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
+ lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
+
+ index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1])
+ assert not index.equals(index2)
+ assert not index.equal_levels(index2)
+
+ # levels are different
+ major_axis = Index(lrange(4))
+ minor_axis = Index(lrange(2))
+
+ major_codes = np.array([0, 0, 1, 2, 2, 3])
+ minor_codes = np.array([0, 1, 0, 0, 1, 0])
+
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+ assert not idx.equals(index)
+ assert not idx.equal_levels(index)
+
+ # some of the labels are different
+ major_axis = Index(['foo', 'bar', 'baz', 'qux'])
+ minor_axis = Index(['one', 'two'])
+
+ major_codes = np.array([0, 0, 2, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 0, 1])
+
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+ assert not idx.equals(index)
+
+
+def test_identical(idx):
+ mi = idx.copy()
+ mi2 = idx.copy()
+ assert mi.identical(mi2)
+
+ mi = mi.set_names(['new1', 'new2'])
+ assert mi.equals(mi2)
+ assert not mi.identical(mi2)
+
+ mi2 = mi2.set_names(['new1', 'new2'])
+ assert mi.identical(mi2)
+
+ mi3 = Index(mi.tolist(), names=mi.names)
+ mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False)
+ assert mi.identical(mi3)
+ assert not mi.identical(mi4)
+ assert mi.equals(mi4)
+
+
+def test_equals_operator(idx):
+ # GH9785
+ assert (idx == idx).all()
+
+
+def test_equals_missing_values():
+ # make sure take is not using -1
+ i = pd.MultiIndex.from_tuples([(0, pd.NaT),
+ (0, pd.Timestamp('20130101'))])
+ result = i[0:1].equals(i[0])
+ assert not result
+ result = i[1:2].equals(i[1])
+ assert not result
+
+
+def test_is_():
+ mi = MultiIndex.from_tuples(lzip(range(10), range(10)))
+ assert mi.is_(mi)
+ assert mi.is_(mi.view())
+ assert mi.is_(mi.view().view().view().view())
+ mi2 = mi.view()
+ # names are metadata, they don't change id
+ mi2.names = ["A", "B"]
+ assert mi2.is_(mi)
+ assert mi.is_(mi2)
+
+ assert mi.is_(mi.set_names(["C", "D"]))
+ mi2 = mi.view()
+ mi2.set_names(["E", "F"], inplace=True)
+ assert mi.is_(mi2)
+ # levels are inherent properties, they change identity
+ mi3 = mi2.set_levels([lrange(10), lrange(10)])
+ assert not mi3.is_(mi2)
+ # shouldn't change
+ assert mi2.is_(mi)
+ mi4 = mi3.view()
+
+ # GH 17464 - Remove duplicate MultiIndex levels
+ mi4.set_levels([lrange(10), lrange(10)], inplace=True)
+ assert not mi4.is_(mi3)
+ mi5 = mi.view()
+ mi5.set_levels(mi5.levels, inplace=True)
+ assert not mi5.is_(mi)
+
+
+def test_is_all_dates(idx):
+ assert not idx.is_all_dates
+
+
+def test_is_numeric(idx):
+ # MultiIndex is never numeric
+ assert not idx.is_numeric()
+
+
+def test_multiindex_compare():
+ # GH 21149
+ # Ensure comparison operations for MultiIndex with nlevels == 1
+ # behave consistently with those for MultiIndex with nlevels > 1
+
+ midx = pd.MultiIndex.from_product([[0, 1]])
+
+ # Equality self-test: MultiIndex object vs self
+ expected = pd.Series([True, True])
+ result = pd.Series(midx == midx)
+ tm.assert_series_equal(result, expected)
+
+ # Greater than comparison: MultiIndex object vs self
+ expected = pd.Series([False, False])
+ result = pd.Series(midx > midx)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_format.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_format.py
new file mode 100644
index 00000000000..a10b7220b8a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_format.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+
+import warnings
+
+import pytest
+
+from pandas.compat import PY3, range, u
+
+import pandas as pd
+from pandas import MultiIndex, compat
+import pandas.util.testing as tm
+
+
+def test_dtype_str(indices):
+ dtype = indices.dtype_str
+ assert isinstance(dtype, compat.string_types)
+ assert dtype == str(indices.dtype)
+
+
+def test_format(idx):
+ idx.format()
+ idx[:0].format()
+
+
+def test_format_integer_names():
+ index = MultiIndex(levels=[[0, 1], [0, 1]],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1])
+ index.format(names=True)
+
+
+def test_format_sparse_config(idx):
+ warn_filters = warnings.filters
+ warnings.filterwarnings('ignore', category=FutureWarning,
+ module=".*format")
+ # GH1538
+ pd.set_option('display.multi_sparse', False)
+
+ result = idx.format()
+ assert result[1] == 'foo two'
+
+ tm.reset_display_options()
+
+ warnings.filters = warn_filters
+
+
+def test_format_sparse_display():
+ index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]],
+ codes=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1],
+ [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]])
+
+ result = index.format()
+ assert result[3] == '1 0 0 0'
+
+
+def test_repr_with_unicode_data():
+ with pd.core.config.option_context("display.encoding", 'UTF-8'):
+ d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+ index = pd.DataFrame(d).set_index(["a", "b"]).index
+ assert "\\u" not in repr(index) # we don't want unicode-escaped
+
+
[email protected](reason="#22511 will remove this test")
+def test_repr_roundtrip():
+
+ mi = MultiIndex.from_product([list('ab'), range(3)],
+ names=['first', 'second'])
+ str(mi)
+
+ if PY3:
+ tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
+ else:
+ result = eval(repr(mi))
+ # string coerces to unicode
+ tm.assert_index_equal(result, mi, exact=False)
+ assert mi.get_level_values('first').inferred_type == 'string'
+ assert result.get_level_values('first').inferred_type == 'unicode'
+
+ mi_u = MultiIndex.from_product(
+ [list(u'ab'), range(3)], names=['first', 'second'])
+ result = eval(repr(mi_u))
+ tm.assert_index_equal(result, mi_u, exact=True)
+
+ # formatting
+ if PY3:
+ str(mi)
+ else:
+ compat.text_type(mi)
+
+ # long format
+ mi = MultiIndex.from_product([list('abcdefg'), range(10)],
+ names=['first', 'second'])
+
+ if PY3:
+ tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
+ else:
+ result = eval(repr(mi))
+ # string coerces to unicode
+ tm.assert_index_equal(result, mi, exact=False)
+ assert mi.get_level_values('first').inferred_type == 'string'
+ assert result.get_level_values('first').inferred_type == 'unicode'
+
+ result = eval(repr(mi_u))
+ tm.assert_index_equal(result, mi_u, exact=True)
+
+
+def test_unicode_string_with_unicode():
+ d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+ idx = pd.DataFrame(d).set_index(["a", "b"]).index
+
+ if PY3:
+ str(idx)
+ else:
+ compat.text_type(idx)
+
+
+def test_bytestring_with_unicode():
+ d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+ idx = pd.DataFrame(d).set_index(["a", "b"]).index
+
+ if PY3:
+ bytes(idx)
+ else:
+ str(idx)
+
+
+def test_repr_max_seq_item_setting(idx):
+ # GH10182
+ idx = idx.repeat(50)
+ with pd.option_context("display.max_seq_items", None):
+ repr(idx)
+ assert '...' not in str(idx)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_get_set.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_get_set.py
new file mode 100644
index 00000000000..d201cb2eb17
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_get_set.py
@@ -0,0 +1,454 @@
+# -*- coding: utf-8 -*-
+
+
+import numpy as np
+import pytest
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import CategoricalIndex, Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def assert_matching(actual, expected, check_dtype=False):
+ # avoid specifying internal representation
+ # as much as possible
+ assert len(actual) == len(expected)
+ for act, exp in zip(actual, expected):
+ act = np.asarray(act)
+ exp = np.asarray(exp)
+ tm.assert_numpy_array_equal(act, exp, check_dtype=check_dtype)
+
+
+def test_get_level_number_integer(idx):
+ idx.names = [1, 0]
+ assert idx._get_level_number(1) == 0
+ assert idx._get_level_number(0) == 1
+ pytest.raises(IndexError, idx._get_level_number, 2)
+ with pytest.raises(KeyError, match='Level fourth not found'):
+ idx._get_level_number('fourth')
+
+
+def test_get_level_values(idx):
+ result = idx.get_level_values(0)
+ expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'],
+ name='first')
+ tm.assert_index_equal(result, expected)
+ assert result.name == 'first'
+
+ result = idx.get_level_values('first')
+ expected = idx.get_level_values(0)
+ tm.assert_index_equal(result, expected)
+
+ # GH 10460
+ index = MultiIndex(
+ levels=[CategoricalIndex(['A', 'B']),
+ CategoricalIndex([1, 2, 3])],
+ codes=[np.array([0, 0, 0, 1, 1, 1]),
+ np.array([0, 1, 2, 0, 1, 2])])
+
+ exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B'])
+ tm.assert_index_equal(index.get_level_values(0), exp)
+ exp = CategoricalIndex([1, 2, 3, 1, 2, 3])
+ tm.assert_index_equal(index.get_level_values(1), exp)
+
+
+def test_get_value_duplicates():
+ index = MultiIndex(levels=[['D', 'B', 'C'],
+ [0, 26, 27, 37, 57, 67, 75, 82]],
+ codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
+ [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
+ names=['tag', 'day'])
+
+ assert index.get_loc('D') == slice(0, 3)
+ with pytest.raises(KeyError):
+ index._engine.get_value(np.array([]), 'D')
+
+
+def test_get_level_values_all_na():
+ # GH 17924 when level entirely consists of nan
+ arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(0)
+ expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64)
+ tm.assert_index_equal(result, expected)
+
+ result = index.get_level_values(1)
+ expected = pd.Index(['a', np.nan, 1], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+
+def test_get_level_values_int_with_na():
+ # GH 17924
+ arrays = [['a', 'b', 'b'], [1, np.nan, 2]]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(1)
+ expected = Index([1, np.nan, 2])
+ tm.assert_index_equal(result, expected)
+
+ arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(1)
+ expected = Index([np.nan, np.nan, 2])
+ tm.assert_index_equal(result, expected)
+
+
+def test_get_level_values_na():
+ arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(0)
+ expected = pd.Index([np.nan, np.nan, np.nan])
+ tm.assert_index_equal(result, expected)
+
+ result = index.get_level_values(1)
+ expected = pd.Index(['a', np.nan, 1])
+ tm.assert_index_equal(result, expected)
+
+ arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(1)
+ expected = pd.DatetimeIndex([0, 1, pd.NaT])
+ tm.assert_index_equal(result, expected)
+
+ arrays = [[], []]
+ index = pd.MultiIndex.from_arrays(arrays)
+ result = index.get_level_values(0)
+ expected = pd.Index([], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+
+def test_set_name_methods(idx, index_names):
+ # so long as these are synonyms, we don't need to test set_names
+ assert idx.rename == idx.set_names
+ new_names = [name + "SUFFIX" for name in index_names]
+ ind = idx.set_names(new_names)
+ assert idx.names == index_names
+ assert ind.names == new_names
+ with pytest.raises(ValueError, match="^Length"):
+ ind.set_names(new_names + new_names)
+ new_names2 = [name + "SUFFIX2" for name in new_names]
+ res = ind.set_names(new_names2, inplace=True)
+ assert res is None
+ assert ind.names == new_names2
+
+ # set names for specific level (# GH7792)
+ ind = idx.set_names(new_names[0], level=0)
+ assert idx.names == index_names
+ assert ind.names == [new_names[0], index_names[1]]
+
+ res = ind.set_names(new_names2[0], level=0, inplace=True)
+ assert res is None
+ assert ind.names == [new_names2[0], index_names[1]]
+
+ # set names for multiple levels
+ ind = idx.set_names(new_names, level=[0, 1])
+ assert idx.names == index_names
+ assert ind.names == new_names
+
+ res = ind.set_names(new_names2, level=[0, 1], inplace=True)
+ assert res is None
+ assert ind.names == new_names2
+
+
+def test_set_levels_codes_directly(idx):
+ # setting levels/codes directly raises AttributeError
+
+ levels = idx.levels
+ new_levels = [[lev + 'a' for lev in level] for level in levels]
+
+ codes = idx.codes
+ major_codes, minor_codes = codes
+ major_codes = [(x + 1) % 3 for x in major_codes]
+ minor_codes = [(x + 1) % 1 for x in minor_codes]
+ new_codes = [major_codes, minor_codes]
+
+ with pytest.raises(AttributeError):
+ idx.levels = new_levels
+
+ with pytest.raises(AttributeError):
+ idx.codes = new_codes
+
+
+def test_set_levels(idx):
+ # side note - you probably wouldn't want to use levels and codes
+ # directly like this - but it is possible.
+ levels = idx.levels
+ new_levels = [[lev + 'a' for lev in level] for level in levels]
+
+ # level changing [w/o mutation]
+ ind2 = idx.set_levels(new_levels)
+ assert_matching(ind2.levels, new_levels)
+ assert_matching(idx.levels, levels)
+
+ # level changing [w/ mutation]
+ ind2 = idx.copy()
+ inplace_return = ind2.set_levels(new_levels, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.levels, new_levels)
+
+ # level changing specific level [w/o mutation]
+ ind2 = idx.set_levels(new_levels[0], level=0)
+ assert_matching(ind2.levels, [new_levels[0], levels[1]])
+ assert_matching(idx.levels, levels)
+
+ ind2 = idx.set_levels(new_levels[1], level=1)
+ assert_matching(ind2.levels, [levels[0], new_levels[1]])
+ assert_matching(idx.levels, levels)
+
+ # level changing multiple levels [w/o mutation]
+ ind2 = idx.set_levels(new_levels, level=[0, 1])
+ assert_matching(ind2.levels, new_levels)
+ assert_matching(idx.levels, levels)
+
+ # level changing specific level [w/ mutation]
+ ind2 = idx.copy()
+ inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.levels, [new_levels[0], levels[1]])
+ assert_matching(idx.levels, levels)
+
+ ind2 = idx.copy()
+ inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.levels, [levels[0], new_levels[1]])
+ assert_matching(idx.levels, levels)
+
+ # level changing multiple levels [w/ mutation]
+ ind2 = idx.copy()
+ inplace_return = ind2.set_levels(new_levels, level=[0, 1],
+ inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.levels, new_levels)
+ assert_matching(idx.levels, levels)
+
+ # illegal level changing should not change levels
+ # GH 13754
+ original_index = idx.copy()
+ for inplace in [True, False]:
+ with pytest.raises(ValueError, match="^On"):
+ idx.set_levels(['c'], level=0, inplace=inplace)
+ assert_matching(idx.levels, original_index.levels,
+ check_dtype=True)
+
+ with pytest.raises(ValueError, match="^On"):
+ idx.set_codes([0, 1, 2, 3, 4, 5], level=0,
+ inplace=inplace)
+ assert_matching(idx.codes, original_index.codes,
+ check_dtype=True)
+
+ with pytest.raises(TypeError, match="^Levels"):
+ idx.set_levels('c', level=0, inplace=inplace)
+ assert_matching(idx.levels, original_index.levels,
+ check_dtype=True)
+
+ with pytest.raises(TypeError, match="^Codes"):
+ idx.set_codes(1, level=0, inplace=inplace)
+ assert_matching(idx.codes, original_index.codes,
+ check_dtype=True)
+
+
+def test_set_codes(idx):
+ # side note - you probably wouldn't want to use levels and codes
+ # directly like this - but it is possible.
+ codes = idx.codes
+ major_codes, minor_codes = codes
+ major_codes = [(x + 1) % 3 for x in major_codes]
+ minor_codes = [(x + 1) % 1 for x in minor_codes]
+ new_codes = [major_codes, minor_codes]
+
+ # changing codes w/o mutation
+ ind2 = idx.set_codes(new_codes)
+ assert_matching(ind2.codes, new_codes)
+ assert_matching(idx.codes, codes)
+
+ # changing label w/ mutation
+ ind2 = idx.copy()
+ inplace_return = ind2.set_codes(new_codes, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.codes, new_codes)
+
+ # codes changing specific level w/o mutation
+ ind2 = idx.set_codes(new_codes[0], level=0)
+ assert_matching(ind2.codes, [new_codes[0], codes[1]])
+ assert_matching(idx.codes, codes)
+
+ ind2 = idx.set_codes(new_codes[1], level=1)
+ assert_matching(ind2.codes, [codes[0], new_codes[1]])
+ assert_matching(idx.codes, codes)
+
+ # codes changing multiple levels w/o mutation
+ ind2 = idx.set_codes(new_codes, level=[0, 1])
+ assert_matching(ind2.codes, new_codes)
+ assert_matching(idx.codes, codes)
+
+ # label changing specific level w/ mutation
+ ind2 = idx.copy()
+ inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.codes, [new_codes[0], codes[1]])
+ assert_matching(idx.codes, codes)
+
+ ind2 = idx.copy()
+ inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.codes, [codes[0], new_codes[1]])
+ assert_matching(idx.codes, codes)
+
+ # codes changing multiple levels [w/ mutation]
+ ind2 = idx.copy()
+ inplace_return = ind2.set_codes(new_codes, level=[0, 1],
+ inplace=True)
+ assert inplace_return is None
+ assert_matching(ind2.codes, new_codes)
+ assert_matching(idx.codes, codes)
+
+ # label changing for levels of different magnitude of categories
+ ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)])
+ new_codes = range(129, -1, -1)
+ expected = pd.MultiIndex.from_tuples(
+ [(0, i) for i in new_codes])
+
+ # [w/o mutation]
+ result = ind.set_codes(codes=new_codes, level=1)
+ assert result.equals(expected)
+
+ # [w/ mutation]
+ result = ind.copy()
+ result.set_codes(codes=new_codes, level=1, inplace=True)
+ assert result.equals(expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ ind.set_codes(labels=new_codes, level=1)
+
+
+def test_set_labels_deprecated():
+ # GH23752
+ ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)])
+ new_labels = range(129, -1, -1)
+ expected = pd.MultiIndex.from_tuples(
+ [(0, i) for i in new_labels])
+
+ # [w/o mutation]
+ with tm.assert_produces_warning(FutureWarning):
+ result = ind.set_labels(labels=new_labels, level=1)
+ assert result.equals(expected)
+
+ # [w/ mutation]
+ result = ind.copy()
+ with tm.assert_produces_warning(FutureWarning):
+ result.set_labels(labels=new_labels, level=1, inplace=True)
+ assert result.equals(expected)
+
+
+def test_set_levels_codes_names_bad_input(idx):
+ levels, codes = idx.levels, idx.codes
+ names = idx.names
+
+ with pytest.raises(ValueError, match='Length of levels'):
+ idx.set_levels([levels[0]])
+
+ with pytest.raises(ValueError, match='Length of codes'):
+ idx.set_codes([codes[0]])
+
+ with pytest.raises(ValueError, match='Length of names'):
+ idx.set_names([names[0]])
+
+ # shouldn't scalar data error, instead should demand list-like
+ with pytest.raises(TypeError, match='list of lists-like'):
+ idx.set_levels(levels[0])
+
+ # shouldn't scalar data error, instead should demand list-like
+ with pytest.raises(TypeError, match='list of lists-like'):
+ idx.set_codes(codes[0])
+
+ # shouldn't scalar data error, instead should demand list-like
+ with pytest.raises(TypeError, match='list-like'):
+ idx.set_names(names[0])
+
+ # should have equal lengths
+ with pytest.raises(TypeError, match='list of lists-like'):
+ idx.set_levels(levels[0], level=[0, 1])
+
+ with pytest.raises(TypeError, match='list-like'):
+ idx.set_levels(levels, level=0)
+
+ # should have equal lengths
+ with pytest.raises(TypeError, match='list of lists-like'):
+ idx.set_codes(codes[0], level=[0, 1])
+
+ with pytest.raises(TypeError, match='list-like'):
+ idx.set_codes(codes, level=0)
+
+ # should have equal lengths
+ with pytest.raises(ValueError, match='Length of names'):
+ idx.set_names(names[0], level=[0, 1])
+
+ with pytest.raises(TypeError, match='Names must be a'):
+ idx.set_names(names, level=0)
+
+
[email protected]('inplace', [True, False])
+def test_set_names_with_nlevel_1(inplace):
+ # GH 21149
+ # Ensure that .set_names for MultiIndex with
+ # nlevels == 1 does not raise any errors
+ expected = pd.MultiIndex(levels=[[0, 1]],
+ codes=[[0, 1]],
+ names=['first'])
+ m = pd.MultiIndex.from_product([[0, 1]])
+ result = m.set_names('first', level=0, inplace=inplace)
+
+ if inplace:
+ result = m
+
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('ordered', [True, False])
+def test_set_levels_categorical(ordered):
+ # GH13854
+ index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]])
+
+ cidx = CategoricalIndex(list("bac"), ordered=ordered)
+ result = index.set_levels(cidx, 0)
+ expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]],
+ codes=index.codes)
+ tm.assert_index_equal(result, expected)
+
+ result_lvl = result.get_level_values(0)
+ expected_lvl = CategoricalIndex(list("bacb"),
+ categories=cidx.categories,
+ ordered=cidx.ordered)
+ tm.assert_index_equal(result_lvl, expected_lvl)
+
+
+def test_set_value_keeps_names():
+ # motivating example from #3742
+ lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe']
+ lev2 = ['1', '2', '3'] * 2
+ idx = pd.MultiIndex.from_arrays([lev1, lev2], names=['Name', 'Number'])
+ df = pd.DataFrame(
+ np.random.randn(6, 4),
+ columns=['one', 'two', 'three', 'four'],
+ index=idx)
+ df = df.sort_index()
+ assert df._is_copy is None
+ assert df.index.names == ('Name', 'Number')
+ df.at[('grethe', '4'), 'one'] = 99.34
+ assert df._is_copy is None
+ assert df.index.names == ('Name', 'Number')
+
+
+def test_set_levels_with_iterable():
+ # GH23273
+ sizes = [1, 2, 3]
+ colors = ['black'] * 3
+ index = pd.MultiIndex.from_arrays([sizes, colors], names=['size', 'color'])
+
+ result = index.set_levels(map(int, ['3', '2', '1']), level='size')
+
+ expected_sizes = [3, 2, 1]
+ expected = pd.MultiIndex.from_arrays([expected_sizes, colors],
+ names=['size', 'color'])
+ tm.assert_index_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_indexing.py
new file mode 100644
index 00000000000..c40ecd9e82a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_indexing.py
@@ -0,0 +1,375 @@
+# -*- coding: utf-8 -*-
+
+
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, Index, IntervalIndex, MultiIndex,
+ date_range)
+from pandas.core.indexes.base import InvalidIndexError
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+
+def test_slice_locs_partial(idx):
+ sorted_idx, _ = idx.sortlevel(0)
+
+ result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one'))
+ assert result == (1, 5)
+
+ result = sorted_idx.slice_locs(None, ('qux', 'one'))
+ assert result == (0, 5)
+
+ result = sorted_idx.slice_locs(('foo', 'two'), None)
+ assert result == (1, len(sorted_idx))
+
+ result = sorted_idx.slice_locs('bar', 'baz')
+ assert result == (2, 4)
+
+
+def test_slice_locs():
+ df = tm.makeTimeDataFrame()
+ stacked = df.stack()
+ idx = stacked.index
+
+ slob = slice(*idx.slice_locs(df.index[5], df.index[15]))
+ sliced = stacked[slob]
+ expected = df[5:16].stack()
+ tm.assert_almost_equal(sliced.values, expected.values)
+
+ slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30),
+ df.index[15] - timedelta(seconds=30)))
+ sliced = stacked[slob]
+ expected = df[6:15].stack()
+ tm.assert_almost_equal(sliced.values, expected.values)
+
+
+def test_slice_locs_with_type_mismatch():
+ df = tm.makeTimeDataFrame()
+ stacked = df.stack()
+ idx = stacked.index
+ with pytest.raises(TypeError, match='^Level type mismatch'):
+ idx.slice_locs((1, 3))
+ with pytest.raises(TypeError, match='^Level type mismatch'):
+ idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
+ df = tm.makeCustomDataframe(5, 5)
+ stacked = df.stack()
+ idx = stacked.index
+ with pytest.raises(TypeError, match='^Level type mismatch'):
+ idx.slice_locs(timedelta(seconds=30))
+ # TODO: Try creating a UnicodeDecodeError in exception message
+ with pytest.raises(TypeError, match='^Level type mismatch'):
+ idx.slice_locs(df.index[1], (16, "a"))
+
+
+def test_slice_locs_not_sorted():
+ index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
+ lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
+
+ msg = "[Kk]ey length.*greater than MultiIndex lexsort depth"
+ with pytest.raises(KeyError, match=msg):
+ index.slice_locs((1, 0, 1), (2, 1, 0))
+
+ # works
+ sorted_index, _ = index.sortlevel(0)
+ # should there be a test case here???
+ sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
+
+
+def test_slice_locs_not_contained():
+ # some searchsorted action
+
+ index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]],
+ codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3],
+ [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0)
+
+ result = index.slice_locs((1, 0), (5, 2))
+ assert result == (3, 6)
+
+ result = index.slice_locs(1, 5)
+ assert result == (3, 6)
+
+ result = index.slice_locs((2, 2), (5, 2))
+ assert result == (3, 6)
+
+ result = index.slice_locs(2, 5)
+ assert result == (3, 6)
+
+ result = index.slice_locs((1, 0), (6, 3))
+ assert result == (3, 8)
+
+ result = index.slice_locs(-1, 10)
+ assert result == (0, len(index))
+
+
+def test_putmask_with_wrong_mask(idx):
+ # GH18368
+
+ with pytest.raises(ValueError):
+ idx.putmask(np.ones(len(idx) + 1, np.bool), 1)
+
+ with pytest.raises(ValueError):
+ idx.putmask(np.ones(len(idx) - 1, np.bool), 1)
+
+ with pytest.raises(ValueError):
+ idx.putmask('foo', 1)
+
+
+def test_get_indexer():
+ major_axis = Index(lrange(4))
+ minor_axis = Index(lrange(2))
+
+ major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
+ minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)
+
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+ idx1 = index[:5]
+ idx2 = index[[1, 3, 5]]
+
+ r1 = idx1.get_indexer(idx2)
+ assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))
+
+ r1 = idx2.get_indexer(idx1, method='pad')
+ e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
+ assert_almost_equal(r1, e1)
+
+ r2 = idx2.get_indexer(idx1[::-1], method='pad')
+ assert_almost_equal(r2, e1[::-1])
+
+ rffill1 = idx2.get_indexer(idx1, method='ffill')
+ assert_almost_equal(r1, rffill1)
+
+ r1 = idx2.get_indexer(idx1, method='backfill')
+ e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
+ assert_almost_equal(r1, e1)
+
+ r2 = idx2.get_indexer(idx1[::-1], method='backfill')
+ assert_almost_equal(r2, e1[::-1])
+
+ rbfill1 = idx2.get_indexer(idx1, method='bfill')
+ assert_almost_equal(r1, rbfill1)
+
+ # pass non-MultiIndex
+ r1 = idx1.get_indexer(idx2.values)
+ rexp1 = idx1.get_indexer(idx2)
+ assert_almost_equal(r1, rexp1)
+
+ r1 = idx1.get_indexer([1, 2, 3])
+ assert (r1 == [-1, -1, -1]).all()
+
+ # create index with duplicates
+ idx1 = Index(lrange(10) + lrange(10))
+ idx2 = Index(lrange(20))
+
+ msg = "Reindexing only valid with uniquely valued Index objects"
+ with pytest.raises(InvalidIndexError, match=msg):
+ idx1.get_indexer(idx2)
+
+
+def test_get_indexer_nearest():
+ midx = MultiIndex.from_tuples([('a', 1), ('b', 2)])
+ with pytest.raises(NotImplementedError):
+ midx.get_indexer(['a'], method='nearest')
+ with pytest.raises(NotImplementedError):
+ midx.get_indexer(['a'], method='pad', tolerance=2)
+
+
+def test_getitem(idx):
+ # scalar
+ assert idx[2] == ('bar', 'one')
+
+ # slice
+ result = idx[2:5]
+ expected = idx[[2, 3, 4]]
+ assert result.equals(expected)
+
+ # boolean
+ result = idx[[True, False, True, False, True, True]]
+ result2 = idx[np.array([True, False, True, False, True, True])]
+ expected = idx[[0, 2, 4, 5]]
+ assert result.equals(expected)
+ assert result2.equals(expected)
+
+
+def test_getitem_group_select(idx):
+ sorted_idx, _ = idx.sortlevel(0)
+ assert sorted_idx.get_loc('baz') == slice(3, 4)
+ assert sorted_idx.get_loc('foo') == slice(0, 2)
+
+
+def test_get_indexer_consistency(idx):
+ # See GH 16819
+ if isinstance(idx, IntervalIndex):
+ pass
+
+ if idx.is_unique or isinstance(idx, CategoricalIndex):
+ indexer = idx.get_indexer(idx[0:2])
+ assert isinstance(indexer, np.ndarray)
+ assert indexer.dtype == np.intp
+ else:
+ e = "Reindexing only valid with uniquely valued Index objects"
+ with pytest.raises(InvalidIndexError, match=e):
+ idx.get_indexer(idx[0:2])
+
+ indexer, _ = idx.get_indexer_non_unique(idx[0:2])
+ assert isinstance(indexer, np.ndarray)
+ assert indexer.dtype == np.intp
+
+
[email protected]('ind1', [[True] * 5, pd.Index([True] * 5)])
[email protected]('ind2', [[True, False, True, False, False],
+ pd.Index([True, False, True, False,
+ False])])
+def test_getitem_bool_index_all(ind1, ind2):
+ # GH#22533
+ idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3),
+ (40, 4), (50, 5)])
+ tm.assert_index_equal(idx[ind1], idx)
+
+ expected = MultiIndex.from_tuples([(10, 1), (30, 3)])
+ tm.assert_index_equal(idx[ind2], expected)
+
+
[email protected]('ind1', [[True], pd.Index([True])])
[email protected]('ind2', [[False], pd.Index([False])])
+def test_getitem_bool_index_single(ind1, ind2):
+ # GH#22533
+ idx = MultiIndex.from_tuples([(10, 1)])
+ tm.assert_index_equal(idx[ind1], idx)
+
+ expected = pd.MultiIndex(levels=[np.array([], dtype=np.int64),
+ np.array([], dtype=np.int64)],
+ codes=[[], []])
+ tm.assert_index_equal(idx[ind2], expected)
+
+
+def test_get_loc(idx):
+ assert idx.get_loc(('foo', 'two')) == 1
+ assert idx.get_loc(('baz', 'two')) == 3
+ pytest.raises(KeyError, idx.get_loc, ('bar', 'two'))
+ pytest.raises(KeyError, idx.get_loc, 'quux')
+
+ pytest.raises(NotImplementedError, idx.get_loc, 'foo',
+ method='nearest')
+
+ # 3 levels
+ index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
+ lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
+ pytest.raises(KeyError, index.get_loc, (1, 1))
+ assert index.get_loc((2, 0)) == slice(3, 5)
+
+
+def test_get_loc_duplicates():
+ index = Index([2, 2, 2, 2])
+ result = index.get_loc(2)
+ expected = slice(0, 4)
+ assert result == expected
+ # pytest.raises(Exception, index.get_loc, 2)
+
+ index = Index(['c', 'a', 'a', 'b', 'b'])
+ rs = index.get_loc('c')
+ xp = 0
+ assert rs == xp
+
+
+def test_get_loc_level():
+ index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index(
+ lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array(
+ [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])])
+
+ loc, new_index = index.get_loc_level((0, 1))
+ expected = slice(1, 2)
+ exp_index = index[expected].droplevel(0).droplevel(0)
+ assert loc == expected
+ assert new_index.equals(exp_index)
+
+ loc, new_index = index.get_loc_level((0, 1, 0))
+ expected = 1
+ assert loc == expected
+ assert new_index is None
+
+ pytest.raises(KeyError, index.get_loc_level, (2, 2))
+ # GH 22221: unused label
+ pytest.raises(KeyError, index.drop(2).get_loc_level, 2)
+ # Unused label on unsorted level:
+ pytest.raises(KeyError, index.drop(1, level=2).get_loc_level, 2, 2)
+
+ index = MultiIndex(levels=[[2000], lrange(4)], codes=[np.array(
+ [0, 0, 0, 0]), np.array([0, 1, 2, 3])])
+ result, new_index = index.get_loc_level((2000, slice(None, None)))
+ expected = slice(None, None)
+ assert result == expected
+ assert new_index.equals(index.droplevel(0))
+
+
[email protected]('dtype1', [int, float, bool, str])
[email protected]('dtype2', [int, float, bool, str])
+def test_get_loc_multiple_dtypes(dtype1, dtype2):
+ # GH 18520
+ levels = [np.array([0, 1]).astype(dtype1),
+ np.array([0, 1]).astype(dtype2)]
+ idx = pd.MultiIndex.from_product(levels)
+ assert idx.get_loc(idx[2]) == 2
+
+
[email protected]('level', [0, 1])
[email protected]('dtypes', [[int, float], [float, int]])
+def test_get_loc_implicit_cast(level, dtypes):
+ # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa
+ levels = [['a', 'b'], ['c', 'd']]
+ key = ['b', 'd']
+ lev_dtype, key_dtype = dtypes
+ levels[level] = np.array([0, 1], dtype=lev_dtype)
+ key[level] = key_dtype(1)
+ idx = MultiIndex.from_product(levels)
+ assert idx.get_loc(tuple(key)) == 3
+
+
+def test_get_loc_cast_bool():
+ # GH 19086 : int is casted to bool, but not vice-versa
+ levels = [[False, True], np.arange(2, dtype='int64')]
+ idx = MultiIndex.from_product(levels)
+
+ assert idx.get_loc((0, 1)) == 1
+ assert idx.get_loc((1, 0)) == 2
+
+ pytest.raises(KeyError, idx.get_loc, (False, True))
+ pytest.raises(KeyError, idx.get_loc, (True, False))
+
+
[email protected]('level', [0, 1])
+def test_get_loc_nan(level, nulls_fixture):
+ # GH 18485 : NaN in MultiIndex
+ levels = [['a', 'b'], ['c', 'd']]
+ key = ['b', 'd']
+ levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture))
+ key[level] = nulls_fixture
+ idx = MultiIndex.from_product(levels)
+ assert idx.get_loc(tuple(key)) == 3
+
+
+def test_get_loc_missing_nan():
+ # GH 8569
+ idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
+ assert isinstance(idx.get_loc(1), slice)
+ pytest.raises(KeyError, idx.get_loc, 3)
+ pytest.raises(KeyError, idx.get_loc, np.nan)
+ pytest.raises(KeyError, idx.get_loc, [np.nan])
+
+
+def test_get_indexer_categorical_time():
+ # https://github.com/pandas-dev/pandas/issues/21390
+ midx = MultiIndex.from_product(
+ [Categorical(['a', 'b', 'c']),
+ Categorical(date_range("2012-01-01", periods=3, freq='H'))])
+ result = midx.get_indexer(midx)
+ tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_integrity.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_integrity.py
new file mode 100644
index 00000000000..c1638a9cde6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_integrity.py
@@ -0,0 +1,293 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, range
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+
+import pandas as pd
+from pandas import IntervalIndex, MultiIndex, RangeIndex
+import pandas.util.testing as tm
+
+
+def test_labels_dtypes():
+
+ # GH 8456
+ i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
+ assert i.codes[0].dtype == 'int8'
+ assert i.codes[1].dtype == 'int8'
+
+ i = MultiIndex.from_product([['a'], range(40)])
+ assert i.codes[1].dtype == 'int8'
+ i = MultiIndex.from_product([['a'], range(400)])
+ assert i.codes[1].dtype == 'int16'
+ i = MultiIndex.from_product([['a'], range(40000)])
+ assert i.codes[1].dtype == 'int32'
+
+ i = pd.MultiIndex.from_product([['a'], range(1000)])
+ assert (i.codes[0] >= 0).all()
+ assert (i.codes[1] >= 0).all()
+
+
+def test_values_boxed():
+ tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT),
+ (3, pd.Timestamp('2000-01-03')),
+ (1, pd.Timestamp('2000-01-04')),
+ (2, pd.Timestamp('2000-01-02')),
+ (3, pd.Timestamp('2000-01-03'))]
+ result = pd.MultiIndex.from_tuples(tuples)
+ expected = construct_1d_object_array_from_listlike(tuples)
+ tm.assert_numpy_array_equal(result.values, expected)
+ # Check that code branches for boxed values produce identical results
+ tm.assert_numpy_array_equal(result.values[:4], result[:4].values)
+
+
+def test_values_multiindex_datetimeindex():
+ # Test to ensure we hit the boxing / nobox part of MI.values
+ ints = np.arange(10 ** 18, 10 ** 18 + 5)
+ naive = pd.DatetimeIndex(ints)
+ # TODO(GH-24559): Remove the FutureWarning
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ aware = pd.DatetimeIndex(ints, tz='US/Central')
+
+ idx = pd.MultiIndex.from_arrays([naive, aware])
+ result = idx.values
+
+ outer = pd.DatetimeIndex([x[0] for x in result])
+ tm.assert_index_equal(outer, naive)
+
+ inner = pd.DatetimeIndex([x[1] for x in result])
+ tm.assert_index_equal(inner, aware)
+
+ # n_lev > n_lab
+ result = idx[:2].values
+
+ outer = pd.DatetimeIndex([x[0] for x in result])
+ tm.assert_index_equal(outer, naive[:2])
+
+ inner = pd.DatetimeIndex([x[1] for x in result])
+ tm.assert_index_equal(inner, aware[:2])
+
+
+def test_values_multiindex_periodindex():
+ # Test to ensure we hit the boxing / nobox part of MI.values
+ ints = np.arange(2007, 2012)
+ pidx = pd.PeriodIndex(ints, freq='D')
+
+ idx = pd.MultiIndex.from_arrays([ints, pidx])
+ result = idx.values
+
+ outer = pd.Int64Index([x[0] for x in result])
+ tm.assert_index_equal(outer, pd.Int64Index(ints))
+
+ inner = pd.PeriodIndex([x[1] for x in result])
+ tm.assert_index_equal(inner, pidx)
+
+ # n_lev > n_lab
+ result = idx[:2].values
+
+ outer = pd.Int64Index([x[0] for x in result])
+ tm.assert_index_equal(outer, pd.Int64Index(ints[:2]))
+
+ inner = pd.PeriodIndex([x[1] for x in result])
+ tm.assert_index_equal(inner, pidx[:2])
+
+
+def test_consistency():
+ # need to construct an overflow
+ major_axis = lrange(70000)
+ minor_axis = lrange(10)
+
+ major_codes = np.arange(70000)
+ minor_codes = np.repeat(lrange(10), 7000)
+
+ # the fact that is works means it's consistent
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+
+ # inconsistent
+ major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
+ minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
+ index = MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes])
+
+ assert index.is_unique is False
+
+
+def test_hash_collisions():
+ # non-smoke test that we don't get hash collisions
+
+ index = MultiIndex.from_product([np.arange(1000), np.arange(1000)],
+ names=['one', 'two'])
+ result = index.get_indexer(index.values)
+ tm.assert_numpy_array_equal(result, np.arange(
+ len(index), dtype='intp'))
+
+ for i in [0, 1, len(index) - 2, len(index) - 1]:
+ result = index.get_loc(index[i])
+ assert result == i
+
+
+def test_dims():
+ pass
+
+
+def take_invalid_kwargs():
+ vals = [['A', 'B'],
+ [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]]
+ idx = pd.MultiIndex.from_product(vals, names=['str', 'dt'])
+ indices = [1, 2]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+
+def test_isna_behavior(idx):
+ # should not segfault GH5123
+ # NOTE: if MI representation changes, may make sense to allow
+ # isna(MI)
+ with pytest.raises(NotImplementedError):
+ pd.isna(idx)
+
+
+def test_large_multiindex_error():
+ # GH12527
+ df_below_1000000 = pd.DataFrame(
+ 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]),
+ columns=['dest'])
+ with pytest.raises(KeyError):
+ df_below_1000000.loc[(-1, 0), 'dest']
+ with pytest.raises(KeyError):
+ df_below_1000000.loc[(3, 0), 'dest']
+ df_above_1000000 = pd.DataFrame(
+ 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]),
+ columns=['dest'])
+ with pytest.raises(KeyError):
+ df_above_1000000.loc[(-1, 0), 'dest']
+ with pytest.raises(KeyError):
+ df_above_1000000.loc[(3, 0), 'dest']
+
+
+def test_million_record_attribute_error():
+ # GH 18165
+ r = list(range(1000000))
+ df = pd.DataFrame({'a': r, 'b': r},
+ index=pd.MultiIndex.from_tuples([(x, x) for x in r]))
+
+ msg = "'Series' object has no attribute 'foo'"
+ with pytest.raises(AttributeError, match=msg):
+ df['a'].foo()
+
+
+def test_can_hold_identifiers(idx):
+ key = idx[0]
+ assert idx._can_hold_identifiers_and_holds_name(key) is True
+
+
+def test_metadata_immutable(idx):
+ levels, codes = idx.levels, idx.codes
+ # shouldn't be able to set at either the top level or base level
+ mutable_regex = re.compile('does not support mutable operations')
+ with pytest.raises(TypeError, match=mutable_regex):
+ levels[0] = levels[0]
+ with pytest.raises(TypeError, match=mutable_regex):
+ levels[0][0] = levels[0][0]
+ # ditto for labels
+ with pytest.raises(TypeError, match=mutable_regex):
+ codes[0] = codes[0]
+ with pytest.raises(TypeError, match=mutable_regex):
+ codes[0][0] = codes[0][0]
+ # and for names
+ names = idx.names
+ with pytest.raises(TypeError, match=mutable_regex):
+ names[0] = names[0]
+
+
+def test_level_setting_resets_attributes():
+ ind = pd.MultiIndex.from_arrays([
+ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
+ ])
+ assert ind.is_monotonic
+ ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True)
+ # if this fails, probably didn't reset the cache correctly.
+ assert not ind.is_monotonic
+
+
+def test_rangeindex_fallback_coercion_bug():
+ # GH 12893
+ foo = pd.DataFrame(np.arange(100).reshape((10, 10)))
+ bar = pd.DataFrame(np.arange(100).reshape((10, 10)))
+ df = pd.concat({'foo': foo.stack(), 'bar': bar.stack()}, axis=1)
+ df.index.names = ['fizz', 'buzz']
+
+ str(df)
+ expected = pd.DataFrame({'bar': np.arange(100),
+ 'foo': np.arange(100)},
+ index=pd.MultiIndex.from_product(
+ [range(10), range(10)],
+ names=['fizz', 'buzz']))
+ tm.assert_frame_equal(df, expected, check_like=True)
+
+ result = df.index.get_level_values('fizz')
+ expected = pd.Int64Index(np.arange(10), name='fizz').repeat(10)
+ tm.assert_index_equal(result, expected)
+
+ result = df.index.get_level_values('buzz')
+ expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz')
+ tm.assert_index_equal(result, expected)
+
+
+def test_hash_error(indices):
+ index = indices
+ with pytest.raises(TypeError, match=("unhashable type: %r" %
+ type(index).__name__)):
+ hash(indices)
+
+
+def test_mutability(indices):
+ if not len(indices):
+ return
+ pytest.raises(TypeError, indices.__setitem__, 0, indices[0])
+
+
+def test_wrong_number_names(indices):
+ with pytest.raises(ValueError, match="^Length"):
+ indices.names = ["apple", "banana", "carrot"]
+
+
+def test_memory_usage(idx):
+ result = idx.memory_usage()
+ if len(idx):
+ idx.get_loc(idx[0])
+ result2 = idx.memory_usage()
+ result3 = idx.memory_usage(deep=True)
+
+ # RangeIndex, IntervalIndex
+ # don't have engines
+ if not isinstance(idx, (RangeIndex, IntervalIndex)):
+ assert result2 > result
+
+ if idx.inferred_type == 'object':
+ assert result3 > result2
+
+ else:
+
+ # we report 0 for no-length
+ assert result == 0
+
+
+def test_nlevels(idx):
+ assert idx.nlevels == 2
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_join.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_join.py
new file mode 100644
index 00000000000..9e6c947e647
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_join.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, MultiIndex
+import pandas.util.testing as tm
+
+
+ Index(['three', 'one', 'two']),
+ Index(['one']),
+ Index(['one', 'three']),
+])
+def test_join_level(idx, other, join_type):
+ join_index, lidx, ridx = other.join(idx, how=join_type,
+ level='second',
+ return_indexers=True)
+
+ exp_level = other.join(idx.levels[1], how=join_type)
+ assert join_index.levels[0].equals(idx.levels[0])
+ assert join_index.levels[1].equals(exp_level)
+
+ # pare down levels
+ mask = np.array(
+ [x[1] in exp_level for x in idx], dtype=bool)
+ exp_values = idx.values[mask]
+ tm.assert_numpy_array_equal(join_index.values, exp_values)
+
+ if join_type in ('outer', 'inner'):
+ join_index2, ridx2, lidx2 = \
+ idx.join(other, how=join_type, level='second',
+ return_indexers=True)
+
+ assert join_index.equals(join_index2)
+ tm.assert_numpy_array_equal(lidx, lidx2)
+ tm.assert_numpy_array_equal(ridx, ridx2)
+ tm.assert_numpy_array_equal(join_index2.values, exp_values)
+
+
+def test_join_level_corner_case(idx):
+ # some corner cases
+ index = Index(['three', 'one', 'two'])
+ result = index.join(idx, level='second')
+ assert isinstance(result, MultiIndex)
+
+ with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"):
+ idx.join(idx, level=1)
+
+
+def test_join_self(idx, join_type):
+ joined = idx.join(idx, how=join_type)
+ assert idx is joined
+
+
+def test_join_multi():
+ # GH 10665
+ midx = pd.MultiIndex.from_product(
+ [np.arange(4), np.arange(4)], names=['a', 'b'])
+ idx = pd.Index([1, 2, 5], name='b')
+
+ # inner
+ jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True)
+ exp_idx = pd.MultiIndex.from_product(
+ [np.arange(4), [1, 2]], names=['a', 'b'])
+ exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
+ exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
+ tm.assert_index_equal(jidx, exp_idx)
+ tm.assert_numpy_array_equal(lidx, exp_lidx)
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+ # flip
+ jidx, ridx, lidx = idx.join(midx, how='inner', return_indexers=True)
+ tm.assert_index_equal(jidx, exp_idx)
+ tm.assert_numpy_array_equal(lidx, exp_lidx)
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+
+ # keep MultiIndex
+ jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True)
+ exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0,
+ 1, -1], dtype=np.intp)
+ tm.assert_index_equal(jidx, midx)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+ # flip
+ jidx, ridx, lidx = idx.join(midx, how='right', return_indexers=True)
+ tm.assert_index_equal(jidx, midx)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+
+
+def test_join_self_unique(idx, join_type):
+ if idx.is_unique:
+ joined = idx.join(idx, how=join_type)
+ assert (idx == joined).all()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_missing.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_missing.py
new file mode 100644
index 00000000000..cd4adfa96ef
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_missing.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+
+import pandas as pd
+from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index
+from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
+import pandas.util.testing as tm
+
+
+def test_fillna(idx):
+ # GH 11343
+
+ # TODO: Remove or Refactor. Not Implemented for MultiIndex
+ for name, index in [('idx', idx), ]:
+ if len(index) == 0:
+ pass
+ elif isinstance(index, MultiIndex):
+ idx = index.copy()
+ msg = "isna is not defined for MultiIndex"
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.fillna(idx[0])
+ else:
+ idx = index.copy()
+ result = idx.fillna(idx[0])
+ tm.assert_index_equal(result, idx)
+ assert result is not idx
+
+ msg = "'value' must be a scalar, passed: "
+ with pytest.raises(TypeError, match=msg):
+ idx.fillna([idx[0]])
+
+ idx = index.copy()
+ values = idx.values
+
+ if isinstance(index, DatetimeIndexOpsMixin):
+ values[1] = iNaT
+ elif isinstance(index, (Int64Index, UInt64Index)):
+ continue
+ else:
+ values[1] = np.nan
+
+ if isinstance(index, PeriodIndex):
+ idx = index.__class__(values, freq=index.freq)
+ else:
+ idx = index.__class__(values)
+
+ expected = np.array([False] * len(idx), dtype=bool)
+ expected[1] = True
+ tm.assert_numpy_array_equal(idx._isnan, expected)
+ assert idx.hasnans is True
+
+
+def test_dropna():
+ # GH 6194
+ idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5],
+ [1, 2, np.nan, np.nan, 5],
+ ['a', 'b', 'c', np.nan, 'e']])
+
+ exp = pd.MultiIndex.from_arrays([[1, 5],
+ [1, 5],
+ ['a', 'e']])
+ tm.assert_index_equal(idx.dropna(), exp)
+ tm.assert_index_equal(idx.dropna(how='any'), exp)
+
+ exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5],
+ [1, 2, np.nan, 5],
+ ['a', 'b', 'c', 'e']])
+ tm.assert_index_equal(idx.dropna(how='all'), exp)
+
+ msg = "invalid how option: xxx"
+ with pytest.raises(ValueError, match=msg):
+ idx.dropna(how='xxx')
+
+
+def test_nulls(idx):
+ # this is really a smoke test for the methods
+ # as these are adequately tested for function elsewhere
+
+ msg = "isna is not defined for MultiIndex"
+ with pytest.raises(NotImplementedError, match=msg):
+ idx.isna()
+
+
+def test_hasnans_isnans(idx):
+ # GH 11343, added tests for hasnans / isnans
+ index = idx.copy()
+
+ # cases in indices doesn't include NaN
+ expected = np.array([False] * len(index), dtype=bool)
+ tm.assert_numpy_array_equal(index._isnan, expected)
+ assert index.hasnans is False
+
+ index = idx.copy()
+ values = index.values
+ values[1] = np.nan
+
+ index = idx.__class__(values)
+
+ expected = np.array([False] * len(index), dtype=bool)
+ expected[1] = True
+ tm.assert_numpy_array_equal(index._isnan, expected)
+ assert index.hasnans is True
+
+
+def test_nan_stays_float():
+
+ # GH 7031
+ idx0 = pd.MultiIndex(levels=[["A", "B"], []],
+ codes=[[1, 0], [-1, -1]],
+ names=[0, 1])
+ idx1 = pd.MultiIndex(levels=[["C"], ["D"]],
+ codes=[[0], [0]],
+ names=[0, 1])
+ idxm = idx0.join(idx1, how='outer')
+ assert pd.isna(idx0.get_level_values(1)).all()
+ # the following failed in 0.14.1
+ assert pd.isna(idxm.get_level_values(1)[:-1]).all()
+
+ df0 = pd.DataFrame([[1, 2]], index=idx0)
+ df1 = pd.DataFrame([[3, 4]], index=idx1)
+ dfm = df0 - df1
+ assert pd.isna(df0.index.get_level_values(1)).all()
+ # the following failed in 0.14.1
+ assert pd.isna(dfm.index.get_level_values(1)[:-1]).all()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_monotonic.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_monotonic.py
new file mode 100644
index 00000000000..72e9bcc1e2e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_monotonic.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, IntervalIndex, MultiIndex
+from pandas.api.types import is_scalar
+
+
+def test_is_monotonic_increasing():
+ i = MultiIndex.from_product([np.arange(10),
+ np.arange(10)], names=['one', 'two'])
+ assert i.is_monotonic is True
+ assert i._is_strictly_monotonic_increasing is True
+ assert Index(i.values).is_monotonic is True
+ assert i._is_strictly_monotonic_increasing is True
+
+ i = MultiIndex.from_product([np.arange(10, 0, -1),
+ np.arange(10)], names=['one', 'two'])
+ assert i.is_monotonic is False
+ assert i._is_strictly_monotonic_increasing is False
+ assert Index(i.values).is_monotonic is False
+ assert Index(i.values)._is_strictly_monotonic_increasing is False
+
+ i = MultiIndex.from_product([np.arange(10),
+ np.arange(10, 0, -1)],
+ names=['one', 'two'])
+ assert i.is_monotonic is False
+ assert i._is_strictly_monotonic_increasing is False
+ assert Index(i.values).is_monotonic is False
+ assert Index(i.values)._is_strictly_monotonic_increasing is False
+
+ i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']])
+ assert i.is_monotonic is False
+ assert i._is_strictly_monotonic_increasing is False
+ assert Index(i.values).is_monotonic is False
+ assert Index(i.values)._is_strictly_monotonic_increasing is False
+
+ # string ordering
+ i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ assert i.is_monotonic is False
+ assert Index(i.values).is_monotonic is False
+ assert i._is_strictly_monotonic_increasing is False
+ assert Index(i.values)._is_strictly_monotonic_increasing is False
+
+ i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'],
+ ['mom', 'next', 'zenith']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ assert i.is_monotonic is True
+ assert Index(i.values).is_monotonic is True
+ assert i._is_strictly_monotonic_increasing is True
+ assert Index(i.values)._is_strictly_monotonic_increasing is True
+
+ # mixed levels, hits the TypeError
+ i = MultiIndex(
+ levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237',
+ 'nl0000289783',
+ 'nl0000289965', 'nl0000301109']],
+ codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]],
+ names=['household_id', 'asset_id'])
+
+ assert i.is_monotonic is False
+ assert i._is_strictly_monotonic_increasing is False
+
+ # empty
+ i = MultiIndex.from_arrays([[], []])
+ assert i.is_monotonic is True
+ assert Index(i.values).is_monotonic is True
+ assert i._is_strictly_monotonic_increasing is True
+ assert Index(i.values)._is_strictly_monotonic_increasing is True
+
+
+def test_is_monotonic_decreasing():
+ i = MultiIndex.from_product([np.arange(9, -1, -1),
+ np.arange(9, -1, -1)],
+ names=['one', 'two'])
+ assert i.is_monotonic_decreasing is True
+ assert i._is_strictly_monotonic_decreasing is True
+ assert Index(i.values).is_monotonic_decreasing is True
+ assert i._is_strictly_monotonic_decreasing is True
+
+ i = MultiIndex.from_product([np.arange(10),
+ np.arange(10, 0, -1)],
+ names=['one', 'two'])
+ assert i.is_monotonic_decreasing is False
+ assert i._is_strictly_monotonic_decreasing is False
+ assert Index(i.values).is_monotonic_decreasing is False
+ assert Index(i.values)._is_strictly_monotonic_decreasing is False
+
+ i = MultiIndex.from_product([np.arange(10, 0, -1),
+ np.arange(10)], names=['one', 'two'])
+ assert i.is_monotonic_decreasing is False
+ assert i._is_strictly_monotonic_decreasing is False
+ assert Index(i.values).is_monotonic_decreasing is False
+ assert Index(i.values)._is_strictly_monotonic_decreasing is False
+
+ i = MultiIndex.from_product([[2.0, np.nan, 1.0], ['c', 'b', 'a']])
+ assert i.is_monotonic_decreasing is False
+ assert i._is_strictly_monotonic_decreasing is False
+ assert Index(i.values).is_monotonic_decreasing is False
+ assert Index(i.values)._is_strictly_monotonic_decreasing is False
+
+ # string ordering
+ i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'],
+ ['three', 'two', 'one']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ assert i.is_monotonic_decreasing is False
+ assert Index(i.values).is_monotonic_decreasing is False
+ assert i._is_strictly_monotonic_decreasing is False
+ assert Index(i.values)._is_strictly_monotonic_decreasing is False
+
+ i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'],
+ ['zenith', 'next', 'mom']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ assert i.is_monotonic_decreasing is True
+ assert Index(i.values).is_monotonic_decreasing is True
+ assert i._is_strictly_monotonic_decreasing is True
+ assert Index(i.values)._is_strictly_monotonic_decreasing is True
+
+ # mixed levels, hits the TypeError
+ i = MultiIndex(
+ levels=[[4, 3, 2, 1], ['nl0000301109', 'nl0000289965',
+ 'nl0000289783', 'lu0197800237',
+ 'gb00b03mlx29']],
+ codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]],
+ names=['household_id', 'asset_id'])
+
+ assert i.is_monotonic_decreasing is False
+ assert i._is_strictly_monotonic_decreasing is False
+
+ # empty
+ i = MultiIndex.from_arrays([[], []])
+ assert i.is_monotonic_decreasing is True
+ assert Index(i.values).is_monotonic_decreasing is True
+ assert i._is_strictly_monotonic_decreasing is True
+ assert Index(i.values)._is_strictly_monotonic_decreasing is True
+
+
+def test_is_strictly_monotonic_increasing():
+ idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']],
+ codes=[[0, 0, 1, 1], [0, 0, 0, 1]])
+ assert idx.is_monotonic_increasing is True
+ assert idx._is_strictly_monotonic_increasing is False
+
+
+def test_is_strictly_monotonic_decreasing():
+ idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']],
+ codes=[[0, 0, 1, 1], [0, 0, 0, 1]])
+ assert idx.is_monotonic_decreasing is True
+ assert idx._is_strictly_monotonic_decreasing is False
+
+
+def test_searchsorted_monotonic(indices):
+ # GH17271
+ # not implemented for tuple searches in MultiIndex
+ # or Intervals searches in IntervalIndex
+ if isinstance(indices, (MultiIndex, IntervalIndex)):
+ return
+
+ # nothing to test if the index is empty
+ if indices.empty:
+ return
+ value = indices[0]
+
+ # determine the expected results (handle dupes for 'right')
+ expected_left, expected_right = 0, (indices == value).argmin()
+ if expected_right == 0:
+ # all values are the same, expected_right should be length
+ expected_right = len(indices)
+
+ # test _searchsorted_monotonic in all cases
+ # test searchsorted only for increasing
+ if indices.is_monotonic_increasing:
+ ssm_left = indices._searchsorted_monotonic(value, side='left')
+ assert is_scalar(ssm_left)
+ assert expected_left == ssm_left
+
+ ssm_right = indices._searchsorted_monotonic(value, side='right')
+ assert is_scalar(ssm_right)
+ assert expected_right == ssm_right
+
+ ss_left = indices.searchsorted(value, side='left')
+ assert is_scalar(ss_left)
+ assert expected_left == ss_left
+
+ ss_right = indices.searchsorted(value, side='right')
+ assert is_scalar(ss_right)
+ assert expected_right == ss_right
+
+ elif indices.is_monotonic_decreasing:
+ ssm_left = indices._searchsorted_monotonic(value, side='left')
+ assert is_scalar(ssm_left)
+ assert expected_left == ssm_left
+
+ ssm_right = indices._searchsorted_monotonic(value, side='right')
+ assert is_scalar(ssm_right)
+ assert expected_right == ssm_right
+
+ else:
+ # non-monotonic should raise.
+ with pytest.raises(ValueError):
+ indices._searchsorted_monotonic(value, side='left')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_names.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_names.py
new file mode 100644
index 00000000000..1f67b3bb5d9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_names.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas as pd
+from pandas import MultiIndex
+import pandas.util.testing as tm
+
+
+def check_level_names(index, names):
+ assert [level.name for level in index.levels] == list(names)
+
+
+def test_slice_keep_name():
+ x = MultiIndex.from_tuples([('a', 'b'), (1, 2), ('c', 'd')],
+ names=['x', 'y'])
+ assert x[1:].names == x.names
+
+
+def test_index_name_retained():
+ # GH9857
+ result = pd.DataFrame({'x': [1, 2, 6],
+ 'y': [2, 2, 8],
+ 'z': [-5, 0, 5]})
+ result = result.set_index('z')
+ result.loc[10] = [9, 10]
+ df_expected = pd.DataFrame({'x': [1, 2, 6, 9],
+ 'y': [2, 2, 8, 10],
+ 'z': [-5, 0, 5, 10]})
+ df_expected = df_expected.set_index('z')
+ tm.assert_frame_equal(result, df_expected)
+
+
+def test_changing_names(idx):
+
+ # names should be applied to levels
+ level_names = [level.name for level in idx.levels]
+ check_level_names(idx, idx.names)
+
+ view = idx.view()
+ copy = idx.copy()
+ shallow_copy = idx._shallow_copy()
+
+ # changing names should change level names on object
+ new_names = [name + "a" for name in idx.names]
+ idx.names = new_names
+ check_level_names(idx, new_names)
+
+ # but not on copies
+ check_level_names(view, level_names)
+ check_level_names(copy, level_names)
+ check_level_names(shallow_copy, level_names)
+
+ # and copies shouldn't change original
+ shallow_copy.names = [name + "c" for name in shallow_copy.names]
+ check_level_names(idx, new_names)
+
+
+def test_take_preserve_name(idx):
+ taken = idx.take([3, 0, 1])
+ assert taken.names == idx.names
+
+
+def test_copy_names():
+ # Check that adding a "names" parameter to the copy is honored
+ # GH14302
+ multi_idx = pd.Index([(1, 2), (3, 4)], names=['MyName1', 'MyName2'])
+ multi_idx1 = multi_idx.copy()
+
+ assert multi_idx.equals(multi_idx1)
+ assert multi_idx.names == ['MyName1', 'MyName2']
+ assert multi_idx1.names == ['MyName1', 'MyName2']
+
+ multi_idx2 = multi_idx.copy(names=['NewName1', 'NewName2'])
+
+ assert multi_idx.equals(multi_idx2)
+ assert multi_idx.names == ['MyName1', 'MyName2']
+ assert multi_idx2.names == ['NewName1', 'NewName2']
+
+ multi_idx3 = multi_idx.copy(name=['NewName1', 'NewName2'])
+
+ assert multi_idx.equals(multi_idx3)
+ assert multi_idx.names == ['MyName1', 'MyName2']
+ assert multi_idx3.names == ['NewName1', 'NewName2']
+
+
+def test_names(idx, index_names):
+
+ # names are assigned in setup
+ names = index_names
+ level_names = [level.name for level in idx.levels]
+ assert names == level_names
+
+ # setting bad names on existing
+ index = idx
+ with pytest.raises(ValueError, match="^Length of names"):
+ setattr(index, "names", list(index.names) + ["third"])
+ with pytest.raises(ValueError, match="^Length of names"):
+ setattr(index, "names", [])
+
+ # initializing with bad names (should always be equivalent)
+ major_axis, minor_axis = idx.levels
+ major_codes, minor_codes = idx.codes
+ with pytest.raises(ValueError, match="^Length of names"):
+ MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=['first'])
+ with pytest.raises(ValueError, match="^Length of names"):
+ MultiIndex(levels=[major_axis, minor_axis],
+ codes=[major_codes, minor_codes],
+ names=['first', 'second', 'third'])
+
+ # names are assigned
+ index.names = ["a", "b"]
+ ind_names = list(index.names)
+ level_names = [level.name for level in index.levels]
+ assert ind_names == level_names
+
+
+def test_duplicate_level_names_access_raises(idx):
+ # GH19029
+ idx.names = ['foo', 'foo']
+ with pytest.raises(ValueError, match='name foo occurs multiple times'):
+ idx._get_level_number('foo')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_partial_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_partial_indexing.py
new file mode 100644
index 00000000000..b75396a3136
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_partial_indexing.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, date_range
+import pandas.util.testing as tm
+
+
+def test_partial_string_timestamp_multiindex():
+ # GH10331
+ dr = pd.date_range('2016-01-01', '2016-01-03', freq='12H')
+ abc = ['a', 'b', 'c']
+ ix = pd.MultiIndex.from_product([dr, abc])
+ df = pd.DataFrame({'c1': range(0, 15)}, index=ix)
+ idx = pd.IndexSlice
+
+ # c1
+ # 2016-01-01 00:00:00 a 0
+ # b 1
+ # c 2
+ # 2016-01-01 12:00:00 a 3
+ # b 4
+ # c 5
+ # 2016-01-02 00:00:00 a 6
+ # b 7
+ # c 8
+ # 2016-01-02 12:00:00 a 9
+ # b 10
+ # c 11
+ # 2016-01-03 00:00:00 a 12
+ # b 13
+ # c 14
+
+ # partial string matching on a single index
+ for df_swap in (df.swaplevel(),
+ df.swaplevel(0),
+ df.swaplevel(0, 1)):
+ df_swap = df_swap.sort_index()
+ just_a = df_swap.loc['a']
+ result = just_a.loc['2016-01-01']
+ expected = df.loc[idx[:, 'a'], :].iloc[0:2]
+ expected.index = expected.index.droplevel(1)
+ tm.assert_frame_equal(result, expected)
+
+ # indexing with IndexSlice
+ result = df.loc[idx['2016-01-01':'2016-02-01', :], :]
+ expected = df
+ tm.assert_frame_equal(result, expected)
+
+ # match on secondary index
+ result = df_swap.loc[idx[:, '2016-01-01':'2016-01-01'], :]
+ expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]]
+ tm.assert_frame_equal(result, expected)
+
+ # Even though this syntax works on a single index, this is somewhat
+ # ambiguous and we don't want to extend this behavior forward to work
+ # in multi-indexes. This would amount to selecting a scalar from a
+ # column.
+ with pytest.raises(KeyError):
+ df['2016-01-01']
+
+ # partial string match on year only
+ result = df.loc['2016']
+ expected = df
+ tm.assert_frame_equal(result, expected)
+
+ # partial string match on date
+ result = df.loc['2016-01-01']
+ expected = df.iloc[0:6]
+ tm.assert_frame_equal(result, expected)
+
+ # partial string match on date and hour, from middle
+ result = df.loc['2016-01-02 12']
+ expected = df.iloc[9:12]
+ tm.assert_frame_equal(result, expected)
+
+ # partial string match on secondary index
+ result = df_swap.loc[idx[:, '2016-01-02'], :]
+ expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]]
+ tm.assert_frame_equal(result, expected)
+
+ # tuple selector with partial string match on date
+ result = df.loc[('2016-01-01', 'a'), :]
+ expected = df.iloc[[0, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # Slicing date on first level should break (of course)
+ with pytest.raises(KeyError):
+ df_swap.loc['2016-01-01']
+
+ # GH12685 (partial string with daily resolution or below)
+ dr = date_range('2013-01-01', periods=100, freq='D')
+ ix = MultiIndex.from_product([dr, ['a', 'b']])
+ df = DataFrame(np.random.randn(200, 1), columns=['A'], index=ix)
+
+ result = df.loc[idx['2013-03':'2013-03', :], :]
+ expected = df.iloc[118:180]
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reindex.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reindex.py
new file mode 100644
index 00000000000..341ef82c538
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reindex.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def check_level_names(index, names):
+ assert [level.name for level in index.levels] == list(names)
+
+
+def test_reindex(idx):
+ result, indexer = idx.reindex(list(idx[:4]))
+ assert isinstance(result, MultiIndex)
+ check_level_names(result, idx[:4].names)
+
+ result, indexer = idx.reindex(list(idx))
+ assert isinstance(result, MultiIndex)
+ assert indexer is None
+ check_level_names(result, idx.names)
+
+
+def test_reindex_level(idx):
+ index = Index(['one'])
+
+ target, indexer = idx.reindex(index, level='second')
+ target2, indexer2 = index.reindex(idx, level='second')
+
+ exp_index = idx.join(index, level='second', how='right')
+ exp_index2 = idx.join(index, level='second', how='left')
+
+ assert target.equals(exp_index)
+ exp_indexer = np.array([0, 2, 4])
+ tm.assert_numpy_array_equal(indexer, exp_indexer, check_dtype=False)
+
+ assert target2.equals(exp_index2)
+ exp_indexer2 = np.array([0, -1, 0, -1, 0, -1])
+ tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False)
+
+ with pytest.raises(TypeError, match="Fill method not supported"):
+ idx.reindex(idx, method='pad', level='second')
+
+ with pytest.raises(TypeError, match="Fill method not supported"):
+ index.reindex(index, method='bfill', level='first')
+
+
+def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx):
+ # GH6552
+ idx = idx.copy()
+ target = idx.copy()
+ idx.names = target.names = [None, None]
+
+ other_dtype = pd.MultiIndex.from_product([[1, 2], [3, 4]])
+
+ # list & ndarray cases
+ assert idx.reindex([])[0].names == [None, None]
+ assert idx.reindex(np.array([]))[0].names == [None, None]
+ assert idx.reindex(target.tolist())[0].names == [None, None]
+ assert idx.reindex(target.values)[0].names == [None, None]
+ assert idx.reindex(other_dtype.tolist())[0].names == [None, None]
+ assert idx.reindex(other_dtype.values)[0].names == [None, None]
+
+ idx.names = ['foo', 'bar']
+ assert idx.reindex([])[0].names == ['foo', 'bar']
+ assert idx.reindex(np.array([]))[0].names == ['foo', 'bar']
+ assert idx.reindex(target.tolist())[0].names == ['foo', 'bar']
+ assert idx.reindex(target.values)[0].names == ['foo', 'bar']
+ assert idx.reindex(other_dtype.tolist())[0].names == ['foo', 'bar']
+ assert idx.reindex(other_dtype.values)[0].names == ['foo', 'bar']
+
+
+def test_reindex_lvl_preserves_names_when_target_is_list_or_array():
+ # GH7774
+ idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']],
+ names=['foo', 'bar'])
+ assert idx.reindex([], level=0)[0].names == ['foo', 'bar']
+ assert idx.reindex([], level=1)[0].names == ['foo', 'bar']
+
+
+def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array():
+ # GH7774
+ idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']])
+ assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64
+ assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_
+
+
+def test_reindex_base(idx):
+ idx = idx
+ expected = np.arange(idx.size, dtype=np.intp)
+
+ actual = idx.get_indexer(idx)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ with pytest.raises(ValueError, match='Invalid fill method'):
+ idx.get_indexer(idx, method='invalid')
+
+
+def test_reindex_non_unique():
+ idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (1, 1), (2, 2)])
+ a = pd.Series(np.arange(4), index=idx)
+ new_idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
+
+ msg = 'cannot handle a non-unique multi-index!'
+ with pytest.raises(ValueError, match=msg):
+ a.reindex(new_idx)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reshape.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reshape.py
new file mode 100644
index 00000000000..92564a20c30
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_reshape.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def test_insert(idx):
+ # key contained in all levels
+ new_index = idx.insert(0, ('bar', 'two'))
+ assert new_index.equal_levels(idx)
+ assert new_index[0] == ('bar', 'two')
+
+ # key not contained in all levels
+ new_index = idx.insert(0, ('abc', 'three'))
+
+ exp0 = Index(list(idx.levels[0]) + ['abc'], name='first')
+ tm.assert_index_equal(new_index.levels[0], exp0)
+
+ exp1 = Index(list(idx.levels[1]) + ['three'], name='second')
+ tm.assert_index_equal(new_index.levels[1], exp1)
+ assert new_index[0] == ('abc', 'three')
+
+ # key wrong length
+ msg = "Item must have length equal to number of levels"
+ with pytest.raises(ValueError, match=msg):
+ idx.insert(0, ('foo2',))
+
+ left = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1]],
+ columns=['1st', '2nd', '3rd'])
+ left.set_index(['1st', '2nd'], inplace=True)
+ ts = left['3rd'].copy(deep=True)
+
+ left.loc[('b', 'x'), '3rd'] = 2
+ left.loc[('b', 'a'), '3rd'] = -1
+ left.loc[('b', 'b'), '3rd'] = 3
+ left.loc[('a', 'x'), '3rd'] = 4
+ left.loc[('a', 'w'), '3rd'] = 5
+ left.loc[('a', 'a'), '3rd'] = 6
+
+ ts.loc[('b', 'x')] = 2
+ ts.loc['b', 'a'] = -1
+ ts.loc[('b', 'b')] = 3
+ ts.loc['a', 'x'] = 4
+ ts.loc[('a', 'w')] = 5
+ ts.loc['a', 'a'] = 6
+
+ right = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1], ['b', 'x', 2],
+ ['b', 'a', -1], ['b', 'b', 3], ['a', 'x', 4],
+ ['a', 'w', 5], ['a', 'a', 6]],
+ columns=['1st', '2nd', '3rd'])
+ right.set_index(['1st', '2nd'], inplace=True)
+ # FIXME data types changes to float because
+ # of intermediate nan insertion;
+ tm.assert_frame_equal(left, right, check_dtype=False)
+ tm.assert_series_equal(ts, right['3rd'])
+
+ # GH9250
+ idx = [('test1', i) for i in range(5)] + \
+ [('test2', i) for i in range(6)] + \
+ [('test', 17), ('test', 18)]
+
+ left = pd.Series(np.linspace(0, 10, 11),
+ pd.MultiIndex.from_tuples(idx[:-2]))
+
+ left.loc[('test', 17)] = 11
+ left.loc[('test', 18)] = 12
+
+ right = pd.Series(np.linspace(0, 12, 13),
+ pd.MultiIndex.from_tuples(idx))
+
+ tm.assert_series_equal(left, right)
+
+
+def test_append(idx):
+ result = idx[:3].append(idx[3:])
+ assert result.equals(idx)
+
+ foos = [idx[:1], idx[1:3], idx[3:]]
+ result = foos[0].append(foos[1:])
+ assert result.equals(idx)
+
+ # empty
+ result = idx.append([])
+ assert result.equals(idx)
+
+
+def test_repeat():
+ reps = 2
+ numbers = [1, 2, 3]
+ names = np.array(['foo', 'bar'])
+
+ m = MultiIndex.from_product([
+ numbers, names], names=names)
+ expected = MultiIndex.from_product([
+ numbers, names.repeat(reps)], names=names)
+ tm.assert_index_equal(m.repeat(reps), expected)
+
+
+def test_insert_base(idx):
+
+ result = idx[1:4]
+
+ # test 0th element
+ assert idx[0:4].equals(result.insert(0, idx[0]))
+
+
+def test_delete_base(idx):
+
+ expected = idx[1:]
+ result = idx.delete(0)
+ assert result.equals(expected)
+ assert result.name == expected.name
+
+ expected = idx[:-1]
+ result = idx.delete(-1)
+ assert result.equals(expected)
+ assert result.name == expected.name
+
+ with pytest.raises((IndexError, ValueError)):
+ # Exception raised depends on NumPy version.
+ idx.delete(len(idx))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_set_ops.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_set_ops.py
new file mode 100644
index 00000000000..41a0e1e59e8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_set_ops.py
@@ -0,0 +1,372 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import MultiIndex, Series
+import pandas.util.testing as tm
+
+
[email protected]("case", [0.5, "xxx"])
[email protected]("sort", [None, False])
[email protected]("method", ["intersection", "union",
+ "difference", "symmetric_difference"])
+def test_set_ops_error_cases(idx, case, sort, method):
+ # non-iterable input
+ msg = "Input must be Index or array-like"
+ with pytest.raises(TypeError, match=msg):
+ getattr(idx, method)(case, sort=sort)
+
+
[email protected]("sort", [None, False])
+def test_intersection_base(idx, sort):
+ first = idx[:5]
+ second = idx[:3]
+ intersect = first.intersection(second, sort=sort)
+
+ if sort is None:
+ tm.assert_index_equal(intersect, second.sort_values())
+ assert tm.equalContents(intersect, second)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ result = first.intersection(case, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, second.sort_values())
+ assert tm.equalContents(result, second)
+
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.intersection([1, 2, 3], sort=sort)
+
+
[email protected]("sort", [None, False])
+def test_union_base(idx, sort):
+ first = idx[3:]
+ second = idx[:5]
+ everything = idx
+ union = first.union(second, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(union, everything.sort_values())
+ assert tm.equalContents(union, everything)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ result = first.union(case, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, everything.sort_values())
+ assert tm.equalContents(result, everything)
+
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.union([1, 2, 3], sort=sort)
+
+
[email protected]("sort", [None, False])
+def test_difference_base(idx, sort):
+ second = idx[4:]
+ answer = idx[:4]
+ result = idx.difference(second, sort=sort)
+
+ if sort is None:
+ answer = answer.sort_values()
+
+ assert result.equals(answer)
+ tm.assert_index_equal(result, answer)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ result = idx.difference(case, sort=sort)
+ tm.assert_index_equal(result, answer)
+
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ idx.difference([1, 2, 3], sort=sort)
+
+
[email protected]("sort", [None, False])
+def test_symmetric_difference(idx, sort):
+ first = idx[1:]
+ second = idx[:-1]
+ answer = idx[[-1, 0]]
+ result = first.symmetric_difference(second, sort=sort)
+
+ if sort is None:
+ answer = answer.sort_values()
+
+ tm.assert_index_equal(result, answer)
+
+ # GH 10149
+ cases = [klass(second.values)
+ for klass in [np.array, Series, list]]
+ for case in cases:
+ result = first.symmetric_difference(case, sort=sort)
+ tm.assert_index_equal(result, answer)
+
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.symmetric_difference([1, 2, 3], sort=sort)
+
+
+def test_empty(idx):
+ # GH 15270
+ assert not idx.empty
+ assert idx[:0].empty
+
+
[email protected]("sort", [None, False])
+def test_difference(idx, sort):
+
+ first = idx
+ result = first.difference(idx[-3:], sort=sort)
+ vals = idx[:-3].values
+
+ if sort is None:
+ vals = sorted(vals)
+
+ expected = MultiIndex.from_tuples(vals,
+ sortorder=0,
+ names=idx.names)
+
+ assert isinstance(result, MultiIndex)
+ assert result.equals(expected)
+ assert result.names == idx.names
+ tm.assert_index_equal(result, expected)
+
+ # empty difference: reflexive
+ result = idx.difference(idx, sort=sort)
+ expected = idx[:0]
+ assert result.equals(expected)
+ assert result.names == idx.names
+
+ # empty difference: superset
+ result = idx[-3:].difference(idx, sort=sort)
+ expected = idx[:0]
+ assert result.equals(expected)
+ assert result.names == idx.names
+
+ # empty difference: degenerate
+ result = idx[:0].difference(idx, sort=sort)
+ expected = idx[:0]
+ assert result.equals(expected)
+ assert result.names == idx.names
+
+ # names not the same
+ chunklet = idx[-3:]
+ chunklet.names = ['foo', 'baz']
+ result = first.difference(chunklet, sort=sort)
+ assert result.names == (None, None)
+
+ # empty, but non-equal
+ result = idx.difference(idx.sortlevel(1)[0], sort=sort)
+ assert len(result) == 0
+
+ # raise Exception called with non-MultiIndex
+ result = first.difference(first.values, sort=sort)
+ assert result.equals(first[:0])
+
+ # name from empty array
+ result = first.difference([], sort=sort)
+ assert first.equals(result)
+ assert first.names == result.names
+
+ # name from non-empty array
+ result = first.difference([('foo', 'one')], sort=sort)
+ expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), (
+ 'foo', 'two'), ('qux', 'one'), ('qux', 'two')])
+ expected.names = first.names
+ assert first.names == result.names
+
+ msg = "other must be a MultiIndex or a list of tuples"
+ with pytest.raises(TypeError, match=msg):
+ first.difference([1, 2, 3, 4, 5], sort=sort)
+
+
+def test_difference_sort_special():
+ # GH-24959
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+ # sort=None, the default
+ result = idx.difference([])
+ tm.assert_index_equal(result, idx)
+
+
[email protected](reason="Not implemented.")
+def test_difference_sort_special_true():
+ # TODO decide on True behaviour
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+ result = idx.difference([], sort=True)
+ expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']])
+ tm.assert_index_equal(result, expected)
+
+
+def test_difference_sort_incomparable():
+ # GH-24959
+ idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2],
+ ['a', 'b']])
+
+ other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4],
+ ['c', 'd']])
+ # sort=None, the default
+ # MultiIndex.difference deviates here from other difference
+ # implementations in not catching the TypeError
+ with pytest.raises(TypeError):
+ result = idx.difference(other)
+
+ # sort=False
+ result = idx.difference(other, sort=False)
+ tm.assert_index_equal(result, idx)
+
+
[email protected](reason="Not implemented.")
+def test_difference_sort_incomparable_true():
+ # TODO decide on True behaviour
+ # # sort=True, raises
+ idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2],
+ ['a', 'b']])
+ other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4],
+ ['c', 'd']])
+
+ with pytest.raises(TypeError):
+ idx.difference(other, sort=True)
+
+
[email protected]("sort", [None, False])
+def test_union(idx, sort):
+ piece1 = idx[:5][::-1]
+ piece2 = idx[3:]
+
+ the_union = piece1.union(piece2, sort=sort)
+
+ if sort is None:
+ tm.assert_index_equal(the_union, idx.sort_values())
+
+ assert tm.equalContents(the_union, idx)
+
+ # corner case, pass self or empty thing:
+ the_union = idx.union(idx, sort=sort)
+ assert the_union is idx
+
+ the_union = idx.union(idx[:0], sort=sort)
+ assert the_union is idx
+
+ # won't work in python 3
+ # tuples = _index.values
+ # result = _index[:4] | tuples[4:]
+ # assert result.equals(tuples)
+
+ # not valid for python 3
+ # def test_union_with_regular_index(self):
+ # other = Index(['A', 'B', 'C'])
+
+ # result = other.union(idx)
+ # assert ('foo', 'one') in result
+ # assert 'B' in result
+
+ # result2 = _index.union(other)
+ # assert result.equals(result2)
+
+
[email protected]("sort", [None, False])
+def test_intersection(idx, sort):
+ piece1 = idx[:5][::-1]
+ piece2 = idx[3:]
+
+ the_int = piece1.intersection(piece2, sort=sort)
+
+ if sort is None:
+ tm.assert_index_equal(the_int, idx[3:5])
+ assert tm.equalContents(the_int, idx[3:5])
+
+ # corner case, pass self
+ the_int = idx.intersection(idx, sort=sort)
+ assert the_int is idx
+
+ # empty intersection: disjoint
+ empty = idx[:2].intersection(idx[2:], sort=sort)
+ expected = idx[:0]
+ assert empty.equals(expected)
+
+ # can't do in python 3
+ # tuples = _index.values
+ # result = _index & tuples
+ # assert result.equals(tuples)
+
+
+def test_intersect_equal_sort():
+ # GH-24959
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+ tm.assert_index_equal(idx.intersection(idx, sort=False), idx)
+ tm.assert_index_equal(idx.intersection(idx, sort=None), idx)
+
+
[email protected](reason="Not implemented.")
+def test_intersect_equal_sort_true():
+ # TODO decide on True behaviour
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+ sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']])
+ tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_)
+
+
[email protected]('slice_', [slice(None), slice(0)])
+def test_union_sort_other_empty(slice_):
+ # https://github.com/pandas-dev/pandas/issues/24959
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+
+ # default, sort=None
+ other = idx[slice_]
+ tm.assert_index_equal(idx.union(other), idx)
+ # MultiIndex does not special case empty.union(idx)
+ # tm.assert_index_equal(other.union(idx), idx)
+
+ # sort=False
+ tm.assert_index_equal(idx.union(other, sort=False), idx)
+
+
[email protected](reason="Not implemented.")
+def test_union_sort_other_empty_sort(slice_):
+ # TODO decide on True behaviour
+ # # sort=True
+ idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']])
+ other = idx[:0]
+ result = idx.union(other, sort=True)
+ expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']])
+ tm.assert_index_equal(result, expected)
+
+
+def test_union_sort_other_incomparable():
+ # https://github.com/pandas-dev/pandas/issues/24959
+ idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']])
+
+ # default, sort=None
+ result = idx.union(idx[:1])
+ tm.assert_index_equal(result, idx)
+
+ # sort=False
+ result = idx.union(idx[:1], sort=False)
+ tm.assert_index_equal(result, idx)
+
+
[email protected](reason="Not implemented.")
+def test_union_sort_other_incomparable_sort():
+ # TODO decide on True behaviour
+ # # sort=True
+ idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']])
+ with pytest.raises(TypeError, match='Cannot compare'):
+ idx.union(idx[:1], sort=True)
+
+
[email protected]("method", ['union', 'intersection', 'difference',
+ 'symmetric_difference'])
+def test_setops_disallow_true(method):
+ idx1 = pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
+ idx2 = pd.MultiIndex.from_product([['b', 'c'], [1, 2]])
+
+ with pytest.raises(ValueError, match="The 'sort' keyword only takes"):
+ getattr(idx1, method)(idx2, sort=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_sorting.py b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_sorting.py
new file mode 100644
index 00000000000..1a81318e06d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/multi/test_sorting.py
@@ -0,0 +1,266 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+from pandas.errors import PerformanceWarning, UnsortedIndexError
+
+import pandas as pd
+from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex
+import pandas.util.testing as tm
+
+
+def test_sortlevel(idx):
+ import random
+
+ tuples = list(idx)
+ random.shuffle(tuples)
+
+ index = MultiIndex.from_tuples(tuples)
+
+ sorted_idx, _ = index.sortlevel(0)
+ expected = MultiIndex.from_tuples(sorted(tuples))
+ assert sorted_idx.equals(expected)
+
+ sorted_idx, _ = index.sortlevel(0, ascending=False)
+ assert sorted_idx.equals(expected[::-1])
+
+ sorted_idx, _ = index.sortlevel(1)
+ by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
+ expected = MultiIndex.from_tuples(by1)
+ assert sorted_idx.equals(expected)
+
+ sorted_idx, _ = index.sortlevel(1, ascending=False)
+ assert sorted_idx.equals(expected[::-1])
+
+
+def test_sortlevel_not_sort_remaining():
+ mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
+ sorted_idx, _ = mi.sortlevel('A', sort_remaining=False)
+ assert sorted_idx.equals(mi)
+
+
+def test_sortlevel_deterministic():
+ tuples = [('bar', 'one'), ('foo', 'two'), ('qux', 'two'),
+ ('foo', 'one'), ('baz', 'two'), ('qux', 'one')]
+
+ index = MultiIndex.from_tuples(tuples)
+
+ sorted_idx, _ = index.sortlevel(0)
+ expected = MultiIndex.from_tuples(sorted(tuples))
+ assert sorted_idx.equals(expected)
+
+ sorted_idx, _ = index.sortlevel(0, ascending=False)
+ assert sorted_idx.equals(expected[::-1])
+
+ sorted_idx, _ = index.sortlevel(1)
+ by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
+ expected = MultiIndex.from_tuples(by1)
+ assert sorted_idx.equals(expected)
+
+ sorted_idx, _ = index.sortlevel(1, ascending=False)
+ assert sorted_idx.equals(expected[::-1])
+
+
+def test_sort(indices):
+ with pytest.raises(TypeError):
+ indices.sort()
+
+
+def test_numpy_argsort(idx):
+ result = np.argsort(idx)
+ expected = idx.argsort()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # these are the only two types that perform
+ # pandas compatibility input validation - the
+ # rest already perform separate (or no) such
+ # validation via their 'values' attribute as
+ # defined in pandas.core.indexes/base.py - they
+ # cannot be changed at the moment due to
+ # backwards compatibility concerns
+ if isinstance(type(idx), (CategoricalIndex, RangeIndex)):
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(idx, axis=1)
+
+ msg = "the 'kind' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(idx, kind='mergesort')
+
+ msg = "the 'order' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argsort(idx, order=('a', 'b'))
+
+
+def test_unsortedindex():
+ # GH 11897
+ mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
+ ('x', 'b'), ('y', 'a'), ('z', 'b')],
+ names=['one', 'two'])
+ df = pd.DataFrame([[i, 10 * i] for i in lrange(6)], index=mi,
+ columns=['one', 'two'])
+
+ # GH 16734: not sorted, but no real slicing
+ result = df.loc(axis=0)['z', 'a']
+ expected = df.iloc[0]
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(UnsortedIndexError):
+ df.loc(axis=0)['z', slice('a')]
+ df.sort_index(inplace=True)
+ assert len(df.loc(axis=0)['z', :]) == 2
+
+ with pytest.raises(KeyError):
+ df.loc(axis=0)['q', :]
+
+
+def test_unsortedindex_doc_examples():
+ # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa
+ dfm = DataFrame({'jim': [0, 0, 1, 1],
+ 'joe': ['x', 'x', 'z', 'y'],
+ 'jolie': np.random.rand(4)})
+
+ dfm = dfm.set_index(['jim', 'joe'])
+ with tm.assert_produces_warning(PerformanceWarning):
+ dfm.loc[(1, 'z')]
+
+ with pytest.raises(UnsortedIndexError):
+ dfm.loc[(0, 'y'):(1, 'z')]
+
+ assert not dfm.index.is_lexsorted()
+ assert dfm.index.lexsort_depth == 1
+
+ # sort it
+ dfm = dfm.sort_index()
+ dfm.loc[(1, 'z')]
+ dfm.loc[(0, 'y'):(1, 'z')]
+
+ assert dfm.index.is_lexsorted()
+ assert dfm.index.lexsort_depth == 2
+
+
+def test_reconstruct_sort():
+
+ # starts off lexsorted & monotonic
+ mi = MultiIndex.from_arrays([
+ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
+ ])
+ assert mi.is_lexsorted()
+ assert mi.is_monotonic
+
+ recons = mi._sort_levels_monotonic()
+ assert recons.is_lexsorted()
+ assert recons.is_monotonic
+ assert mi is recons
+
+ assert mi.equals(recons)
+ assert Index(mi.values).equals(Index(recons.values))
+
+ # cannot convert to lexsorted
+ mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
+ ('x', 'b'), ('y', 'a'), ('z', 'b')],
+ names=['one', 'two'])
+ assert not mi.is_lexsorted()
+ assert not mi.is_monotonic
+
+ recons = mi._sort_levels_monotonic()
+ assert not recons.is_lexsorted()
+ assert not recons.is_monotonic
+
+ assert mi.equals(recons)
+ assert Index(mi.values).equals(Index(recons.values))
+
+ # cannot convert to lexsorted
+ mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
+ codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
+ names=['col1', 'col2'])
+ assert not mi.is_lexsorted()
+ assert not mi.is_monotonic
+
+ recons = mi._sort_levels_monotonic()
+ assert not recons.is_lexsorted()
+ assert not recons.is_monotonic
+
+ assert mi.equals(recons)
+ assert Index(mi.values).equals(Index(recons.values))
+
+
+def test_reconstruct_remove_unused():
+ # xref to GH 2770
+ df = DataFrame([['deleteMe', 1, 9],
+ ['keepMe', 2, 9],
+ ['keepMeToo', 3, 9]],
+ columns=['first', 'second', 'third'])
+ df2 = df.set_index(['first', 'second'], drop=False)
+ df2 = df2[df2['first'] != 'deleteMe']
+
+ # removed levels are there
+ expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'],
+ [1, 2, 3]],
+ codes=[[1, 2], [1, 2]],
+ names=['first', 'second'])
+ result = df2.index
+ tm.assert_index_equal(result, expected)
+
+ expected = MultiIndex(levels=[['keepMe', 'keepMeToo'],
+ [2, 3]],
+ codes=[[0, 1], [0, 1]],
+ names=['first', 'second'])
+ result = df2.index.remove_unused_levels()
+ tm.assert_index_equal(result, expected)
+
+ # idempotent
+ result2 = result.remove_unused_levels()
+ tm.assert_index_equal(result2, expected)
+ assert result2.is_(result)
+
+
[email protected]('first_type,second_type', [
+ ('int64', 'int64'),
+ ('datetime64[D]', 'str')
+])
+def test_remove_unused_levels_large(first_type, second_type):
+ # GH16556
+
+ # because tests should be deterministic (and this test in particular
+ # checks that levels are removed, which is not the case for every
+ # random input):
+ rng = np.random.RandomState(4) # seed is arbitrary value that works
+
+ size = 1 << 16
+ df = DataFrame(dict(
+ first=rng.randint(0, 1 << 13, size).astype(first_type),
+ second=rng.randint(0, 1 << 10, size).astype(second_type),
+ third=rng.rand(size)))
+ df = df.groupby(['first', 'second']).sum()
+ df = df[df.third < 0.1]
+
+ result = df.index.remove_unused_levels()
+ assert len(result.levels[0]) < len(df.index.levels[0])
+ assert len(result.levels[1]) < len(df.index.levels[1])
+ assert result.equals(df.index)
+
+ expected = df.reset_index().set_index(['first', 'second']).index
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('level0', [['a', 'd', 'b'],
+ ['a', 'd', 'b', 'unused']])
[email protected]('level1', [['w', 'x', 'y', 'z'],
+ ['w', 'x', 'y', 'z', 'unused']])
+def test_remove_unused_nan(level0, level1):
+ # GH 18417
+ mi = pd.MultiIndex(levels=[level0, level1],
+ codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]])
+
+ result = mi.remove_unused_levels()
+ tm.assert_index_equal(result, mi)
+ for level in 0, 1:
+ assert('unused' not in result.levels[level])
+
+
+def test_argsort(idx):
+ result = idx.argsort()
+ expected = idx.values.argsort()
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_arithmetic.py
new file mode 100644
index 00000000000..67b642e0138
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_arithmetic.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import PeriodIndex, period_range
+import pandas.util.testing as tm
+
+
+class TestPeriodIndexArithmetic(object):
+ # ---------------------------------------------------------------
+ # PeriodIndex.shift is used by __add__ and __sub__
+
+ def test_pi_shift_ndarray(self):
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='M', name='idx')
+ result = idx.shift(np.array([1, 2, 3, 4]))
+ expected = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'],
+ freq='M', name='idx')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.shift(np.array([1, -2, 3, -4]))
+ expected = PeriodIndex(['2011-02', '2010-12', 'NaT', '2010-12'],
+ freq='M', name='idx')
+ tm.assert_index_equal(result, expected)
+
+ def test_shift(self):
+ pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='A', start='1/1/2002', end='12/1/2010')
+
+ tm.assert_index_equal(pi1.shift(0), pi1)
+
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(1), pi2)
+
+ pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='A', start='1/1/2000', end='12/1/2008')
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(-1), pi2)
+
+ pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='M', start='2/1/2001', end='1/1/2010')
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(1), pi2)
+
+ pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='M', start='12/1/2000', end='11/1/2009')
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(-1), pi2)
+
+ pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='D', start='1/2/2001', end='12/2/2009')
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(1), pi2)
+
+ pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009')
+ pi2 = period_range(freq='D', start='12/31/2000', end='11/30/2009')
+ assert len(pi1) == len(pi2)
+ tm.assert_index_equal(pi1.shift(-1), pi2)
+
+ def test_shift_corner_cases(self):
+ # GH#9903
+ idx = pd.PeriodIndex([], name='xxx', freq='H')
+
+ with pytest.raises(TypeError):
+ # period shift doesn't accept freq
+ idx.shift(1, freq='H')
+
+ tm.assert_index_equal(idx.shift(0), idx)
+ tm.assert_index_equal(idx.shift(3), idx)
+
+ idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00'
+ '2011-01-01 12:00'], name='xxx', freq='H')
+ tm.assert_index_equal(idx.shift(0), idx)
+ exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00'
+ '2011-01-01 15:00'], name='xxx', freq='H')
+ tm.assert_index_equal(idx.shift(3), exp)
+ exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00'
+ '2011-01-01 09:00'], name='xxx', freq='H')
+ tm.assert_index_equal(idx.shift(-3), exp)
+
+ def test_shift_nat(self):
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'],
+ freq='M', name='idx')
+ result = idx.shift(1)
+ expected = PeriodIndex(['2011-02', '2011-03', 'NaT', '2011-05'],
+ freq='M', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ def test_shift_gh8083(self):
+ # test shift for PeriodIndex
+ # GH#8083
+ drange = pd.period_range('20130101', periods=5, freq='D')
+ result = drange.shift(1)
+ expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04',
+ '2013-01-05', '2013-01-06'], freq='D')
+ tm.assert_index_equal(result, expected)
+
+ def test_shift_periods(self):
+ # GH #22458 : argument 'n' was deprecated in favor of 'periods'
+ idx = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ tm.assert_index_equal(idx.shift(periods=0), idx)
+ tm.assert_index_equal(idx.shift(0), idx)
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=True):
+ tm.assert_index_equal(idx.shift(n=0), idx)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_asfreq.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_asfreq.py
new file mode 100644
index 00000000000..2dd49e7e084
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_asfreq.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, PeriodIndex, Series, period_range
+from pandas.util import testing as tm
+
+
+class TestPeriodIndex(object):
+
+ def test_asfreq(self):
+ pi1 = period_range(freq='A', start='1/1/2001', end='1/1/2001')
+ pi2 = period_range(freq='Q', start='1/1/2001', end='1/1/2001')
+ pi3 = period_range(freq='M', start='1/1/2001', end='1/1/2001')
+ pi4 = period_range(freq='D', start='1/1/2001', end='1/1/2001')
+ pi5 = period_range(freq='H', start='1/1/2001', end='1/1/2001 00:00')
+ pi6 = period_range(freq='Min', start='1/1/2001', end='1/1/2001 00:00')
+ pi7 = period_range(freq='S', start='1/1/2001', end='1/1/2001 00:00:00')
+
+ assert pi1.asfreq('Q', 'S') == pi2
+ assert pi1.asfreq('Q', 's') == pi2
+ assert pi1.asfreq('M', 'start') == pi3
+ assert pi1.asfreq('D', 'StarT') == pi4
+ assert pi1.asfreq('H', 'beGIN') == pi5
+ assert pi1.asfreq('Min', 'S') == pi6
+ assert pi1.asfreq('S', 'S') == pi7
+
+ assert pi2.asfreq('A', 'S') == pi1
+ assert pi2.asfreq('M', 'S') == pi3
+ assert pi2.asfreq('D', 'S') == pi4
+ assert pi2.asfreq('H', 'S') == pi5
+ assert pi2.asfreq('Min', 'S') == pi6
+ assert pi2.asfreq('S', 'S') == pi7
+
+ assert pi3.asfreq('A', 'S') == pi1
+ assert pi3.asfreq('Q', 'S') == pi2
+ assert pi3.asfreq('D', 'S') == pi4
+ assert pi3.asfreq('H', 'S') == pi5
+ assert pi3.asfreq('Min', 'S') == pi6
+ assert pi3.asfreq('S', 'S') == pi7
+
+ assert pi4.asfreq('A', 'S') == pi1
+ assert pi4.asfreq('Q', 'S') == pi2
+ assert pi4.asfreq('M', 'S') == pi3
+ assert pi4.asfreq('H', 'S') == pi5
+ assert pi4.asfreq('Min', 'S') == pi6
+ assert pi4.asfreq('S', 'S') == pi7
+
+ assert pi5.asfreq('A', 'S') == pi1
+ assert pi5.asfreq('Q', 'S') == pi2
+ assert pi5.asfreq('M', 'S') == pi3
+ assert pi5.asfreq('D', 'S') == pi4
+ assert pi5.asfreq('Min', 'S') == pi6
+ assert pi5.asfreq('S', 'S') == pi7
+
+ assert pi6.asfreq('A', 'S') == pi1
+ assert pi6.asfreq('Q', 'S') == pi2
+ assert pi6.asfreq('M', 'S') == pi3
+ assert pi6.asfreq('D', 'S') == pi4
+ assert pi6.asfreq('H', 'S') == pi5
+ assert pi6.asfreq('S', 'S') == pi7
+
+ assert pi7.asfreq('A', 'S') == pi1
+ assert pi7.asfreq('Q', 'S') == pi2
+ assert pi7.asfreq('M', 'S') == pi3
+ assert pi7.asfreq('D', 'S') == pi4
+ assert pi7.asfreq('H', 'S') == pi5
+ assert pi7.asfreq('Min', 'S') == pi6
+
+ pytest.raises(ValueError, pi7.asfreq, 'T', 'foo')
+ result1 = pi1.asfreq('3M')
+ result2 = pi1.asfreq('M')
+ expected = period_range(freq='M', start='2001-12', end='2001-12')
+ tm.assert_numpy_array_equal(result1.asi8, expected.asi8)
+ assert result1.freqstr == '3M'
+ tm.assert_numpy_array_equal(result2.asi8, expected.asi8)
+ assert result2.freqstr == 'M'
+
+ def test_asfreq_nat(self):
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M')
+ result = idx.asfreq(freq='Q')
+ expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q')
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['D', '3D'])
+ def test_asfreq_mult_pi(self, freq):
+ pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M')
+
+ result = pi.asfreq(freq)
+ exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT',
+ '2001-04-30'], freq=freq)
+ tm.assert_index_equal(result, exp)
+ assert result.freq == exp.freq
+
+ result = pi.asfreq(freq, how='S')
+ exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT',
+ '2001-03-01'], freq=freq)
+ tm.assert_index_equal(result, exp)
+ assert result.freq == exp.freq
+
+ def test_asfreq_combined_pi(self):
+ pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'],
+ freq='H')
+ exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'],
+ freq='25H')
+ for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']):
+ result = pi.asfreq(freq, how=how)
+ tm.assert_index_equal(result, exp)
+ assert result.freq == exp.freq
+
+ for freq in ['1D1H', '1H1D']:
+ pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00',
+ 'NaT'], freq=freq)
+ result = pi.asfreq('H')
+ exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'],
+ freq='H')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == exp.freq
+
+ pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00',
+ 'NaT'], freq=freq)
+ result = pi.asfreq('H', how='S')
+ exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'],
+ freq='H')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == exp.freq
+
+ def test_asfreq_ts(self):
+ index = period_range(freq='A', start='1/1/2001', end='12/31/2010')
+ ts = Series(np.random.randn(len(index)), index=index)
+ df = DataFrame(np.random.randn(len(index), 3), index=index)
+
+ result = ts.asfreq('D', how='end')
+ df_result = df.asfreq('D', how='end')
+ exp_index = index.asfreq('D', how='end')
+ assert len(result) == len(ts)
+ tm.assert_index_equal(result.index, exp_index)
+ tm.assert_index_equal(df_result.index, exp_index)
+
+ result = ts.asfreq('D', how='start')
+ assert len(result) == len(ts)
+ tm.assert_index_equal(result.index, index.asfreq('D', how='start'))
+
+ def test_astype_asfreq(self):
+ pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D')
+ exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M')
+ tm.assert_index_equal(pi1.asfreq('M'), exp)
+ tm.assert_index_equal(pi1.astype('period[M]'), exp)
+
+ exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M')
+ tm.assert_index_equal(pi1.asfreq('3M'), exp)
+ tm.assert_index_equal(pi1.astype('period[3M]'), exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_astype.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_astype.py
new file mode 100644
index 00000000000..6abdf5962d6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_astype.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, Int64Index, NaT, Period, PeriodIndex, period_range
+import pandas.util.testing as tm
+
+
+class TestPeriodIndexAsType(object):
+ @pytest.mark.parametrize('dtype', [
+ float, 'timedelta64', 'timedelta64[ns]'])
+ def test_astype_raises(self, dtype):
+ # GH#13149, GH#13209
+ idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
+ msg = 'Cannot cast PeriodArray to dtype'
+ with pytest.raises(TypeError, match=msg):
+ idx.astype(dtype)
+
+ def test_astype_conversion(self):
+ # GH#13149, GH#13209
+ idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
+
+ result = idx.astype(object)
+ expected = Index([Period('2016-05-16', freq='D')] +
+ [Period(NaT, freq='D')] * 3, dtype='object')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype(np.int64)
+ expected = Int64Index([16937] + [-9223372036854775808] * 3,
+ dtype=np.int64)
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype(str)
+ expected = Index(str(x) for x in idx)
+ tm.assert_index_equal(result, expected)
+
+ idx = period_range('1990', '2009', freq='A')
+ result = idx.astype('i8')
+ tm.assert_index_equal(result, Index(idx.asi8))
+ tm.assert_numpy_array_equal(result.values, idx.asi8)
+
+ def test_astype_uint(self):
+ arr = period_range('2000', periods=2)
+ expected = pd.UInt64Index(np.array([10957, 10958], dtype='uint64'))
+ tm.assert_index_equal(arr.astype("uint64"), expected)
+ tm.assert_index_equal(arr.astype("uint32"), expected)
+
+ def test_astype_object(self):
+ idx = pd.PeriodIndex([], freq='M')
+
+ exp = np.array([], dtype=object)
+ tm.assert_numpy_array_equal(idx.astype(object).values, exp)
+ tm.assert_numpy_array_equal(idx._mpl_repr(), exp)
+
+ idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M')
+
+ exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object)
+ tm.assert_numpy_array_equal(idx.astype(object).values, exp)
+ tm.assert_numpy_array_equal(idx._mpl_repr(), exp)
+
+ exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT],
+ dtype=object)
+ idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D')
+ tm.assert_numpy_array_equal(idx.astype(object).values, exp)
+ tm.assert_numpy_array_equal(idx._mpl_repr(), exp)
+
+ # TODO: de-duplicate this version (from test_ops) with the one above
+ # (from test_period)
+ def test_astype_object2(self):
+ idx = pd.period_range(start='2013-01-01', periods=4, freq='M',
+ name='idx')
+ expected_list = [pd.Period('2013-01-31', freq='M'),
+ pd.Period('2013-02-28', freq='M'),
+ pd.Period('2013-03-31', freq='M'),
+ pd.Period('2013-04-30', freq='M')]
+ expected = pd.Index(expected_list, dtype=object, name='idx')
+ result = idx.astype(object)
+ assert isinstance(result, Index)
+ assert result.dtype == object
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert idx.tolist() == expected_list
+
+ idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT',
+ '2013-01-04'], freq='D', name='idx')
+ expected_list = [pd.Period('2013-01-01', freq='D'),
+ pd.Period('2013-01-02', freq='D'),
+ pd.Period('NaT', freq='D'),
+ pd.Period('2013-01-04', freq='D')]
+ expected = pd.Index(expected_list, dtype=object, name='idx')
+ result = idx.astype(object)
+ assert isinstance(result, Index)
+ assert result.dtype == object
+ tm.assert_index_equal(result, expected)
+ for i in [0, 1, 3]:
+ assert result[i] == expected[i]
+ assert result[2] is pd.NaT
+ assert result.name == expected.name
+
+ result_list = idx.tolist()
+ for i in [0, 1, 3]:
+ assert result_list[i] == expected_list[i]
+ assert result_list[2] is pd.NaT
+
+ def test_astype_category(self):
+ obj = pd.period_range("2000", periods=2)
+ result = obj.astype('category')
+ expected = pd.CategoricalIndex([pd.Period('2000-01-01', freq="D"),
+ pd.Period('2000-01-02', freq="D")])
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype('category')
+ expected = expected.values
+ tm.assert_categorical_equal(result, expected)
+
+ def test_astype_array_fallback(self):
+ obj = pd.period_range("2000", periods=2)
+ result = obj.astype(bool)
+ expected = pd.Index(np.array([True, True]))
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype(bool)
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_construction.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_construction.py
new file mode 100644
index 00000000000..916260c4cee
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_construction.py
@@ -0,0 +1,519 @@
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, lmap, lrange, text_type
+
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+import pandas as pd
+from pandas import (
+ Index, Period, PeriodIndex, Series, date_range, offsets, period_range)
+import pandas.core.indexes.period as period
+import pandas.util.testing as tm
+
+
+class TestPeriodIndex(object):
+
+ def setup_method(self, method):
+ pass
+
+ def test_construction_base_constructor(self):
+ # GH 13664
+ arr = [pd.Period('2011-01', freq='M'), pd.NaT,
+ pd.Period('2011-03', freq='M')]
+ tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.PeriodIndex(np.array(arr)))
+
+ arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')]
+ tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.PeriodIndex(np.array(arr)))
+
+ arr = [pd.Period('2011-01', freq='M'), pd.NaT,
+ pd.Period('2011-03', freq='D')]
+ tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object))
+
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.Index(np.array(arr), dtype=object))
+
+ def test_constructor_use_start_freq(self):
+ # GH #1118
+ p = Period('4/2/2012', freq='B')
+ with tm.assert_produces_warning(FutureWarning):
+ index = PeriodIndex(start=p, periods=10)
+ expected = period_range(start='4/2/2012', periods=10, freq='B')
+ tm.assert_index_equal(index, expected)
+
+ index = period_range(start=p, periods=10)
+ tm.assert_index_equal(index, expected)
+
+ def test_constructor_field_arrays(self):
+ # GH #1264
+
+ years = np.arange(1990, 2010).repeat(4)[2:-2]
+ quarters = np.tile(np.arange(1, 5), 20)[2:-2]
+
+ index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC')
+ expected = period_range('1990Q3', '2009Q2', freq='Q-DEC')
+ tm.assert_index_equal(index, expected)
+
+ index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC')
+ tm.assert_numpy_array_equal(index.asi8, index2.asi8)
+
+ index = PeriodIndex(year=years, quarter=quarters)
+ tm.assert_index_equal(index, expected)
+
+ years = [2007, 2007, 2007]
+ months = [1, 2]
+ pytest.raises(ValueError, PeriodIndex, year=years, month=months,
+ freq='M')
+ pytest.raises(ValueError, PeriodIndex, year=years, month=months,
+ freq='2M')
+ pytest.raises(ValueError, PeriodIndex, year=years, month=months,
+ freq='M', start=Period('2007-01', freq='M'))
+
+ years = [2007, 2007, 2007]
+ months = [1, 2, 3]
+ idx = PeriodIndex(year=years, month=months, freq='M')
+ exp = period_range('2007-01', periods=3, freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ def test_constructor_U(self):
+ # U was used as undefined period
+ pytest.raises(ValueError, period_range, '2007-1-1', periods=500,
+ freq='X')
+
+ def test_constructor_nano(self):
+ idx = period_range(start=Period(ordinal=1, freq='N'),
+ end=Period(ordinal=4, freq='N'), freq='N')
+ exp = PeriodIndex([Period(ordinal=1, freq='N'),
+ Period(ordinal=2, freq='N'),
+ Period(ordinal=3, freq='N'),
+ Period(ordinal=4, freq='N')], freq='N')
+ tm.assert_index_equal(idx, exp)
+
+ def test_constructor_arrays_negative_year(self):
+ years = np.arange(1960, 2000, dtype=np.int64).repeat(4)
+ quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40)
+
+ pindex = PeriodIndex(year=years, quarter=quarters)
+
+ tm.assert_index_equal(pindex.year, pd.Index(years))
+ tm.assert_index_equal(pindex.quarter, pd.Index(quarters))
+
+ def test_constructor_invalid_quarters(self):
+ pytest.raises(ValueError, PeriodIndex, year=lrange(2000, 2004),
+ quarter=lrange(4), freq='Q-DEC')
+
+ def test_constructor_corner(self):
+ pytest.raises(ValueError, PeriodIndex, periods=10, freq='A')
+
+ start = Period('2007', freq='A-JUN')
+ end = Period('2010', freq='A-DEC')
+ pytest.raises(ValueError, PeriodIndex, start=start, end=end)
+ pytest.raises(ValueError, PeriodIndex, start=start)
+ pytest.raises(ValueError, PeriodIndex, end=end)
+
+ result = period_range('2007-01', periods=10.5, freq='M')
+ exp = period_range('2007-01', periods=10, freq='M')
+ tm.assert_index_equal(result, exp)
+
+ def test_constructor_fromarraylike(self):
+ idx = period_range('2007-01', periods=20, freq='M')
+
+ # values is an array of Period, thus can retrieve freq
+ tm.assert_index_equal(PeriodIndex(idx.values), idx)
+ tm.assert_index_equal(PeriodIndex(list(idx.values)), idx)
+
+ pytest.raises(ValueError, PeriodIndex, idx._ndarray_values)
+ pytest.raises(ValueError, PeriodIndex, list(idx._ndarray_values))
+ pytest.raises(TypeError, PeriodIndex,
+ data=Period('2007', freq='A'))
+
+ result = PeriodIndex(iter(idx))
+ tm.assert_index_equal(result, idx)
+
+ result = PeriodIndex(idx)
+ tm.assert_index_equal(result, idx)
+
+ result = PeriodIndex(idx, freq='M')
+ tm.assert_index_equal(result, idx)
+
+ result = PeriodIndex(idx, freq=offsets.MonthEnd())
+ tm.assert_index_equal(result, idx)
+ assert result.freq == 'M'
+
+ result = PeriodIndex(idx, freq='2M')
+ tm.assert_index_equal(result, idx.asfreq('2M'))
+ assert result.freq == '2M'
+
+ result = PeriodIndex(idx, freq=offsets.MonthEnd(2))
+ tm.assert_index_equal(result, idx.asfreq('2M'))
+ assert result.freq == '2M'
+
+ result = PeriodIndex(idx, freq='D')
+ exp = idx.asfreq('D', 'e')
+ tm.assert_index_equal(result, exp)
+
+ def test_constructor_datetime64arr(self):
+ vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64)
+ vals = vals.view(np.dtype('M8[us]'))
+
+ pytest.raises(ValueError, PeriodIndex, vals, freq='D')
+
+ @pytest.mark.parametrize('box', [None, 'series', 'index'])
+ def test_constructor_datetime64arr_ok(self, box):
+ # https://github.com/pandas-dev/pandas/issues/23438
+ data = pd.date_range('2017', periods=4, freq="M")
+ if box is None:
+ data = data._values
+ elif box == 'series':
+ data = pd.Series(data)
+
+ result = PeriodIndex(data, freq='D')
+ expected = PeriodIndex([
+ '2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30'
+ ], freq="D")
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_dtype(self):
+ # passing a dtype with a tz should localize
+ idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]')
+ exp = PeriodIndex(['2013-01', '2013-03'], freq='M')
+ tm.assert_index_equal(idx, exp)
+ assert idx.dtype == 'period[M]'
+
+ idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]')
+ exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D')
+ tm.assert_index_equal(idx, exp)
+ assert idx.dtype == 'period[3D]'
+
+ # if we already have a freq and its not the same, then asfreq
+ # (not changed)
+ idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D')
+
+ res = PeriodIndex(idx, dtype='period[M]')
+ exp = PeriodIndex(['2013-01', '2013-01'], freq='M')
+ tm.assert_index_equal(res, exp)
+ assert res.dtype == 'period[M]'
+
+ res = PeriodIndex(idx, freq='M')
+ tm.assert_index_equal(res, exp)
+ assert res.dtype == 'period[M]'
+
+ msg = 'specified freq and dtype are different'
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ PeriodIndex(['2011-01'], freq='M', dtype='period[D]')
+
+ def test_constructor_empty(self):
+ idx = pd.PeriodIndex([], freq='M')
+ assert isinstance(idx, PeriodIndex)
+ assert len(idx) == 0
+ assert idx.freq == 'M'
+
+ with pytest.raises(ValueError, match='freq not specified'):
+ pd.PeriodIndex([])
+
+ def test_constructor_pi_nat(self):
+ idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT,
+ Period('2011-01', freq='M')])
+ exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT,
+ Period('2011-01', freq='M')]))
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'),
+ Period('2011-01', freq='M')])
+ exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex(np.array([pd.NaT, pd.NaT,
+ Period('2011-01', freq='M'),
+ Period('2011-01', freq='M')]))
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ with pytest.raises(ValueError, match='freq not specified'):
+ PeriodIndex([pd.NaT, pd.NaT])
+
+ with pytest.raises(ValueError, match='freq not specified'):
+ PeriodIndex(np.array([pd.NaT, pd.NaT]))
+
+ with pytest.raises(ValueError, match='freq not specified'):
+ PeriodIndex(['NaT', 'NaT'])
+
+ with pytest.raises(ValueError, match='freq not specified'):
+ PeriodIndex(np.array(['NaT', 'NaT']))
+
+ def test_constructor_incompat_freq(self):
+ msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)"
+
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ PeriodIndex([Period('2011-01', freq='M'), pd.NaT,
+ Period('2011-01', freq='D')])
+
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT,
+ Period('2011-01', freq='D')]))
+
+ # first element is pd.NaT
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ PeriodIndex([pd.NaT, Period('2011-01', freq='M'),
+ Period('2011-01', freq='D')])
+
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'),
+ Period('2011-01', freq='D')]))
+
+ def test_constructor_mixed(self):
+ idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')])
+ exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')])
+ exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M')
+ tm.assert_index_equal(idx, exp)
+
+ idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT,
+ '2012-01-01'])
+ exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D')
+ tm.assert_index_equal(idx, exp)
+
+ def test_constructor_simple_new(self):
+ idx = period_range('2007-01', name='p', periods=2, freq='M')
+ result = idx._simple_new(idx, name='p', freq=idx.freq)
+ tm.assert_index_equal(result, idx)
+
+ result = idx._simple_new(idx.astype('i8'), name='p', freq=idx.freq)
+ tm.assert_index_equal(result, idx)
+
+ def test_constructor_simple_new_empty(self):
+ # GH13079
+ idx = PeriodIndex([], freq='M', name='p')
+ result = idx._simple_new(idx, name='p', freq='M')
+ tm.assert_index_equal(result, idx)
+
+ @pytest.mark.parametrize('floats', [[1.1, 2.1], np.array([1.1, 2.1])])
+ def test_constructor_floats(self, floats):
+ with pytest.raises(TypeError):
+ pd.PeriodIndex._simple_new(floats, freq='M')
+
+ with pytest.raises(TypeError):
+ pd.PeriodIndex(floats, freq='M')
+
+ def test_constructor_nat(self):
+ pytest.raises(ValueError, period_range, start='NaT',
+ end='2011-01-01', freq='M')
+ pytest.raises(ValueError, period_range, start='2011-01-01',
+ end='NaT', freq='M')
+
+ def test_constructor_year_and_quarter(self):
+ year = pd.Series([2001, 2002, 2003])
+ quarter = year - 2000
+ idx = PeriodIndex(year=year, quarter=quarter)
+ strs = ['%dQ%d' % t for t in zip(quarter, year)]
+ lops = list(map(Period, strs))
+ p = PeriodIndex(lops)
+ tm.assert_index_equal(p, idx)
+
+ @pytest.mark.parametrize('func, warning', [
+ (PeriodIndex, FutureWarning),
+ (period_range, None)
+ ])
+ def test_constructor_freq_mult(self, func, warning):
+ # GH #7811
+ with tm.assert_produces_warning(warning):
+ # must be the same, but for sure...
+ pidx = func(start='2014-01', freq='2M', periods=4)
+ expected = PeriodIndex(['2014-01', '2014-03',
+ '2014-05', '2014-07'], freq='2M')
+ tm.assert_index_equal(pidx, expected)
+
+ with tm.assert_produces_warning(warning):
+ pidx = func(start='2014-01-02', end='2014-01-15', freq='3D')
+ expected = PeriodIndex(['2014-01-02', '2014-01-05',
+ '2014-01-08', '2014-01-11',
+ '2014-01-14'], freq='3D')
+ tm.assert_index_equal(pidx, expected)
+
+ with tm.assert_produces_warning(warning):
+ pidx = func(end='2014-01-01 17:00', freq='4H', periods=3)
+ expected = PeriodIndex(['2014-01-01 09:00', '2014-01-01 13:00',
+ '2014-01-01 17:00'], freq='4H')
+ tm.assert_index_equal(pidx, expected)
+
+ msg = ('Frequency must be positive, because it'
+ ' represents span: -1M')
+ with pytest.raises(ValueError, match=msg):
+ PeriodIndex(['2011-01'], freq='-1M')
+
+ msg = ('Frequency must be positive, because it' ' represents span: 0M')
+ with pytest.raises(ValueError, match=msg):
+ PeriodIndex(['2011-01'], freq='0M')
+
+ msg = ('Frequency must be positive, because it' ' represents span: 0M')
+ with pytest.raises(ValueError, match=msg):
+ period_range('2011-01', periods=3, freq='0M')
+
+ @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'T', 'S'])
+ @pytest.mark.parametrize('mult', [1, 2, 3, 4, 5])
+ def test_constructor_freq_mult_dti_compat(self, mult, freq):
+ freqstr = str(mult) + freq
+ pidx = period_range(start='2014-04-01', freq=freqstr, periods=10)
+ expected = date_range(start='2014-04-01', freq=freqstr,
+ periods=10).to_period(freqstr)
+ tm.assert_index_equal(pidx, expected)
+
+ def test_constructor_freq_combined(self):
+ for freq in ['1D1H', '1H1D']:
+ pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq)
+ expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'],
+ freq='25H')
+ for freq in ['1D1H', '1H1D']:
+ pidx = period_range(start='2016-01-01', periods=2, freq=freq)
+ expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'],
+ freq='25H')
+ tm.assert_index_equal(pidx, expected)
+
+ def test_constructor_range_based_deprecated(self):
+ with tm.assert_produces_warning(FutureWarning):
+ pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 9
+
+ def test_constructor_range_based_deprecated_different_freq(self):
+ with tm.assert_produces_warning(FutureWarning) as m:
+ PeriodIndex(start='2000', periods=2)
+
+ warning, = m
+ assert 'freq="A-DEC"' in str(warning.message)
+
+ def test_constructor(self):
+ pi = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 9
+
+ pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 4 * 9
+
+ pi = period_range(freq='M', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 12 * 9
+
+ pi = period_range(freq='D', start='1/1/2001', end='12/31/2009')
+ assert len(pi) == 365 * 9 + 2
+
+ pi = period_range(freq='B', start='1/1/2001', end='12/31/2009')
+ assert len(pi) == 261 * 9
+
+ pi = period_range(freq='H', start='1/1/2001', end='12/31/2001 23:00')
+ assert len(pi) == 365 * 24
+
+ pi = period_range(freq='Min', start='1/1/2001', end='1/1/2001 23:59')
+ assert len(pi) == 24 * 60
+
+ pi = period_range(freq='S', start='1/1/2001', end='1/1/2001 23:59:59')
+ assert len(pi) == 24 * 60 * 60
+
+ start = Period('02-Apr-2005', 'B')
+ i1 = period_range(start=start, periods=20)
+ assert len(i1) == 20
+ assert i1.freq == start.freq
+ assert i1[0] == start
+
+ end_intv = Period('2006-12-31', 'W')
+ i1 = period_range(end=end_intv, periods=10)
+ assert len(i1) == 10
+ assert i1.freq == end_intv.freq
+ assert i1[-1] == end_intv
+
+ end_intv = Period('2006-12-31', '1w')
+ i2 = period_range(end=end_intv, periods=10)
+ assert len(i1) == len(i2)
+ assert (i1 == i2).all()
+ assert i1.freq == i2.freq
+
+ end_intv = Period('2006-12-31', ('w', 1))
+ i2 = period_range(end=end_intv, periods=10)
+ assert len(i1) == len(i2)
+ assert (i1 == i2).all()
+ assert i1.freq == i2.freq
+
+ end_intv = Period('2005-05-01', 'B')
+ i1 = period_range(start=start, end=end_intv)
+
+ # infer freq from first element
+ i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')])
+ assert len(i2) == 2
+ assert i2[0] == end_intv
+
+ i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')]))
+ assert len(i2) == 2
+ assert i2[0] == end_intv
+
+ # Mixed freq should fail
+ vals = [end_intv, Period('2006-12-31', 'w')]
+ pytest.raises(ValueError, PeriodIndex, vals)
+ vals = np.array(vals)
+ pytest.raises(ValueError, PeriodIndex, vals)
+
+ def test_constructor_error(self):
+ start = Period('02-Apr-2005', 'B')
+ end_intv = Period('2006-12-31', ('w', 1))
+
+ msg = 'start and end must have same freq'
+ with pytest.raises(ValueError, match=msg):
+ PeriodIndex(start=start, end=end_intv)
+
+ msg = ('Of the three parameters: start, end, and periods, '
+ 'exactly two must be specified')
+ with pytest.raises(ValueError, match=msg):
+ PeriodIndex(start=start)
+
+ @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B',
+ 'T', 'S', 'L', 'U', 'N', 'H'])
+ def test_recreate_from_data(self, freq):
+ org = period_range(start='2001/04/01', freq=freq, periods=1)
+ idx = PeriodIndex(org.values, freq=freq)
+ tm.assert_index_equal(idx, org)
+
+ def test_map_with_string_constructor(self):
+ raw = [2005, 2007, 2009]
+ index = PeriodIndex(raw, freq='A')
+ types = str,
+
+ if PY3:
+ # unicode
+ types += text_type,
+
+ for t in types:
+ expected = Index(lmap(t, raw))
+ res = index.map(t)
+
+ # should return an Index
+ assert isinstance(res, Index)
+
+ # preserve element types
+ assert all(isinstance(resi, t) for resi in res)
+
+ # lastly, values should compare equal
+ tm.assert_index_equal(res, expected)
+
+
+class TestSeriesPeriod(object):
+
+ def setup_method(self, method):
+ self.series = Series(period_range('2000-01-01', periods=10, freq='D'))
+
+ def test_constructor_cant_cast_period(self):
+ with pytest.raises(TypeError):
+ Series(period_range('2000-01-01', periods=10, freq='D'),
+ dtype=float)
+
+ def test_constructor_cast_object(self):
+ s = Series(period_range('1/1/2000', periods=10),
+ dtype=PeriodDtype("D"))
+ exp = Series(period_range('1/1/2000', periods=10))
+ tm.assert_series_equal(s, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_formats.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_formats.py
new file mode 100644
index 00000000000..5b2940372b9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_formats.py
@@ -0,0 +1,220 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import PeriodIndex
+import pandas.util.testing as tm
+
+
+def test_to_native_types():
+ index = PeriodIndex(['2017-01-01', '2017-01-02',
+ '2017-01-03'], freq='D')
+
+ # First, with no arguments.
+ expected = np.array(['2017-01-01', '2017-01-02',
+ '2017-01-03'], dtype='=U10')
+
+ result = index.to_native_types()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # No NaN values, so na_rep has no effect
+ result = index.to_native_types(na_rep='pandas')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Make sure slicing works
+ expected = np.array(['2017-01-01', '2017-01-03'], dtype='=U10')
+
+ result = index.to_native_types([0, 2])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Make sure date formatting works
+ expected = np.array(['01-2017-01', '01-2017-02',
+ '01-2017-03'], dtype='=U10')
+
+ result = index.to_native_types(date_format='%m-%Y-%d')
+ tm.assert_numpy_array_equal(result, expected)
+
+ # NULL object handling should work
+ index = PeriodIndex(['2017-01-01', pd.NaT, '2017-01-03'], freq='D')
+ expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
+
+ result = index.to_native_types()
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = np.array(['2017-01-01', 'pandas',
+ '2017-01-03'], dtype=object)
+
+ result = index.to_native_types(na_rep='pandas')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestPeriodIndexRendering(object):
+
+ def test_frame_repr(self):
+ df = pd.DataFrame({"A": [1, 2, 3]},
+ index=pd.date_range('2000', periods=3))
+ result = repr(df)
+ expected = (
+ ' A\n'
+ '2000-01-01 1\n'
+ '2000-01-02 2\n'
+ '2000-01-03 3')
+ assert result == expected
+
+ @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__'])
+ def test_representation(self, method):
+ # GH#7601
+ idx1 = PeriodIndex([], freq='D')
+ idx2 = PeriodIndex(['2011-01-01'], freq='D')
+ idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D')
+ idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ freq='D')
+ idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A')
+ idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'],
+ freq='H')
+ idx7 = pd.period_range('2013Q1', periods=1, freq="Q")
+ idx8 = pd.period_range('2013Q1', periods=2, freq="Q")
+ idx9 = pd.period_range('2013Q1', periods=3, freq="Q")
+ idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D')
+
+ exp1 = """PeriodIndex([], dtype='period[D]', freq='D')"""
+
+ exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')"""
+
+ exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', "
+ "freq='D')")
+
+ exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], "
+ "dtype='period[D]', freq='D')")
+
+ exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', "
+ "freq='A-DEC')")
+
+ exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], "
+ "dtype='period[H]', freq='H')")
+
+ exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', "
+ "freq='Q-DEC')")
+
+ exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', "
+ "freq='Q-DEC')")
+
+ exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], "
+ "dtype='period[Q-DEC]', freq='Q-DEC')")
+
+ exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], "
+ "dtype='period[3D]', freq='3D')")
+
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5,
+ idx6, idx7, idx8, idx9, idx10],
+ [exp1, exp2, exp3, exp4, exp5,
+ exp6, exp7, exp8, exp9, exp10]):
+ result = getattr(idx, method)()
+ assert result == expected
+
+ def test_representation_to_series(self):
+ # GH#10971
+ idx1 = PeriodIndex([], freq='D')
+ idx2 = PeriodIndex(['2011-01-01'], freq='D')
+ idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D')
+ idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ freq='D')
+ idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A')
+ idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'],
+ freq='H')
+
+ idx7 = pd.period_range('2013Q1', periods=1, freq="Q")
+ idx8 = pd.period_range('2013Q1', periods=2, freq="Q")
+ idx9 = pd.period_range('2013Q1', periods=3, freq="Q")
+
+ exp1 = """Series([], dtype: period[D])"""
+
+ exp2 = """0 2011-01-01
+dtype: period[D]"""
+
+ exp3 = """0 2011-01-01
+1 2011-01-02
+dtype: period[D]"""
+
+ exp4 = """0 2011-01-01
+1 2011-01-02
+2 2011-01-03
+dtype: period[D]"""
+
+ exp5 = """0 2011
+1 2012
+2 2013
+dtype: period[A-DEC]"""
+
+ exp6 = """0 2011-01-01 09:00
+1 2012-02-01 10:00
+2 NaT
+dtype: period[H]"""
+
+ exp7 = """0 2013Q1
+dtype: period[Q-DEC]"""
+
+ exp8 = """0 2013Q1
+1 2013Q2
+dtype: period[Q-DEC]"""
+
+ exp9 = """0 2013Q1
+1 2013Q2
+2 2013Q3
+dtype: period[Q-DEC]"""
+
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5,
+ idx6, idx7, idx8, idx9],
+ [exp1, exp2, exp3, exp4, exp5,
+ exp6, exp7, exp8, exp9]):
+ result = repr(pd.Series(idx))
+ assert result == expected
+
+ def test_summary(self):
+ # GH#9116
+ idx1 = PeriodIndex([], freq='D')
+ idx2 = PeriodIndex(['2011-01-01'], freq='D')
+ idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D')
+ idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ freq='D')
+ idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A')
+ idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'],
+ freq='H')
+
+ idx7 = pd.period_range('2013Q1', periods=1, freq="Q")
+ idx8 = pd.period_range('2013Q1', periods=2, freq="Q")
+ idx9 = pd.period_range('2013Q1', periods=3, freq="Q")
+
+ exp1 = """PeriodIndex: 0 entries
+Freq: D"""
+
+ exp2 = """PeriodIndex: 1 entries, 2011-01-01 to 2011-01-01
+Freq: D"""
+
+ exp3 = """PeriodIndex: 2 entries, 2011-01-01 to 2011-01-02
+Freq: D"""
+
+ exp4 = """PeriodIndex: 3 entries, 2011-01-01 to 2011-01-03
+Freq: D"""
+
+ exp5 = """PeriodIndex: 3 entries, 2011 to 2013
+Freq: A-DEC"""
+
+ exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT
+Freq: H"""
+
+ exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1
+Freq: Q-DEC"""
+
+ exp8 = """PeriodIndex: 2 entries, 2013Q1 to 2013Q2
+Freq: Q-DEC"""
+
+ exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3
+Freq: Q-DEC"""
+
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5,
+ idx6, idx7, idx8, idx9],
+ [exp1, exp2, exp3, exp4, exp5,
+ exp6, exp7, exp8, exp9]):
+ result = idx._summary()
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_indexing.py
new file mode 100644
index 00000000000..d6ce4d5e357
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_indexing.py
@@ -0,0 +1,637 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import period as libperiod
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Period, PeriodIndex, Series, notna, period_range)
+from pandas.util import testing as tm
+
+
+class TestGetItem(object):
+ def test_ellipsis(self):
+ # GH#21282
+ idx = period_range('2011-01-01', '2011-01-31', freq='D',
+ name='idx')
+
+ result = idx[...]
+ assert result.equals(idx)
+ assert result is not idx
+
+ def test_getitem(self):
+ idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D',
+ name='idx')
+
+ for idx in [idx1]:
+ result = idx[0]
+ assert result == pd.Period('2011-01-01', freq='D')
+
+ result = idx[-1]
+ assert result == pd.Period('2011-01-31', freq='D')
+
+ result = idx[0:5]
+ expected = pd.period_range('2011-01-01', '2011-01-05', freq='D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx[0:10:2]
+ expected = pd.PeriodIndex(['2011-01-01', '2011-01-03',
+ '2011-01-05',
+ '2011-01-07', '2011-01-09'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx[-20:-5:3]
+ expected = pd.PeriodIndex(['2011-01-12', '2011-01-15',
+ '2011-01-18',
+ '2011-01-21', '2011-01-24'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx[4::-1]
+ expected = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-03',
+ '2011-01-02', '2011-01-01'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ def test_getitem_index(self):
+ idx = period_range('2007-01', periods=10, freq='M', name='x')
+
+ result = idx[[1, 3, 5]]
+ exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'],
+ freq='M', name='x')
+ tm.assert_index_equal(result, exp)
+
+ result = idx[[True, True, False, False, False,
+ True, True, False, False, False]]
+ exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'],
+ freq='M', name='x')
+ tm.assert_index_equal(result, exp)
+
+ def test_getitem_partial(self):
+ rng = period_range('2007-01', periods=50, freq='M')
+ ts = Series(np.random.randn(len(rng)), rng)
+
+ pytest.raises(KeyError, ts.__getitem__, '2006')
+
+ result = ts['2008']
+ assert (result.index.year == 2008).all()
+
+ result = ts['2008':'2009']
+ assert len(result) == 24
+
+ result = ts['2008-1':'2009-12']
+ assert len(result) == 24
+
+ result = ts['2008Q1':'2009Q4']
+ assert len(result) == 24
+
+ result = ts[:'2009']
+ assert len(result) == 36
+
+ result = ts['2009':]
+ assert len(result) == 50 - 24
+
+ exp = result
+ result = ts[24:]
+ tm.assert_series_equal(exp, result)
+
+ ts = ts[10:].append(ts[10:])
+ msg = "left slice bound for non-unique label: '2008'"
+ with pytest.raises(KeyError, match=msg):
+ ts[slice('2008', '2009')]
+
+ def test_getitem_datetime(self):
+ rng = period_range(start='2012-01-01', periods=10, freq='W-MON')
+ ts = Series(lrange(len(rng)), index=rng)
+
+ dt1 = datetime(2011, 10, 2)
+ dt4 = datetime(2012, 4, 20)
+
+ rs = ts[dt1:dt4]
+ tm.assert_series_equal(rs, ts)
+
+ def test_getitem_nat(self):
+ idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M')
+ assert idx[0] == pd.Period('2011-01', freq='M')
+ assert idx[1] is pd.NaT
+
+ s = pd.Series([0, 1, 2], index=idx)
+ assert s[pd.NaT] == 1
+
+ s = pd.Series(idx, index=idx)
+ assert (s[pd.Period('2011-01', freq='M')] ==
+ pd.Period('2011-01', freq='M'))
+ assert s[pd.NaT] is pd.NaT
+
+ def test_getitem_list_periods(self):
+ # GH 7710
+ rng = period_range(start='2012-01-01', periods=10, freq='D')
+ ts = Series(lrange(len(rng)), index=rng)
+ exp = ts.iloc[[1]]
+ tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp)
+
+ def test_getitem_seconds(self):
+ # GH#6716
+ didx = pd.date_range(start='2013/01/01 09:00:00', freq='S',
+ periods=4000)
+ pidx = period_range(start='2013/01/01 09:00:00', freq='S',
+ periods=4000)
+
+ for idx in [didx, pidx]:
+ # getitem against index should raise ValueError
+ values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H',
+ '2013/02/01 09:00']
+ for v in values:
+ # GH7116
+ # these show deprecations as we are trying
+ # to slice with non-integer indexers
+ # with pytest.raises(IndexError):
+ # idx[v]
+ continue
+
+ s = Series(np.random.rand(len(idx)), index=idx)
+ tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660])
+ tm.assert_series_equal(s['2013/01/01 9H'], s[:3600])
+ for d in ['2013/01/01', '2013/01', '2013']:
+ tm.assert_series_equal(s[d], s)
+
+ def test_getitem_day(self):
+ # GH#6716
+ # Confirm DatetimeIndex and PeriodIndex works identically
+ didx = pd.date_range(start='2013/01/01', freq='D', periods=400)
+ pidx = period_range(start='2013/01/01', freq='D', periods=400)
+
+ for idx in [didx, pidx]:
+ # getitem against index should raise ValueError
+ values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H',
+ '2013/02/01 09:00']
+ for v in values:
+
+ # GH7116
+ # these show deprecations as we are trying
+ # to slice with non-integer indexers
+ # with pytest.raises(IndexError):
+ # idx[v]
+ continue
+
+ s = Series(np.random.rand(len(idx)), index=idx)
+ tm.assert_series_equal(s['2013/01'], s[0:31])
+ tm.assert_series_equal(s['2013/02'], s[31:59])
+ tm.assert_series_equal(s['2014'], s[365:])
+
+ invalid = ['2013/02/01 9H', '2013/02/01 09:00']
+ for v in invalid:
+ with pytest.raises(KeyError):
+ s[v]
+
+
+class TestWhere(object):
+ @pytest.mark.parametrize('klass', [list, tuple, np.array, Series])
+ def test_where(self, klass):
+ i = period_range('20130101', periods=5, freq='D')
+ cond = [True] * len(i)
+ expected = i
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ cond = [False] + [True] * (len(i) - 1)
+ expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D')
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ def test_where_other(self):
+ i = period_range('20130101', periods=5, freq='D')
+ for arr in [np.nan, pd.NaT]:
+ result = i.where(notna(i), other=np.nan)
+ expected = i
+ tm.assert_index_equal(result, expected)
+
+ i2 = i.copy()
+ i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(),
+ freq='D')
+ result = i.where(notna(i2), i2)
+ tm.assert_index_equal(result, i2)
+
+ i2 = i.copy()
+ i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(),
+ freq='D')
+ result = i.where(notna(i2), i2.values)
+ tm.assert_index_equal(result, i2)
+
+
+class TestTake(object):
+ def test_take(self):
+ # GH#10295
+ idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D',
+ name='idx')
+
+ for idx in [idx1]:
+ result = idx.take([0])
+ assert result == pd.Period('2011-01-01', freq='D')
+
+ result = idx.take([5])
+ assert result == pd.Period('2011-01-06', freq='D')
+
+ result = idx.take([0, 1, 2])
+ expected = pd.period_range('2011-01-01', '2011-01-03', freq='D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == 'D'
+ assert result.freq == expected.freq
+
+ result = idx.take([0, 2, 4])
+ expected = pd.PeriodIndex(['2011-01-01', '2011-01-03',
+ '2011-01-05'], freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx.take([7, 4, 1])
+ expected = pd.PeriodIndex(['2011-01-08', '2011-01-05',
+ '2011-01-02'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx.take([3, 2, 5])
+ expected = PeriodIndex(['2011-01-04', '2011-01-03', '2011-01-06'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ result = idx.take([-3, 2, 5])
+ expected = PeriodIndex(['2011-01-29', '2011-01-03', '2011-01-06'],
+ freq='D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+ assert result.freq == 'D'
+
+ def test_take_misc(self):
+ index = period_range(start='1/1/10', end='12/31/12', freq='D',
+ name='idx')
+ expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7),
+ datetime(2010, 1, 9), datetime(2010, 1, 13)],
+ freq='D', name='idx')
+
+ taken1 = index.take([5, 6, 8, 12])
+ taken2 = index[[5, 6, 8, 12]]
+
+ for taken in [taken1, taken2]:
+ tm.assert_index_equal(taken, expected)
+ assert isinstance(taken, PeriodIndex)
+ assert taken.freq == index.freq
+ assert taken.name == expected.name
+
+ def test_take_fill_value(self):
+ # GH#12631
+ idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ name='xxx', freq='D')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx', freq='D')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'],
+ name='xxx', freq='D')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx', freq='D')
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+
+class TestIndexing(object):
+
+ def test_get_loc_msg(self):
+ idx = period_range('2000-1-1', freq='A', periods=10)
+ bad_period = Period('2012', 'A')
+ pytest.raises(KeyError, idx.get_loc, bad_period)
+
+ try:
+ idx.get_loc(bad_period)
+ except KeyError as inst:
+ assert inst.args[0] == bad_period
+
+ def test_get_loc_nat(self):
+ didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'])
+ pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M')
+
+ # check DatetimeIndex compat
+ for idx in [didx, pidx]:
+ assert idx.get_loc(pd.NaT) == 1
+ assert idx.get_loc(None) == 1
+ assert idx.get_loc(float('nan')) == 1
+ assert idx.get_loc(np.nan) == 1
+
+ def test_get_loc(self):
+ # GH 17717
+ p0 = pd.Period('2017-09-01')
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+
+ # get the location of p1/p2 from
+ # monotonic increasing PeriodIndex with non-duplicate
+ idx0 = pd.PeriodIndex([p0, p1, p2])
+ expected_idx1_p1 = 1
+ expected_idx1_p2 = 2
+
+ assert idx0.get_loc(p1) == expected_idx1_p1
+ assert idx0.get_loc(str(p1)) == expected_idx1_p1
+ assert idx0.get_loc(p2) == expected_idx1_p2
+ assert idx0.get_loc(str(p2)) == expected_idx1_p2
+
+ msg = "Cannot interpret 'foo' as period"
+ with pytest.raises(KeyError, match=msg):
+ idx0.get_loc('foo')
+ pytest.raises(KeyError, idx0.get_loc, 1.1)
+ pytest.raises(TypeError, idx0.get_loc, idx0)
+
+ # get the location of p1/p2 from
+ # monotonic increasing PeriodIndex with duplicate
+ idx1 = pd.PeriodIndex([p1, p1, p2])
+ expected_idx1_p1 = slice(0, 2)
+ expected_idx1_p2 = 2
+
+ assert idx1.get_loc(p1) == expected_idx1_p1
+ assert idx1.get_loc(str(p1)) == expected_idx1_p1
+ assert idx1.get_loc(p2) == expected_idx1_p2
+ assert idx1.get_loc(str(p2)) == expected_idx1_p2
+
+ msg = "Cannot interpret 'foo' as period"
+ with pytest.raises(KeyError, match=msg):
+ idx1.get_loc('foo')
+
+ pytest.raises(KeyError, idx1.get_loc, 1.1)
+ pytest.raises(TypeError, idx1.get_loc, idx1)
+
+ # get the location of p1/p2 from
+ # non-monotonic increasing/decreasing PeriodIndex with duplicate
+ idx2 = pd.PeriodIndex([p2, p1, p2])
+ expected_idx2_p1 = 1
+ expected_idx2_p2 = np.array([True, False, True])
+
+ assert idx2.get_loc(p1) == expected_idx2_p1
+ assert idx2.get_loc(str(p1)) == expected_idx2_p1
+ tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2)
+ tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2)
+
+ def test_is_monotonic_increasing(self):
+ # GH 17717
+ p0 = pd.Period('2017-09-01')
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+
+ idx_inc0 = pd.PeriodIndex([p0, p1, p2])
+ idx_inc1 = pd.PeriodIndex([p0, p1, p1])
+ idx_dec0 = pd.PeriodIndex([p2, p1, p0])
+ idx_dec1 = pd.PeriodIndex([p2, p1, p1])
+ idx = pd.PeriodIndex([p1, p2, p0])
+
+ assert idx_inc0.is_monotonic_increasing is True
+ assert idx_inc1.is_monotonic_increasing is True
+ assert idx_dec0.is_monotonic_increasing is False
+ assert idx_dec1.is_monotonic_increasing is False
+ assert idx.is_monotonic_increasing is False
+
+ def test_is_monotonic_decreasing(self):
+ # GH 17717
+ p0 = pd.Period('2017-09-01')
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+
+ idx_inc0 = pd.PeriodIndex([p0, p1, p2])
+ idx_inc1 = pd.PeriodIndex([p0, p1, p1])
+ idx_dec0 = pd.PeriodIndex([p2, p1, p0])
+ idx_dec1 = pd.PeriodIndex([p2, p1, p1])
+ idx = pd.PeriodIndex([p1, p2, p0])
+
+ assert idx_inc0.is_monotonic_decreasing is False
+ assert idx_inc1.is_monotonic_decreasing is False
+ assert idx_dec0.is_monotonic_decreasing is True
+ assert idx_dec1.is_monotonic_decreasing is True
+ assert idx.is_monotonic_decreasing is False
+
+ def test_contains(self):
+ # GH 17717
+ p0 = pd.Period('2017-09-01')
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+ p3 = pd.Period('2017-09-04')
+
+ ps0 = [p0, p1, p2]
+ idx0 = pd.PeriodIndex(ps0)
+
+ for p in ps0:
+ assert idx0.contains(p)
+ assert p in idx0
+
+ assert idx0.contains(str(p))
+ assert str(p) in idx0
+
+ assert idx0.contains('2017-09-01 00:00:01')
+ assert '2017-09-01 00:00:01' in idx0
+
+ assert idx0.contains('2017-09')
+ assert '2017-09' in idx0
+
+ assert not idx0.contains(p3)
+ assert p3 not in idx0
+
+ def test_get_value(self):
+ # GH 17717
+ p0 = pd.Period('2017-09-01')
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+
+ idx0 = pd.PeriodIndex([p0, p1, p2])
+ input0 = np.array([1, 2, 3])
+ expected0 = 2
+
+ result0 = idx0.get_value(input0, p1)
+ assert result0 == expected0
+
+ idx1 = pd.PeriodIndex([p1, p1, p2])
+ input1 = np.array([1, 2, 3])
+ expected1 = np.array([1, 2])
+
+ result1 = idx1.get_value(input1, p1)
+ tm.assert_numpy_array_equal(result1, expected1)
+
+ idx2 = pd.PeriodIndex([p1, p2, p1])
+ input2 = np.array([1, 2, 3])
+ expected2 = np.array([1, 3])
+
+ result2 = idx2.get_value(input2, p1)
+ tm.assert_numpy_array_equal(result2, expected2)
+
+ def test_get_indexer(self):
+ # GH 17717
+ p1 = pd.Period('2017-09-01')
+ p2 = pd.Period('2017-09-04')
+ p3 = pd.Period('2017-09-07')
+
+ tp0 = pd.Period('2017-08-31')
+ tp1 = pd.Period('2017-09-02')
+ tp2 = pd.Period('2017-09-05')
+ tp3 = pd.Period('2017-09-09')
+
+ idx = pd.PeriodIndex([p1, p2, p3])
+
+ tm.assert_numpy_array_equal(idx.get_indexer(idx),
+ np.array([0, 1, 2], dtype=np.intp))
+
+ target = pd.PeriodIndex([tp0, tp1, tp2, tp3])
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
+ np.array([-1, 0, 1, 2], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
+ np.array([0, 1, 2, -1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
+ np.array([0, 0, 1, 2], dtype=np.intp))
+
+ res = idx.get_indexer(target, 'nearest',
+ tolerance=pd.Timedelta('1 day'))
+ tm.assert_numpy_array_equal(res,
+ np.array([0, 0, 1, -1], dtype=np.intp))
+
+ def test_get_indexer_non_unique(self):
+ # GH 17717
+ p1 = pd.Period('2017-09-02')
+ p2 = pd.Period('2017-09-03')
+ p3 = pd.Period('2017-09-04')
+ p4 = pd.Period('2017-09-05')
+
+ idx1 = pd.PeriodIndex([p1, p2, p1])
+ idx2 = pd.PeriodIndex([p2, p1, p3, p4])
+
+ result = idx1.get_indexer_non_unique(idx2)
+ expected_indexer = np.array([1, 0, 2, -1, -1], dtype=np.intp)
+ expected_missing = np.array([2, 3], dtype=np.int64)
+
+ tm.assert_numpy_array_equal(result[0], expected_indexer)
+ tm.assert_numpy_array_equal(result[1], expected_missing)
+
+ # TODO: This method came from test_period; de-dup with version above
+ def test_get_loc2(self):
+ idx = pd.period_range('2000-01-01', periods=3)
+
+ for method in [None, 'pad', 'backfill', 'nearest']:
+ assert idx.get_loc(idx[1], method) == 1
+ assert idx.get_loc(idx[1].asfreq('H', how='start'), method) == 1
+ assert idx.get_loc(idx[1].to_timestamp(), method) == 1
+ assert idx.get_loc(idx[1].to_timestamp()
+ .to_pydatetime(), method) == 1
+ assert idx.get_loc(str(idx[1]), method) == 1
+
+ idx = pd.period_range('2000-01-01', periods=5)[::2]
+ assert idx.get_loc('2000-01-02T12', method='nearest',
+ tolerance='1 day') == 1
+ assert idx.get_loc('2000-01-02T12', method='nearest',
+ tolerance=pd.Timedelta('1D')) == 1
+ assert idx.get_loc('2000-01-02T12', method='nearest',
+ tolerance=np.timedelta64(1, 'D')) == 1
+ assert idx.get_loc('2000-01-02T12', method='nearest',
+ tolerance=timedelta(1)) == 1
+
+ msg = 'unit abbreviation w/o a number'
+ with pytest.raises(ValueError, match=msg):
+ idx.get_loc('2000-01-10', method='nearest', tolerance='foo')
+
+ msg = 'Input has different freq=None from PeriodArray\\(freq=D\\)'
+ with pytest.raises(ValueError, match=msg):
+ idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
+ with pytest.raises(KeyError):
+ idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
+ with pytest.raises(
+ ValueError,
+ match='list-like tolerance size must match target index size'):
+ idx.get_loc('2000-01-10', method='nearest',
+ tolerance=[pd.Timedelta('1 day').to_timedelta64(),
+ pd.Timedelta('1 day').to_timedelta64()])
+
+ # TODO: This method came from test_period; de-dup with version above
+ def test_get_indexer2(self):
+ idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start')
+ tm.assert_numpy_array_equal(idx.get_indexer(idx),
+ np.array([0, 1, 2], dtype=np.intp))
+
+ target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12',
+ '2000-01-02T01'], freq='H')
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
+ np.array([-1, 0, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
+ np.array([0, 1, 2], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
+ np.array([0, 1, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
+ tolerance='1 hour'),
+ np.array([0, -1, 1], dtype=np.intp))
+
+ msg = 'Input has different freq=None from PeriodArray\\(freq=H\\)'
+ with pytest.raises(ValueError, match=msg):
+ idx.get_indexer(target, 'nearest', tolerance='1 minute')
+
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
+ tolerance='1 day'),
+ np.array([0, 1, 1], dtype=np.intp))
+ tol_raw = [pd.Timedelta('1 hour'),
+ pd.Timedelta('1 hour'),
+ np.timedelta64(1, 'D'), ]
+ tm.assert_numpy_array_equal(
+ idx.get_indexer(target, 'nearest',
+ tolerance=[np.timedelta64(x) for x in tol_raw]),
+ np.array([0, -1, 1], dtype=np.intp))
+ tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+ pd.Timedelta('1 hour').to_timedelta64(),
+ np.timedelta64(1, 'M'), ]
+ with pytest.raises(
+ libperiod.IncompatibleFrequency,
+ match='Input has different freq=None from'):
+ idx.get_indexer(target, 'nearest', tolerance=tol_bad)
+
+ def test_indexing(self):
+ # GH 4390, iat incorrectly indexing
+ index = period_range('1/1/2001', periods=10)
+ s = Series(np.random.randn(10), index=index)
+ expected = s[index[0]]
+ result = s.iat[0]
+ assert expected == result
+
+ def test_period_index_indexer(self):
+ # GH4125
+ idx = pd.period_range('2002-01', '2003-12', freq='M')
+ df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx)
+ tm.assert_frame_equal(df, df.loc[idx])
+ tm.assert_frame_equal(df, df.loc[list(idx)])
+ tm.assert_frame_equal(df, df.loc[list(idx)])
+ tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]])
+ tm.assert_frame_equal(df, df.loc[list(idx)])
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_ops.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_ops.py
new file mode 100644
index 00000000000..8b022268897
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_ops.py
@@ -0,0 +1,329 @@
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DatetimeIndex, Index, NaT, PeriodIndex, Series
+from pandas.core.arrays import PeriodArray
+from pandas.tests.test_base import Ops
+import pandas.util.testing as tm
+
+
+class TestPeriodIndexOps(Ops):
+
+ def setup_method(self, method):
+ super(TestPeriodIndexOps, self).setup_method(method)
+ mask = lambda x: (isinstance(x, DatetimeIndex) or
+ isinstance(x, PeriodIndex))
+ self.is_valid_objs = [o for o in self.objs if mask(o)]
+ self.not_valid_objs = [o for o in self.objs if not mask(o)]
+
+ def test_ops_properties(self):
+ f = lambda x: isinstance(x, PeriodIndex)
+ self.check_ops_properties(PeriodArray._field_ops, f)
+ self.check_ops_properties(PeriodArray._object_ops, f)
+ self.check_ops_properties(PeriodArray._bool_ops, f)
+
+ def test_resolution(self):
+ for freq, expected in zip(['A', 'Q', 'M', 'D', 'H',
+ 'T', 'S', 'L', 'U'],
+ ['day', 'day', 'day', 'day',
+ 'hour', 'minute', 'second',
+ 'millisecond', 'microsecond']):
+
+ idx = pd.period_range(start='2013-04-01', periods=30, freq=freq)
+ assert idx.resolution == expected
+
+ def test_value_counts_unique(self):
+ # GH 7735
+ idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
+ # create repeated values, 'n'th element is repeated by n+1 times
+ idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)),
+ freq='H')
+
+ exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00',
+ '2011-01-01 16:00', '2011-01-01 15:00',
+ '2011-01-01 14:00', '2011-01-01 13:00',
+ '2011-01-01 12:00', '2011-01-01 11:00',
+ '2011-01-01 10:00',
+ '2011-01-01 09:00'], freq='H')
+ expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ expected = pd.period_range('2011-01-01 09:00', freq='H',
+ periods=10)
+ tm.assert_index_equal(idx.unique(), expected)
+
+ idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00',
+ '2013-01-01 09:00', '2013-01-01 08:00',
+ '2013-01-01 08:00', NaT], freq='H')
+
+ exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
+ freq='H')
+ expected = Series([3, 2], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00',
+ NaT], freq='H')
+ expected = Series([3, 2, 1], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(dropna=False), expected)
+
+ tm.assert_index_equal(idx.unique(), exp_idx)
+
+ def test_drop_duplicates_metadata(self):
+ # GH 10115
+ idx = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx')
+ result = idx.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert idx.freq == result.freq
+
+ idx_dup = idx.append(idx) # freq will not be reset
+ result = idx_dup.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert idx.freq == result.freq
+
+ def test_drop_duplicates(self):
+ # to check Index/Series compat
+ base = pd.period_range('2011-01-01', '2011-01-31', freq='D',
+ name='idx')
+ idx = base.append(base[:5])
+
+ res = idx.drop_duplicates()
+ tm.assert_index_equal(res, base)
+ res = Series(idx).drop_duplicates()
+ tm.assert_series_equal(res, Series(base))
+
+ res = idx.drop_duplicates(keep='last')
+ exp = base[5:].append(base[:5])
+ tm.assert_index_equal(res, exp)
+ res = Series(idx).drop_duplicates(keep='last')
+ tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
+
+ res = idx.drop_duplicates(keep=False)
+ tm.assert_index_equal(res, base[5:])
+ res = Series(idx).drop_duplicates(keep=False)
+ tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+
+ def test_order_compat(self):
+ def _check_freq(index, expected_index):
+ if isinstance(index, PeriodIndex):
+ assert index.freq == expected_index.freq
+
+ pidx = PeriodIndex(['2011', '2012', '2013'], name='pidx', freq='A')
+ # for compatibility check
+ iidx = Index([2011, 2012, 2013], name='idx')
+ for idx in [pidx, iidx]:
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, idx)
+ _check_freq(ordered, idx)
+
+ ordered = idx.sort_values(ascending=False)
+ tm.assert_index_equal(ordered, idx[::-1])
+ _check_freq(ordered, idx[::-1])
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, idx)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]),
+ check_dtype=False)
+ _check_freq(ordered, idx)
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, idx[::-1])
+ tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]),
+ check_dtype=False)
+ _check_freq(ordered, idx[::-1])
+
+ pidx = PeriodIndex(['2011', '2013', '2015', '2012',
+ '2011'], name='pidx', freq='A')
+ pexpected = PeriodIndex(
+ ['2011', '2011', '2012', '2013', '2015'], name='pidx', freq='A')
+ # for compatibility check
+ iidx = Index([2011, 2013, 2015, 2012, 2011], name='idx')
+ iexpected = Index([2011, 2011, 2012, 2013, 2015], name='idx')
+ for idx, expected in [(pidx, pexpected), (iidx, iexpected)]:
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, expected)
+ _check_freq(ordered, idx)
+
+ ordered = idx.sort_values(ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+ _check_freq(ordered, idx)
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, expected)
+
+ exp = np.array([0, 4, 3, 1, 2])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ _check_freq(ordered, idx)
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+
+ exp = np.array([2, 1, 3, 4, 0])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ _check_freq(ordered, idx)
+
+ pidx = PeriodIndex(['2011', '2013', 'NaT', '2011'], name='pidx',
+ freq='D')
+
+ result = pidx.sort_values()
+ expected = PeriodIndex(['NaT', '2011', '2011', '2013'],
+ name='pidx', freq='D')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == 'D'
+
+ result = pidx.sort_values(ascending=False)
+ expected = PeriodIndex(
+ ['2013', '2011', '2011', 'NaT'], name='pidx', freq='D')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == 'D'
+
+ def test_order(self):
+ for freq in ['D', '2D', '4D']:
+ idx = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ freq=freq, name='idx')
+
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, idx)
+ assert ordered.freq == idx.freq
+
+ ordered = idx.sort_values(ascending=False)
+ expected = idx[::-1]
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq == expected.freq
+ assert ordered.freq == freq
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, idx)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]),
+ check_dtype=False)
+ assert ordered.freq == idx.freq
+ assert ordered.freq == freq
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ expected = idx[::-1]
+ tm.assert_index_equal(ordered, expected)
+ tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]),
+ check_dtype=False)
+ assert ordered.freq == expected.freq
+ assert ordered.freq == freq
+
+ idx1 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05',
+ '2011-01-02', '2011-01-01'], freq='D', name='idx1')
+ exp1 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02',
+ '2011-01-03', '2011-01-05'], freq='D', name='idx1')
+
+ idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05',
+ '2011-01-02', '2011-01-01'],
+ freq='D', name='idx2')
+ exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02',
+ '2011-01-03', '2011-01-05'],
+ freq='D', name='idx2')
+
+ idx3 = PeriodIndex([NaT, '2011-01-03', '2011-01-05',
+ '2011-01-02', NaT], freq='D', name='idx3')
+ exp3 = PeriodIndex([NaT, NaT, '2011-01-02', '2011-01-03',
+ '2011-01-05'], freq='D', name='idx3')
+
+ for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]:
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq == 'D'
+
+ ordered = idx.sort_values(ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+ assert ordered.freq == 'D'
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, expected)
+
+ exp = np.array([0, 4, 3, 1, 2])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq == 'D'
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+
+ exp = np.array([2, 1, 3, 4, 0])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq == 'D'
+
+ def test_shift(self):
+ # This is tested in test_arithmetic
+ pass
+
+ def test_nat(self):
+ assert pd.PeriodIndex._na_value is NaT
+ assert pd.PeriodIndex([], freq='M')._na_value is NaT
+
+ idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D')
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, False]))
+ assert idx.hasnans is False
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([], dtype=np.intp))
+
+ idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D')
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, True]))
+ assert idx.hasnans is True
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([1], dtype=np.intp))
+
+ @pytest.mark.parametrize('freq', ['D', 'M'])
+ def test_equals(self, freq):
+ # GH#13107
+ idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'],
+ freq=freq)
+ assert idx.equals(idx)
+ assert idx.equals(idx.copy())
+ assert idx.equals(idx.astype(object))
+ assert idx.astype(object).equals(idx)
+ assert idx.astype(object).equals(idx.astype(object))
+ assert not idx.equals(list(idx))
+ assert not idx.equals(pd.Series(idx))
+
+ idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'],
+ freq='H')
+ assert not idx.equals(idx2)
+ assert not idx.equals(idx2.copy())
+ assert not idx.equals(idx2.astype(object))
+ assert not idx.astype(object).equals(idx2)
+ assert not idx.equals(list(idx2))
+ assert not idx.equals(pd.Series(idx2))
+
+ # same internal, different tz
+ idx3 = pd.PeriodIndex._simple_new(
+ idx._values._simple_new(idx._values.asi8, freq="H")
+ )
+ tm.assert_numpy_array_equal(idx.asi8, idx3.asi8)
+ assert not idx.equals(idx3)
+ assert not idx.equals(idx3.copy())
+ assert not idx.equals(idx3.astype(object))
+ assert not idx.astype(object).equals(idx3)
+ assert not idx.equals(list(idx3))
+ assert not idx.equals(pd.Series(idx3))
+
+ def test_freq_setter_deprecated(self):
+ # GH 20678
+ idx = pd.period_range('2018Q1', periods=4, freq='Q')
+
+ # no warning for getter
+ with tm.assert_produces_warning(None):
+ idx.freq
+
+ # warning for setter
+ with tm.assert_produces_warning(FutureWarning):
+ idx.freq = pd.offsets.Day()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_partial_slicing.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_partial_slicing.py
new file mode 100644
index 00000000000..0a1e7225463
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_partial_slicing.py
@@ -0,0 +1,132 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Period, Series, period_range
+from pandas.util import testing as tm
+
+
+class TestPeriodIndex(object):
+
+ def setup_method(self, method):
+ pass
+
+ def test_slice_with_negative_step(self):
+ ts = Series(np.arange(20),
+ period_range('2014-01', periods=20, freq='M'))
+ SLC = pd.IndexSlice
+
+ def assert_slices_equivalent(l_slc, i_slc):
+ tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc])
+ tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+ tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+
+ assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1])
+ assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1])
+
+ assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1])
+ assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1])
+
+ assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1])
+ assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1],
+ SLC[13:8:-1])
+ assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1],
+ SLC[13:8:-1])
+ assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1],
+ SLC[13:8:-1])
+
+ assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0])
+
+ def test_slice_with_zero_step_raises(self):
+ ts = Series(np.arange(20),
+ period_range('2014-01', periods=20, freq='M'))
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
+
+ def test_slice_keep_name(self):
+ idx = period_range('20010101', periods=10, freq='D', name='bob')
+ assert idx.name == idx[1:].name
+
+ def test_pindex_slice_index(self):
+ pi = period_range(start='1/1/10', end='12/31/12', freq='M')
+ s = Series(np.random.rand(len(pi)), index=pi)
+ res = s['2010']
+ exp = s[0:12]
+ tm.assert_series_equal(res, exp)
+ res = s['2011']
+ exp = s[12:24]
+ tm.assert_series_equal(res, exp)
+
+ def test_range_slice_day(self):
+ # GH#6716
+ didx = pd.date_range(start='2013/01/01', freq='D', periods=400)
+ pidx = period_range(start='2013/01/01', freq='D', periods=400)
+
+ for idx in [didx, pidx]:
+ # slices against index should raise IndexError
+ values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H',
+ '2013/02/01 09:00']
+ for v in values:
+ with pytest.raises(TypeError):
+ idx[v:]
+
+ s = Series(np.random.rand(len(idx)), index=idx)
+
+ tm.assert_series_equal(s['2013/01/02':], s[1:])
+ tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5])
+ tm.assert_series_equal(s['2013/02':], s[31:])
+ tm.assert_series_equal(s['2014':], s[365:])
+
+ invalid = ['2013/02/01 9H', '2013/02/01 09:00']
+ for v in invalid:
+ with pytest.raises(TypeError):
+ idx[v:]
+
+ def test_range_slice_seconds(self):
+ # GH#6716
+ didx = pd.date_range(start='2013/01/01 09:00:00', freq='S',
+ periods=4000)
+ pidx = period_range(start='2013/01/01 09:00:00', freq='S',
+ periods=4000)
+
+ for idx in [didx, pidx]:
+ # slices against index should raise IndexError
+ values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H',
+ '2013/02/01 09:00']
+ for v in values:
+ with pytest.raises(TypeError):
+ idx[v:]
+
+ s = Series(np.random.rand(len(idx)), index=idx)
+
+ tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'],
+ s[300:660])
+ tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'],
+ s[3600:3960])
+ tm.assert_series_equal(s['2013/01/01 10H':], s[3600:])
+ tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860])
+ for d in ['2013/01/01', '2013/01', '2013']:
+ tm.assert_series_equal(s[d:], s)
+
+ def test_range_slice_outofbounds(self):
+ # GH#5407
+ didx = pd.date_range(start='2013/10/01', freq='D', periods=10)
+ pidx = period_range(start='2013/10/01', freq='D', periods=10)
+
+ for idx in [didx, pidx]:
+ df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx)
+ empty = DataFrame(index=idx.__class__([], freq='D'),
+ columns=['units'])
+ empty['units'] = empty['units'].astype('int64')
+
+ tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty)
+ tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2])
+ tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2])
+ tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty)
+ tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty)
+ tm.assert_frame_equal(df['2013-06':'2013-09'], empty)
+ tm.assert_frame_equal(df['2013-11':'2013-12'], empty)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period.py
new file mode 100644
index 00000000000..dc9a32d75d2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period.py
@@ -0,0 +1,578 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.period import IncompatibleFrequency
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex, Series,
+ date_range, offsets, period_range)
+from pandas.util import testing as tm
+
+from ..datetimelike import DatetimeLike
+
+
+class TestPeriodIndex(DatetimeLike):
+ _holder = PeriodIndex
+
+ def setup_method(self, method):
+ self.indices = dict(index=tm.makePeriodIndex(10),
+ index_dec=period_range('20130101', periods=10,
+ freq='D')[::-1])
+ self.setup_indices()
+
+ def create_index(self):
+ return period_range('20130101', periods=5, freq='D')
+
+ def test_pickle_compat_construction(self):
+ pass
+
+ @pytest.mark.parametrize('freq', ['D', 'M', 'A'])
+ def test_pickle_round_trip(self, freq):
+ idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq=freq)
+ result = tm.round_trip_pickle(idx)
+ tm.assert_index_equal(result, idx)
+
+ def test_where(self):
+ # This is handled in test_indexing
+ pass
+
+ @pytest.mark.parametrize('use_numpy', [True, False])
+ @pytest.mark.parametrize('index', [
+ pd.period_range('2000-01-01', periods=3, freq='D'),
+ pd.period_range('2001-01-01', periods=3, freq='2D'),
+ pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M')])
+ def test_repeat_freqstr(self, index, use_numpy):
+ # GH10183
+ expected = PeriodIndex([p for p in index for _ in range(3)])
+ result = np.repeat(index, 3) if use_numpy else index.repeat(3)
+ tm.assert_index_equal(result, expected)
+ assert result.freqstr == index.freqstr
+
+ def test_fillna_period(self):
+ # GH 11343
+ idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT,
+ '2011-01-01 11:00'], freq='H')
+
+ exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00',
+ '2011-01-01 11:00'], freq='H')
+ tm.assert_index_equal(
+ idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp)
+
+ exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x',
+ pd.Period('2011-01-01 11:00', freq='H')], dtype=object)
+ tm.assert_index_equal(idx.fillna('x'), exp)
+
+ exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'),
+ pd.Period('2011-01-01', freq='D'),
+ pd.Period('2011-01-01 11:00', freq='H')], dtype=object)
+ tm.assert_index_equal(idx.fillna(
+ pd.Period('2011-01-01', freq='D')), exp)
+
+ def test_no_millisecond_field(self):
+ with pytest.raises(AttributeError):
+ DatetimeIndex.millisecond
+
+ with pytest.raises(AttributeError):
+ DatetimeIndex([]).millisecond
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_freq(self, sort):
+ # GH14323: difference of Period MUST preserve frequency
+ # but the ability to union results must be preserved
+
+ index = period_range("20160920", "20160925", freq="D")
+
+ other = period_range("20160921", "20160924", freq="D")
+ expected = PeriodIndex(["20160920", "20160925"], freq='D')
+ idx_diff = index.difference(other, sort)
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ other = period_range("20160922", "20160925", freq="D")
+ idx_diff = index.difference(other, sort)
+ expected = PeriodIndex(["20160920", "20160921"], freq='D')
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ def test_hash_error(self):
+ index = period_range('20010101', periods=10)
+ with pytest.raises(TypeError, match=("unhashable type: %r" %
+ type(index).__name__)):
+ hash(index)
+
+ def test_make_time_series(self):
+ index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ series = Series(1, index=index)
+ assert isinstance(series, Series)
+
+ def test_shallow_copy_empty(self):
+
+ # GH13067
+ idx = PeriodIndex([], freq='M')
+ result = idx._shallow_copy()
+ expected = idx
+
+ tm.assert_index_equal(result, expected)
+
+ def test_shallow_copy_i8(self):
+ # GH-24391
+ pi = period_range("2018-01-01", periods=3, freq="2D")
+ result = pi._shallow_copy(pi.asi8, freq=pi.freq)
+ tm.assert_index_equal(result, pi)
+
+ def test_shallow_copy_changing_freq_raises(self):
+ pi = period_range("2018-01-01", periods=3, freq="2D")
+ with pytest.raises(IncompatibleFrequency, match="are different"):
+ pi._shallow_copy(pi, freq="H")
+
+ def test_dtype_str(self):
+ pi = pd.PeriodIndex([], freq='M')
+ assert pi.dtype_str == 'period[M]'
+ assert pi.dtype_str == str(pi.dtype)
+
+ pi = pd.PeriodIndex([], freq='3M')
+ assert pi.dtype_str == 'period[3M]'
+ assert pi.dtype_str == str(pi.dtype)
+
+ def test_view_asi8(self):
+ idx = pd.PeriodIndex([], freq='M')
+
+ exp = np.array([], dtype=np.int64)
+ tm.assert_numpy_array_equal(idx.view('i8'), exp)
+ tm.assert_numpy_array_equal(idx.asi8, exp)
+
+ idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M')
+
+ exp = np.array([492, -9223372036854775808], dtype=np.int64)
+ tm.assert_numpy_array_equal(idx.view('i8'), exp)
+ tm.assert_numpy_array_equal(idx.asi8, exp)
+
+ exp = np.array([14975, -9223372036854775808], dtype=np.int64)
+ idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D')
+ tm.assert_numpy_array_equal(idx.view('i8'), exp)
+ tm.assert_numpy_array_equal(idx.asi8, exp)
+
+ def test_values(self):
+ idx = pd.PeriodIndex([], freq='M')
+
+ exp = np.array([], dtype=np.object)
+ tm.assert_numpy_array_equal(idx.values, exp)
+ tm.assert_numpy_array_equal(idx.get_values(), exp)
+ exp = np.array([], dtype=np.int64)
+ tm.assert_numpy_array_equal(idx._ndarray_values, exp)
+
+ idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M')
+
+ exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object)
+ tm.assert_numpy_array_equal(idx.values, exp)
+ tm.assert_numpy_array_equal(idx.get_values(), exp)
+ exp = np.array([492, -9223372036854775808], dtype=np.int64)
+ tm.assert_numpy_array_equal(idx._ndarray_values, exp)
+
+ idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D')
+
+ exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT],
+ dtype=object)
+ tm.assert_numpy_array_equal(idx.values, exp)
+ tm.assert_numpy_array_equal(idx.get_values(), exp)
+ exp = np.array([14975, -9223372036854775808], dtype=np.int64)
+ tm.assert_numpy_array_equal(idx._ndarray_values, exp)
+
+ def test_period_index_length(self):
+ pi = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 9
+
+ pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 4 * 9
+
+ pi = period_range(freq='M', start='1/1/2001', end='12/1/2009')
+ assert len(pi) == 12 * 9
+
+ start = Period('02-Apr-2005', 'B')
+ i1 = period_range(start=start, periods=20)
+ assert len(i1) == 20
+ assert i1.freq == start.freq
+ assert i1[0] == start
+
+ end_intv = Period('2006-12-31', 'W')
+ i1 = period_range(end=end_intv, periods=10)
+ assert len(i1) == 10
+ assert i1.freq == end_intv.freq
+ assert i1[-1] == end_intv
+
+ end_intv = Period('2006-12-31', '1w')
+ i2 = period_range(end=end_intv, periods=10)
+ assert len(i1) == len(i2)
+ assert (i1 == i2).all()
+ assert i1.freq == i2.freq
+
+ end_intv = Period('2006-12-31', ('w', 1))
+ i2 = period_range(end=end_intv, periods=10)
+ assert len(i1) == len(i2)
+ assert (i1 == i2).all()
+ assert i1.freq == i2.freq
+
+ try:
+ period_range(start=start, end=end_intv)
+ raise AssertionError('Cannot allow mixed freq for start and end')
+ except ValueError:
+ pass
+
+ end_intv = Period('2005-05-01', 'B')
+ i1 = period_range(start=start, end=end_intv)
+
+ try:
+ period_range(start=start)
+ raise AssertionError(
+ 'Must specify periods if missing start or end')
+ except ValueError:
+ pass
+
+ # infer freq from first element
+ i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')])
+ assert len(i2) == 2
+ assert i2[0] == end_intv
+
+ i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')]))
+ assert len(i2) == 2
+ assert i2[0] == end_intv
+
+ # Mixed freq should fail
+ vals = [end_intv, Period('2006-12-31', 'w')]
+ pytest.raises(ValueError, PeriodIndex, vals)
+ vals = np.array(vals)
+ pytest.raises(ValueError, PeriodIndex, vals)
+
+ def test_fields(self):
+ # year, month, day, hour, minute
+ # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter
+ # qyear
+ pi = period_range(freq='A', start='1/1/2001', end='12/1/2005')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='Q', start='1/1/2001', end='12/1/2002')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='M', start='1/1/2001', end='1/1/2002')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='D', start='12/1/2001', end='6/1/2001')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='B', start='12/1/2001', end='6/1/2001')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='H', start='12/31/2001', end='1/1/2002 23:00')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='Min', start='12/31/2001', end='1/1/2002 00:20')
+ self._check_all_fields(pi)
+
+ pi = period_range(freq='S', start='12/31/2001 00:00:00',
+ end='12/31/2001 00:05:00')
+ self._check_all_fields(pi)
+
+ end_intv = Period('2006-12-31', 'W')
+ i1 = period_range(end=end_intv, periods=10)
+ self._check_all_fields(i1)
+
+ def _check_all_fields(self, periodindex):
+ fields = ['year', 'month', 'day', 'hour', 'minute', 'second',
+ 'weekofyear', 'week', 'dayofweek', 'dayofyear',
+ 'quarter', 'qyear', 'days_in_month']
+
+ periods = list(periodindex)
+ s = pd.Series(periodindex)
+
+ for field in fields:
+ field_idx = getattr(periodindex, field)
+ assert len(periodindex) == len(field_idx)
+ for x, val in zip(periods, field_idx):
+ assert getattr(x, field) == val
+
+ if len(s) == 0:
+ continue
+
+ field_s = getattr(s.dt, field)
+ assert len(periodindex) == len(field_s)
+ for x, val in zip(periods, field_s):
+ assert getattr(x, field) == val
+
+ def test_period_set_index_reindex(self):
+ # GH 6631
+ df = DataFrame(np.random.random(6))
+ idx1 = period_range('2011/01/01', periods=6, freq='M')
+ idx2 = period_range('2013', periods=6, freq='A')
+
+ df = df.set_index(idx1)
+ tm.assert_index_equal(df.index, idx1)
+ df = df.set_index(idx2)
+ tm.assert_index_equal(df.index, idx2)
+
+ def test_factorize(self):
+ idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+ '2014-03', '2014-03'], freq='M')
+
+ exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp)
+ exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+
+ arr, idx = idx1.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ arr, idx = idx1.factorize(sort=True)
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+ '2014-03', '2014-01'], freq='M')
+
+ exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp)
+ arr, idx = idx2.factorize(sort=True)
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
+ exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M')
+ arr, idx = idx2.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ def test_is_(self):
+ create_index = lambda: period_range(freq='A', start='1/1/2001',
+ end='12/1/2009')
+ index = create_index()
+ assert index.is_(index)
+ assert not index.is_(create_index())
+ assert index.is_(index.view())
+ assert index.is_(index.view().view().view().view().view())
+ assert index.view().is_(index)
+ ind2 = index.view()
+ index.name = "Apple"
+ assert ind2.is_(index)
+ assert not index.is_(index[:])
+ assert not index.is_(index.asfreq('M'))
+ assert not index.is_(index.asfreq('A'))
+
+ assert not index.is_(index - 2)
+ assert not index.is_(index - 0)
+
+ def test_contains(self):
+ rng = period_range('2007-01', freq='M', periods=10)
+
+ assert Period('2007-01', freq='M') in rng
+ assert not Period('2007-01', freq='D') in rng
+ assert not Period('2007-01', freq='2M') in rng
+
+ def test_contains_nat(self):
+ # see gh-13582
+ idx = period_range('2007-01', freq='M', periods=10)
+ assert pd.NaT not in idx
+ assert None not in idx
+ assert float('nan') not in idx
+ assert np.nan not in idx
+
+ idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M')
+ assert pd.NaT in idx
+ assert None in idx
+ assert float('nan') in idx
+ assert np.nan in idx
+
+ def test_periods_number_check(self):
+ with pytest.raises(ValueError):
+ period_range('2011-1-1', '2012-1-1', 'B')
+
+ def test_start_time(self):
+ # GH 17157
+ index = period_range(freq='M', start='2016-01-01', end='2016-05-31')
+ expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS')
+ tm.assert_index_equal(index.start_time, expected_index)
+
+ def test_end_time(self):
+ # GH 17157
+ index = period_range(freq='M', start='2016-01-01', end='2016-05-31')
+ expected_index = date_range('2016-01-01', end='2016-05-31', freq='M')
+ expected_index = expected_index.shift(1, freq='D').shift(-1, freq='ns')
+ tm.assert_index_equal(index.end_time, expected_index)
+
+ def test_index_duplicate_periods(self):
+ # monotonic
+ idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN')
+ ts = Series(np.random.randn(len(idx)), index=idx)
+
+ result = ts[2007]
+ expected = ts[1:3]
+ tm.assert_series_equal(result, expected)
+ result[:] = 1
+ assert (ts[1:3] == 1).all()
+
+ # not monotonic
+ idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN')
+ ts = Series(np.random.randn(len(idx)), index=idx)
+
+ result = ts[2007]
+ expected = ts[idx == 2007]
+ tm.assert_series_equal(result, expected)
+
+ def test_index_unique(self):
+ idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN')
+ expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN')
+ tm.assert_index_equal(idx.unique(), expected)
+ assert idx.nunique() == 3
+
+ idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN',
+ tz='US/Eastern')
+ expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN',
+ tz='US/Eastern')
+ tm.assert_index_equal(idx.unique(), expected)
+ assert idx.nunique() == 3
+
+ def test_shift(self):
+ # This is tested in test_arithmetic
+ pass
+
+ @td.skip_if_32bit
+ def test_ndarray_compat_properties(self):
+ super(TestPeriodIndex, self).test_ndarray_compat_properties()
+
+ def test_negative_ordinals(self):
+ Period(ordinal=-1000, freq='A')
+ Period(ordinal=0, freq='A')
+
+ idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A')
+ idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A')
+ tm.assert_index_equal(idx1, idx2)
+
+ def test_pindex_fieldaccessor_nat(self):
+ idx = PeriodIndex(['2011-01', '2011-02', 'NaT',
+ '2012-03', '2012-04'], freq='D', name='name')
+
+ exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name='name')
+ tm.assert_index_equal(idx.year, exp)
+ exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name='name')
+ tm.assert_index_equal(idx.month, exp)
+
+ def test_pindex_qaccess(self):
+ pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q')
+ s = Series(np.random.rand(len(pi)), index=pi).cumsum()
+ # Todo: fix these accessors!
+ assert s['05Q4'] == s[2]
+
+ def test_pindex_multiples(self):
+ with tm.assert_produces_warning(FutureWarning):
+ pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M')
+ expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07',
+ '2011-09', '2011-11'], freq='2M')
+ tm.assert_index_equal(pi, expected)
+ assert pi.freq == offsets.MonthEnd(2)
+ assert pi.freqstr == '2M'
+
+ pi = period_range(start='1/1/11', end='12/31/11', freq='2M')
+ tm.assert_index_equal(pi, expected)
+ assert pi.freq == offsets.MonthEnd(2)
+ assert pi.freqstr == '2M'
+
+ pi = period_range(start='1/1/11', periods=6, freq='2M')
+ tm.assert_index_equal(pi, expected)
+ assert pi.freq == offsets.MonthEnd(2)
+ assert pi.freqstr == '2M'
+
+ def test_iteration(self):
+ index = period_range(start='1/1/10', periods=4, freq='B')
+
+ result = list(index)
+ assert isinstance(result[0], Period)
+ assert result[0].freq == index.freq
+
+ def test_is_full(self):
+ index = PeriodIndex([2005, 2007, 2009], freq='A')
+ assert not index.is_full
+
+ index = PeriodIndex([2005, 2006, 2007], freq='A')
+ assert index.is_full
+
+ index = PeriodIndex([2005, 2005, 2007], freq='A')
+ assert not index.is_full
+
+ index = PeriodIndex([2005, 2005, 2006], freq='A')
+ assert index.is_full
+
+ index = PeriodIndex([2006, 2005, 2005], freq='A')
+ pytest.raises(ValueError, getattr, index, 'is_full')
+
+ assert index[:0].is_full
+
+ def test_with_multi_index(self):
+ # #1705
+ index = date_range('1/1/2012', periods=4, freq='12H')
+ index_as_arrays = [index.to_period(freq='D'), index.hour]
+
+ s = Series([0, 1, 2, 3], index_as_arrays)
+
+ assert isinstance(s.index.levels[0], PeriodIndex)
+
+ assert isinstance(s.index.values[0][0], Period)
+
+ def test_convert_array_of_periods(self):
+ rng = period_range('1/1/2000', periods=20, freq='D')
+ periods = list(rng)
+
+ result = pd.Index(periods)
+ assert isinstance(result, PeriodIndex)
+
+ def test_append_concat(self):
+ # #1815
+ d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC')
+ d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC')
+
+ s1 = Series(np.random.randn(10), d1)
+ s2 = Series(np.random.randn(10), d2)
+
+ s1 = s1.to_period()
+ s2 = s2.to_period()
+
+ # drops index
+ result = pd.concat([s1, s2])
+ assert isinstance(result.index, PeriodIndex)
+ assert result.index[0] == s1.index[0]
+
+ def test_pickle_freq(self):
+ # GH2891
+ prng = period_range('1/1/2011', '1/1/2012', freq='M')
+ new_prng = tm.round_trip_pickle(prng)
+ assert new_prng.freq == offsets.MonthEnd()
+ assert new_prng.freqstr == 'M'
+
+ def test_map(self):
+ # test_map_dictlike generally tests
+
+ index = PeriodIndex([2005, 2007, 2009], freq='A')
+ result = index.map(lambda x: x.ordinal)
+ exp = Index([x.ordinal for x in index])
+ tm.assert_index_equal(result, exp)
+
+ def test_join_self(self, join_type):
+ index = period_range('1/1/2000', periods=10)
+ joined = index.join(index, how=join_type)
+ assert index is joined
+
+ def test_insert(self):
+ # GH 18295 (test missing)
+ expected = PeriodIndex(
+ ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q')
+ for na in (np.nan, pd.NaT, None):
+ result = period_range('2017Q1', periods=4, freq='Q').insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+
+def test_maybe_convert_timedelta():
+ pi = PeriodIndex(['2000', '2001'], freq='D')
+ offset = offsets.Day(2)
+ assert pi._maybe_convert_timedelta(offset) == 2
+ assert pi._maybe_convert_timedelta(2) == 2
+
+ offset = offsets.BusinessDay()
+ with pytest.raises(ValueError, match='freq'):
+ pi._maybe_convert_timedelta(offset)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period_range.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period_range.py
new file mode 100644
index 00000000000..aa300111ba6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_period_range.py
@@ -0,0 +1,95 @@
+import pytest
+
+from pandas import NaT, Period, PeriodIndex, date_range, period_range
+import pandas.util.testing as tm
+
+
+class TestPeriodRange(object):
+
+ @pytest.mark.parametrize('freq', ['D', 'W', 'M', 'Q', 'A'])
+ def test_construction_from_string(self, freq):
+ # non-empty
+ expected = date_range(start='2017-01-01', periods=5,
+ freq=freq, name='foo').to_period()
+ start, end = str(expected[0]), str(expected[-1])
+
+ result = period_range(start=start, end=end, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(start=start, periods=5, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(end=end, periods=5, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ # empty
+ expected = PeriodIndex([], freq=freq, name='foo')
+
+ result = period_range(start=start, periods=0, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(end=end, periods=0, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(start=end, end=start, freq=freq, name='foo')
+ tm.assert_index_equal(result, expected)
+
+ def test_construction_from_period(self):
+ # upsampling
+ start, end = Period('2017Q1', freq='Q'), Period('2018Q1', freq='Q')
+ expected = date_range(start='2017-03-31', end='2018-03-31', freq='M',
+ name='foo').to_period()
+ result = period_range(start=start, end=end, freq='M', name='foo')
+ tm.assert_index_equal(result, expected)
+
+ # downsampling
+ start, end = Period('2017-1', freq='M'), Period('2019-12', freq='M')
+ expected = date_range(start='2017-01-31', end='2019-12-31', freq='Q',
+ name='foo').to_period()
+ result = period_range(start=start, end=end, freq='Q', name='foo')
+ tm.assert_index_equal(result, expected)
+
+ # empty
+ expected = PeriodIndex([], freq='W', name='foo')
+
+ result = period_range(start=start, periods=0, freq='W', name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(end=end, periods=0, freq='W', name='foo')
+ tm.assert_index_equal(result, expected)
+
+ result = period_range(start=end, end=start, freq='W', name='foo')
+ tm.assert_index_equal(result, expected)
+
+ def test_errors(self):
+ # not enough params
+ msg = ('Of the three parameters: start, end, and periods, '
+ 'exactly two must be specified')
+ with pytest.raises(ValueError, match=msg):
+ period_range(start='2017Q1')
+
+ with pytest.raises(ValueError, match=msg):
+ period_range(end='2017Q1')
+
+ with pytest.raises(ValueError, match=msg):
+ period_range(periods=5)
+
+ with pytest.raises(ValueError, match=msg):
+ period_range()
+
+ # too many params
+ with pytest.raises(ValueError, match=msg):
+ period_range(start='2017Q1', end='2018Q1', periods=8, freq='Q')
+
+ # start/end NaT
+ msg = 'start and end must not be NaT'
+ with pytest.raises(ValueError, match=msg):
+ period_range(start=NaT, end='2018Q1')
+
+ with pytest.raises(ValueError, match=msg):
+ period_range(start='2017Q1', end=NaT)
+
+ # invalid periods param
+ msg = 'periods must be a number, got foo'
+ with pytest.raises(TypeError, match=msg):
+ period_range(start='2017Q1', periods='foo')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_scalar_compat.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_scalar_compat.py
new file mode 100644
index 00000000000..b140a1f3c5b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_scalar_compat.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""Tests for PeriodIndex behaving like a vectorized Period scalar"""
+
+from pandas import Timedelta, date_range, period_range
+import pandas.util.testing as tm
+
+
+class TestPeriodIndexOps(object):
+ def test_start_time(self):
+ index = period_range(freq='M', start='2016-01-01', end='2016-05-31')
+ expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS')
+ tm.assert_index_equal(index.start_time, expected_index)
+
+ def test_end_time(self):
+ index = period_range(freq='M', start='2016-01-01', end='2016-05-31')
+ expected_index = date_range('2016-01-01', end='2016-05-31', freq='M')
+ expected_index += Timedelta(1, 'D') - Timedelta(1, 'ns')
+ tm.assert_index_equal(index.end_time, expected_index)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_setops.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_setops.py
new file mode 100644
index 00000000000..bf29edad484
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_setops.py
@@ -0,0 +1,281 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, PeriodIndex, date_range, period_range
+import pandas.core.indexes.period as period
+import pandas.util.testing as tm
+
+
+def _permute(obj):
+ return obj.take(np.random.permutation(len(obj)))
+
+
+class TestPeriodIndex(object):
+
+ def test_joins(self, join_type):
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+
+ joined = index.join(index[:-5], how=join_type)
+
+ assert isinstance(joined, PeriodIndex)
+ assert joined.freq == index.freq
+
+ def test_join_self(self, join_type):
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+
+ res = index.join(index, how=join_type)
+ assert index is res
+
+ def test_join_does_not_recur(self):
+ df = tm.makeCustomDataframe(
+ 3, 2, data_gen_f=lambda *args: np.random.randint(2),
+ c_idx_type='p', r_idx_type='dt')
+ s = df.iloc[:2, 0]
+
+ res = s.index.join(df.columns, how='outer')
+ expected = Index([s.index[0], s.index[1],
+ df.columns[0], df.columns[1]], object)
+ tm.assert_index_equal(res, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union(self, sort):
+ # union
+ other1 = pd.period_range('1/1/2000', freq='D', periods=5)
+ rng1 = pd.period_range('1/6/2000', freq='D', periods=5)
+ expected1 = pd.period_range('1/1/2000', freq='D', periods=10)
+
+ rng2 = pd.period_range('1/1/2000', freq='D', periods=5)
+ other2 = pd.period_range('1/4/2000', freq='D', periods=5)
+ expected2 = pd.period_range('1/1/2000', freq='D', periods=8)
+
+ rng3 = pd.period_range('1/1/2000', freq='D', periods=5)
+ other3 = pd.PeriodIndex([], freq='D')
+ expected3 = pd.period_range('1/1/2000', freq='D', periods=5)
+
+ rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5)
+ other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5)
+ expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00',
+ '2000-01-01 11:00', '2000-01-01 12:00',
+ '2000-01-01 13:00', '2000-01-02 09:00',
+ '2000-01-02 10:00', '2000-01-02 11:00',
+ '2000-01-02 12:00', '2000-01-02 13:00'],
+ freq='H')
+
+ rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03',
+ '2000-01-01 09:05'], freq='T')
+ other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05'
+ '2000-01-01 09:08'],
+ freq='T')
+ expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03',
+ '2000-01-01 09:05', '2000-01-01 09:08'],
+ freq='T')
+
+ rng6 = pd.period_range('2000-01-01', freq='M', periods=7)
+ other6 = pd.period_range('2000-04-01', freq='M', periods=7)
+ expected6 = pd.period_range('2000-01-01', freq='M', periods=10)
+
+ rng7 = pd.period_range('2003-01-01', freq='A', periods=5)
+ other7 = pd.period_range('1998-01-01', freq='A', periods=8)
+ expected7 = pd.period_range('1998-01-01', freq='A', periods=10)
+
+ rng8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000',
+ '1/5/2000', '1/4/2000'], freq='D')
+ other8 = pd.period_range('1/6/2000', freq='D', periods=5)
+ expected8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000',
+ '1/5/2000', '1/4/2000', '1/6/2000',
+ '1/7/2000', '1/8/2000', '1/9/2000',
+ '1/10/2000'], freq='D')
+
+ for rng, other, expected in [(rng1, other1, expected1),
+ (rng2, other2, expected2),
+ (rng3, other3, expected3),
+ (rng4, other4, expected4),
+ (rng5, other5, expected5),
+ (rng6, other6, expected6),
+ (rng7, other7, expected7),
+ (rng8, other8, expected8)]:
+
+ result_union = rng.union(other, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result_union, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union_misc(self, sort):
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+
+ result = index[:-5].union(index[10:], sort=sort)
+ tm.assert_index_equal(result, index)
+
+ # not in order
+ result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, index)
+ assert tm.equalContents(result, index)
+
+ # raise if different frequencies
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+ index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED')
+ with pytest.raises(period.IncompatibleFrequency):
+ index.union(index2, sort=sort)
+
+ msg = 'can only call with other PeriodIndex-ed objects'
+ with pytest.raises(ValueError, match=msg):
+ index.join(index.to_timestamp())
+
+ index3 = period_range('1/1/2000', '1/20/2000', freq='2D')
+ with pytest.raises(period.IncompatibleFrequency):
+ index.join(index3)
+
+ def test_union_dataframe_index(self):
+ rng1 = pd.period_range('1/1/1999', '1/1/2012', freq='M')
+ s1 = pd.Series(np.random.randn(len(rng1)), rng1)
+
+ rng2 = pd.period_range('1/1/1980', '12/1/2001', freq='M')
+ s2 = pd.Series(np.random.randn(len(rng2)), rng2)
+ df = pd.DataFrame({'s1': s1, 's2': s2})
+
+ exp = pd.period_range('1/1/1980', '1/1/2012', freq='M')
+ tm.assert_index_equal(df.index, exp)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection(self, sort):
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+
+ result = index[:-5].intersection(index[10:], sort=sort)
+ tm.assert_index_equal(result, index[10:-5])
+
+ # not in order
+ left = _permute(index[:-5])
+ right = _permute(index[10:])
+ result = left.intersection(right, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, index[10:-5])
+ assert tm.equalContents(result, index[10:-5])
+
+ # raise if different frequencies
+ index = period_range('1/1/2000', '1/20/2000', freq='D')
+ index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED')
+ with pytest.raises(period.IncompatibleFrequency):
+ index.intersection(index2, sort=sort)
+
+ index3 = period_range('1/1/2000', '1/20/2000', freq='2D')
+ with pytest.raises(period.IncompatibleFrequency):
+ index.intersection(index3, sort=sort)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_cases(self, sort):
+ base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx')
+
+ # if target has the same name, it is preserved
+ rng2 = period_range('5/15/2000', '6/20/2000', freq='D', name='idx')
+ expected2 = period_range('6/1/2000', '6/20/2000', freq='D',
+ name='idx')
+
+ # if target name is different, it will be reset
+ rng3 = period_range('5/15/2000', '6/20/2000', freq='D', name='other')
+ expected3 = period_range('6/1/2000', '6/20/2000', freq='D',
+ name=None)
+
+ rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx')
+ expected4 = PeriodIndex([], name='idx', freq='D')
+
+ for (rng, expected) in [(rng2, expected2), (rng3, expected3),
+ (rng4, expected4)]:
+ result = base.intersection(rng, sort=sort)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ # non-monotonic
+ base = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-02',
+ '2011-01-03'], freq='D', name='idx')
+
+ rng2 = PeriodIndex(['2011-01-04', '2011-01-02',
+ '2011-02-02', '2011-02-03'],
+ freq='D', name='idx')
+ expected2 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D',
+ name='idx')
+
+ rng3 = PeriodIndex(['2011-01-04', '2011-01-02', '2011-02-02',
+ '2011-02-03'],
+ freq='D', name='other')
+ expected3 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D',
+ name=None)
+
+ rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx')
+ expected4 = PeriodIndex([], freq='D', name='idx')
+
+ for (rng, expected) in [(rng2, expected2), (rng3, expected3),
+ (rng4, expected4)]:
+ result = base.intersection(rng, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == 'D'
+
+ # empty same freq
+ rng = date_range('6/1/2000', '6/15/2000', freq='T')
+ result = rng[0:0].intersection(rng)
+ assert len(result) == 0
+
+ result = rng.intersection(rng[0:0])
+ assert len(result) == 0
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference(self, sort):
+ # diff
+ period_rng = ['1/3/2000', '1/2/2000', '1/1/2000', '1/5/2000',
+ '1/4/2000']
+ rng1 = pd.PeriodIndex(period_rng, freq='D')
+ other1 = pd.period_range('1/6/2000', freq='D', periods=5)
+ expected1 = rng1
+
+ rng2 = pd.PeriodIndex(period_rng, freq='D')
+ other2 = pd.period_range('1/4/2000', freq='D', periods=5)
+ expected2 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000'],
+ freq='D')
+
+ rng3 = pd.PeriodIndex(period_rng, freq='D')
+ other3 = pd.PeriodIndex([], freq='D')
+ expected3 = rng3
+
+ period_rng = ['2000-01-01 10:00', '2000-01-01 09:00',
+ '2000-01-01 12:00', '2000-01-01 11:00',
+ '2000-01-01 13:00']
+ rng4 = pd.PeriodIndex(period_rng, freq='H')
+ other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5)
+ expected4 = rng4
+
+ rng5 = pd.PeriodIndex(['2000-01-01 09:03', '2000-01-01 09:01',
+ '2000-01-01 09:05'], freq='T')
+ other5 = pd.PeriodIndex(
+ ['2000-01-01 09:01', '2000-01-01 09:05'], freq='T')
+ expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T')
+
+ period_rng = ['2000-02-01', '2000-01-01', '2000-06-01',
+ '2000-07-01', '2000-05-01', '2000-03-01',
+ '2000-04-01']
+ rng6 = pd.PeriodIndex(period_rng, freq='M')
+ other6 = pd.period_range('2000-04-01', freq='M', periods=7)
+ expected6 = pd.PeriodIndex(['2000-02-01', '2000-01-01', '2000-03-01'],
+ freq='M')
+
+ period_rng = ['2003', '2007', '2006', '2005', '2004']
+ rng7 = pd.PeriodIndex(period_rng, freq='A')
+ other7 = pd.period_range('1998-01-01', freq='A', periods=8)
+ expected7 = pd.PeriodIndex(['2007', '2006'], freq='A')
+
+ for rng, other, expected in [(rng1, other1, expected1),
+ (rng2, other2, expected2),
+ (rng3, other3, expected3),
+ (rng4, other4, expected4),
+ (rng5, other5, expected5),
+ (rng6, other6, expected6),
+ (rng7, other7, expected7), ]:
+ result_difference = rng.difference(other, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result_difference, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/period/test_tools.py b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_tools.py
new file mode 100644
index 00000000000..641400ebec9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/period/test_tools.py
@@ -0,0 +1,345 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.ccalendar import MONTHS
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import (
+ DatetimeIndex, Period, PeriodIndex, Series, Timedelta, Timestamp,
+ date_range, period_range, to_datetime)
+import pandas.core.indexes.period as period
+import pandas.util.testing as tm
+
+
+class TestPeriodRepresentation(object):
+ """
+ Wish to match NumPy units
+ """
+
+ def _check_freq(self, freq, base_date):
+ rng = period_range(start=base_date, periods=10, freq=freq)
+ exp = np.arange(10, dtype=np.int64)
+
+ tm.assert_numpy_array_equal(rng.asi8, exp)
+
+ def test_annual(self):
+ self._check_freq('A', 1970)
+
+ def test_monthly(self):
+ self._check_freq('M', '1970-01')
+
+ @pytest.mark.parametrize('freq', ['W-THU', 'D', 'B', 'H', 'T',
+ 'S', 'L', 'U', 'N'])
+ def test_freq(self, freq):
+ self._check_freq(freq, '1970-01-01')
+
+ def test_negone_ordinals(self):
+ freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S']
+
+ period = Period(ordinal=-1, freq='D')
+ for freq in freqs:
+ repr(period.asfreq(freq))
+
+ for freq in freqs:
+ period = Period(ordinal=-1, freq=freq)
+ repr(period)
+ assert period.year == 1969
+
+ period = Period(ordinal=-1, freq='B')
+ repr(period)
+ period = Period(ordinal=-1, freq='W')
+ repr(period)
+
+
+class TestPeriodIndex(object):
+ def test_to_timestamp(self):
+ index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ series = Series(1, index=index, name='foo')
+
+ exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
+ result = series.to_timestamp(how='end')
+ exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+ assert result.name == 'foo'
+
+ exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
+ result = series.to_timestamp(how='start')
+ tm.assert_index_equal(result.index, exp_index)
+
+ def _get_with_delta(delta, freq='A-DEC'):
+ return date_range(to_datetime('1/1/2001') + delta,
+ to_datetime('12/31/2009') + delta, freq=freq)
+
+ delta = timedelta(hours=23)
+ result = series.to_timestamp('H', 'end')
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ delta = timedelta(hours=23, minutes=59)
+ result = series.to_timestamp('T', 'end')
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ result = series.to_timestamp('S', 'end')
+ delta = timedelta(hours=23, minutes=59, seconds=59)
+ exp_index = _get_with_delta(delta)
+ exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+
+ index = period_range(freq='H', start='1/1/2001', end='1/2/2001')
+ series = Series(1, index=index, name='foo')
+
+ exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59',
+ freq='H')
+ result = series.to_timestamp(how='end')
+ exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result.index, exp_index)
+ assert result.name == 'foo'
+
+ def test_to_timestamp_freq(self):
+ idx = pd.period_range('2017', periods=12, freq="A-DEC")
+ result = idx.to_timestamp()
+ expected = pd.date_range("2017", periods=12, freq="AS-JAN")
+ tm.assert_index_equal(result, expected)
+
+ def test_to_timestamp_repr_is_code(self):
+ zs = [Timestamp('99-04-17 00:00:00', tz='UTC'),
+ Timestamp('2001-04-17 00:00:00', tz='UTC'),
+ Timestamp('2001-04-17 00:00:00', tz='America/Los_Angeles'),
+ Timestamp('2001-04-17 00:00:00', tz=None)]
+ for z in zs:
+ assert eval(repr(z)) == z
+
+ def test_to_timestamp_to_period_astype(self):
+ idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx')
+
+ res = idx.astype('period[M]')
+ exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx')
+ tm.assert_index_equal(res, exp)
+
+ res = idx.astype('period[3M]')
+ exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx')
+ tm.assert_index_equal(res, exp)
+
+ def test_dti_to_period(self):
+ dti = pd.date_range(start='1/1/2005', end='12/1/2005', freq='M')
+ pi1 = dti.to_period()
+ pi2 = dti.to_period(freq='D')
+ pi3 = dti.to_period(freq='3D')
+
+ assert pi1[0] == Period('Jan 2005', freq='M')
+ assert pi2[0] == Period('1/31/2005', freq='D')
+ assert pi3[0] == Period('1/31/2005', freq='3D')
+
+ assert pi1[-1] == Period('Nov 2005', freq='M')
+ assert pi2[-1] == Period('11/30/2005', freq='D')
+ assert pi3[-1], Period('11/30/2005', freq='3D')
+
+ tm.assert_index_equal(pi1, period_range('1/1/2005', '11/1/2005',
+ freq='M'))
+ tm.assert_index_equal(pi2, period_range('1/1/2005', '11/1/2005',
+ freq='M').asfreq('D'))
+ tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005',
+ freq='M').asfreq('3D'))
+
+ @pytest.mark.parametrize('month', MONTHS)
+ def test_to_period_quarterly(self, month):
+ # make sure we can make the round trip
+ freq = 'Q-%s' % month
+ rng = period_range('1989Q3', '1991Q3', freq=freq)
+ stamps = rng.to_timestamp()
+ result = stamps.to_period(freq)
+ tm.assert_index_equal(rng, result)
+
+ @pytest.mark.parametrize('off', ['BQ', 'QS', 'BQS'])
+ def test_to_period_quarterlyish(self, off):
+ rng = date_range('01-Jan-2012', periods=8, freq=off)
+ prng = rng.to_period()
+ assert prng.freq == 'Q-DEC'
+
+ @pytest.mark.parametrize('off', ['BA', 'AS', 'BAS'])
+ def test_to_period_annualish(self, off):
+ rng = date_range('01-Jan-2012', periods=8, freq=off)
+ prng = rng.to_period()
+ assert prng.freq == 'A-DEC'
+
+ def test_to_period_monthish(self):
+ offsets = ['MS', 'BM']
+ for off in offsets:
+ rng = date_range('01-Jan-2012', periods=8, freq=off)
+ prng = rng.to_period()
+ assert prng.freq == 'M'
+
+ rng = date_range('01-Jan-2012', periods=8, freq='M')
+ prng = rng.to_period()
+ assert prng.freq == 'M'
+
+ msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ date_range('01-Jan-2012', periods=8, freq='EOM')
+
+ def test_period_dt64_round_trip(self):
+ dti = date_range('1/1/2000', '1/7/2002', freq='B')
+ pi = dti.to_period()
+ tm.assert_index_equal(pi.to_timestamp(), dti)
+
+ dti = date_range('1/1/2000', '1/7/2002', freq='B')
+ pi = dti.to_period(freq='H')
+ tm.assert_index_equal(pi.to_timestamp(), dti)
+
+ def test_combine_first(self):
+ # GH#3367
+ didx = pd.date_range(start='1950-01-31', end='1950-07-31', freq='M')
+ pidx = pd.period_range(start=pd.Period('1950-1'),
+ end=pd.Period('1950-7'), freq='M')
+ # check to be consistent with DatetimeIndex
+ for idx in [didx, pidx]:
+ a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx)
+ b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx)
+
+ result = a.combine_first(b)
+ expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx,
+ dtype=np.float64)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['D', '2D'])
+ def test_searchsorted(self, freq):
+ pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03',
+ '2014-01-04', '2014-01-05'], freq=freq)
+
+ p1 = pd.Period('2014-01-01', freq=freq)
+ assert pidx.searchsorted(p1) == 0
+
+ p2 = pd.Period('2014-01-04', freq=freq)
+ assert pidx.searchsorted(p2) == 3
+
+ msg = "Input has different freq=H from PeriodIndex"
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ pidx.searchsorted(pd.Period('2014-01-01', freq='H'))
+
+ msg = "Input has different freq=5D from PeriodIndex"
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ pidx.searchsorted(pd.Period('2014-01-01', freq='5D'))
+
+
+class TestPeriodIndexConversion(object):
+ def test_tolist(self):
+ index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
+ rs = index.tolist()
+ for x in rs:
+ assert isinstance(x, Period)
+
+ recon = PeriodIndex(rs)
+ tm.assert_index_equal(index, recon)
+
+ def test_to_timestamp_pi_nat(self):
+ # GH#7228
+ index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M',
+ name='idx')
+
+ result = index.to_timestamp('D')
+ expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1),
+ datetime(2011, 2, 1)], name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.name == 'idx'
+
+ result2 = result.to_period(freq='M')
+ tm.assert_index_equal(result2, index)
+ assert result2.name == 'idx'
+
+ result3 = result.to_period(freq='3M')
+ exp = PeriodIndex(['NaT', '2011-01', '2011-02'],
+ freq='3M', name='idx')
+ tm.assert_index_equal(result3, exp)
+ assert result3.freqstr == '3M'
+
+ msg = ('Frequency must be positive, because it'
+ ' represents span: -2A')
+ with pytest.raises(ValueError, match=msg):
+ result.to_period(freq='-2A')
+
+ def test_to_timestamp_preserve_name(self):
+ index = period_range(freq='A', start='1/1/2001', end='12/1/2009',
+ name='foo')
+ assert index.name == 'foo'
+
+ conv = index.to_timestamp('D')
+ assert conv.name == 'foo'
+
+ def test_to_timestamp_quarterly_bug(self):
+ years = np.arange(1960, 2000).repeat(4)
+ quarters = np.tile(lrange(1, 5), 40)
+
+ pindex = PeriodIndex(year=years, quarter=quarters)
+
+ stamps = pindex.to_timestamp('D', 'end')
+ expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex])
+ tm.assert_index_equal(stamps, expected)
+
+ def test_to_timestamp_pi_mult(self):
+ idx = PeriodIndex(['2011-01', 'NaT', '2011-02'],
+ freq='2M', name='idx')
+
+ result = idx.to_timestamp()
+ expected = DatetimeIndex(['2011-01-01', 'NaT', '2011-02-01'],
+ name='idx')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.to_timestamp(how='E')
+ expected = DatetimeIndex(['2011-02-28', 'NaT', '2011-03-31'],
+ name='idx')
+ expected = expected + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result, expected)
+
+ def test_to_timestamp_pi_combined(self):
+ idx = period_range(start='2011', periods=2, freq='1D1H', name='idx')
+
+ result = idx.to_timestamp()
+ expected = DatetimeIndex(['2011-01-01 00:00', '2011-01-02 01:00'],
+ name='idx')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.to_timestamp(how='E')
+ expected = DatetimeIndex(['2011-01-02 00:59:59',
+ '2011-01-03 01:59:59'],
+ name='idx')
+ expected = expected + Timedelta(1, 's') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.to_timestamp(how='E', freq='H')
+ expected = DatetimeIndex(['2011-01-02 00:00', '2011-01-03 01:00'],
+ name='idx')
+ expected = expected + Timedelta(1, 'h') - Timedelta(1, 'ns')
+ tm.assert_index_equal(result, expected)
+
+ def test_period_astype_to_timestamp(self):
+ pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M')
+
+ exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'])
+ tm.assert_index_equal(pi.astype('datetime64[ns]'), exp)
+
+ exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'])
+ exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp)
+
+ exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ tz='US/Eastern')
+ res = pi.astype('datetime64[ns, US/Eastern]')
+ tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp)
+
+ exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'],
+ tz='US/Eastern')
+ exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ res = pi.astype('datetime64[ns, US/Eastern]', how='end')
+ tm.assert_index_equal(res, exp)
+
+ def test_to_timestamp_1703(self):
+ index = period_range('1/1/2012', periods=4, freq='D')
+
+ result = index.to_timestamp()
+ assert result[0] == Timestamp('1/1/2012')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_base.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_base.py
new file mode 100644
index 00000000000..c99007cef90
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_base.py
@@ -0,0 +1,2587 @@
+# -*- coding: utf-8 -*-
+
+from collections import defaultdict
+from datetime import datetime, timedelta
+import math
+import operator
+import sys
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import Timestamp
+from pandas.compat import (
+ PY3, PY35, PY36, StringIO, lrange, lzip, range, text_type, u, zip)
+from pandas.compat.numpy import np_datetime64_compat
+
+from pandas.core.dtypes.common import is_unsigned_integer_dtype
+from pandas.core.dtypes.generic import ABCIndex
+
+import pandas as pd
+from pandas import (
+ CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, Int64Index,
+ PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, date_range,
+ isna, period_range)
+import pandas.core.config as cf
+from pandas.core.index import _get_combined_index, ensure_index_from_sequences
+from pandas.core.indexes.api import Index, MultiIndex
+from pandas.core.sorting import safe_sort
+from pandas.tests.indexes.common import Base
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+
+class TestIndex(Base):
+ _holder = Index
+
+ def setup_method(self, method):
+ self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100),
+ strIndex=tm.makeStringIndex(100),
+ dateIndex=tm.makeDateIndex(100),
+ periodIndex=tm.makePeriodIndex(100),
+ tdIndex=tm.makeTimedeltaIndex(100),
+ intIndex=tm.makeIntIndex(100),
+ uintIndex=tm.makeUIntIndex(100),
+ rangeIndex=tm.makeRangeIndex(100),
+ floatIndex=tm.makeFloatIndex(100),
+ boolIndex=Index([True, False]),
+ catIndex=tm.makeCategoricalIndex(100),
+ empty=Index([]),
+ tuples=MultiIndex.from_tuples(lzip(
+ ['foo', 'bar', 'baz'], [1, 2, 3])),
+ repeats=Index([0, 0, 1, 1, 2, 2]))
+ self.setup_indices()
+
+ def create_index(self):
+ return Index(list('abcde'))
+
+ def generate_index_types(self, skip_index_keys=[]):
+ """
+ Return a generator of the various index types, leaving
+ out the ones with a key in skip_index_keys
+ """
+ for key, index in self.indices.items():
+ if key not in skip_index_keys:
+ yield key, index
+
+ def test_can_hold_identifiers(self):
+ index = self.create_index()
+ key = index[0]
+ assert index._can_hold_identifiers_and_holds_name(key) is True
+
+ def test_new_axis(self):
+ new_index = self.dateIndex[None, :]
+ assert new_index.ndim == 2
+ assert isinstance(new_index, np.ndarray)
+
+ def test_copy_and_deepcopy(self):
+ new_copy2 = self.intIndex.copy(dtype=int)
+ assert new_copy2.dtype.kind == 'i'
+
+ @pytest.mark.parametrize("attr", ['strIndex', 'dateIndex'])
+ def test_constructor_regular(self, attr):
+ # regular instance creation
+ index = getattr(self, attr)
+ tm.assert_contains_all(index, index)
+
+ def test_constructor_casting(self):
+ # casting
+ arr = np.array(self.strIndex)
+ index = Index(arr)
+ tm.assert_contains_all(arr, index)
+ tm.assert_index_equal(self.strIndex, index)
+
+ def test_constructor_copy(self):
+ # copy
+ arr = np.array(self.strIndex)
+ index = Index(arr, copy=True, name='name')
+ assert isinstance(index, Index)
+ assert index.name == 'name'
+ tm.assert_numpy_array_equal(arr, index.values)
+ arr[0] = "SOMEBIGLONGSTRING"
+ assert index[0] != "SOMEBIGLONGSTRING"
+
+ # what to do here?
+ # arr = np.array(5.)
+ # pytest.raises(Exception, arr.view, Index)
+
+ def test_constructor_corner(self):
+ # corner case
+ pytest.raises(TypeError, Index, 0)
+
+ @pytest.mark.parametrize("index_vals", [
+ [('A', 1), 'B'], ['B', ('A', 1)]])
+ def test_construction_list_mixed_tuples(self, index_vals):
+ # see gh-10697: if we are constructing from a mixed list of tuples,
+ # make sure that we are independent of the sorting order.
+ index = Index(index_vals)
+ assert isinstance(index, Index)
+ assert not isinstance(index, MultiIndex)
+
+ @pytest.mark.parametrize('na_value', [None, np.nan])
+ @pytest.mark.parametrize('vtype', [list, tuple, iter])
+ def test_construction_list_tuples_nan(self, na_value, vtype):
+ # GH 18505 : valid tuples containing NaN
+ values = [(1, 'two'), (3., na_value)]
+ result = Index(vtype(values))
+ expected = MultiIndex.from_tuples(values)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("cast_as_obj", [True, False])
+ @pytest.mark.parametrize("index", [
+ pd.date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern', name='Green Eggs & Ham'), # DTI with tz
+ pd.date_range('2015-01-01 10:00', freq='D', periods=3), # DTI no tz
+ pd.timedelta_range('1 days', freq='D', periods=3), # td
+ pd.period_range('2015-01-01', freq='D', periods=3) # period
+ ])
+ def test_constructor_from_index_dtlike(self, cast_as_obj, index):
+ if cast_as_obj:
+ result = pd.Index(index.astype(object))
+ else:
+ result = pd.Index(index)
+
+ tm.assert_index_equal(result, index)
+
+ if isinstance(index, pd.DatetimeIndex):
+ assert result.tz == index.tz
+ if cast_as_obj:
+ # GH#23524 check that Index(dti, dtype=object) does not
+ # incorrectly raise ValueError, and that nanoseconds are not
+ # dropped
+ index += pd.Timedelta(nanoseconds=50)
+ result = pd.Index(index, dtype=object)
+ assert result.dtype == np.object_
+ assert list(result) == list(index)
+
+ @pytest.mark.parametrize("index,has_tz", [
+ (pd.date_range('2015-01-01 10:00', freq='D', periods=3,
+ tz='US/Eastern'), True), # datetimetz
+ (pd.timedelta_range('1 days', freq='D', periods=3), False), # td
+ (pd.period_range('2015-01-01', freq='D', periods=3), False) # period
+ ])
+ def test_constructor_from_series_dtlike(self, index, has_tz):
+ result = pd.Index(pd.Series(index))
+ tm.assert_index_equal(result, index)
+
+ if has_tz:
+ assert result.tz == index.tz
+
+ @pytest.mark.parametrize("klass", [Index, DatetimeIndex])
+ def test_constructor_from_series(self, klass):
+ expected = DatetimeIndex([Timestamp('20110101'), Timestamp('20120101'),
+ Timestamp('20130101')])
+ s = Series([Timestamp('20110101'), Timestamp('20120101'),
+ Timestamp('20130101')])
+ result = klass(s)
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_from_series_freq(self):
+ # GH 6273
+ # create from a series, passing a freq
+ dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990']
+ expected = DatetimeIndex(dts, freq='MS')
+
+ s = Series(pd.to_datetime(dts))
+ result = DatetimeIndex(s, freq='MS')
+
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_from_frame_series_freq(self):
+ # GH 6273
+ # create from a series, passing a freq
+ dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990']
+ expected = DatetimeIndex(dts, freq='MS')
+
+ df = pd.DataFrame(np.random.rand(5, 3))
+ df['date'] = dts
+ result = DatetimeIndex(df['date'], freq='MS')
+
+ assert df['date'].dtype == object
+ expected.name = 'date'
+ tm.assert_index_equal(result, expected)
+
+ expected = pd.Series(dts, name='date')
+ tm.assert_series_equal(df['date'], expected)
+
+ # GH 6274
+ # infer freq of same
+ freq = pd.infer_freq(df['date'])
+ assert freq == 'MS'
+
+ @pytest.mark.parametrize("array", [
+ np.arange(5), np.array(['a', 'b', 'c']), date_range(
+ '2000-01-01', periods=3).values
+ ])
+ def test_constructor_ndarray_like(self, array):
+ # GH 5460#issuecomment-44474502
+ # it should be possible to convert any object that satisfies the numpy
+ # ndarray interface directly into an Index
+ class ArrayLike(object):
+ def __init__(self, array):
+ self.array = array
+
+ def __array__(self, dtype=None):
+ return self.array
+
+ expected = pd.Index(array)
+ result = pd.Index(ArrayLike(array))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('dtype', [
+ int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32',
+ 'uint16', 'uint8'])
+ def test_constructor_int_dtype_float(self, dtype):
+ # GH 18400
+ if is_unsigned_integer_dtype(dtype):
+ index_type = UInt64Index
+ else:
+ index_type = Int64Index
+
+ expected = index_type([0, 1, 2, 3])
+ result = Index([0., 1., 2., 3.], dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_int_dtype_nan(self):
+ # see gh-15187
+ data = [np.nan]
+ expected = Float64Index(data)
+ result = Index(data, dtype='float')
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", ['int64', 'uint64'])
+ def test_constructor_int_dtype_nan_raises(self, dtype):
+ # see gh-15187
+ data = [np.nan]
+ msg = "cannot convert"
+ with pytest.raises(ValueError, match=msg):
+ Index(data, dtype=dtype)
+
+ def test_constructor_no_pandas_array(self):
+ ser = pd.Series([1, 2, 3])
+ result = pd.Index(ser.array)
+ expected = pd.Index([1, 2, 3])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("klass,dtype,na_val", [
+ (pd.Float64Index, np.float64, np.nan),
+ (pd.DatetimeIndex, 'datetime64[ns]', pd.NaT)
+ ])
+ def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val):
+ # GH 13467
+ na_list = [na_val, na_val]
+ expected = klass(na_list)
+ assert expected.dtype == dtype
+
+ result = Index(na_list)
+ tm.assert_index_equal(result, expected)
+
+ result = Index(np.array(na_list))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("pos", [0, 1])
+ @pytest.mark.parametrize("klass,dtype,ctor", [
+ (pd.DatetimeIndex, 'datetime64[ns]', np.datetime64('nat')),
+ (pd.TimedeltaIndex, 'timedelta64[ns]', np.timedelta64('nat'))
+ ])
+ def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor,
+ nulls_fixture):
+ expected = klass([pd.NaT, pd.NaT])
+ assert expected.dtype == dtype
+ data = [ctor]
+ data.insert(pos, nulls_fixture)
+
+ result = Index(data)
+ tm.assert_index_equal(result, expected)
+
+ result = Index(np.array(data, dtype=object))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("swap_objs", [True, False])
+ def test_index_ctor_nat_result(self, swap_objs):
+ # mixed np.datetime64/timedelta64 nat results in object
+ data = [np.datetime64('nat'), np.timedelta64('nat')]
+ if swap_objs:
+ data = data[::-1]
+
+ expected = pd.Index(data, dtype=object)
+ tm.assert_index_equal(Index(data), expected)
+ tm.assert_index_equal(Index(np.array(data, dtype=object)), expected)
+
+ def test_index_ctor_infer_periodindex(self):
+ xp = period_range('2012-1-1', freq='M', periods=3)
+ rs = Index(xp)
+ tm.assert_index_equal(rs, xp)
+ assert isinstance(rs, PeriodIndex)
+
+ @pytest.mark.parametrize("vals,dtype", [
+ ([1, 2, 3, 4, 5], 'int'), ([1.1, np.nan, 2.2, 3.0], 'float'),
+ (['A', 'B', 'C', np.nan], 'obj')
+ ])
+ def test_constructor_simple_new(self, vals, dtype):
+ index = Index(vals, name=dtype)
+ result = index._simple_new(index.values, dtype)
+ tm.assert_index_equal(result, index)
+
+ @pytest.mark.parametrize("vals", [
+ [1, 2, 3], np.array([1, 2, 3]), np.array([1, 2, 3], dtype=int),
+ # below should coerce
+ [1., 2., 3.], np.array([1., 2., 3.], dtype=float)
+ ])
+ def test_constructor_dtypes_to_int64(self, vals):
+ index = Index(vals, dtype=int)
+ assert isinstance(index, Int64Index)
+
+ @pytest.mark.parametrize("vals", [
+ [1, 2, 3], [1., 2., 3.], np.array([1., 2., 3.]),
+ np.array([1, 2, 3], dtype=int), np.array([1., 2., 3.], dtype=float)
+ ])
+ def test_constructor_dtypes_to_float64(self, vals):
+ index = Index(vals, dtype=float)
+ assert isinstance(index, Float64Index)
+
+ @pytest.mark.parametrize("cast_index", [True, False])
+ @pytest.mark.parametrize("vals", [
+ [True, False, True], np.array([True, False, True], dtype=bool)
+ ])
+ def test_constructor_dtypes_to_object(self, cast_index, vals):
+ if cast_index:
+ index = Index(vals, dtype=bool)
+ else:
+ index = Index(vals)
+
+ assert isinstance(index, Index)
+ assert index.dtype == object
+
+ @pytest.mark.parametrize("vals", [
+ [1, 2, 3], np.array([1, 2, 3], dtype=int),
+ np.array([np_datetime64_compat('2011-01-01'),
+ np_datetime64_compat('2011-01-02')]),
+ [datetime(2011, 1, 1), datetime(2011, 1, 2)]
+ ])
+ def test_constructor_dtypes_to_categorical(self, vals):
+ index = Index(vals, dtype='category')
+ assert isinstance(index, CategoricalIndex)
+
+ @pytest.mark.parametrize("cast_index", [True, False])
+ @pytest.mark.parametrize("vals", [
+ Index(np.array([np_datetime64_compat('2011-01-01'),
+ np_datetime64_compat('2011-01-02')])),
+ Index([datetime(2011, 1, 1), datetime(2011, 1, 2)])
+
+ ])
+ def test_constructor_dtypes_to_datetime(self, cast_index, vals):
+ if cast_index:
+ index = Index(vals, dtype=object)
+ assert isinstance(index, Index)
+ assert index.dtype == object
+ else:
+ index = Index(vals)
+ assert isinstance(index, DatetimeIndex)
+
+ @pytest.mark.parametrize("cast_index", [True, False])
+ @pytest.mark.parametrize("vals", [
+ np.array([np.timedelta64(1, 'D'), np.timedelta64(1, 'D')]),
+ [timedelta(1), timedelta(1)]
+ ])
+ def test_constructor_dtypes_to_timedelta(self, cast_index, vals):
+ if cast_index:
+ index = Index(vals, dtype=object)
+ assert isinstance(index, Index)
+ assert index.dtype == object
+ else:
+ index = Index(vals)
+ assert isinstance(index, TimedeltaIndex)
+
+ @pytest.mark.parametrize("attr, utc", [
+ ['values', False],
+ ['asi8', True]])
+ @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex])
+ def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc,
+ klass):
+ # Test constructing with a datetimetz dtype
+ # .values produces numpy datetimes, so these are considered naive
+ # .asi8 produces integers, so these are considered epoch timestamps
+ # ^the above will be true in a later version. Right now we `.view`
+ # the i8 values as NS_DTYPE, effectively treating them as wall times.
+ index = pd.date_range('2011-01-01', periods=5)
+ arg = getattr(index, attr)
+ index = index.tz_localize(tz_naive_fixture)
+ dtype = index.dtype
+
+ # TODO(GH-24559): Remove the sys.modules and warnings
+ # not sure what this is from. It's Py2 only.
+ modules = [sys.modules['pandas.core.indexes.base']]
+
+ if (tz_naive_fixture and attr == "asi8" and
+ str(tz_naive_fixture) not in ('UTC', 'tzutc()')):
+ ex_warn = FutureWarning
+ else:
+ ex_warn = None
+
+ # stacklevel is checked elsewhere. We don't do it here since
+ # Index will have an frame, throwing off the expected.
+ with tm.assert_produces_warning(ex_warn, check_stacklevel=False,
+ clear=modules):
+ result = klass(arg, tz=tz_naive_fixture)
+ tm.assert_index_equal(result, index)
+
+ with tm.assert_produces_warning(ex_warn, check_stacklevel=False):
+ result = klass(arg, dtype=dtype)
+ tm.assert_index_equal(result, index)
+
+ with tm.assert_produces_warning(ex_warn, check_stacklevel=False):
+ result = klass(list(arg), tz=tz_naive_fixture)
+ tm.assert_index_equal(result, index)
+
+ with tm.assert_produces_warning(ex_warn, check_stacklevel=False):
+ result = klass(list(arg), dtype=dtype)
+ tm.assert_index_equal(result, index)
+
+ @pytest.mark.parametrize("attr", ['values', 'asi8'])
+ @pytest.mark.parametrize("klass", [pd.Index, pd.TimedeltaIndex])
+ def test_constructor_dtypes_timedelta(self, attr, klass):
+ index = pd.timedelta_range('1 days', periods=5)
+ dtype = index.dtype
+
+ values = getattr(index, attr)
+
+ result = klass(values, dtype=dtype)
+ tm.assert_index_equal(result, index)
+
+ result = klass(list(values), dtype=dtype)
+ tm.assert_index_equal(result, index)
+
+ @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])])
+ @pytest.mark.parametrize("klass",
+ [Index, Float64Index, Int64Index, UInt64Index,
+ CategoricalIndex, DatetimeIndex, TimedeltaIndex])
+ def test_constructor_empty(self, value, klass):
+ empty = klass(value)
+ assert isinstance(empty, klass)
+ assert not len(empty)
+
+ @pytest.mark.parametrize("empty,klass", [
+ (PeriodIndex([], freq='B'), PeriodIndex),
+ (PeriodIndex(iter([]), freq='B'), PeriodIndex),
+ (PeriodIndex((x for x in []), freq='B'), PeriodIndex),
+ (RangeIndex(step=1), pd.RangeIndex),
+ (MultiIndex(levels=[[1, 2], ['blue', 'red']],
+ codes=[[], []]), MultiIndex)
+ ])
+ def test_constructor_empty_special(self, empty, klass):
+ assert isinstance(empty, klass)
+ assert not len(empty)
+
+ def test_constructor_overflow_int64(self):
+ # see gh-15832
+ msg = ("The elements provided in the data cannot "
+ "all be casted to the dtype int64")
+ with pytest.raises(OverflowError, match=msg):
+ Index([np.iinfo(np.uint64).max - 1], dtype="int64")
+
+ @pytest.mark.xfail(reason="see GH#21311: Index "
+ "doesn't enforce dtype argument")
+ def test_constructor_cast(self):
+ msg = "could not convert string to float"
+ with pytest.raises(ValueError, match=msg):
+ Index(["a", "b", "c"], dtype=float)
+
+ def test_view_with_args(self):
+
+ restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex',
+ 'empty']
+
+ for i in restricted:
+ ind = self.indices[i]
+
+ # with arguments
+ pytest.raises(TypeError, lambda: ind.view('i8'))
+
+ # these are ok
+ for i in list(set(self.indices.keys()) - set(restricted)):
+ ind = self.indices[i]
+
+ # with arguments
+ ind.view('i8')
+
+ def test_astype(self):
+ casted = self.intIndex.astype('i8')
+
+ # it works!
+ casted.get_loc(5)
+
+ # pass on name
+ self.intIndex.name = 'foobar'
+ casted = self.intIndex.astype('i8')
+ assert casted.name == 'foobar'
+
+ def test_equals_object(self):
+ # same
+ assert Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))
+
+ @pytest.mark.parametrize("comp", [
+ Index(['a', 'b']), Index(['a', 'b', 'd']), ['a', 'b', 'c']])
+ def test_not_equals_object(self, comp):
+ assert not Index(['a', 'b', 'c']).equals(comp)
+
+ def test_insert(self):
+
+ # GH 7256
+ # validate neg/pos inserts
+ result = Index(['b', 'c', 'd'])
+
+ # test 0th element
+ tm.assert_index_equal(Index(['a', 'b', 'c', 'd']),
+ result.insert(0, 'a'))
+
+ # test Nth element that follows Python list behavior
+ tm.assert_index_equal(Index(['b', 'c', 'e', 'd']),
+ result.insert(-1, 'e'))
+
+ # test loc +/- neq (0, -1)
+ tm.assert_index_equal(result.insert(1, 'z'), result.insert(-2, 'z'))
+
+ # test empty
+ null_index = Index([])
+ tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))
+
+ def test_insert_missing(self, nulls_fixture):
+ # GH 22295
+ # test there is no mangling of NA values
+ expected = Index(['a', nulls_fixture, 'b', 'c'])
+ result = Index(list('abc')).insert(1, nulls_fixture)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("pos,expected", [
+ (0, Index(['b', 'c', 'd'], name='index')),
+ (-1, Index(['a', 'b', 'c'], name='index'))
+ ])
+ def test_delete(self, pos, expected):
+ index = Index(['a', 'b', 'c', 'd'], name='index')
+ result = index.delete(pos)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ def test_delete_raises(self):
+ index = Index(['a', 'b', 'c', 'd'], name='index')
+ with pytest.raises((IndexError, ValueError)):
+ # either depending on numpy version
+ index.delete(5)
+
+ def test_identical(self):
+
+ # index
+ i1 = Index(['a', 'b', 'c'])
+ i2 = Index(['a', 'b', 'c'])
+
+ assert i1.identical(i2)
+
+ i1 = i1.rename('foo')
+ assert i1.equals(i2)
+ assert not i1.identical(i2)
+
+ i2 = i2.rename('foo')
+ assert i1.identical(i2)
+
+ i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')])
+ i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False)
+ assert not i3.identical(i4)
+
+ def test_is_(self):
+ ind = Index(range(10))
+ assert ind.is_(ind)
+ assert ind.is_(ind.view().view().view().view())
+ assert not ind.is_(Index(range(10)))
+ assert not ind.is_(ind.copy())
+ assert not ind.is_(ind.copy(deep=False))
+ assert not ind.is_(ind[:])
+ assert not ind.is_(np.array(range(10)))
+
+ # quasi-implementation dependent
+ assert ind.is_(ind.view())
+ ind2 = ind.view()
+ ind2.name = 'bob'
+ assert ind.is_(ind2)
+ assert ind2.is_(ind)
+ # doesn't matter if Indices are *actually* views of underlying data,
+ assert not ind.is_(Index(ind.values))
+ arr = np.array(range(1, 11))
+ ind1 = Index(arr, copy=False)
+ ind2 = Index(arr, copy=False)
+ assert not ind1.is_(ind2)
+
+ def test_asof(self):
+ d = self.dateIndex[0]
+ assert self.dateIndex.asof(d) == d
+ assert isna(self.dateIndex.asof(d - timedelta(1)))
+
+ d = self.dateIndex[-1]
+ assert self.dateIndex.asof(d + timedelta(1)) == d
+
+ d = self.dateIndex[0].to_pydatetime()
+ assert isinstance(self.dateIndex.asof(d), Timestamp)
+
+ def test_asof_datetime_partial(self):
+ index = pd.date_range('2010-01-01', periods=2, freq='m')
+ expected = Timestamp('2010-02-28')
+ result = index.asof('2010-02')
+ assert result == expected
+ assert not isinstance(result, Index)
+
+ def test_nanosecond_index_access(self):
+ s = Series([Timestamp('20130101')]).values.view('i8')[0]
+ r = DatetimeIndex([s + 50 + i for i in range(100)])
+ x = Series(np.random.randn(100), index=r)
+
+ first_value = x.asof(x.index[0])
+
+ # this does not yet work, as parsing strings is done via dateutil
+ # assert first_value == x['2013-01-01 00:00:00.000000050+0000']
+
+ expected_ts = np_datetime64_compat('2013-01-01 00:00:00.000000050+'
+ '0000', 'ns')
+ assert first_value == x[Timestamp(expected_ts)]
+
+ def test_booleanindex(self):
+ boolIndex = np.repeat(True, len(self.strIndex)).astype(bool)
+ boolIndex[5:30:2] = False
+
+ subIndex = self.strIndex[boolIndex]
+
+ for i, val in enumerate(subIndex):
+ assert subIndex.get_loc(val) == i
+
+ subIndex = self.strIndex[list(boolIndex)]
+ for i, val in enumerate(subIndex):
+ assert subIndex.get_loc(val) == i
+
+ def test_fancy(self):
+ sl = self.strIndex[[1, 2, 3]]
+ for i in sl:
+ assert i == sl[sl.get_loc(i)]
+
+ @pytest.mark.parametrize("attr", [
+ 'strIndex', 'intIndex', 'floatIndex'])
+ @pytest.mark.parametrize("dtype", [np.int_, np.bool_])
+ def test_empty_fancy(self, attr, dtype):
+ empty_arr = np.array([], dtype=dtype)
+ index = getattr(self, attr)
+ empty_index = index.__class__([])
+
+ assert index[[]].identical(empty_index)
+ assert index[empty_arr].identical(empty_index)
+
+ @pytest.mark.parametrize("attr", [
+ 'strIndex', 'intIndex', 'floatIndex'])
+ def test_empty_fancy_raises(self, attr):
+ # pd.DatetimeIndex is excluded, because it overrides getitem and should
+ # be tested separately.
+ empty_farr = np.array([], dtype=np.float_)
+ index = getattr(self, attr)
+ empty_index = index.__class__([])
+
+ assert index[[]].identical(empty_index)
+ # np.ndarray only accepts ndarray of int & bool dtypes, so should Index
+ pytest.raises(IndexError, index.__getitem__, empty_farr)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection(self, sort):
+ first = self.strIndex[:20]
+ second = self.strIndex[:10]
+ intersect = first.intersection(second, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(intersect, second.sort_values())
+ assert tm.equalContents(intersect, second)
+
+ # Corner cases
+ inter = first.intersection(first, sort=sort)
+ assert inter is first
+
+ @pytest.mark.parametrize("index2,keeps_name", [
+ (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name
+ (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names
+ (Index([3, 4, 5, 6, 7]), False)])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_name_preservation(self, index2, keeps_name, sort):
+ index1 = Index([1, 2, 3, 4, 5], name='index')
+ expected = Index([3, 4, 5])
+ result = index1.intersection(index2, sort)
+
+ if keeps_name:
+ expected.name = 'index'
+
+ assert result.name == expected.name
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("first_name,second_name,expected_name", [
+ ('A', 'A', 'A'), ('A', 'B', None), (None, 'B', None)])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_name_preservation2(self, first_name, second_name,
+ expected_name, sort):
+ first = self.strIndex[5:20]
+ second = self.strIndex[:10]
+ first.name = first_name
+ second.name = second_name
+ intersect = first.intersection(second, sort=sort)
+ assert intersect.name == expected_name
+
+ @pytest.mark.parametrize("index2,keeps_name", [
+ (Index([4, 7, 6, 5, 3], name='index'), True),
+ (Index([4, 7, 6, 5, 3], name='other'), False)])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_monotonic(self, index2, keeps_name, sort):
+ index1 = Index([5, 3, 2, 4, 1], name='index')
+ expected = Index([5, 3, 4])
+
+ if keeps_name:
+ expected.name = "index"
+
+ result = index1.intersection(index2, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("index2,expected_arr", [
+ (Index(['B', 'D']), ['B']),
+ (Index(['B', 'D', 'A']), ['A', 'B', 'A'])])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_non_monotonic_non_unique(self, index2, expected_arr,
+ sort):
+ # non-monotonic non-unique
+ index1 = Index(['A', 'B', 'A', 'C'])
+ expected = Index(expected_arr, dtype='object')
+ result = index1.intersection(index2, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersect_str_dates(self, sort):
+ dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
+
+ i1 = Index(dt_dates, dtype=object)
+ i2 = Index(['aa'], dtype=object)
+ result = i2.intersection(i1, sort=sort)
+
+ assert len(result) == 0
+
+ def test_intersect_nosort(self):
+ result = pd.Index(['c', 'b', 'a']).intersection(['b', 'a'])
+ expected = pd.Index(['b', 'a'])
+ tm.assert_index_equal(result, expected)
+
+ def test_intersection_equal_sort(self):
+ idx = pd.Index(['c', 'a', 'b'])
+ tm.assert_index_equal(idx.intersection(idx, sort=False), idx)
+ tm.assert_index_equal(idx.intersection(idx, sort=None), idx)
+
+ @pytest.mark.xfail(reason="Not implemented")
+ def test_intersection_equal_sort_true(self):
+ # TODO decide on True behaviour
+ idx = pd.Index(['c', 'a', 'b'])
+ sorted_ = pd.Index(['a', 'b', 'c'])
+ tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_chained_union(self, sort):
+ # Chained unions handles names correctly
+ i1 = Index([1, 2], name='i1')
+ i2 = Index([5, 6], name='i2')
+ i3 = Index([3, 4], name='i3')
+ union = i1.union(i2.union(i3, sort=sort), sort=sort)
+ expected = i1.union(i2, sort=sort).union(i3, sort=sort)
+ tm.assert_index_equal(union, expected)
+
+ j1 = Index([1, 2], name='j1')
+ j2 = Index([], name='j2')
+ j3 = Index([], name='j3')
+ union = j1.union(j2.union(j3, sort=sort), sort=sort)
+ expected = j1.union(j2, sort=sort).union(j3, sort=sort)
+ tm.assert_index_equal(union, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union(self, sort):
+ # TODO: Replace with fixturesult
+ first = self.strIndex[5:20]
+ second = self.strIndex[:10]
+ everything = self.strIndex[:20]
+
+ union = first.union(second, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(union, everything.sort_values())
+ assert tm.equalContents(union, everything)
+
+ @pytest.mark.parametrize('slice_', [slice(None), slice(0)])
+ def test_union_sort_other_special(self, slice_):
+ # https://github.com/pandas-dev/pandas/issues/24959
+
+ idx = pd.Index([1, 0, 2])
+ # default, sort=None
+ other = idx[slice_]
+ tm.assert_index_equal(idx.union(other), idx)
+ tm.assert_index_equal(other.union(idx), idx)
+
+ # sort=False
+ tm.assert_index_equal(idx.union(other, sort=False), idx)
+
+ @pytest.mark.xfail(reason="Not implemented")
+ @pytest.mark.parametrize('slice_', [slice(None), slice(0)])
+ def test_union_sort_special_true(self, slice_):
+ # TODO decide on True behaviour
+ # sort=True
+ idx = pd.Index([1, 0, 2])
+ # default, sort=None
+ other = idx[slice_]
+
+ result = idx.union(other, sort=True)
+ expected = pd.Index([0, 1, 2])
+ tm.assert_index_equal(result, expected)
+
+ def test_union_sort_other_incomparable(self):
+ # https://github.com/pandas-dev/pandas/issues/24959
+ idx = pd.Index([1, pd.Timestamp('2000')])
+ # default (sort=None)
+ with tm.assert_produces_warning(RuntimeWarning):
+ result = idx.union(idx[:1])
+
+ tm.assert_index_equal(result, idx)
+
+ # sort=None
+ with tm.assert_produces_warning(RuntimeWarning):
+ result = idx.union(idx[:1], sort=None)
+ tm.assert_index_equal(result, idx)
+
+ # sort=False
+ result = idx.union(idx[:1], sort=False)
+ tm.assert_index_equal(result, idx)
+
+ @pytest.mark.xfail(reason="Not implemented")
+ def test_union_sort_other_incomparable_true(self):
+ # TODO decide on True behaviour
+ # sort=True
+ idx = pd.Index([1, pd.Timestamp('2000')])
+ with pytest.raises(TypeError, match='.*'):
+ idx.union(idx[:1], sort=True)
+
+ @pytest.mark.parametrize("klass", [
+ np.array, Series, list])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union_from_iterables(self, klass, sort):
+ # GH 10149
+ # TODO: Replace with fixturesult
+ first = self.strIndex[5:20]
+ second = self.strIndex[:10]
+ everything = self.strIndex[:20]
+
+ case = klass(second.values)
+ result = first.union(case, sort=sort)
+ if sort is None:
+ tm.assert_index_equal(result, everything.sort_values())
+ assert tm.equalContents(result, everything)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union_identity(self, sort):
+ # TODO: replace with fixturesult
+ first = self.strIndex[5:20]
+
+ union = first.union(first, sort=sort)
+ # i.e. identity is not preserved when sort is True
+ assert (union is first) is (not sort)
+
+ union = first.union([], sort=sort)
+ assert (union is first) is (not sort)
+
+ union = Index([]).union(first, sort=sort)
+ assert (union is first) is (not sort)
+
+ @pytest.mark.parametrize("first_list", [list('ba'), list()])
+ @pytest.mark.parametrize("second_list", [list('ab'), list()])
+ @pytest.mark.parametrize("first_name, second_name, expected_name", [
+ ('A', 'B', None), (None, 'B', None), ('A', None, None)])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union_name_preservation(self, first_list, second_list, first_name,
+ second_name, expected_name, sort):
+ first = Index(first_list, name=first_name)
+ second = Index(second_list, name=second_name)
+ union = first.union(second, sort=sort)
+
+ vals = set(first_list).union(second_list)
+
+ if sort is None and len(first_list) > 0 and len(second_list) > 0:
+ expected = Index(sorted(vals), name=expected_name)
+ tm.assert_index_equal(union, expected)
+ else:
+ expected = Index(vals, name=expected_name)
+ assert tm.equalContents(union, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_union_dt_as_obj(self, sort):
+ # TODO: Replace with fixturesult
+ firstCat = self.strIndex.union(self.dateIndex)
+ secondCat = self.strIndex.union(self.strIndex)
+
+ if self.dateIndex.dtype == np.object_:
+ appended = np.append(self.strIndex, self.dateIndex)
+ else:
+ appended = np.append(self.strIndex, self.dateIndex.astype('O'))
+
+ assert tm.equalContents(firstCat, appended)
+ assert tm.equalContents(secondCat, self.strIndex)
+ tm.assert_contains_all(self.strIndex, firstCat)
+ tm.assert_contains_all(self.strIndex, secondCat)
+ tm.assert_contains_all(self.dateIndex, firstCat)
+
+ @pytest.mark.parametrize("method", ['union', 'intersection', 'difference',
+ 'symmetric_difference'])
+ def test_setops_disallow_true(self, method):
+ idx1 = pd.Index(['a', 'b'])
+ idx2 = pd.Index(['b', 'c'])
+
+ with pytest.raises(ValueError, match="The 'sort' keyword only takes"):
+ getattr(idx1, method)(idx2, sort=True)
+
+ def test_map_identity_mapping(self):
+ # GH 12766
+ # TODO: replace with fixture
+ for name, cur_index in self.indices.items():
+ tm.assert_index_equal(cur_index, cur_index.map(lambda x: x))
+
+ def test_map_with_tuples(self):
+ # GH 12766
+
+ # Test that returning a single tuple from an Index
+ # returns an Index.
+ index = tm.makeIntIndex(3)
+ result = tm.makeIntIndex(3).map(lambda x: (x,))
+ expected = Index([(i,) for i in index])
+ tm.assert_index_equal(result, expected)
+
+ # Test that returning a tuple from a map of a single index
+ # returns a MultiIndex object.
+ result = index.map(lambda x: (x, x == 1))
+ expected = MultiIndex.from_tuples([(i, i == 1) for i in index])
+ tm.assert_index_equal(result, expected)
+
+ def test_map_with_tuples_mi(self):
+ # Test that returning a single object from a MultiIndex
+ # returns an Index.
+ first_level = ['foo', 'bar', 'baz']
+ multi_index = MultiIndex.from_tuples(lzip(first_level, [1, 2, 3]))
+ reduced_index = multi_index.map(lambda x: x[0])
+ tm.assert_index_equal(reduced_index, Index(first_level))
+
+ @pytest.mark.parametrize("attr", [
+ 'makeDateIndex', 'makePeriodIndex', 'makeTimedeltaIndex'])
+ def test_map_tseries_indices_return_index(self, attr):
+ index = getattr(tm, attr)(10)
+ expected = Index([1] * 10)
+ result = index.map(lambda x: 1)
+ tm.assert_index_equal(expected, result)
+
+ def test_map_tseries_indices_accsr_return_index(self):
+ date_index = tm.makeDateIndex(24, freq='h', name='hourly')
+ expected = Index(range(24), name='hourly')
+ tm.assert_index_equal(expected, date_index.map(lambda x: x.hour))
+
+ @pytest.mark.parametrize(
+ "mapper",
+ [
+ lambda values, index: {i: e for e, i in zip(values, index)},
+ lambda values, index: pd.Series(values, index)])
+ def test_map_dictlike(self, mapper):
+ # GH 12756
+ expected = Index(['foo', 'bar', 'baz'])
+ index = tm.makeIntIndex(3)
+ result = index.map(mapper(expected.values, index))
+ tm.assert_index_equal(result, expected)
+
+ # TODO: replace with fixture
+ for name in self.indices.keys():
+ if name == 'catIndex':
+ # Tested in test_categorical
+ continue
+ elif name == 'repeats':
+ # Cannot map duplicated index
+ continue
+
+ index = self.indices[name]
+ expected = Index(np.arange(len(index), 0, -1))
+
+ # to match proper result coercion for uints
+ if name == 'empty':
+ expected = Index([])
+
+ result = index.map(mapper(expected, index))
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("mapper", [
+ Series(['foo', 2., 'baz'], index=[0, 2, -1]),
+ {0: 'foo', 2: 2.0, -1: 'baz'}])
+ def test_map_with_non_function_missing_values(self, mapper):
+ # GH 12756
+ expected = Index([2., np.nan, 'foo'])
+ result = Index([2, 1, 0]).map(mapper)
+
+ tm.assert_index_equal(expected, result)
+
+ def test_map_na_exclusion(self):
+ index = Index([1.5, np.nan, 3, np.nan, 5])
+
+ result = index.map(lambda x: x * 2, na_action='ignore')
+ expected = index * 2
+ tm.assert_index_equal(result, expected)
+
+ def test_map_defaultdict(self):
+ index = Index([1, 2, 3])
+ default_dict = defaultdict(lambda: 'blank')
+ default_dict[1] = 'stuff'
+ result = index.map(default_dict)
+ expected = Index(['stuff', 'blank', 'blank'])
+ tm.assert_index_equal(result, expected)
+
+ def test_append_multiple(self):
+ index = Index(['a', 'b', 'c', 'd', 'e', 'f'])
+
+ foos = [index[:2], index[2:4], index[4:]]
+ result = foos[0].append(foos[1:])
+ tm.assert_index_equal(result, index)
+
+ # empty
+ result = index.append([])
+ tm.assert_index_equal(result, index)
+
+ @pytest.mark.parametrize("name,expected", [
+ ('foo', 'foo'), ('bar', None)])
+ def test_append_empty_preserve_name(self, name, expected):
+ left = Index([], name='foo')
+ right = Index([1, 2, 3], name=name)
+
+ result = left.append(right)
+ assert result.name == expected
+
+ @pytest.mark.parametrize("second_name,expected", [
+ (None, None), ('name', 'name')])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_name_preservation(self, second_name, expected, sort):
+ # TODO: replace with fixturesult
+ first = self.strIndex[5:20]
+ second = self.strIndex[:10]
+ answer = self.strIndex[10:20]
+
+ first.name = 'name'
+ second.name = second_name
+ result = first.difference(second, sort=sort)
+
+ assert tm.equalContents(result, answer)
+
+ if expected is None:
+ assert result.name is None
+ else:
+ assert result.name == expected
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_empty_arg(self, sort):
+ first = self.strIndex[5:20]
+ first.name == 'name'
+ result = first.difference([], sort)
+
+ assert tm.equalContents(result, first)
+ assert result.name == first.name
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_identity(self, sort):
+ first = self.strIndex[5:20]
+ first.name == 'name'
+ result = first.difference(first, sort)
+
+ assert len(result) == 0
+ assert result.name == first.name
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_sort(self, sort):
+ first = self.strIndex[5:20]
+ second = self.strIndex[:10]
+
+ result = first.difference(second, sort)
+ expected = self.strIndex[10:20]
+
+ if sort is None:
+ expected = expected.sort_values()
+
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_symmetric_difference(self, sort):
+ # smoke
+ index1 = Index([5, 2, 3, 4], name='index1')
+ index2 = Index([2, 3, 4, 1])
+ result = index1.symmetric_difference(index2, sort=sort)
+ expected = Index([5, 1])
+ assert tm.equalContents(result, expected)
+ assert result.name is None
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+
+ # __xor__ syntax
+ expected = index1 ^ index2
+ assert tm.equalContents(result, expected)
+ assert result.name is None
+
+ @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference'])
+ def test_difference_incomparable(self, opname):
+ a = pd.Index([3, pd.Timestamp('2000'), 1])
+ b = pd.Index([2, pd.Timestamp('1999'), 1])
+ op = operator.methodcaller(opname, b)
+
+ # sort=None, the default
+ result = op(a)
+ expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')])
+ if opname == 'difference':
+ expected = expected[:2]
+ tm.assert_index_equal(result, expected)
+
+ # sort=False
+ op = operator.methodcaller(opname, b, sort=False)
+ result = op(a)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.xfail(reason="Not implemented")
+ @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference'])
+ def test_difference_incomparable_true(self, opname):
+ # TODO decide on True behaviour
+ # # sort=True, raises
+ a = pd.Index([3, pd.Timestamp('2000'), 1])
+ b = pd.Index([2, pd.Timestamp('1999'), 1])
+ op = operator.methodcaller(opname, b, sort=True)
+
+ with pytest.raises(TypeError, match='Cannot compare'):
+ op(a)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_symmetric_difference_mi(self, sort):
+ index1 = MultiIndex.from_tuples(self.tuples)
+ index2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)])
+ result = index1.symmetric_difference(index2, sort=sort)
+ expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)])
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+ assert tm.equalContents(result, expected)
+
+ @pytest.mark.parametrize("index2,expected", [
+ (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])),
+ (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0]))])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_symmetric_difference_missing(self, index2, expected, sort):
+ # GH 13514 change: {nan} - {nan} == {}
+ # (GH 6444, sorting of nans, is no longer an issue)
+ index1 = Index([1, np.nan, 2, 3])
+
+ result = index1.symmetric_difference(index2, sort=sort)
+ if sort is None:
+ expected = expected.sort_values()
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_symmetric_difference_non_index(self, sort):
+ index1 = Index([1, 2, 3, 4], name='index1')
+ index2 = np.array([2, 3, 4, 5])
+ expected = Index([1, 5])
+ result = index1.symmetric_difference(index2, sort=sort)
+ assert tm.equalContents(result, expected)
+ assert result.name == 'index1'
+
+ result = index1.symmetric_difference(index2, result_name='new_name',
+ sort=sort)
+ assert tm.equalContents(result, expected)
+ assert result.name == 'new_name'
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_type(self, sort):
+ # GH 20040
+ # If taking difference of a set and itself, it
+ # needs to preserve the type of the index
+ skip_index_keys = ['repeats']
+ for key, index in self.generate_index_types(skip_index_keys):
+ result = index.difference(index, sort=sort)
+ expected = index.drop(index)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_difference(self, sort):
+ # GH 20040
+ # Test that the intersection of an index with an
+ # empty index produces the same index as the difference
+ # of an index with itself. Test for all types
+ skip_index_keys = ['repeats']
+ for key, index in self.generate_index_types(skip_index_keys):
+ inter = index.intersection(index.drop(index))
+ diff = index.difference(index, sort=sort)
+ tm.assert_index_equal(inter, diff)
+
+ @pytest.mark.parametrize("attr,expected", [
+ ('strIndex', False), ('boolIndex', False), ('catIndex', False),
+ ('intIndex', True), ('dateIndex', False), ('floatIndex', True)])
+ def test_is_numeric(self, attr, expected):
+ assert getattr(self, attr).is_numeric() == expected
+
+ @pytest.mark.parametrize("attr,expected", [
+ ('strIndex', True), ('boolIndex', True), ('catIndex', False),
+ ('intIndex', False), ('dateIndex', False), ('floatIndex', False)])
+ def test_is_object(self, attr, expected):
+ assert getattr(self, attr).is_object() == expected
+
+ @pytest.mark.parametrize("attr,expected", [
+ ('strIndex', False), ('boolIndex', False), ('catIndex', False),
+ ('intIndex', False), ('dateIndex', True), ('floatIndex', False)])
+ def test_is_all_dates(self, attr, expected):
+ assert getattr(self, attr).is_all_dates == expected
+
+ def test_summary(self):
+ self._check_method_works(Index._summary)
+ # GH3869
+ ind = Index(['{other}%s', "~:{range}:0"], name='A')
+ result = ind._summary()
+ # shouldn't be formatted accidentally.
+ assert '~:{range}:0' in result
+ assert '{other}%s' in result
+
+ # GH18217
+ def test_summary_deprecated(self):
+ ind = Index(['{other}%s', "~:{range}:0"], name='A')
+
+ with tm.assert_produces_warning(FutureWarning):
+ ind.summary()
+
+ def test_format(self):
+ self._check_method_works(Index.format)
+
+ # GH 14626
+ # windows has different precision on datetime.datetime.now (it doesn't
+ # include us since the default for Timestamp shows these but Index
+ # formatting does not we are skipping)
+ now = datetime.now()
+ if not str(now).endswith("000"):
+ index = Index([now])
+ formatted = index.format()
+ expected = [str(index[0])]
+ assert formatted == expected
+
+ self.strIndex[:0].format()
+
+ @pytest.mark.parametrize("vals", [
+ [1, 2.0 + 3.0j, 4.], ['a', 'b', 'c']])
+ def test_format_missing(self, vals, nulls_fixture):
+ # 2845
+ vals = list(vals) # Copy for each iteration
+ vals.append(nulls_fixture)
+ index = Index(vals)
+
+ formatted = index.format()
+ expected = [str(index[0]), str(index[1]), str(index[2]), u('NaN')]
+
+ assert formatted == expected
+ assert index[3] is nulls_fixture
+
+ def test_format_with_name_time_info(self):
+ # bug I fixed 12/20/2011
+ inc = timedelta(hours=4)
+ dates = Index([dt + inc for dt in self.dateIndex], name='something')
+
+ formatted = dates.format(name=True)
+ assert formatted[0] == 'something'
+
+ def test_format_datetime_with_time(self):
+ t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)])
+
+ result = t.format()
+ expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00']
+ assert len(result) == 2
+ assert result == expected
+
+ @pytest.mark.parametrize("op", ['any', 'all'])
+ def test_logical_compat(self, op):
+ index = self.create_index()
+ assert getattr(index, op)() == getattr(index.values, op)()
+
+ def _check_method_works(self, method):
+ # TODO: make this a dedicated test with parametrized methods
+ method(self.empty)
+ method(self.dateIndex)
+ method(self.unicodeIndex)
+ method(self.strIndex)
+ method(self.intIndex)
+ method(self.tuples)
+ method(self.catIndex)
+
+ def test_get_indexer(self):
+ index1 = Index([1, 2, 3, 4, 5])
+ index2 = Index([2, 4, 6])
+
+ r1 = index1.get_indexer(index2)
+ e1 = np.array([1, 3, -1], dtype=np.intp)
+ assert_almost_equal(r1, e1)
+
+ @pytest.mark.parametrize("reverse", [True, False])
+ @pytest.mark.parametrize("expected,method", [
+ (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'pad'),
+ (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'ffill'),
+ (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'backfill'),
+ (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'bfill')])
+ def test_get_indexer_methods(self, reverse, expected, method):
+ index1 = Index([1, 2, 3, 4, 5])
+ index2 = Index([2, 4, 6])
+
+ if reverse:
+ index1 = index1[::-1]
+ expected = expected[::-1]
+
+ result = index2.get_indexer(index1, method=method)
+ assert_almost_equal(result, expected)
+
+ def test_get_indexer_invalid(self):
+ # GH10411
+ index = Index(np.arange(10))
+
+ with pytest.raises(ValueError, match='tolerance argument'):
+ index.get_indexer([1, 0], tolerance=1)
+
+ with pytest.raises(ValueError, match='limit argument'):
+ index.get_indexer([1, 0], limit=1)
+
+ @pytest.mark.parametrize(
+ 'method, tolerance, indexer, expected',
+ [
+ ('pad', None, [0, 5, 9], [0, 5, 9]),
+ ('backfill', None, [0, 5, 9], [0, 5, 9]),
+ ('nearest', None, [0, 5, 9], [0, 5, 9]),
+ ('pad', 0, [0, 5, 9], [0, 5, 9]),
+ ('backfill', 0, [0, 5, 9], [0, 5, 9]),
+ ('nearest', 0, [0, 5, 9], [0, 5, 9]),
+
+ ('pad', None, [0.2, 1.8, 8.5], [0, 1, 8]),
+ ('backfill', None, [0.2, 1.8, 8.5], [1, 2, 9]),
+ ('nearest', None, [0.2, 1.8, 8.5], [0, 2, 9]),
+ ('pad', 1, [0.2, 1.8, 8.5], [0, 1, 8]),
+ ('backfill', 1, [0.2, 1.8, 8.5], [1, 2, 9]),
+ ('nearest', 1, [0.2, 1.8, 8.5], [0, 2, 9]),
+
+ ('pad', 0.2, [0.2, 1.8, 8.5], [0, -1, -1]),
+ ('backfill', 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]),
+ ('nearest', 0.2, [0.2, 1.8, 8.5], [0, 2, -1])])
+ def test_get_indexer_nearest(self, method, tolerance, indexer, expected):
+ index = Index(np.arange(10))
+
+ actual = index.get_indexer(indexer, method=method, tolerance=tolerance)
+ tm.assert_numpy_array_equal(actual, np.array(expected,
+ dtype=np.intp))
+
+ @pytest.mark.parametrize('listtype', [list, tuple, Series, np.array])
+ @pytest.mark.parametrize(
+ 'tolerance, expected',
+ list(zip([[0.3, 0.3, 0.1], [0.2, 0.1, 0.1],
+ [0.1, 0.5, 0.5]],
+ [[0, 2, -1], [0, -1, -1],
+ [-1, 2, 9]])))
+ def test_get_indexer_nearest_listlike_tolerance(self, tolerance,
+ expected, listtype):
+ index = Index(np.arange(10))
+
+ actual = index.get_indexer([0.2, 1.8, 8.5], method='nearest',
+ tolerance=listtype(tolerance))
+ tm.assert_numpy_array_equal(actual, np.array(expected,
+ dtype=np.intp))
+
+ def test_get_indexer_nearest_error(self):
+ index = Index(np.arange(10))
+ with pytest.raises(ValueError, match='limit argument'):
+ index.get_indexer([1, 0], method='nearest', limit=1)
+
+ with pytest.raises(ValueError, match='tolerance size must match'):
+ index.get_indexer([1, 0], method='nearest',
+ tolerance=[1, 2, 3])
+
+ @pytest.mark.parametrize("method,expected", [
+ ('pad', [8, 7, 0]), ('backfill', [9, 8, 1]), ('nearest', [9, 7, 0])])
+ def test_get_indexer_nearest_decreasing(self, method, expected):
+ index = Index(np.arange(10))[::-1]
+
+ actual = index.get_indexer([0, 5, 9], method=method)
+ tm.assert_numpy_array_equal(actual, np.array([9, 4, 0], dtype=np.intp))
+
+ actual = index.get_indexer([0.2, 1.8, 8.5], method=method)
+ tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp))
+
+ @pytest.mark.parametrize("method,expected", [
+ ('pad', np.array([-1, 0, 1, 1], dtype=np.intp)),
+ ('backfill', np.array([0, 0, 1, -1], dtype=np.intp))])
+ def test_get_indexer_strings(self, method, expected):
+ index = pd.Index(['b', 'c'])
+ actual = index.get_indexer(['a', 'b', 'c', 'd'], method=method)
+
+ tm.assert_numpy_array_equal(actual, expected)
+
+ def test_get_indexer_strings_raises(self):
+ index = pd.Index(['b', 'c'])
+
+ with pytest.raises(TypeError):
+ index.get_indexer(['a', 'b', 'c', 'd'], method='nearest')
+
+ with pytest.raises(TypeError):
+ index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2)
+
+ with pytest.raises(TypeError):
+ index.get_indexer(['a', 'b', 'c', 'd'], method='pad',
+ tolerance=[2, 2, 2, 2])
+
+ def test_get_indexer_numeric_index_boolean_target(self):
+ # GH 16877
+ numeric_index = pd.Index(range(4))
+ result = numeric_index.get_indexer([True, False, True])
+ expected = np.array([-1, -1, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_indexer_with_NA_values(self, unique_nulls_fixture,
+ unique_nulls_fixture2):
+ # GH 22332
+ # check pairwise, that no pair of na values
+ # is mangled
+ if unique_nulls_fixture is unique_nulls_fixture2:
+ return # skip it, values are not unique
+ arr = np.array([unique_nulls_fixture,
+ unique_nulls_fixture2], dtype=np.object)
+ index = pd.Index(arr, dtype=np.object)
+ result = index.get_indexer([unique_nulls_fixture,
+ unique_nulls_fixture2, 'Unknown'])
+ expected = np.array([0, 1, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
+ def test_get_loc(self, method):
+ index = pd.Index([0, 1, 2])
+ assert index.get_loc(1, method=method) == 1
+
+ if method:
+ assert index.get_loc(1, method=method, tolerance=0) == 1
+
+ @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
+ def test_get_loc_raises_bad_label(self, method):
+ index = pd.Index([0, 1, 2])
+ if method:
+ # Messages vary across versions
+ if PY36:
+ msg = 'not supported between'
+ elif PY35:
+ msg = 'unorderable types'
+ else:
+ if method == 'nearest':
+ msg = 'unsupported operand'
+ else:
+ msg = 'requires scalar valued input'
+ else:
+ msg = 'invalid key'
+
+ with pytest.raises(TypeError, match=msg):
+ index.get_loc([1, 2], method=method)
+
+ @pytest.mark.parametrize("method,loc", [
+ ('pad', 1), ('backfill', 2), ('nearest', 1)])
+ def test_get_loc_tolerance(self, method, loc):
+ index = pd.Index([0, 1, 2])
+ assert index.get_loc(1.1, method) == loc
+ assert index.get_loc(1.1, method, tolerance=1) == loc
+
+ @pytest.mark.parametrize("method", ['pad', 'backfill', 'nearest'])
+ def test_get_loc_outside_tolerance_raises(self, method):
+ index = pd.Index([0, 1, 2])
+ with pytest.raises(KeyError, match='1.1'):
+ index.get_loc(1.1, method, tolerance=0.05)
+
+ def test_get_loc_bad_tolerance_raises(self):
+ index = pd.Index([0, 1, 2])
+ with pytest.raises(ValueError, match='must be numeric'):
+ index.get_loc(1.1, 'nearest', tolerance='invalid')
+
+ def test_get_loc_tolerance_no_method_raises(self):
+ index = pd.Index([0, 1, 2])
+ with pytest.raises(ValueError, match='tolerance .* valid if'):
+ index.get_loc(1.1, tolerance=1)
+
+ def test_get_loc_raises_missized_tolerance(self):
+ index = pd.Index([0, 1, 2])
+ with pytest.raises(ValueError, match='tolerance size must match'):
+ index.get_loc(1.1, 'nearest', tolerance=[1, 1])
+
+ def test_get_loc_raises_object_nearest(self):
+ index = pd.Index(['a', 'c'])
+ with pytest.raises(TypeError, match='unsupported operand type'):
+ index.get_loc('a', method='nearest')
+
+ def test_get_loc_raises_object_tolerance(self):
+ index = pd.Index(['a', 'c'])
+ with pytest.raises(TypeError, match='unsupported operand type'):
+ index.get_loc('a', method='pad', tolerance='invalid')
+
+ @pytest.mark.parametrize("dtype", [int, float])
+ def test_slice_locs(self, dtype):
+ index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype))
+ n = len(index)
+
+ assert index.slice_locs(start=2) == (2, n)
+ assert index.slice_locs(start=3) == (3, n)
+ assert index.slice_locs(3, 8) == (3, 6)
+ assert index.slice_locs(5, 10) == (3, n)
+ assert index.slice_locs(end=8) == (0, 6)
+ assert index.slice_locs(end=9) == (0, 7)
+
+ # reversed
+ index2 = index[::-1]
+ assert index2.slice_locs(8, 2) == (2, 6)
+ assert index2.slice_locs(7, 3) == (2, 5)
+
+ def test_slice_float_locs(self):
+ index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=float))
+ n = len(index)
+ assert index.slice_locs(5.0, 10.0) == (3, n)
+ assert index.slice_locs(4.5, 10.5) == (3, 8)
+
+ index2 = index[::-1]
+ assert index2.slice_locs(8.5, 1.5) == (2, 6)
+ assert index2.slice_locs(10.5, -1) == (0, n)
+
+ @pytest.mark.xfail(reason="Assertions were not correct - see GH#20915")
+ def test_slice_ints_with_floats_raises(self):
+ # int slicing with floats
+ # GH 4892, these are all TypeErrors
+ index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=int))
+ n = len(index)
+
+ pytest.raises(TypeError,
+ lambda: index.slice_locs(5.0, 10.0))
+ pytest.raises(TypeError,
+ lambda: index.slice_locs(4.5, 10.5))
+
+ index2 = index[::-1]
+ pytest.raises(TypeError,
+ lambda: index2.slice_locs(8.5, 1.5), (2, 6))
+ pytest.raises(TypeError,
+ lambda: index2.slice_locs(10.5, -1), (0, n))
+
+ def test_slice_locs_dup(self):
+ index = Index(['a', 'a', 'b', 'c', 'd', 'd'])
+ assert index.slice_locs('a', 'd') == (0, 6)
+ assert index.slice_locs(end='d') == (0, 6)
+ assert index.slice_locs('a', 'c') == (0, 4)
+ assert index.slice_locs('b', 'd') == (2, 6)
+
+ index2 = index[::-1]
+ assert index2.slice_locs('d', 'a') == (0, 6)
+ assert index2.slice_locs(end='a') == (0, 6)
+ assert index2.slice_locs('d', 'b') == (0, 4)
+ assert index2.slice_locs('c', 'a') == (2, 6)
+
+ @pytest.mark.parametrize("dtype", [int, float])
+ def test_slice_locs_dup_numeric(self, dtype):
+ index = Index(np.array([10, 12, 12, 14], dtype=dtype))
+ assert index.slice_locs(12, 12) == (1, 3)
+ assert index.slice_locs(11, 13) == (1, 3)
+
+ index2 = index[::-1]
+ assert index2.slice_locs(12, 12) == (1, 3)
+ assert index2.slice_locs(13, 11) == (1, 3)
+
+ def test_slice_locs_na(self):
+ index = Index([np.nan, 1, 2])
+ assert index.slice_locs(1) == (1, 3)
+ assert index.slice_locs(np.nan) == (0, 3)
+
+ index = Index([0, np.nan, np.nan, 1, 2])
+ assert index.slice_locs(np.nan) == (1, 5)
+
+ def test_slice_locs_na_raises(self):
+ index = Index([np.nan, 1, 2])
+ with pytest.raises(KeyError, match=''):
+ index.slice_locs(start=1.5)
+
+ with pytest.raises(KeyError, match=''):
+ index.slice_locs(end=1.5)
+
+ @pytest.mark.parametrize("in_slice,expected", [
+ (pd.IndexSlice[::-1], 'yxdcb'), (pd.IndexSlice['b':'y':-1], ''),
+ (pd.IndexSlice['b'::-1], 'b'), (pd.IndexSlice[:'b':-1], 'yxdcb'),
+ (pd.IndexSlice[:'y':-1], 'y'), (pd.IndexSlice['y'::-1], 'yxdcb'),
+ (pd.IndexSlice['y'::-4], 'yb'),
+ # absent labels
+ (pd.IndexSlice[:'a':-1], 'yxdcb'), (pd.IndexSlice[:'a':-2], 'ydb'),
+ (pd.IndexSlice['z'::-1], 'yxdcb'), (pd.IndexSlice['z'::-3], 'yc'),
+ (pd.IndexSlice['m'::-1], 'dcb'), (pd.IndexSlice[:'m':-1], 'yx'),
+ (pd.IndexSlice['a':'a':-1], ''), (pd.IndexSlice['z':'z':-1], ''),
+ (pd.IndexSlice['m':'m':-1], '')
+ ])
+ def test_slice_locs_negative_step(self, in_slice, expected):
+ index = Index(list('bcdxy'))
+
+ s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop,
+ in_slice.step)
+ result = index[s_start:s_stop:in_slice.step]
+ expected = pd.Index(list(expected))
+ tm.assert_index_equal(result, expected)
+
+ def test_drop_by_str_label(self):
+ # TODO: Parametrize these after replacing self.strIndex with fixture
+ n = len(self.strIndex)
+ drop = self.strIndex[lrange(5, 10)]
+ dropped = self.strIndex.drop(drop)
+
+ expected = self.strIndex[lrange(5) + lrange(10, n)]
+ tm.assert_index_equal(dropped, expected)
+
+ dropped = self.strIndex.drop(self.strIndex[0])
+ expected = self.strIndex[1:]
+ tm.assert_index_equal(dropped, expected)
+
+ @pytest.mark.parametrize("keys", [['foo', 'bar'], ['1', 'bar']])
+ def test_drop_by_str_label_raises_missing_keys(self, keys):
+ with pytest.raises(KeyError, match=''):
+ self.strIndex.drop(keys)
+
+ def test_drop_by_str_label_errors_ignore(self):
+ # TODO: Parametrize these after replacing self.strIndex with fixture
+
+ # errors='ignore'
+ n = len(self.strIndex)
+ drop = self.strIndex[lrange(5, 10)]
+ mixed = drop.tolist() + ['foo']
+ dropped = self.strIndex.drop(mixed, errors='ignore')
+
+ expected = self.strIndex[lrange(5) + lrange(10, n)]
+ tm.assert_index_equal(dropped, expected)
+
+ dropped = self.strIndex.drop(['foo', 'bar'], errors='ignore')
+ expected = self.strIndex[lrange(n)]
+ tm.assert_index_equal(dropped, expected)
+
+ def test_drop_by_numeric_label_loc(self):
+ # TODO: Parametrize numeric and str tests after self.strIndex fixture
+ index = Index([1, 2, 3])
+ dropped = index.drop(1)
+ expected = Index([2, 3])
+
+ tm.assert_index_equal(dropped, expected)
+
+ def test_drop_by_numeric_label_raises_missing_keys(self):
+ index = Index([1, 2, 3])
+ with pytest.raises(KeyError, match=''):
+ index.drop([3, 4])
+
+ @pytest.mark.parametrize("key,expected", [
+ (4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))])
+ def test_drop_by_numeric_label_errors_ignore(self, key, expected):
+ index = Index([1, 2, 3])
+ dropped = index.drop(key, errors='ignore')
+
+ tm.assert_index_equal(dropped, expected)
+
+ @pytest.mark.parametrize("values", [['a', 'b', ('c', 'd')],
+ ['a', ('c', 'd'), 'b'],
+ [('c', 'd'), 'a', 'b']])
+ @pytest.mark.parametrize("to_drop", [[('c', 'd'), 'a'], ['a', ('c', 'd')]])
+ def test_drop_tuple(self, values, to_drop):
+ # GH 18304
+ index = pd.Index(values)
+ expected = pd.Index(['b'])
+
+ result = index.drop(to_drop)
+ tm.assert_index_equal(result, expected)
+
+ removed = index.drop(to_drop[0])
+ for drop_me in to_drop[1], [to_drop[1]]:
+ result = removed.drop(drop_me)
+ tm.assert_index_equal(result, expected)
+
+ removed = index.drop(to_drop[1])
+ for drop_me in to_drop[1], [to_drop[1]]:
+ pytest.raises(KeyError, removed.drop, drop_me)
+
+ @pytest.mark.parametrize("method,expected,sort", [
+ ('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')],
+ dtype=[('num', int), ('let', 'a1')]),
+ False),
+
+ ('intersection', np.array([(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')],
+ dtype=[('num', int), ('let', 'a1')]),
+ None),
+
+ ('union', np.array([(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'),
+ (2, 'C')], dtype=[('num', int), ('let', 'a1')]),
+ None)
+ ])
+ def test_tuple_union_bug(self, method, expected, sort):
+ index1 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')],
+ dtype=[('num', int), ('let', 'a1')]))
+ index2 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'),
+ (2, 'B'), (1, 'C'), (2, 'C')],
+ dtype=[('num', int), ('let', 'a1')]))
+
+ result = getattr(index1, method)(index2, sort=sort)
+ assert result.ndim == 1
+
+ expected = Index(expected)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("attr", [
+ 'is_monotonic_increasing', 'is_monotonic_decreasing',
+ '_is_strictly_monotonic_increasing',
+ '_is_strictly_monotonic_decreasing'])
+ def test_is_monotonic_incomparable(self, attr):
+ index = Index([5, datetime.now(), 7])
+ assert not getattr(index, attr)
+
+ def test_get_set_value(self):
+ # TODO: Remove function? GH 19728
+ values = np.random.randn(100)
+ date = self.dateIndex[67]
+
+ assert_almost_equal(self.dateIndex.get_value(values, date), values[67])
+
+ self.dateIndex.set_value(values, date, 10)
+ assert values[67] == 10
+
+ @pytest.mark.parametrize("values", [
+ ['foo', 'bar', 'quux'], {'foo', 'bar', 'quux'}])
+ @pytest.mark.parametrize("index,expected", [
+ (Index(['qux', 'baz', 'foo', 'bar']),
+ np.array([False, False, True, True])),
+ (Index([]), np.array([], dtype=bool)) # empty
+ ])
+ def test_isin(self, values, index, expected):
+ result = index.isin(values)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2):
+ # Test cartesian product of null fixtures and ensure that we don't
+ # mangle the various types (save a corner case with PyPy)
+
+ # all nans are the same
+ if (isinstance(nulls_fixture, float) and
+ isinstance(nulls_fixture2, float) and
+ math.isnan(nulls_fixture) and
+ math.isnan(nulls_fixture2)):
+ tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
+ [nulls_fixture2]), np.array([False, True]))
+
+ elif nulls_fixture is nulls_fixture2: # should preserve NA type
+ tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
+ [nulls_fixture2]), np.array([False, True]))
+
+ else:
+ tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
+ [nulls_fixture2]), np.array([False, False]))
+
+ def test_isin_nan_common_float64(self, nulls_fixture):
+ if nulls_fixture is pd.NaT:
+ pytest.skip("pd.NaT not compatible with Float64Index")
+
+ # Float64Index overrides isin, so must be checked separately
+ tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin(
+ [np.nan]), np.array([False, True]))
+
+ # we cannot compare NaT with NaN
+ tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin(
+ [pd.NaT]), np.array([False, False]))
+
+ @pytest.mark.parametrize("level", [0, -1])
+ @pytest.mark.parametrize("index", [
+ Index(['qux', 'baz', 'foo', 'bar']),
+ # Float64Index overrides isin, so must be checked separately
+ Float64Index([1.0, 2.0, 3.0, 4.0])])
+ def test_isin_level_kwarg(self, level, index):
+ values = index.tolist()[-2:] + ['nonexisting']
+
+ expected = np.array([False, False, True, True])
+ tm.assert_numpy_array_equal(expected, index.isin(values, level=level))
+
+ index.name = 'foobar'
+ tm.assert_numpy_array_equal(expected,
+ index.isin(values, level='foobar'))
+
+ @pytest.mark.parametrize("level", [1, 10, -2])
+ @pytest.mark.parametrize("index", [
+ Index(['qux', 'baz', 'foo', 'bar']),
+ # Float64Index overrides isin, so must be checked separately
+ Float64Index([1.0, 2.0, 3.0, 4.0])])
+ def test_isin_level_kwarg_raises_bad_index(self, level, index):
+ with pytest.raises(IndexError, match='Too many levels'):
+ index.isin([], level=level)
+
+ @pytest.mark.parametrize("level", [1.0, 'foobar', 'xyzzy', np.nan])
+ @pytest.mark.parametrize("index", [
+ Index(['qux', 'baz', 'foo', 'bar']),
+ Float64Index([1.0, 2.0, 3.0, 4.0])])
+ def test_isin_level_kwarg_raises_key(self, level, index):
+ with pytest.raises(KeyError, match='must be same as name'):
+ index.isin([], level=level)
+
+ @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
+ def test_isin_empty(self, empty):
+ # see gh-16991
+ index = Index(["a", "b"])
+ expected = np.array([False, False])
+
+ result = index.isin(empty)
+ tm.assert_numpy_array_equal(expected, result)
+
+ @pytest.mark.parametrize("values", [
+ [1, 2, 3, 4],
+ [1., 2., 3., 4.],
+ [True, True, True, True],
+ ["foo", "bar", "baz", "qux"],
+ pd.date_range('2018-01-01', freq='D', periods=4)])
+ def test_boolean_cmp(self, values):
+ index = Index(values)
+ result = (index == values)
+ expected = np.array([True, True, True, True], dtype=bool)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("name,level", [
+ (None, 0), ('a', 'a')])
+ def test_get_level_values(self, name, level):
+ expected = self.strIndex.copy()
+ if name:
+ expected.name = name
+
+ result = expected.get_level_values(level)
+ tm.assert_index_equal(result, expected)
+
+ def test_slice_keep_name(self):
+ index = Index(['a', 'b'], name='asdf')
+ assert index.name == index[1:].name
+
+ # instance attributes of the form self.<name>Index
+ @pytest.mark.parametrize('index_kind',
+ ['unicode', 'str', 'date', 'int', 'float'])
+ def test_join_self(self, join_type, index_kind):
+
+ res = getattr(self, '{0}Index'.format(index_kind))
+
+ joined = res.join(res, how=join_type)
+ assert res is joined
+
+ @pytest.mark.parametrize("method", ['strip', 'rstrip', 'lstrip'])
+ def test_str_attribute(self, method):
+ # GH9068
+ index = Index([' jack', 'jill ', ' jesse ', 'frank'])
+ expected = Index([getattr(str, method)(x) for x in index.values])
+
+ result = getattr(index.str, method)()
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("index", [
+ Index(range(5)), tm.makeDateIndex(10),
+ MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
+ period_range(start='2000', end='2010', freq='A')])
+ def test_str_attribute_raises(self, index):
+ with pytest.raises(AttributeError, match='only use .str accessor'):
+ index.str.repeat(2)
+
+ @pytest.mark.parametrize("expand,expected", [
+ (None, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])),
+ (False, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])),
+ (True, MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan),
+ ('f', np.nan, np.nan)]))])
+ def test_str_split(self, expand, expected):
+ index = Index(['a b c', 'd e', 'f'])
+ if expand is not None:
+ result = index.str.split(expand=expand)
+ else:
+ result = index.str.split()
+
+ tm.assert_index_equal(result, expected)
+
+ def test_str_bool_return(self):
+ # test boolean case, should return np.array instead of boolean Index
+ index = Index(['a1', 'a2', 'b1', 'b2'])
+ result = index.str.startswith('a')
+ expected = np.array([True, True, False, False])
+
+ tm.assert_numpy_array_equal(result, expected)
+ assert isinstance(result, np.ndarray)
+
+ def test_str_bool_series_indexing(self):
+ index = Index(['a1', 'a2', 'b1', 'b2'])
+ s = Series(range(4), index=index)
+
+ result = s[s.index.str.startswith('a')]
+ expected = Series(range(2), index=['a1', 'a2'])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("index,expected", [
+ (Index(list('abcd')), True), (Index(range(4)), False)])
+ def test_tab_completion(self, index, expected):
+ # GH 9910
+ result = 'str' in dir(index)
+ assert result == expected
+
+ def test_indexing_doesnt_change_class(self):
+ index = Index([1, 2, 3, 'a', 'b', 'c'])
+
+ assert index[1:3].identical(pd.Index([2, 3], dtype=np.object_))
+ assert index[[0, 1]].identical(pd.Index([1, 2], dtype=np.object_))
+
+ def test_outer_join_sort(self):
+ left_index = Index(np.random.permutation(15))
+ right_index = tm.makeDateIndex(10)
+
+ with tm.assert_produces_warning(RuntimeWarning):
+ result = left_index.join(right_index, how='outer')
+
+ # right_index in this case because DatetimeIndex has join precedence
+ # over Int64Index
+ with tm.assert_produces_warning(RuntimeWarning):
+ expected = right_index.astype(object).union(
+ left_index.astype(object))
+
+ tm.assert_index_equal(result, expected)
+
+ def test_nan_first_take_datetime(self):
+ index = Index([pd.NaT, Timestamp('20130101'), Timestamp('20130102')])
+ result = index.take([-1, 0, 1])
+ expected = Index([index[-1], index[0], index[1]])
+ tm.assert_index_equal(result, expected)
+
+ def test_take_fill_value(self):
+ # GH 12631
+ index = pd.Index(list('ABC'), name='xxx')
+ result = index.take(np.array([1, 0, -1]))
+ expected = pd.Index(list('BAC'), name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = index.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.Index(['B', 'A', np.nan], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = index.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.Index(['B', 'A', 'C'], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ def test_take_fill_value_none_raises(self):
+ index = pd.Index(list('ABC'), name='xxx')
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+
+ with pytest.raises(ValueError, match=msg):
+ index.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ index.take(np.array([1, 0, -5]), fill_value=True)
+
+ def test_take_bad_bounds_raises(self):
+ index = pd.Index(list('ABC'), name='xxx')
+ with pytest.raises(IndexError, match='out of bounds'):
+ index.take(np.array([1, -5]))
+
+ @pytest.mark.parametrize("name", [None, 'foobar'])
+ @pytest.mark.parametrize("labels", [
+ [], np.array([]), ['A', 'B', 'C'], ['C', 'B', 'A'],
+ np.array(['A', 'B', 'C']), np.array(['C', 'B', 'A']),
+ # Must preserve name even if dtype changes
+ pd.date_range('20130101', periods=3).values,
+ pd.date_range('20130101', periods=3).tolist()])
+ def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name,
+ labels):
+ # GH6552
+ index = pd.Index([0, 1, 2])
+ index.name = name
+ assert index.reindex(labels)[0].name == name
+
+ @pytest.mark.parametrize("labels", [
+ [], np.array([]), np.array([], dtype=np.int64)])
+ def test_reindex_preserves_type_if_target_is_empty_list_or_array(self,
+ labels):
+ # GH7774
+ index = pd.Index(list('abc'))
+ assert index.reindex(labels)[0].dtype.type == np.object_
+
+ @pytest.mark.parametrize("labels,dtype", [
+ (pd.Int64Index([]), np.int64),
+ (pd.Float64Index([]), np.float64),
+ (pd.DatetimeIndex([]), np.datetime64)])
+ def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self,
+ labels,
+ dtype):
+ # GH7774
+ index = pd.Index(list('abc'))
+ assert index.reindex(labels)[0].dtype.type == dtype
+
+ def test_reindex_no_type_preserve_target_empty_mi(self):
+ index = pd.Index(list('abc'))
+ result = index.reindex(pd.MultiIndex(
+ [pd.Int64Index([]), pd.Float64Index([])], [[], []]))[0]
+ assert result.levels[0].dtype.type == np.int64
+ assert result.levels[1].dtype.type == np.float64
+
+ def test_groupby(self):
+ index = Index(range(5))
+ result = index.groupby(np.array([1, 1, 2, 2, 2]))
+ expected = {1: pd.Index([0, 1]), 2: pd.Index([2, 3, 4])}
+
+ tm.assert_dict_equal(result, expected)
+
+ @pytest.mark.parametrize("mi,expected", [
+ (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])),
+ (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False]))])
+ def test_equals_op_multiindex(self, mi, expected):
+ # GH9785
+ # test comparisons of multiindex
+ df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1])
+
+ result = df.index == mi
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_equals_op_multiindex_identify(self):
+ df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1])
+
+ result = df.index == df.index
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("index", [
+ MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]),
+ Index(['foo', 'bar', 'baz'])])
+ def test_equals_op_mismatched_multiindex_raises(self, index):
+ df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1])
+
+ with pytest.raises(ValueError, match="Lengths must match"):
+ df.index == index
+
+ def test_equals_op_index_vs_mi_same_length(self):
+ mi = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)])
+ index = Index(['foo', 'bar', 'baz'])
+
+ result = mi == index
+ expected = np.array([False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("dt_conv", [
+ pd.to_datetime, pd.to_timedelta])
+ def test_dt_conversion_preserves_name(self, dt_conv):
+ # GH 10875
+ index = pd.Index(['01:02:03', '01:02:04'], name='label')
+ assert index.name == dt_conv(index).name
+
+ @pytest.mark.skipif(not PY3, reason="compat test")
+ @pytest.mark.parametrize("index,expected", [
+ # ASCII
+ # short
+ (pd.Index(['a', 'bb', 'ccc']),
+ u"""Index(['a', 'bb', 'ccc'], dtype='object')"""),
+ # multiple lines
+ (pd.Index(['a', 'bb', 'ccc'] * 10),
+ u"""\
+Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc',
+ 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc',
+ 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
+ dtype='object')"""),
+ # truncated
+ (pd.Index(['a', 'bb', 'ccc'] * 100),
+ u"""\
+Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
+ ...
+ 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
+ dtype='object', length=300)"""),
+
+ # Non-ASCII
+ # short
+ (pd.Index([u'あ', u'いい', u'ううう']),
+ u"""Index(['あ', 'いい', 'ううう'], dtype='object')"""),
+ # multiple lines
+ (pd.Index([u'あ', u'いい', u'ううう'] * 10),
+ (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', "
+ u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', "
+ u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう'],\n"
+ u" dtype='object')")),
+ # truncated
+ (pd.Index([u'あ', u'いい', u'ううう'] * 100),
+ (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', "
+ u"'あ', 'いい', 'ううう', 'あ',\n"
+ u" ...\n"
+ u" 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう', 'あ', 'いい', 'ううう'],\n"
+ u" dtype='object', length=300)"))])
+ def test_string_index_repr(self, index, expected):
+ result = repr(index)
+ assert result == expected
+
+ @pytest.mark.skipif(PY3, reason="compat test")
+ @pytest.mark.parametrize("index,expected", [
+ # ASCII
+ # short
+ (pd.Index(['a', 'bb', 'ccc']),
+ u"""Index([u'a', u'bb', u'ccc'], dtype='object')"""),
+ # multiple lines
+ (pd.Index(['a', 'bb', 'ccc'] * 10),
+ u"""\
+Index([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a',
+ u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb',
+ u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'],
+ dtype='object')"""),
+ # truncated
+ (pd.Index(['a', 'bb', 'ccc'] * 100),
+ u"""\
+Index([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a',
+ ...
+ u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'],
+ dtype='object', length=300)"""),
+
+ # Non-ASCII
+ # short
+ (pd.Index([u'あ', u'いい', u'ううう']),
+ u"""Index([u'あ', u'いい', u'ううう'], dtype='object')"""),
+ # multiple lines
+ (pd.Index([u'あ', u'いい', u'ううう'] * 10),
+ (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', "
+ u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n"
+ u" u'いい', u'ううう', u'あ', u'いい', u'ううう', "
+ u"u'あ', u'いい', u'ううう', u'あ', u'いい',\n"
+ u" u'ううう', u'あ', u'いい', u'ううう', u'あ', "
+ u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n"
+ u" dtype='object')")),
+ # truncated
+ (pd.Index([u'あ', u'いい', u'ううう'] * 100),
+ (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', "
+ u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n"
+ u" ...\n"
+ u" u'ううう', u'あ', u'いい', u'ううう', u'あ', "
+ u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n"
+ u" dtype='object', length=300)"))])
+ def test_string_index_repr_compat(self, index, expected):
+ result = unicode(index) # noqa
+ assert result == expected
+
+ @pytest.mark.skipif(not PY3, reason="compat test")
+ @pytest.mark.parametrize("index,expected", [
+ # short
+ (pd.Index([u'あ', u'いい', u'ううう']),
+ (u"Index(['あ', 'いい', 'ううう'], "
+ u"dtype='object')")),
+ # multiple lines
+ (pd.Index([u'あ', u'いい', u'ううう'] * 10),
+ (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ', 'いい', 'ううう'],\n"
+ u" dtype='object')""")),
+ # truncated
+ (pd.Index([u'あ', u'いい', u'ううう'] * 100),
+ (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', "
+ u"'ううう', 'あ', 'いい', 'ううう',\n"
+ u" 'あ',\n"
+ u" ...\n"
+ u" 'ううう', 'あ', 'いい', 'ううう', 'あ', "
+ u"'いい', 'ううう', 'あ', 'いい',\n"
+ u" 'ううう'],\n"
+ u" dtype='object', length=300)"))])
+ def test_string_index_repr_with_unicode_option(self, index, expected):
+ # Enable Unicode option -----------------------------------------
+ with cf.option_context('display.unicode.east_asian_width', True):
+ result = repr(index)
+ assert result == expected
+
+ @pytest.mark.skipif(PY3, reason="compat test")
+ @pytest.mark.parametrize("index,expected", [
+ # short
+ (pd.Index([u'あ', u'いい', u'ううう']),
+ (u"Index([u'あ', u'いい', u'ううう'], "
+ u"dtype='object')")),
+ # multiple lines
+ (pd.Index([u'あ', u'いい', u'ううう'] * 10),
+ (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', "
+ u"u'ううう', u'あ', u'いい',\n"
+ u" u'ううう', u'あ', u'いい', u'ううう', "
+ u"u'あ', u'いい', u'ううう', u'あ',\n"
+ u" u'いい', u'ううう', u'あ', u'いい', "
+ u"u'ううう', u'あ', u'いい',\n"
+ u" u'ううう', u'あ', u'いい', u'ううう', "
+ u"u'あ', u'いい', u'ううう'],\n"
+ u" dtype='object')")),
+ # truncated
+ (pd.Index([u'あ', u'いい', u'ううう'] * 100),
+ (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', "
+ u"u'ううう', u'あ', u'いい',\n"
+ u" u'ううう', u'あ',\n"
+ u" ...\n"
+ u" u'ううう', u'あ', u'いい', u'ううう', "
+ u"u'あ', u'いい', u'ううう', u'あ',\n"
+ u" u'いい', u'ううう'],\n"
+ u" dtype='object', length=300)"))])
+ def test_string_index_repr_with_unicode_option_compat(self, index,
+ expected):
+ # Enable Unicode option -----------------------------------------
+ with cf.option_context('display.unicode.east_asian_width', True):
+ result = unicode(index) # noqa
+ assert result == expected
+
+ def test_cached_properties_not_settable(self):
+ index = pd.Index([1, 2, 3])
+ with pytest.raises(AttributeError, match="Can't set attribute"):
+ index.is_unique = False
+
+ def test_get_duplicates_deprecated(self):
+ index = pd.Index([1, 2, 3])
+ with tm.assert_produces_warning(FutureWarning):
+ index.get_duplicates()
+
+ def test_tab_complete_warning(self, ip):
+ # https://github.com/pandas-dev/pandas/issues/16409
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.completer import provisionalcompleter
+
+ code = "import pandas as pd; idx = pd.Index([1, 2])"
+ ip.run_code(code)
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('idx.', 4))
+
+
+class TestMixedIntIndex(Base):
+ # Mostly the tests from common.py for which the results differ
+ # in py2 and py3 because ints and strings are uncomparable in py3
+ # (GH 13514)
+
+ _holder = Index
+
+ def setup_method(self, method):
+ self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c']))
+ self.setup_indices()
+
+ def create_index(self):
+ return self.mixedIndex
+
+ def test_argsort(self):
+ index = self.create_index()
+ if PY36:
+ with pytest.raises(TypeError, match="'>|<' not supported"):
+ result = index.argsort()
+ elif PY3:
+ with pytest.raises(TypeError, match="unorderable types"):
+ result = index.argsort()
+ else:
+ result = index.argsort()
+ expected = np.array(index).argsort()
+ tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+ def test_numpy_argsort(self):
+ index = self.create_index()
+ if PY36:
+ with pytest.raises(TypeError, match="'>|<' not supported"):
+ result = np.argsort(index)
+ elif PY3:
+ with pytest.raises(TypeError, match="unorderable types"):
+ result = np.argsort(index)
+ else:
+ result = np.argsort(index)
+ expected = index.argsort()
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_copy_name(self):
+ # Check that "name" argument passed at initialization is honoured
+ # GH12309
+ index = self.create_index()
+
+ first = index.__class__(index, copy=True, name='mario')
+ second = first.__class__(first, copy=False)
+
+ # Even though "copy=False", we want a new object.
+ assert first is not second
+ tm.assert_index_equal(first, second)
+
+ assert first.name == 'mario'
+ assert second.name == 'mario'
+
+ s1 = Series(2, index=first)
+ s2 = Series(3, index=second[:-1])
+
+ s3 = s1 * s2
+
+ assert s3.index.name == 'mario'
+
+ def test_copy_name2(self):
+ # Check that adding a "name" parameter to the copy is honored
+ # GH14302
+ index = pd.Index([1, 2], name='MyName')
+ index1 = index.copy()
+
+ tm.assert_index_equal(index, index1)
+
+ index2 = index.copy(name='NewName')
+ tm.assert_index_equal(index, index2, check_names=False)
+ assert index.name == 'MyName'
+ assert index2.name == 'NewName'
+
+ index3 = index.copy(names=['NewName'])
+ tm.assert_index_equal(index, index3, check_names=False)
+ assert index.name == 'MyName'
+ assert index.names == ['MyName']
+ assert index3.name == 'NewName'
+ assert index3.names == ['NewName']
+
+ def test_union_base(self):
+ index = self.create_index()
+ first = index[3:]
+ second = index[:5]
+
+ result = first.union(second)
+
+ expected = Index([0, 1, 2, 'a', 'b', 'c'])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [
+ np.array, Series, list])
+ def test_union_different_type_base(self, klass):
+ # GH 10149
+ index = self.create_index()
+ first = index[3:]
+ second = index[:5]
+
+ result = first.union(klass(second.values))
+
+ assert tm.equalContents(result, index)
+
+ def test_unique_na(self):
+ idx = pd.Index([2, np.nan, 2, 1], name='my_index')
+ expected = pd.Index([2, np.nan, 1], name='my_index')
+ result = idx.unique()
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_base(self, sort):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = self.create_index()
+ first = index[:5]
+ second = index[:3]
+
+ expected = Index([0, 1, 'a']) if sort is None else Index([0, 'a', 1])
+ result = first.intersection(second, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [
+ np.array, Series, list])
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection_different_type_base(self, klass, sort):
+ # GH 10149
+ index = self.create_index()
+ first = index[:5]
+ second = index[:3]
+
+ result = first.intersection(klass(second.values), sort=sort)
+ assert tm.equalContents(result, second)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_base(self, sort):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = self.create_index()
+ first = index[:4]
+ second = index[3:]
+
+ result = first.difference(second, sort)
+ expected = Index([0, 'a', 1])
+ if sort is None:
+ expected = Index(safe_sort(expected))
+ tm.assert_index_equal(result, expected)
+
+ def test_symmetric_difference(self):
+ # (same results for py2 and py3 but sortedness not tested elsewhere)
+ index = self.create_index()
+ first = index[:4]
+ second = index[3:]
+
+ result = first.symmetric_difference(second)
+ expected = Index([0, 1, 2, 'a', 'c'])
+ tm.assert_index_equal(result, expected)
+
+ def test_logical_compat(self):
+ index = self.create_index()
+ assert index.all() == index.values.all()
+ assert index.any() == index.values.any()
+
+ @pytest.mark.parametrize("how", ['any', 'all'])
+ @pytest.mark.parametrize("dtype", [
+ None, object, 'category'])
+ @pytest.mark.parametrize("vals,expected", [
+ ([1, 2, 3], [1, 2, 3]), ([1., 2., 3.], [1., 2., 3.]),
+ ([1., 2., np.nan, 3.], [1., 2., 3.]),
+ (['A', 'B', 'C'], ['A', 'B', 'C']),
+ (['A', np.nan, 'B', 'C'], ['A', 'B', 'C'])])
+ def test_dropna(self, how, dtype, vals, expected):
+ # GH 6194
+ index = pd.Index(vals, dtype=dtype)
+ result = index.dropna(how=how)
+ expected = pd.Index(expected, dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("how", ['any', 'all'])
+ @pytest.mark.parametrize("index,expected", [
+ (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']),
+ pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])),
+ (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', pd.NaT]),
+ pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])),
+ (pd.TimedeltaIndex(['1 days', '2 days', '3 days']),
+ pd.TimedeltaIndex(['1 days', '2 days', '3 days'])),
+ (pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', '3 days', pd.NaT]),
+ pd.TimedeltaIndex(['1 days', '2 days', '3 days'])),
+ (pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'),
+ pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M')),
+ (pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], freq='M'),
+ pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'))])
+ def test_dropna_dt_like(self, how, index, expected):
+ result = index.dropna(how=how)
+ tm.assert_index_equal(result, expected)
+
+ def test_dropna_invalid_how_raises(self):
+ msg = "invalid how option: xxx"
+ with pytest.raises(ValueError, match=msg):
+ pd.Index([1, 2, 3]).dropna(how='xxx')
+
+ def test_get_combined_index(self):
+ result = _get_combined_index([])
+ expected = Index([])
+ tm.assert_index_equal(result, expected)
+
+ def test_repeat(self):
+ repeats = 2
+ index = pd.Index([1, 2, 3])
+ expected = pd.Index([1, 1, 2, 2, 3, 3])
+
+ result = index.repeat(repeats)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize("index", [
+ pd.Index([np.nan]), pd.Index([np.nan, 1]),
+ pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]),
+ pd.to_datetime(['NaT']), pd.to_datetime(['NaT', '2000-01-01']),
+ pd.to_datetime(['2000-01-01', 'NaT', '2000-01-02']),
+ pd.to_timedelta(['1 day', 'NaT'])])
+ def test_is_monotonic_na(self, index):
+ assert index.is_monotonic_increasing is False
+ assert index.is_monotonic_decreasing is False
+ assert index._is_strictly_monotonic_increasing is False
+ assert index._is_strictly_monotonic_decreasing is False
+
+ def test_repr_summary(self):
+ with cf.option_context('display.max_seq_items', 10):
+ result = repr(pd.Index(np.arange(1000)))
+ assert len(result) < 200
+ assert "..." in result
+
+ @pytest.mark.parametrize("klass", [Series, DataFrame])
+ def test_int_name_format(self, klass):
+ index = Index(['a', 'b', 'c'], name=0)
+ result = klass(lrange(3), index=index)
+ assert '0' in repr(result)
+
+ def test_print_unicode_columns(self):
+ df = pd.DataFrame({u("\u05d0"): [1, 2, 3],
+ "\u05d1": [4, 5, 6],
+ "c": [7, 8, 9]})
+ repr(df.columns) # should not raise UnicodeDecodeError
+
+ @pytest.mark.parametrize("func,compat_func", [
+ (str, text_type), # unicode string
+ (bytes, str) # byte string
+ ])
+ def test_with_unicode(self, func, compat_func):
+ index = Index(lrange(1000))
+
+ if PY3:
+ func(index)
+ else:
+ compat_func(index)
+
+ def test_intersect_str_dates(self):
+ dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
+
+ index1 = Index(dt_dates, dtype=object)
+ index2 = Index(['aa'], dtype=object)
+ result = index2.intersection(index1)
+
+ expected = Index([], dtype=object)
+ tm.assert_index_equal(result, expected)
+
+
+class TestIndexUtils(object):
+
+ @pytest.mark.parametrize('data, names, expected', [
+ ([[1, 2, 3]], None, Index([1, 2, 3])),
+ ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')),
+ ([['a', 'a'], ['c', 'd']], None,
+ MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])),
+ ([['a', 'a'], ['c', 'd']], ['L1', 'L2'],
+ MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]],
+ names=['L1', 'L2'])),
+ ])
+ def test_ensure_index_from_sequences(self, data, names, expected):
+ result = ensure_index_from_sequences(data, names)
+ tm.assert_index_equal(result, expected)
+
+
[email protected]('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt',
+ 'add', 'radd', 'sub', 'rsub',
+ 'mul', 'rmul', 'truediv', 'rtruediv',
+ 'floordiv', 'rfloordiv',
+ 'pow', 'rpow', 'mod', 'divmod'])
+def test_generated_op_names(opname, indices):
+ index = indices
+ if isinstance(index, ABCIndex) and opname == 'rsub':
+ # pd.Index.__rsub__ does not exist; though the method does exist
+ # for subclasses. see GH#19723
+ return
+ opname = '__{name}__'.format(name=opname)
+ method = getattr(index, opname)
+ assert method.__name__ == opname
+
+
[email protected]('index_maker', tm.index_subclass_makers_generator())
+def test_index_subclass_constructor_wrong_kwargs(index_maker):
+ # GH #19348
+ with pytest.raises(TypeError, match='unexpected keyword argument'):
+ index_maker(foo='bar')
+
+
+def test_deprecated_fastpath():
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx = pd.Index(
+ np.array(['a', 'b'], dtype=object), name='test', fastpath=True)
+
+ expected = pd.Index(['a', 'b'], name='test')
+ tm.assert_index_equal(idx, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx = pd.Int64Index(
+ np.array([1, 2, 3], dtype='int64'), name='test', fastpath=True)
+
+ expected = pd.Index([1, 2, 3], name='test', dtype='int64')
+ tm.assert_index_equal(idx, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx = pd.RangeIndex(0, 5, 2, name='test', fastpath=True)
+
+ expected = pd.RangeIndex(0, 5, 2, name='test')
+ tm.assert_index_equal(idx, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx = pd.CategoricalIndex(['a', 'b', 'c'], name='test', fastpath=True)
+
+ expected = pd.CategoricalIndex(['a', 'b', 'c'], name='test')
+ tm.assert_index_equal(idx, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_category.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_category.py
new file mode 100644
index 00000000000..d889135160a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_category.py
@@ -0,0 +1,1161 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas._libs import index as libindex
+from pandas.compat import PY3, range
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import Categorical, IntervalIndex, compat
+import pandas.core.config as cf
+from pandas.core.indexes.api import CategoricalIndex, Index
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+from .common import Base
+
+if PY3:
+ unicode = lambda x: x
+
+
+class TestCategoricalIndex(Base):
+ _holder = CategoricalIndex
+
+ def setup_method(self, method):
+ self.indices = dict(catIndex=tm.makeCategoricalIndex(100))
+ self.setup_indices()
+
+ def create_index(self, categories=None, ordered=False):
+ if categories is None:
+ categories = list('cab')
+ return CategoricalIndex(
+ list('aabbca'), categories=categories, ordered=ordered)
+
+ def test_can_hold_identifiers(self):
+ idx = self.create_index(categories=list('abcd'))
+ key = idx[0]
+ assert idx._can_hold_identifiers_and_holds_name(key) is True
+
+ def test_construction(self):
+
+ ci = self.create_index(categories=list('abcd'))
+ categories = ci.categories
+
+ result = Index(ci)
+ tm.assert_index_equal(result, ci, exact=True)
+ assert not result.ordered
+
+ result = Index(ci.values)
+ tm.assert_index_equal(result, ci, exact=True)
+ assert not result.ordered
+
+ # empty
+ result = CategoricalIndex(categories=categories)
+ tm.assert_index_equal(result.categories, Index(categories))
+ tm.assert_numpy_array_equal(result.codes, np.array([], dtype='int8'))
+ assert not result.ordered
+
+ # passing categories
+ result = CategoricalIndex(list('aabbca'), categories=categories)
+ tm.assert_index_equal(result.categories, Index(categories))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, 2, 0], dtype='int8'))
+
+ c = pd.Categorical(list('aabbca'))
+ result = CategoricalIndex(c)
+ tm.assert_index_equal(result.categories, Index(list('abc')))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, 2, 0], dtype='int8'))
+ assert not result.ordered
+
+ result = CategoricalIndex(c, categories=categories)
+ tm.assert_index_equal(result.categories, Index(categories))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, 2, 0], dtype='int8'))
+ assert not result.ordered
+
+ ci = CategoricalIndex(c, categories=list('abcd'))
+ result = CategoricalIndex(ci)
+ tm.assert_index_equal(result.categories, Index(categories))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, 2, 0], dtype='int8'))
+ assert not result.ordered
+
+ result = CategoricalIndex(ci, categories=list('ab'))
+ tm.assert_index_equal(result.categories, Index(list('ab')))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, -1, 0], dtype='int8'))
+ assert not result.ordered
+
+ result = CategoricalIndex(ci, categories=list('ab'), ordered=True)
+ tm.assert_index_equal(result.categories, Index(list('ab')))
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([0, 0, 1,
+ 1, -1, 0], dtype='int8'))
+ assert result.ordered
+
+ result = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True)
+ expected = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True,
+ dtype='category')
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # turn me to an Index
+ result = Index(np.array(ci))
+ assert isinstance(result, Index)
+ assert not isinstance(result, CategoricalIndex)
+
+ def test_construction_with_dtype(self):
+
+ # specify dtype
+ ci = self.create_index(categories=list('abc'))
+
+ result = Index(np.array(ci), dtype='category')
+ tm.assert_index_equal(result, ci, exact=True)
+
+ result = Index(np.array(ci).tolist(), dtype='category')
+ tm.assert_index_equal(result, ci, exact=True)
+
+ # these are generally only equal when the categories are reordered
+ ci = self.create_index()
+
+ result = Index(
+ np.array(ci), dtype='category').reorder_categories(ci.categories)
+ tm.assert_index_equal(result, ci, exact=True)
+
+ # make sure indexes are handled
+ expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2],
+ ordered=True)
+ idx = Index(range(3))
+ result = CategoricalIndex(idx, categories=idx, ordered=True)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ def test_construction_empty_with_bool_categories(self):
+ # see gh-22702
+ cat = pd.CategoricalIndex([], categories=[True, False])
+ categories = sorted(cat.categories.tolist())
+ assert categories == [False, True]
+
+ def test_construction_with_categorical_dtype(self):
+ # construction with CategoricalDtype
+ # GH18109
+ data, cats, ordered = 'a a b b'.split(), 'c b a'.split(), True
+ dtype = CategoricalDtype(categories=cats, ordered=ordered)
+
+ result = CategoricalIndex(data, dtype=dtype)
+ expected = CategoricalIndex(data, categories=cats, ordered=ordered)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # GH 19032
+ result = Index(data, dtype=dtype)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # error when combining categories/ordered and dtype kwargs
+ msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+ with pytest.raises(ValueError, match=msg):
+ CategoricalIndex(data, categories=cats, dtype=dtype)
+
+ with pytest.raises(ValueError, match=msg):
+ Index(data, categories=cats, dtype=dtype)
+
+ with pytest.raises(ValueError, match=msg):
+ CategoricalIndex(data, ordered=ordered, dtype=dtype)
+
+ with pytest.raises(ValueError, match=msg):
+ Index(data, ordered=ordered, dtype=dtype)
+
+ def test_create_categorical(self):
+ # https://github.com/pandas-dev/pandas/pull/17513
+ # The public CI constructor doesn't hit this code path with
+ # instances of CategoricalIndex, but we still want to test the code
+ ci = CategoricalIndex(['a', 'b', 'c'])
+ # First ci is self, second ci is data.
+ result = CategoricalIndex._create_categorical(ci, ci)
+ expected = Categorical(['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_disallow_set_ops(self):
+
+ # GH 10039
+ # set ops (+/-) raise TypeError
+ idx = pd.Index(pd.Categorical(['a', 'b']))
+
+ pytest.raises(TypeError, lambda: idx - idx)
+ pytest.raises(TypeError, lambda: idx + idx)
+ pytest.raises(TypeError, lambda: idx - ['a', 'b'])
+ pytest.raises(TypeError, lambda: idx + ['a', 'b'])
+ pytest.raises(TypeError, lambda: ['a', 'b'] - idx)
+ pytest.raises(TypeError, lambda: ['a', 'b'] + idx)
+
+ def test_method_delegation(self):
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.set_categories(list('cab'))
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('aabbca'), categories=list('cab')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.rename_categories(list('efg'))
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('ffggef'), categories=list('efg')))
+
+ # GH18862 (let rename_categories take callables)
+ result = ci.rename_categories(lambda x: x.upper())
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('AABBCA'), categories=list('CAB')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.add_categories(['d'])
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('aabbca'), categories=list('cabd')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
+ result = ci.remove_categories(['c'])
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('aabb') + [np.nan] + ['a'], categories=list('ab')))
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.as_unordered()
+ tm.assert_index_equal(result, ci)
+
+ ci = CategoricalIndex(list('aabbca'), categories=list('cabdef'))
+ result = ci.as_ordered()
+ tm.assert_index_equal(result, CategoricalIndex(
+ list('aabbca'), categories=list('cabdef'), ordered=True))
+
+ # invalid
+ pytest.raises(ValueError, lambda: ci.set_categories(
+ list('cab'), inplace=True))
+
+ def test_contains(self):
+
+ ci = self.create_index(categories=list('cabdef'))
+
+ assert 'a' in ci
+ assert 'z' not in ci
+ assert 'e' not in ci
+ assert np.nan not in ci
+
+ # assert codes NOT in index
+ assert 0 not in ci
+ assert 1 not in ci
+
+ ci = CategoricalIndex(
+ list('aabbca') + [np.nan], categories=list('cabdef'))
+ assert np.nan in ci
+
+ def test_map(self):
+ ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
+ ordered=True)
+ result = ci.map(lambda x: x.lower())
+ exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'),
+ ordered=True)
+ tm.assert_index_equal(result, exp)
+
+ ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
+ ordered=False, name='XXX')
+ result = ci.map(lambda x: x.lower())
+ exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'),
+ ordered=False, name='XXX')
+ tm.assert_index_equal(result, exp)
+
+ # GH 12766: Return an index not an array
+ tm.assert_index_equal(ci.map(lambda x: 1),
+ Index(np.array([1] * 5, dtype=np.int64),
+ name='XXX'))
+
+ # change categories dtype
+ ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
+ ordered=False)
+
+ def f(x):
+ return {'A': 10, 'B': 20, 'C': 30}.get(x)
+
+ result = ci.map(f)
+ exp = pd.CategoricalIndex([10, 20, 10, 20, 30],
+ categories=[20, 10, 30],
+ ordered=False)
+ tm.assert_index_equal(result, exp)
+
+ result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C']))
+ tm.assert_index_equal(result, exp)
+
+ result = ci.map({'A': 10, 'B': 20, 'C': 30})
+ tm.assert_index_equal(result, exp)
+
+ def test_map_with_categorical_series(self):
+ # GH 12756
+ a = pd.Index([1, 2, 3, 4])
+ b = pd.Series(["even", "odd", "even", "odd"],
+ dtype="category")
+ c = pd.Series(["even", "odd", "even", "odd"])
+
+ exp = CategoricalIndex(["odd", "even", "odd", np.nan])
+ tm.assert_index_equal(a.map(b), exp)
+ exp = pd.Index(["odd", "even", "odd", np.nan])
+ tm.assert_index_equal(a.map(c), exp)
+
+ @pytest.mark.parametrize(
+ (
+ 'data',
+ 'f'
+ ),
+ (
+ ([1, 1, np.nan], pd.isna),
+ ([1, 2, np.nan], pd.isna),
+ ([1, 1, np.nan], {1: False}),
+ ([1, 2, np.nan], {1: False, 2: False}),
+ ([1, 1, np.nan], pd.Series([False, False])),
+ ([1, 2, np.nan], pd.Series([False, False, False]))
+ ))
+ def test_map_with_nan(self, data, f): # GH 24241
+ values = pd.Categorical(data)
+ result = values.map(f)
+ if data[1] == 1:
+ expected = pd.Categorical([False, False, np.nan])
+ tm.assert_categorical_equal(result, expected)
+ else:
+ expected = pd.Index([False, False, np.nan])
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series])
+ def test_where(self, klass):
+ i = self.create_index()
+ cond = [True] * len(i)
+ expected = i
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ cond = [False] + [True] * (len(i) - 1)
+ expected = CategoricalIndex([np.nan] + i[1:].tolist(),
+ categories=i.categories)
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ def test_append(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ # append cats with the same categories
+ result = ci[:3].append(ci[3:])
+ tm.assert_index_equal(result, ci, exact=True)
+
+ foos = [ci[:1], ci[1:3], ci[3:]]
+ result = foos[0].append(foos[1:])
+ tm.assert_index_equal(result, ci, exact=True)
+
+ # empty
+ result = ci.append([])
+ tm.assert_index_equal(result, ci, exact=True)
+
+ # appending with different categories or reordered is not ok
+ pytest.raises(
+ TypeError,
+ lambda: ci.append(ci.values.set_categories(list('abcd'))))
+ pytest.raises(
+ TypeError,
+ lambda: ci.append(ci.values.reorder_categories(list('abc'))))
+
+ # with objects
+ result = ci.append(Index(['c', 'a']))
+ expected = CategoricalIndex(list('aabbcaca'), categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # invalid objects
+ pytest.raises(TypeError, lambda: ci.append(Index(['a', 'd'])))
+
+ # GH14298 - if base object is not categorical -> coerce to object
+ result = Index(['c', 'a']).append(ci)
+ expected = Index(list('caaabbca'))
+ tm.assert_index_equal(result, expected, exact=True)
+
+ def test_append_to_another(self):
+ # hits _concat_index_asobject
+ fst = Index(['a', 'b'])
+ snd = CategoricalIndex(['d', 'e'])
+ result = fst.append(snd)
+ expected = Index(['a', 'b', 'd', 'e'])
+ tm.assert_index_equal(result, expected)
+
+ def test_insert(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ # test 0th element
+ result = ci.insert(0, 'a')
+ expected = CategoricalIndex(list('aaabbca'), categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # test Nth element that follows Python list behavior
+ result = ci.insert(-1, 'a')
+ expected = CategoricalIndex(list('aabbcaa'), categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # test empty
+ result = CategoricalIndex(categories=categories).insert(0, 'a')
+ expected = CategoricalIndex(['a'], categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # invalid
+ pytest.raises(TypeError, lambda: ci.insert(0, 'd'))
+
+ # GH 18295 (test missing)
+ expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b'])
+ for na in (np.nan, pd.NaT, None):
+ result = CategoricalIndex(list('aabcb')).insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+ def test_delete(self):
+
+ ci = self.create_index()
+ categories = ci.categories
+
+ result = ci.delete(0)
+ expected = CategoricalIndex(list('abbca'), categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = ci.delete(-1)
+ expected = CategoricalIndex(list('aabbc'), categories=categories)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ with pytest.raises((IndexError, ValueError)):
+ # Either depending on NumPy version
+ ci.delete(10)
+
+ def test_astype(self):
+
+ ci = self.create_index()
+ result = ci.astype(object)
+ tm.assert_index_equal(result, Index(np.array(ci)))
+
+ # this IS equal, but not the same class
+ assert result.equals(ci)
+ assert isinstance(result, Index)
+ assert not isinstance(result, CategoricalIndex)
+
+ # interval
+ ii = IntervalIndex.from_arrays(left=[-0.001, 2.0],
+ right=[2, 4],
+ closed='right')
+
+ ci = CategoricalIndex(Categorical.from_codes(
+ [0, 1, -1], categories=ii, ordered=True))
+
+ result = ci.astype('interval')
+ expected = ii.take([0, 1, -1])
+ tm.assert_index_equal(result, expected)
+
+ result = IntervalIndex(result.values)
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('name', [None, 'foo'])
+ @pytest.mark.parametrize('dtype_ordered', [True, False])
+ @pytest.mark.parametrize('index_ordered', [True, False])
+ def test_astype_category(self, name, dtype_ordered, index_ordered):
+ # GH 18630
+ index = self.create_index(ordered=index_ordered)
+ if name:
+ index = index.rename(name)
+
+ # standard categories
+ dtype = CategoricalDtype(ordered=dtype_ordered)
+ result = index.astype(dtype)
+ expected = CategoricalIndex(index.tolist(),
+ name=name,
+ categories=index.categories,
+ ordered=dtype_ordered)
+ tm.assert_index_equal(result, expected)
+
+ # non-standard categories
+ dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
+ result = index.astype(dtype)
+ expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
+ tm.assert_index_equal(result, expected)
+
+ if dtype_ordered is False:
+ # dtype='category' can't specify ordered, so only test once
+ result = index.astype('category')
+ expected = index
+ tm.assert_index_equal(result, expected)
+
+ def test_reindex_base(self):
+ # Determined by cat ordering.
+ idx = CategoricalIndex(list("cab"), categories=list("cab"))
+ expected = np.arange(len(idx), dtype=np.intp)
+
+ actual = idx.get_indexer(idx)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ with pytest.raises(ValueError, match="Invalid fill method"):
+ idx.get_indexer(idx, method="invalid")
+
+ def test_reindexing(self):
+ np.random.seed(123456789)
+
+ ci = self.create_index()
+ oidx = Index(np.array(ci))
+
+ for n in [1, 2, 5, len(ci)]:
+ finder = oidx[np.random.randint(0, len(ci), size=n)]
+ expected = oidx.get_indexer_non_unique(finder)[0]
+
+ actual = ci.get_indexer(finder)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ # see gh-17323
+ #
+ # Even when indexer is equal to the
+ # members in the index, we should
+ # respect duplicates instead of taking
+ # the fast-track path.
+ for finder in [list("aabbca"), list("aababca")]:
+ expected = oidx.get_indexer_non_unique(finder)[0]
+
+ actual = ci.get_indexer(finder)
+ tm.assert_numpy_array_equal(expected, actual)
+
+ def test_reindex_dtype(self):
+ c = CategoricalIndex(['a', 'b', 'c', 'a'])
+ res, indexer = c.reindex(['a', 'c'])
+ tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(['a', 'b', 'c', 'a'])
+ res, indexer = c.reindex(Categorical(['a', 'c']))
+
+ exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c'])
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(['a', 'b', 'c', 'a'],
+ categories=['a', 'b', 'c', 'd'])
+ res, indexer = c.reindex(['a', 'c'])
+ exp = Index(['a', 'a', 'c'], dtype='object')
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([0, 3, 2], dtype=np.intp))
+
+ c = CategoricalIndex(['a', 'b', 'c', 'a'],
+ categories=['a', 'b', 'c', 'd'])
+ res, indexer = c.reindex(Categorical(['a', 'c']))
+ exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c'])
+ tm.assert_index_equal(res, exp, exact=True)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([0, 3, 2], dtype=np.intp))
+
+ def test_reindex_duplicate_target(self):
+ # See GH23963
+ c = CategoricalIndex(['a', 'b', 'c', 'a'],
+ categories=['a', 'b', 'c', 'd'])
+ with pytest.raises(ValueError, match='non-unique indexer'):
+ c.reindex(['a', 'a', 'c'])
+
+ with pytest.raises(ValueError, match='non-unique indexer'):
+ c.reindex(CategoricalIndex(['a', 'a', 'c'],
+ categories=['a', 'b', 'c', 'd']))
+
+ def test_reindex_empty_index(self):
+ # See GH16770
+ c = CategoricalIndex([])
+ res, indexer = c.reindex(['a', 'b'])
+ tm.assert_index_equal(res, Index(['a', 'b']), exact=True)
+ tm.assert_numpy_array_equal(indexer,
+ np.array([-1, -1], dtype=np.intp))
+
+ @pytest.mark.parametrize('data, non_lexsorted_data', [
+ [[1, 2, 3], [9, 0, 1, 2, 3]],
+ [list('abc'), list('fabcd')],
+ ])
+ def test_is_monotonic(self, data, non_lexsorted_data):
+ c = CategoricalIndex(data)
+ assert c.is_monotonic_increasing is True
+ assert c.is_monotonic_decreasing is False
+
+ c = CategoricalIndex(data, ordered=True)
+ assert c.is_monotonic_increasing is True
+ assert c.is_monotonic_decreasing is False
+
+ c = CategoricalIndex(data, categories=reversed(data))
+ assert c.is_monotonic_increasing is False
+ assert c.is_monotonic_decreasing is True
+
+ c = CategoricalIndex(data, categories=reversed(data), ordered=True)
+ assert c.is_monotonic_increasing is False
+ assert c.is_monotonic_decreasing is True
+
+ # test when data is neither monotonic increasing nor decreasing
+ reordered_data = [data[0], data[2], data[1]]
+ c = CategoricalIndex(reordered_data, categories=reversed(data))
+ assert c.is_monotonic_increasing is False
+ assert c.is_monotonic_decreasing is False
+
+ # non lexsorted categories
+ categories = non_lexsorted_data
+
+ c = CategoricalIndex(categories[:2], categories=categories)
+ assert c.is_monotonic_increasing is True
+ assert c.is_monotonic_decreasing is False
+
+ c = CategoricalIndex(categories[1:3], categories=categories)
+ assert c.is_monotonic_increasing is True
+ assert c.is_monotonic_decreasing is False
+
+ def test_has_duplicates(self):
+
+ idx = CategoricalIndex([0, 0, 0], name='foo')
+ assert idx.is_unique is False
+ assert idx.has_duplicates is True
+
+ def test_drop_duplicates(self):
+
+ idx = CategoricalIndex([0, 0, 0], name='foo')
+ expected = CategoricalIndex([0], name='foo')
+ tm.assert_index_equal(idx.drop_duplicates(), expected)
+ tm.assert_index_equal(idx.unique(), expected)
+
+ def test_get_indexer(self):
+
+ idx1 = CategoricalIndex(list('aabcde'), categories=list('edabc'))
+ idx2 = CategoricalIndex(list('abf'))
+
+ for indexer in [idx2, list('abf'), Index(list('abf'))]:
+ r1 = idx1.get_indexer(idx2)
+ assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
+
+ pytest.raises(NotImplementedError,
+ lambda: idx2.get_indexer(idx1, method='pad'))
+ pytest.raises(NotImplementedError,
+ lambda: idx2.get_indexer(idx1, method='backfill'))
+ pytest.raises(NotImplementedError,
+ lambda: idx2.get_indexer(idx1, method='nearest'))
+
+ def test_get_loc(self):
+ # GH 12531
+ cidx1 = CategoricalIndex(list('abcde'), categories=list('edabc'))
+ idx1 = Index(list('abcde'))
+ assert cidx1.get_loc('a') == idx1.get_loc('a')
+ assert cidx1.get_loc('e') == idx1.get_loc('e')
+
+ for i in [cidx1, idx1]:
+ with pytest.raises(KeyError):
+ i.get_loc('NOT-EXIST')
+
+ # non-unique
+ cidx2 = CategoricalIndex(list('aacded'), categories=list('edabc'))
+ idx2 = Index(list('aacded'))
+
+ # results in bool array
+ res = cidx2.get_loc('d')
+ tm.assert_numpy_array_equal(res, idx2.get_loc('d'))
+ tm.assert_numpy_array_equal(res, np.array([False, False, False,
+ True, False, True]))
+ # unique element results in scalar
+ res = cidx2.get_loc('e')
+ assert res == idx2.get_loc('e')
+ assert res == 4
+
+ for i in [cidx2, idx2]:
+ with pytest.raises(KeyError):
+ i.get_loc('NOT-EXIST')
+
+ # non-unique, slicable
+ cidx3 = CategoricalIndex(list('aabbb'), categories=list('abc'))
+ idx3 = Index(list('aabbb'))
+
+ # results in slice
+ res = cidx3.get_loc('a')
+ assert res == idx3.get_loc('a')
+ assert res == slice(0, 2, None)
+
+ res = cidx3.get_loc('b')
+ assert res == idx3.get_loc('b')
+ assert res == slice(2, 5, None)
+
+ for i in [cidx3, idx3]:
+ with pytest.raises(KeyError):
+ i.get_loc('c')
+
+ def test_repr_roundtrip(self):
+
+ ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ str(ci)
+ tm.assert_index_equal(eval(repr(ci)), ci, exact=True)
+
+ # formatting
+ if PY3:
+ str(ci)
+ else:
+ compat.text_type(ci)
+
+ # long format
+ # this is not reprable
+ ci = CategoricalIndex(np.random.randint(0, 5, size=100))
+ if PY3:
+ str(ci)
+ else:
+ compat.text_type(ci)
+
+ def test_isin(self):
+
+ ci = CategoricalIndex(
+ list('aabca') + [np.nan], categories=['c', 'a', 'b'])
+ tm.assert_numpy_array_equal(
+ ci.isin(['c']),
+ np.array([False, False, False, True, False, False]))
+ tm.assert_numpy_array_equal(
+ ci.isin(['c', 'a', 'b']), np.array([True] * 5 + [False]))
+ tm.assert_numpy_array_equal(
+ ci.isin(['c', 'a', 'b', np.nan]), np.array([True] * 6))
+
+ # mismatched categorical -> coerced to ndarray so doesn't matter
+ result = ci.isin(ci.set_categories(list('abcdefghi')))
+ expected = np.array([True] * 6)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = ci.isin(ci.set_categories(list('defghi')))
+ expected = np.array([False] * 5 + [True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_identical(self):
+
+ ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
+ ordered=True)
+ assert ci1.identical(ci1)
+ assert ci1.identical(ci1.copy())
+ assert not ci1.identical(ci2)
+
+ def test_ensure_copied_data(self):
+ # gh-12309: Check the "copy" argument of each
+ # Index.__new__ is honored.
+ #
+ # Must be tested separately from other indexes because
+ # self.value is not an ndarray.
+ _base = lambda ar: ar if ar.base is None else ar.base
+
+ for index in self.indices.values():
+ result = CategoricalIndex(index.values, copy=True)
+ tm.assert_index_equal(index, result)
+ assert _base(index.values) is not _base(result.values)
+
+ result = CategoricalIndex(index.values, copy=False)
+ assert _base(index.values) is _base(result.values)
+
+ def test_equals_categorical(self):
+ ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
+ ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
+ ordered=True)
+
+ assert ci1.equals(ci1)
+ assert not ci1.equals(ci2)
+ assert ci1.equals(ci1.astype(object))
+ assert ci1.astype(object).equals(ci1)
+
+ assert (ci1 == ci1).all()
+ assert not (ci1 != ci1).all()
+ assert not (ci1 > ci1).all()
+ assert not (ci1 < ci1).all()
+ assert (ci1 <= ci1).all()
+ assert (ci1 >= ci1).all()
+
+ assert not (ci1 == 1).all()
+ assert (ci1 == Index(['a', 'b'])).all()
+ assert (ci1 == ci1.values).all()
+
+ # invalid comparisons
+ with pytest.raises(ValueError, match="Lengths must match"):
+ ci1 == Index(['a', 'b', 'c'])
+ pytest.raises(TypeError, lambda: ci1 == ci2)
+ pytest.raises(
+ TypeError, lambda: ci1 == Categorical(ci1.values, ordered=False))
+ pytest.raises(
+ TypeError,
+ lambda: ci1 == Categorical(ci1.values, categories=list('abc')))
+
+ # tests
+ # make sure that we are testing for category inclusion properly
+ ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b'])
+ assert not ci.equals(list('aabca'))
+ # Same categories, but different order
+ # Unordered
+ assert ci.equals(CategoricalIndex(list('aabca')))
+ # Ordered
+ assert not ci.equals(CategoricalIndex(list('aabca'), ordered=True))
+ assert ci.equals(ci.copy())
+
+ ci = CategoricalIndex(list('aabca') + [np.nan],
+ categories=['c', 'a', 'b'])
+ assert not ci.equals(list('aabca'))
+ assert not ci.equals(CategoricalIndex(list('aabca')))
+ assert ci.equals(ci.copy())
+
+ ci = CategoricalIndex(list('aabca') + [np.nan],
+ categories=['c', 'a', 'b'])
+ assert not ci.equals(list('aabca') + [np.nan])
+ assert ci.equals(CategoricalIndex(list('aabca') + [np.nan]))
+ assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan],
+ ordered=True))
+ assert ci.equals(ci.copy())
+
+ def test_equals_categoridcal_unordered(self):
+ # https://github.com/pandas-dev/pandas/issues/16603
+ a = pd.CategoricalIndex(['A'], categories=['A', 'B'])
+ b = pd.CategoricalIndex(['A'], categories=['B', 'A'])
+ c = pd.CategoricalIndex(['C'], categories=['B', 'A'])
+ assert a.equals(b)
+ assert not a.equals(c)
+ assert not b.equals(c)
+
+ def test_frame_repr(self):
+ df = pd.DataFrame({"A": [1, 2, 3]},
+ index=pd.CategoricalIndex(['a', 'b', 'c']))
+ result = repr(df)
+ expected = ' A\na 1\nb 2\nc 3'
+ assert result == expected
+
+ def test_string_categorical_index_repr(self):
+ # short
+ idx = pd.CategoricalIndex(['a', 'bb', 'ccc'])
+ if PY3:
+ expected = u"""CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'a', u'bb', u'ccc'], categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category')""" # noqa
+ assert unicode(idx) == expected
+
+ # multiple lines
+ idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 10)
+ if PY3:
+ expected = u"""CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
+ 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
+ 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
+ categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb',
+ u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a',
+ u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc',
+ u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'],
+ categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ # truncated
+ idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 100)
+ if PY3:
+ expected = u"""CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
+ ...
+ 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
+ categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb',
+ u'ccc', u'a',
+ ...
+ u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a',
+ u'bb', u'ccc'],
+ categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert unicode(idx) == expected
+
+ # larger categories
+ idx = pd.CategoricalIndex(list('abcdefghijklmmo'))
+ if PY3:
+ expected = u"""CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+ 'm', 'm', 'o'],
+ categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j',
+ u'k', u'l', u'm', u'm', u'o'],
+ categories=[u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', ...], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ # short
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'])
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう'], categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" # noqa
+ assert unicode(idx) == expected
+
+ # multiple lines
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 10)
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
+ 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
+ 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
+ categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい',
+ u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう',
+ u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'],
+ categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ # truncated
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 100)
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
+ ...
+ 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
+ categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい',
+ u'ううう', u'あ',
+ ...
+ u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう'],
+ categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert unicode(idx) == expected
+
+ # larger categories
+ idx = pd.CategoricalIndex(list(u'あいうえおかきくけこさしすせそ'))
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
+ 'す', 'せ', 'そ'],
+ categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', u'け', u'こ',
+ u'さ', u'し', u'す', u'せ', u'そ'],
+ categories=[u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', ...], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ # Emable Unicode option -----------------------------------------
+ with cf.option_context('display.unicode.east_asian_width', True):
+
+ # short
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'])
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう'], categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" # noqa
+ assert unicode(idx) == expected
+
+ # multiple lines
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 10)
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
+ 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
+ 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
+ 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
+ categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ', u'いい', u'ううう'],
+ categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ # truncated
+ idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 100)
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
+ 'ううう', 'あ',
+ ...
+ 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
+ 'あ', 'いい', 'ううう'],
+ categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ',
+ u'いい', u'ううう', u'あ',
+ ...
+ u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい',
+ u'ううう', u'あ', u'いい', u'ううう'],
+ categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category', length=300)""" # noqa
+
+ assert unicode(idx) == expected
+
+ # larger categories
+ idx = pd.CategoricalIndex(list(u'あいうえおかきくけこさしすせそ'))
+ if PY3:
+ expected = u"""CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
+ 'さ', 'し', 'す', 'せ', 'そ'],
+ categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa
+
+ assert repr(idx) == expected
+ else:
+ expected = u"""CategoricalIndex([u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く',
+ u'け', u'こ', u'さ', u'し', u'す', u'せ', u'そ'],
+ categories=[u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', ...], ordered=False, dtype='category')""" # noqa
+
+ assert unicode(idx) == expected
+
+ def test_fillna_categorical(self):
+ # GH 11343
+ idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name='x')
+ # fill by value in categories
+ exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name='x')
+ tm.assert_index_equal(idx.fillna(1.0), exp)
+
+ # fill by value not in categories raises ValueError
+ msg = 'fill value must be in categories'
+ with pytest.raises(ValueError, match=msg):
+ idx.fillna(2.0)
+
+ def test_take_fill_value(self):
+ # GH 12631
+
+ # numeric category
+ idx = pd.CategoricalIndex([1, 2, 3], name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.CategoricalIndex([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.CategoricalIndex([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # object category
+ idx = pd.CategoricalIndex(list('CBA'), categories=list('ABC'),
+ ordered=True, name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'),
+ ordered=True, name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.CategoricalIndex(['B', 'C', np.nan],
+ categories=list('ABC'), ordered=True,
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'),
+ ordered=True, name='xxx')
+ tm.assert_index_equal(result, expected)
+ tm.assert_categorical_equal(result.values, expected.values)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_take_fill_value_datetime(self):
+
+ # datetime category
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+ name='xxx')
+ idx = pd.CategoricalIndex(idx)
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx')
+ expected = pd.CategoricalIndex(expected)
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'],
+ name='xxx')
+ exp_cats = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'])
+ expected = pd.CategoricalIndex(expected, categories=exp_cats)
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'],
+ name='xxx')
+ expected = pd.CategoricalIndex(expected)
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_take_invalid_kwargs(self):
+ idx = pd.CategoricalIndex([1, 2, 3], name='foo')
+ indices = [1, 0, -1]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+ @pytest.mark.parametrize('dtype, engine_type', [
+ (np.int8, libindex.Int8Engine),
+ (np.int16, libindex.Int16Engine),
+ (np.int32, libindex.Int32Engine),
+ (np.int64, libindex.Int64Engine),
+ ])
+ def test_engine_type(self, dtype, engine_type):
+ if dtype != np.int64:
+ # num. of uniques required to push CategoricalIndex.codes to a
+ # dtype (128 categories required for .codes dtype to be int16 etc.)
+ num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
+ ci = pd.CategoricalIndex(range(num_uniques))
+ else:
+ # having 2**32 - 2**31 categories would be very memory-intensive,
+ # so we cheat a bit with the dtype
+ ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
+ ci.values._codes = ci.values._codes.astype('int64')
+ assert np.issubdtype(ci.codes.dtype, dtype)
+ assert isinstance(ci._engine, engine_type)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_common.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_common.py
new file mode 100644
index 00000000000..fd356202a8c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_common.py
@@ -0,0 +1,343 @@
+"""
+Collection of tests asserting things that should be true for
+any index subclass. Makes use of the `indices` fixture defined
+in pandas/tests/indexes/conftest.py.
+"""
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+
+from pandas.core.dtypes.common import needs_i8_conversion
+
+import pandas as pd
+from pandas import CategoricalIndex, MultiIndex, RangeIndex, compat
+import pandas.util.testing as tm
+
+
+class TestCommon(object):
+
+ def test_droplevel(self, indices):
+ # GH 21115
+ if isinstance(indices, MultiIndex):
+ # Tested separately in test_multi.py
+ return
+
+ assert indices.droplevel([]).equals(indices)
+
+ for level in indices.name, [indices.name]:
+ if isinstance(indices.name, tuple) and level is indices.name:
+ # GH 21121 : droplevel with tuple name
+ continue
+ with pytest.raises(ValueError):
+ indices.droplevel(level)
+
+ for level in 'wrong', ['wrong']:
+ with pytest.raises(KeyError):
+ indices.droplevel(level)
+
+ def test_constructor_non_hashable_name(self, indices):
+ # GH 20527
+
+ if isinstance(indices, MultiIndex):
+ pytest.skip("multiindex handled in test_multi.py")
+
+ message = "Index.name must be a hashable type"
+ renamed = [['1']]
+
+ # With .rename()
+ with pytest.raises(TypeError, match=message):
+ indices.rename(name=renamed)
+
+ # With .set_names()
+ with pytest.raises(TypeError, match=message):
+ indices.set_names(names=renamed)
+
+ def test_constructor_unwraps_index(self, indices):
+ if isinstance(indices, pd.MultiIndex):
+ raise pytest.skip("MultiIndex has no ._data")
+ a = indices
+ b = type(a)(a)
+ tm.assert_equal(a._data, b._data)
+
+ @pytest.mark.parametrize("itm", [101, 'no_int'])
+ # FutureWarning from non-tuple sequence of nd indexing
+ @pytest.mark.filterwarnings("ignore::FutureWarning")
+ def test_getitem_error(self, indices, itm):
+ with pytest.raises(IndexError):
+ indices[itm]
+
+ @pytest.mark.parametrize(
+ 'fname, sname, expected_name',
+ [
+ ('A', 'A', 'A'),
+ ('A', 'B', None),
+ ('A', None, None),
+ (None, 'B', None),
+ (None, None, None),
+ ])
+ def test_corner_union(self, indices, fname, sname, expected_name):
+ # GH 9943 9862
+ # Test unions with various name combinations
+ # Do not test MultiIndex or repeats
+
+ if isinstance(indices, MultiIndex) or not indices.is_unique:
+ pytest.skip("Not for MultiIndex or repeated indices")
+
+ # Test copy.union(copy)
+ first = indices.copy().set_names(fname)
+ second = indices.copy().set_names(sname)
+ union = first.union(second)
+ expected = indices.copy().set_names(expected_name)
+ tm.assert_index_equal(union, expected)
+
+ # Test copy.union(empty)
+ first = indices.copy().set_names(fname)
+ second = indices.drop(indices).set_names(sname)
+ union = first.union(second)
+ expected = indices.copy().set_names(expected_name)
+ tm.assert_index_equal(union, expected)
+
+ # Test empty.union(copy)
+ first = indices.drop(indices).set_names(fname)
+ second = indices.copy().set_names(sname)
+ union = first.union(second)
+ expected = indices.copy().set_names(expected_name)
+ tm.assert_index_equal(union, expected)
+
+ # Test empty.union(empty)
+ first = indices.drop(indices).set_names(fname)
+ second = indices.drop(indices).set_names(sname)
+ union = first.union(second)
+ expected = indices.drop(indices).set_names(expected_name)
+ tm.assert_index_equal(union, expected)
+
+ def test_to_flat_index(self, indices):
+ # 22866
+ if isinstance(indices, MultiIndex):
+ pytest.skip("Separate expectation for MultiIndex")
+
+ result = indices.to_flat_index()
+ tm.assert_index_equal(result, indices)
+
+ def test_wrong_number_names(self, indices):
+ with pytest.raises(ValueError, match="^Length"):
+ indices.names = ["apple", "banana", "carrot"]
+
+ def test_set_name_methods(self, indices):
+ new_name = "This is the new name for this index"
+
+ # don't tests a MultiIndex here (as its tested separated)
+ if isinstance(indices, MultiIndex):
+ pytest.skip('Skip check for MultiIndex')
+ original_name = indices.name
+ new_ind = indices.set_names([new_name])
+ assert new_ind.name == new_name
+ assert indices.name == original_name
+ res = indices.rename(new_name, inplace=True)
+
+ # should return None
+ assert res is None
+ assert indices.name == new_name
+ assert indices.names == [new_name]
+ # with pytest.raises(TypeError, match="list-like"):
+ # # should still fail even if it would be the right length
+ # ind.set_names("a")
+ with pytest.raises(ValueError, match="Level must be None"):
+ indices.set_names("a", level=0)
+
+ # rename in place just leaves tuples and other containers alone
+ name = ('A', 'B')
+ indices.rename(name, inplace=True)
+ assert indices.name == name
+ assert indices.names == [name]
+
+ def test_dtype_str(self, indices):
+ dtype = indices.dtype_str
+ assert isinstance(dtype, compat.string_types)
+ assert dtype == str(indices.dtype)
+
+ def test_hash_error(self, indices):
+ index = indices
+ with pytest.raises(TypeError, match=("unhashable type: %r" %
+ type(index).__name__)):
+ hash(indices)
+
+ def test_copy_and_deepcopy(self, indices):
+ from copy import copy, deepcopy
+
+ if isinstance(indices, MultiIndex):
+ pytest.skip('Skip check for MultiIndex')
+
+ for func in (copy, deepcopy):
+ idx_copy = func(indices)
+ assert idx_copy is not indices
+ assert idx_copy.equals(indices)
+
+ new_copy = indices.copy(deep=True, name="banana")
+ assert new_copy.name == "banana"
+
+ def test_unique(self, indices):
+ # don't test a MultiIndex here (as its tested separated)
+ # don't test a CategoricalIndex because categories change (GH 18291)
+ if isinstance(indices, (MultiIndex, CategoricalIndex)):
+ pytest.skip('Skip check for MultiIndex/CategoricalIndex')
+
+ # GH 17896
+ expected = indices.drop_duplicates()
+ for level in 0, indices.name, None:
+ result = indices.unique(level=level)
+ tm.assert_index_equal(result, expected)
+
+ for level in 3, 'wrong':
+ pytest.raises((IndexError, KeyError), indices.unique, level=level)
+
+ def test_get_unique_index(self, indices):
+ # MultiIndex tested separately
+ if not len(indices) or isinstance(indices, MultiIndex):
+ pytest.skip('Skip check for empty Index and MultiIndex')
+
+ idx = indices[[0] * 5]
+ idx_unique = indices[[0]]
+
+ # We test against `idx_unique`, so first we make sure it's unique
+ # and doesn't contain nans.
+ assert idx_unique.is_unique is True
+ try:
+ assert idx_unique.hasnans is False
+ except NotImplementedError:
+ pass
+
+ for dropna in [False, True]:
+ result = idx._get_unique_index(dropna=dropna)
+ tm.assert_index_equal(result, idx_unique)
+
+ # nans:
+ if not indices._can_hold_na:
+ pytest.skip('Skip na-check if index cannot hold na')
+
+ if needs_i8_conversion(indices):
+ vals = indices.asi8[[0] * 5]
+ vals[0] = iNaT
+ else:
+ vals = indices.values[[0] * 5]
+ vals[0] = np.nan
+
+ vals_unique = vals[:2]
+ idx_nan = indices._shallow_copy(vals)
+ idx_unique_nan = indices._shallow_copy(vals_unique)
+ assert idx_unique_nan.is_unique is True
+
+ assert idx_nan.dtype == indices.dtype
+ assert idx_unique_nan.dtype == indices.dtype
+
+ for dropna, expected in zip([False, True],
+ [idx_unique_nan,
+ idx_unique]):
+ for i in [idx_nan, idx_unique_nan]:
+ result = i._get_unique_index(dropna=dropna)
+ tm.assert_index_equal(result, expected)
+
+ def test_sort(self, indices):
+ pytest.raises(TypeError, indices.sort)
+
+ def test_mutability(self, indices):
+ if not len(indices):
+ pytest.skip('Skip check for empty Index')
+ pytest.raises(TypeError, indices.__setitem__, 0, indices[0])
+
+ def test_view(self, indices):
+ assert indices.view().name == indices.name
+
+ def test_compat(self, indices):
+ assert indices.tolist() == list(indices)
+
+ def test_searchsorted_monotonic(self, indices):
+ # GH17271
+ # not implemented for tuple searches in MultiIndex
+ # or Intervals searches in IntervalIndex
+ if isinstance(indices, (MultiIndex, pd.IntervalIndex)):
+ pytest.skip('Skip check for MultiIndex/IntervalIndex')
+
+ # nothing to test if the index is empty
+ if indices.empty:
+ pytest.skip('Skip check for empty Index')
+ value = indices[0]
+
+ # determine the expected results (handle dupes for 'right')
+ expected_left, expected_right = 0, (indices == value).argmin()
+ if expected_right == 0:
+ # all values are the same, expected_right should be length
+ expected_right = len(indices)
+
+ # test _searchsorted_monotonic in all cases
+ # test searchsorted only for increasing
+ if indices.is_monotonic_increasing:
+ ssm_left = indices._searchsorted_monotonic(value, side='left')
+ assert expected_left == ssm_left
+
+ ssm_right = indices._searchsorted_monotonic(value, side='right')
+ assert expected_right == ssm_right
+
+ ss_left = indices.searchsorted(value, side='left')
+ assert expected_left == ss_left
+
+ ss_right = indices.searchsorted(value, side='right')
+ assert expected_right == ss_right
+
+ elif indices.is_monotonic_decreasing:
+ ssm_left = indices._searchsorted_monotonic(value, side='left')
+ assert expected_left == ssm_left
+
+ ssm_right = indices._searchsorted_monotonic(value, side='right')
+ assert expected_right == ssm_right
+ else:
+ # non-monotonic should raise.
+ with pytest.raises(ValueError):
+ indices._searchsorted_monotonic(value, side='left')
+
+ def test_pickle(self, indices):
+ original_name, indices.name = indices.name, 'foo'
+ unpickled = tm.round_trip_pickle(indices)
+ assert indices.equals(unpickled)
+ indices.name = original_name
+
+ @pytest.mark.parametrize('keep', ['first', 'last', False])
+ def test_duplicated(self, indices, keep):
+ if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
+ # MultiIndex tested separately in:
+ # tests/indexes/multi/test_unique_and_duplicates
+ pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex')
+
+ holder = type(indices)
+
+ idx = holder(indices)
+ if idx.has_duplicates:
+ # We are testing the duplicated-method here, so we need to know
+ # exactly which indices are duplicate and how (for the result).
+ # This is not possible if "idx" has duplicates already, which we
+ # therefore remove. This is seemingly circular, as drop_duplicates
+ # invokes duplicated, but in the end, it all works out because we
+ # cross-check with Series.duplicated, which is tested separately.
+ idx = idx.drop_duplicates()
+
+ n, k = len(idx), 10
+ duplicated_selection = np.random.choice(n, k * n)
+ expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
+ idx = holder(idx.values[duplicated_selection])
+
+ result = idx.duplicated(keep=keep)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_has_duplicates(self, indices):
+ holder = type(indices)
+ if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
+ # MultiIndex tested separately in:
+ # tests/indexes/multi/test_unique_and_duplicates.
+ # RangeIndex is unique by definition.
+ pytest.skip('Skip check for empty Index, MultiIndex, '
+ 'and RangeIndex')
+
+ idx = holder([indices[0]] * 5)
+ assert idx.is_unique is False
+ assert idx.has_duplicates is True
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_frozen.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_frozen.py
new file mode 100644
index 00000000000..c2931b10233
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_frozen.py
@@ -0,0 +1,109 @@
+import warnings
+
+import numpy as np
+
+from pandas.compat import u
+
+from pandas.core.indexes.frozen import FrozenList, FrozenNDArray
+from pandas.tests.test_base import CheckImmutable, CheckStringMixin
+from pandas.util import testing as tm
+
+
+class TestFrozenList(CheckImmutable, CheckStringMixin):
+ mutable_methods = ('extend', 'pop', 'remove', 'insert')
+ unicode_container = FrozenList([u("\u05d0"), u("\u05d1"), "c"])
+
+ def setup_method(self, _):
+ self.lst = [1, 2, 3, 4, 5]
+ self.container = FrozenList(self.lst)
+ self.klass = FrozenList
+
+ def test_add(self):
+ result = self.container + (1, 2, 3)
+ expected = FrozenList(self.lst + [1, 2, 3])
+ self.check_result(result, expected)
+
+ result = (1, 2, 3) + self.container
+ expected = FrozenList([1, 2, 3] + self.lst)
+ self.check_result(result, expected)
+
+ def test_iadd(self):
+ q = r = self.container
+
+ q += [5]
+ self.check_result(q, self.lst + [5])
+
+ # Other shouldn't be mutated.
+ self.check_result(r, self.lst)
+
+ def test_union(self):
+ result = self.container.union((1, 2, 3))
+ expected = FrozenList(self.lst + [1, 2, 3])
+ self.check_result(result, expected)
+
+ def test_difference(self):
+ result = self.container.difference([2])
+ expected = FrozenList([1, 3, 4, 5])
+ self.check_result(result, expected)
+
+ def test_difference_dupe(self):
+ result = FrozenList([1, 2, 3, 2]).difference([2])
+ expected = FrozenList([1, 3])
+ self.check_result(result, expected)
+
+
+class TestFrozenNDArray(CheckImmutable, CheckStringMixin):
+ mutable_methods = ('put', 'itemset', 'fill')
+
+ def setup_method(self, _):
+ self.lst = [3, 5, 7, -2]
+ self.klass = FrozenNDArray
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", FutureWarning)
+
+ self.container = FrozenNDArray(self.lst)
+ self.unicode_container = FrozenNDArray(
+ [u("\u05d0"), u("\u05d1"), "c"])
+
+ def test_constructor_warns(self):
+ # see gh-9031
+ with tm.assert_produces_warning(FutureWarning):
+ FrozenNDArray([1, 2, 3])
+
+ def test_shallow_copying(self):
+ original = self.container.copy()
+ assert isinstance(self.container.view(), FrozenNDArray)
+ assert not isinstance(self.container.view(np.ndarray), FrozenNDArray)
+ assert self.container.view() is not self.container
+ tm.assert_numpy_array_equal(self.container, original)
+
+ # Shallow copy should be the same too
+ assert isinstance(self.container._shallow_copy(), FrozenNDArray)
+
+ # setting should not be allowed
+ def testit(container):
+ container[0] = 16
+
+ self.check_mutable_error(testit, self.container)
+
+ def test_values(self):
+ original = self.container.view(np.ndarray).copy()
+ n = original[0] + 15
+
+ vals = self.container.values()
+ tm.assert_numpy_array_equal(original, vals)
+
+ assert original is not vals
+ vals[0] = n
+
+ assert isinstance(self.container, FrozenNDArray)
+ tm.assert_numpy_array_equal(self.container.values(), original)
+ assert vals[0] == n
+
+ def test_searchsorted(self):
+ expected = 2
+ assert self.container.searchsorted(7) == expected
+
+ with tm.assert_produces_warning(FutureWarning):
+ assert self.container.searchsorted(v=7) == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_numeric.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_numeric.py
new file mode 100644
index 00000000000..a64340c02cd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_numeric.py
@@ -0,0 +1,1091 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import Timestamp
+from pandas.compat import range
+
+import pandas as pd
+from pandas import Float64Index, Index, Int64Index, Series, UInt64Index
+from pandas.tests.indexes.common import Base
+import pandas.util.testing as tm
+
+
+class Numeric(Base):
+
+ def test_can_hold_identifiers(self):
+ idx = self.create_index()
+ key = idx[0]
+ assert idx._can_hold_identifiers_and_holds_name(key) is False
+
+ def test_numeric_compat(self):
+ pass # override Base method
+
+ def test_explicit_conversions(self):
+
+ # GH 8608
+ # add/sub are overridden explicitly for Float/Int Index
+ idx = self._holder(np.arange(5, dtype='int64'))
+
+ # float conversions
+ arr = np.arange(5, dtype='int64') * 3.2
+ expected = Float64Index(arr)
+ fidx = idx * 3.2
+ tm.assert_index_equal(fidx, expected)
+ fidx = 3.2 * idx
+ tm.assert_index_equal(fidx, expected)
+
+ # interops with numpy arrays
+ expected = Float64Index(arr)
+ a = np.zeros(5, dtype='float64')
+ result = fidx - a
+ tm.assert_index_equal(result, expected)
+
+ expected = Float64Index(-arr)
+ a = np.zeros(5, dtype='float64')
+ result = a - fidx
+ tm.assert_index_equal(result, expected)
+
+ def test_index_groupby(self):
+ int_idx = Index(range(6))
+ float_idx = Index(np.arange(0, 0.6, 0.1))
+ obj_idx = Index('A B C D E F'.split())
+ dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)
+
+ for idx in [int_idx, float_idx, obj_idx, dt_idx]:
+ to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
+ tm.assert_dict_equal(idx.groupby(to_groupby),
+ {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]})
+
+ to_groupby = Index([datetime(2011, 11, 1),
+ datetime(2011, 12, 1),
+ pd.NaT,
+ pd.NaT,
+ datetime(2011, 12, 1),
+ datetime(2011, 11, 1)],
+ tz='UTC').values
+
+ ex_keys = [Timestamp('2011-11-01'), Timestamp('2011-12-01')]
+ expected = {ex_keys[0]: idx[[0, 5]],
+ ex_keys[1]: idx[[1, 4]]}
+ tm.assert_dict_equal(idx.groupby(to_groupby), expected)
+
+ @pytest.mark.parametrize('klass', [list, tuple, np.array, Series])
+ def test_where(self, klass):
+ i = self.create_index()
+ cond = [True] * len(i)
+ expected = i
+ result = i.where(klass(cond))
+
+ cond = [False] + [True] * (len(i) - 1)
+ expected = Float64Index([i._na_value] + i[1:].tolist())
+ result = i.where(klass(cond))
+ tm.assert_index_equal(result, expected)
+
+ def test_insert(self):
+ # GH 18295 (test missing)
+ expected = Float64Index([0, np.nan, 1, 2, 3, 4])
+ for na in (np.nan, pd.NaT, None):
+ result = self.create_index().insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+
+class TestFloat64Index(Numeric):
+ _holder = Float64Index
+
+ def setup_method(self, method):
+ self.indices = dict(mixed=Float64Index([1.5, 2, 3, 4, 5]),
+ float=Float64Index(np.arange(5) * 2.5),
+ mixed_dec=Float64Index([5, 4, 3, 2, 1.5]),
+ float_dec=Float64Index(np.arange(4, -1, -1) * 2.5))
+ self.setup_indices()
+
+ def create_index(self):
+ return Float64Index(np.arange(5, dtype='float64'))
+
+ def test_repr_roundtrip(self):
+ for ind in (self.mixed, self.float):
+ tm.assert_index_equal(eval(repr(ind)), ind)
+
+ def check_is_index(self, i):
+ assert isinstance(i, Index)
+ assert not isinstance(i, Float64Index)
+
+ def check_coerce(self, a, b, is_float_index=True):
+ assert a.equals(b)
+ tm.assert_index_equal(a, b, exact=False)
+ if is_float_index:
+ assert isinstance(b, Float64Index)
+ else:
+ self.check_is_index(b)
+
+ def test_constructor(self):
+
+ # explicit construction
+ index = Float64Index([1, 2, 3, 4, 5])
+ assert isinstance(index, Float64Index)
+ expected = np.array([1, 2, 3, 4, 5], dtype='float64')
+ tm.assert_numpy_array_equal(index.values, expected)
+ index = Float64Index(np.array([1, 2, 3, 4, 5]))
+ assert isinstance(index, Float64Index)
+ index = Float64Index([1., 2, 3, 4, 5])
+ assert isinstance(index, Float64Index)
+ index = Float64Index(np.array([1., 2, 3, 4, 5]))
+ assert isinstance(index, Float64Index)
+ assert index.dtype == float
+
+ index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32)
+ assert isinstance(index, Float64Index)
+ assert index.dtype == np.float64
+
+ index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32)
+ assert isinstance(index, Float64Index)
+ assert index.dtype == np.float64
+
+ # nan handling
+ result = Float64Index([np.nan, np.nan])
+ assert pd.isna(result.values).all()
+ result = Float64Index(np.array([np.nan]))
+ assert pd.isna(result.values).all()
+ result = Index(np.array([np.nan]))
+ assert pd.isna(result.values).all()
+
+ def test_constructor_invalid(self):
+
+ # invalid
+ pytest.raises(TypeError, Float64Index, 0.)
+ pytest.raises(TypeError, Float64Index, ['a', 'b', 0.])
+ pytest.raises(TypeError, Float64Index, [Timestamp('20130101')])
+
+ def test_constructor_coerce(self):
+
+ self.check_coerce(self.mixed, Index([1.5, 2, 3, 4, 5]))
+ self.check_coerce(self.float, Index(np.arange(5) * 2.5))
+ self.check_coerce(self.float, Index(np.array(
+ np.arange(5) * 2.5, dtype=object)))
+
+ def test_constructor_explicit(self):
+
+ # these don't auto convert
+ self.check_coerce(self.float,
+ Index((np.arange(5) * 2.5), dtype=object),
+ is_float_index=False)
+ self.check_coerce(self.mixed, Index(
+ [1.5, 2, 3, 4, 5], dtype=object), is_float_index=False)
+
+ def test_astype(self):
+
+ result = self.float.astype(object)
+ assert result.equals(self.float)
+ assert self.float.equals(result)
+ self.check_is_index(result)
+
+ i = self.mixed.copy()
+ i.name = 'foo'
+ result = i.astype(object)
+ assert result.equals(i)
+ assert i.equals(result)
+ self.check_is_index(result)
+
+ # GH 12881
+ # a float astype int
+ for dtype in ['int16', 'int32', 'int64']:
+ i = Float64Index([0, 1, 2])
+ result = i.astype(dtype)
+ expected = Int64Index([0, 1, 2])
+ tm.assert_index_equal(result, expected)
+
+ i = Float64Index([0, 1.1, 2])
+ result = i.astype(dtype)
+ expected = Int64Index([0, 1, 2])
+ tm.assert_index_equal(result, expected)
+
+ for dtype in ['float32', 'float64']:
+ i = Float64Index([0, 1, 2])
+ result = i.astype(dtype)
+ expected = i
+ tm.assert_index_equal(result, expected)
+
+ i = Float64Index([0, 1.1, 2])
+ result = i.astype(dtype)
+ expected = Index(i.values.astype(dtype))
+ tm.assert_index_equal(result, expected)
+
+ # invalid
+ for dtype in ['M8[ns]', 'm8[ns]']:
+ pytest.raises(TypeError, lambda: i.astype(dtype))
+
+ # GH 13149
+ for dtype in ['int16', 'int32', 'int64']:
+ i = Float64Index([0, 1.1, np.NAN])
+ pytest.raises(ValueError, lambda: i.astype(dtype))
+
+ def test_type_coercion_fail(self, any_int_dtype):
+ # see gh-15832
+ msg = "Trying to coerce float values to integers"
+ with pytest.raises(ValueError, match=msg):
+ Index([1, 2, 3.5], dtype=any_int_dtype)
+
+ def test_type_coercion_valid(self, float_dtype):
+ # There is no Float32Index, so we always
+ # generate Float64Index.
+ i = Index([1, 2, 3.5], dtype=float_dtype)
+ tm.assert_index_equal(i, Index([1, 2, 3.5]))
+
+ def test_equals_numeric(self):
+
+ i = Float64Index([1.0, 2.0])
+ assert i.equals(i)
+ assert i.identical(i)
+
+ i2 = Float64Index([1.0, 2.0])
+ assert i.equals(i2)
+
+ i = Float64Index([1.0, np.nan])
+ assert i.equals(i)
+ assert i.identical(i)
+
+ i2 = Float64Index([1.0, np.nan])
+ assert i.equals(i2)
+
+ def test_get_indexer(self):
+ idx = Float64Index([0.0, 1.0, 2.0])
+ tm.assert_numpy_array_equal(idx.get_indexer(idx),
+ np.array([0, 1, 2], dtype=np.intp))
+
+ target = [-0.1, 0.5, 1.1]
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
+ np.array([-1, 0, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
+ np.array([0, 1, 2], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
+ np.array([0, 1, 1], dtype=np.intp))
+
+ def test_get_loc(self):
+ idx = Float64Index([0.0, 1.0, 2.0])
+ for method in [None, 'pad', 'backfill', 'nearest']:
+ assert idx.get_loc(1, method) == 1
+ if method is not None:
+ assert idx.get_loc(1, method, tolerance=0) == 1
+
+ for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]:
+ assert idx.get_loc(1.1, method) == loc
+ assert idx.get_loc(1.1, method, tolerance=0.9) == loc
+
+ pytest.raises(KeyError, idx.get_loc, 'foo')
+ pytest.raises(KeyError, idx.get_loc, 1.5)
+ pytest.raises(KeyError, idx.get_loc, 1.5, method='pad',
+ tolerance=0.1)
+ pytest.raises(KeyError, idx.get_loc, True)
+ pytest.raises(KeyError, idx.get_loc, False)
+
+ with pytest.raises(ValueError, match='must be numeric'):
+ idx.get_loc(1.4, method='nearest', tolerance='foo')
+
+ with pytest.raises(ValueError, match='must contain numeric elements'):
+ idx.get_loc(1.4, method='nearest', tolerance=np.array(['foo']))
+
+ with pytest.raises(
+ ValueError,
+ match='tolerance size must match target index size'):
+ idx.get_loc(1.4, method='nearest', tolerance=np.array([1, 2]))
+
+ def test_get_loc_na(self):
+ idx = Float64Index([np.nan, 1, 2])
+ assert idx.get_loc(1) == 1
+ assert idx.get_loc(np.nan) == 0
+
+ idx = Float64Index([np.nan, 1, np.nan])
+ assert idx.get_loc(1) == 1
+
+ # representable by slice [0:2:2]
+ # pytest.raises(KeyError, idx.slice_locs, np.nan)
+ sliced = idx.slice_locs(np.nan)
+ assert isinstance(sliced, tuple)
+ assert sliced == (0, 3)
+
+ # not representable by slice
+ idx = Float64Index([np.nan, 1, np.nan, np.nan])
+ assert idx.get_loc(1) == 1
+ pytest.raises(KeyError, idx.slice_locs, np.nan)
+
+ def test_get_loc_missing_nan(self):
+ # GH 8569
+ idx = Float64Index([1, 2])
+ assert idx.get_loc(1) == 0
+ pytest.raises(KeyError, idx.get_loc, 3)
+ pytest.raises(KeyError, idx.get_loc, np.nan)
+ pytest.raises(KeyError, idx.get_loc, [np.nan])
+
+ def test_contains_nans(self):
+ i = Float64Index([1.0, 2.0, np.nan])
+ assert np.nan in i
+
+ def test_contains_not_nans(self):
+ i = Float64Index([1.0, 2.0, np.nan])
+ assert 1.0 in i
+
+ def test_doesnt_contain_all_the_things(self):
+ i = Float64Index([np.nan])
+ assert not i.isin([0]).item()
+ assert not i.isin([1]).item()
+ assert i.isin([np.nan]).item()
+
+ def test_nan_multiple_containment(self):
+ i = Float64Index([1.0, np.nan])
+ tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False]))
+ tm.assert_numpy_array_equal(i.isin([2.0, np.pi]),
+ np.array([False, False]))
+ tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True]))
+ tm.assert_numpy_array_equal(i.isin([1.0, np.nan]),
+ np.array([True, True]))
+ i = Float64Index([1.0, 2.0])
+ tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False]))
+
+ def test_astype_from_object(self):
+ index = Index([1.0, np.nan, 0.2], dtype='object')
+ result = index.astype(float)
+ expected = Float64Index([1.0, np.nan, 0.2])
+ assert result.dtype == expected.dtype
+ tm.assert_index_equal(result, expected)
+
+ def test_fillna_float64(self):
+ # GH 11343
+ idx = Index([1.0, np.nan, 3.0], dtype=float, name='x')
+ # can't downcast
+ exp = Index([1.0, 0.1, 3.0], name='x')
+ tm.assert_index_equal(idx.fillna(0.1), exp)
+
+ # downcast
+ exp = Float64Index([1.0, 2.0, 3.0], name='x')
+ tm.assert_index_equal(idx.fillna(2), exp)
+
+ # object
+ exp = Index([1.0, 'obj', 3.0], name='x')
+ tm.assert_index_equal(idx.fillna('obj'), exp)
+
+ def test_take_fill_value(self):
+ # GH 12631
+ idx = pd.Float64Index([1., 2., 3.], name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.Float64Index([2., 1., 3.], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = pd.Float64Index([2., 1., np.nan], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.Float64Index([2., 1., 3.], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+
+class NumericInt(Numeric):
+
+ def test_view(self):
+ i = self._holder([], name='Foo')
+ i_view = i.view()
+ assert i_view.name == 'Foo'
+
+ i_view = i.view(self._dtype)
+ tm.assert_index_equal(i, self._holder(i_view, name='Foo'))
+
+ i_view = i.view(self._holder)
+ tm.assert_index_equal(i, self._holder(i_view, name='Foo'))
+
+ def test_is_monotonic(self):
+ assert self.index.is_monotonic is True
+ assert self.index.is_monotonic_increasing is True
+ assert self.index._is_strictly_monotonic_increasing is True
+ assert self.index.is_monotonic_decreasing is False
+ assert self.index._is_strictly_monotonic_decreasing is False
+
+ index = self._holder([4, 3, 2, 1])
+ assert index.is_monotonic is False
+ assert index._is_strictly_monotonic_increasing is False
+ assert index._is_strictly_monotonic_decreasing is True
+
+ index = self._holder([1])
+ assert index.is_monotonic is True
+ assert index.is_monotonic_increasing is True
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_increasing is True
+ assert index._is_strictly_monotonic_decreasing is True
+
+ def test_is_strictly_monotonic(self):
+ index = self._holder([1, 1, 2, 3])
+ assert index.is_monotonic_increasing is True
+ assert index._is_strictly_monotonic_increasing is False
+
+ index = self._holder([3, 2, 1, 1])
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_decreasing is False
+
+ index = self._holder([1, 1])
+ assert index.is_monotonic_increasing
+ assert index.is_monotonic_decreasing
+ assert not index._is_strictly_monotonic_increasing
+ assert not index._is_strictly_monotonic_decreasing
+
+ def test_logical_compat(self):
+ idx = self.create_index()
+ assert idx.all() == idx.values.all()
+ assert idx.any() == idx.values.any()
+
+ def test_identical(self):
+ i = Index(self.index.copy())
+ assert i.identical(self.index)
+
+ same_values_different_type = Index(i, dtype=object)
+ assert not i.identical(same_values_different_type)
+
+ i = self.index.copy(dtype=object)
+ i = i.rename('foo')
+ same_values = Index(i, dtype=object)
+ assert same_values.identical(i)
+
+ assert not i.identical(self.index)
+ assert Index(same_values, name='foo', dtype=object).identical(i)
+
+ assert not self.index.copy(dtype=object).identical(
+ self.index.copy(dtype=self._dtype))
+
+ def test_join_non_unique(self):
+ left = Index([4, 4, 3, 3])
+
+ joined, lidx, ridx = left.join(left, return_indexers=True)
+
+ exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4])
+ tm.assert_index_equal(joined, exp_joined)
+
+ exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(lidx, exp_lidx)
+
+ exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+
+ @pytest.mark.parametrize('kind', ['outer', 'inner', 'left', 'right'])
+ def test_join_self(self, kind):
+ joined = self.index.join(self.index, how=kind)
+ assert self.index is joined
+
+ def test_union_noncomparable(self):
+ from datetime import datetime, timedelta
+ # corner case, non-Int64Index
+ now = datetime.now()
+ other = Index([now + timedelta(i) for i in range(4)], dtype=object)
+ result = self.index.union(other)
+ expected = Index(np.concatenate((self.index, other)))
+ tm.assert_index_equal(result, expected)
+
+ result = other.union(self.index)
+ expected = Index(np.concatenate((other, self.index)))
+ tm.assert_index_equal(result, expected)
+
+ def test_cant_or_shouldnt_cast(self):
+ # can't
+ data = ['foo', 'bar', 'baz']
+ pytest.raises(TypeError, self._holder, data)
+
+ # shouldn't
+ data = ['0', '1', '2']
+ pytest.raises(TypeError, self._holder, data)
+
+ def test_view_index(self):
+ self.index.view(Index)
+
+ def test_prevent_casting(self):
+ result = self.index.astype('O')
+ assert result.dtype == np.object_
+
+ def test_take_preserve_name(self):
+ index = self._holder([1, 2, 3, 4], name='foo')
+ taken = index.take([3, 0, 1])
+ assert index.name == taken.name
+
+ def test_take_fill_value(self):
+ # see gh-12631
+ idx = self._holder([1, 2, 3], name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = self._holder([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ name = self._holder.__name__
+ msg = ("Unable to fill values because "
+ "{name} cannot contain NA").format(name=name)
+
+ # fill_value=True
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -1]), fill_value=True)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = self._holder([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_slice_keep_name(self):
+ idx = self._holder([1, 2], name='asdf')
+ assert idx.name == idx[1:].name
+
+
+class TestInt64Index(NumericInt):
+ _dtype = 'int64'
+ _holder = Int64Index
+
+ def setup_method(self, method):
+ self.indices = dict(index=Int64Index(np.arange(0, 20, 2)),
+ index_dec=Int64Index(np.arange(19, -1, -1)))
+ self.setup_indices()
+
+ def create_index(self):
+ return Int64Index(np.arange(5, dtype='int64'))
+
+ def test_constructor(self):
+ # pass list, coerce fine
+ index = Int64Index([-5, 0, 1, 2])
+ expected = Index([-5, 0, 1, 2], dtype=np.int64)
+ tm.assert_index_equal(index, expected)
+
+ # from iterable
+ index = Int64Index(iter([-5, 0, 1, 2]))
+ tm.assert_index_equal(index, expected)
+
+ # scalar raise Exception
+ pytest.raises(TypeError, Int64Index, 5)
+
+ # copy
+ arr = self.index.values
+ new_index = Int64Index(arr, copy=True)
+ tm.assert_index_equal(new_index, self.index)
+ val = arr[0] + 3000
+
+ # this should not change index
+ arr[0] = val
+ assert new_index[0] != val
+
+ # interpret list-like
+ expected = Int64Index([5, 0])
+ for cls in [Index, Int64Index]:
+ for idx in [cls([5, 0], dtype='int64'),
+ cls(np.array([5, 0]), dtype='int64'),
+ cls(Series([5, 0]), dtype='int64')]:
+ tm.assert_index_equal(idx, expected)
+
+ def test_constructor_corner(self):
+ arr = np.array([1, 2, 3, 4], dtype=object)
+ index = Int64Index(arr)
+ assert index.values.dtype == np.int64
+ tm.assert_index_equal(index, Index(arr))
+
+ # preventing casting
+ arr = np.array([1, '2', 3, '4'], dtype=object)
+ with pytest.raises(TypeError, match='casting'):
+ Int64Index(arr)
+
+ arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1]
+ with pytest.raises(TypeError, match='casting'):
+ Int64Index(arr_with_floats)
+
+ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
+
+ # see gh-15832
+ msg = "Trying to coerce negative values to unsigned integers"
+
+ with pytest.raises(OverflowError, match=msg):
+ Index([-1], dtype=uint_dtype)
+
+ def test_constructor_unwraps_index(self):
+ idx = pd.Index([1, 2])
+ result = pd.Int64Index(idx)
+ expected = np.array([1, 2], dtype='int64')
+ tm.assert_numpy_array_equal(result._data, expected)
+
+ def test_coerce_list(self):
+ # coerce things
+ arr = Index([1, 2, 3, 4])
+ assert isinstance(arr, Int64Index)
+
+ # but not if explicit dtype passed
+ arr = Index([1, 2, 3, 4], dtype=object)
+ assert isinstance(arr, Index)
+
+ def test_get_indexer(self):
+ target = Int64Index(np.arange(10))
+ indexer = self.index.get_indexer(target)
+ expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ target = Int64Index(np.arange(10))
+ indexer = self.index.get_indexer(target, method='pad')
+ expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ target = Int64Index(np.arange(10))
+ indexer = self.index.get_indexer(target, method='backfill')
+ expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ def test_intersection(self):
+ other = Index([1, 2, 3, 4, 5])
+ result = self.index.intersection(other)
+ expected = Index(np.sort(np.intersect1d(self.index.values,
+ other.values)))
+ tm.assert_index_equal(result, expected)
+
+ result = other.intersection(self.index)
+ expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values,
+ other.values))))
+ tm.assert_index_equal(result, expected)
+
+ def test_join_inner(self):
+ other = Int64Index([7, 12, 25, 1, 2, 5])
+ other_mono = Int64Index([1, 2, 5, 7, 12, 25])
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='inner',
+ return_indexers=True)
+
+ # no guarantee of sortedness, so sort for comparison purposes
+ ind = res.argsort()
+ res = res.take(ind)
+ lidx = lidx.take(ind)
+ ridx = ridx.take(ind)
+
+ eres = Int64Index([2, 12])
+ elidx = np.array([1, 6], dtype=np.intp)
+ eridx = np.array([4, 1], dtype=np.intp)
+
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='inner',
+ return_indexers=True)
+
+ res2 = self.index.intersection(other_mono)
+ tm.assert_index_equal(res, res2)
+
+ elidx = np.array([1, 6], dtype=np.intp)
+ eridx = np.array([1, 4], dtype=np.intp)
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_left(self):
+ other = Int64Index([7, 12, 25, 1, 2, 5])
+ other_mono = Int64Index([1, 2, 5, 7, 12, 25])
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='left',
+ return_indexers=True)
+ eres = self.index
+ eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1],
+ dtype=np.intp)
+
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='left',
+ return_indexers=True)
+ eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1],
+ dtype=np.intp)
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # non-unique
+ idx = Index([1, 1, 2, 5])
+ idx2 = Index([1, 2, 5, 7, 9])
+ res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True)
+ eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2
+ eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+ elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_right(self):
+ other = Int64Index([7, 12, 25, 1, 2, 5])
+ other_mono = Int64Index([1, 2, 5, 7, 12, 25])
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='right',
+ return_indexers=True)
+ eres = other
+ elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp)
+
+ assert isinstance(other, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ assert ridx is None
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='right',
+ return_indexers=True)
+ eres = other_mono
+ elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp)
+ assert isinstance(other, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ assert ridx is None
+
+ # non-unique
+ idx = Index([1, 1, 2, 5])
+ idx2 = Index([1, 2, 5, 7, 9])
+ res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True)
+ eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2
+ elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+ eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_non_int_index(self):
+ other = Index([3, 6, 7, 8, 10], dtype=object)
+
+ outer = self.index.join(other, how='outer')
+ outer2 = other.join(self.index, how='outer')
+ expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18])
+ tm.assert_index_equal(outer, outer2)
+ tm.assert_index_equal(outer, expected)
+
+ inner = self.index.join(other, how='inner')
+ inner2 = other.join(self.index, how='inner')
+ expected = Index([6, 8, 10])
+ tm.assert_index_equal(inner, inner2)
+ tm.assert_index_equal(inner, expected)
+
+ left = self.index.join(other, how='left')
+ tm.assert_index_equal(left, self.index.astype(object))
+
+ left2 = other.join(self.index, how='left')
+ tm.assert_index_equal(left2, other)
+
+ right = self.index.join(other, how='right')
+ tm.assert_index_equal(right, other)
+
+ right2 = other.join(self.index, how='right')
+ tm.assert_index_equal(right2, self.index.astype(object))
+
+ def test_join_outer(self):
+ other = Int64Index([7, 12, 25, 1, 2, 5])
+ other_mono = Int64Index([1, 2, 5, 7, 12, 25])
+
+ # not monotonic
+ # guarantee of sortedness
+ res, lidx, ridx = self.index.join(other, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25])
+ elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1],
+ dtype=np.intp)
+ eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2],
+ dtype=np.intp)
+
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other_mono, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1],
+ dtype=np.intp)
+ eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5],
+ dtype=np.intp)
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+
+class TestUInt64Index(NumericInt):
+
+ _dtype = 'uint64'
+ _holder = UInt64Index
+
+ def setup_method(self, method):
+ vals = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25]
+ self.indices = dict(index=UInt64Index(vals),
+ index_dec=UInt64Index(reversed(vals)))
+ self.setup_indices()
+
+ def create_index(self):
+ return UInt64Index(np.arange(5, dtype='uint64'))
+
+ def test_constructor(self):
+ idx = UInt64Index([1, 2, 3])
+ res = Index([1, 2, 3], dtype=np.uint64)
+ tm.assert_index_equal(res, idx)
+
+ idx = UInt64Index([1, 2**63])
+ res = Index([1, 2**63], dtype=np.uint64)
+ tm.assert_index_equal(res, idx)
+
+ idx = UInt64Index([1, 2**63])
+ res = Index([1, 2**63])
+ tm.assert_index_equal(res, idx)
+
+ idx = Index([-1, 2**63], dtype=object)
+ res = Index(np.array([-1, 2**63], dtype=object))
+ tm.assert_index_equal(res, idx)
+
+ def test_get_indexer(self):
+ target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
+ indexer = self.index.get_indexer(target)
+ expected = np.array([0, -1, 1, 2, 3, 4,
+ -1, -1, -1, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
+ indexer = self.index.get_indexer(target, method='pad')
+ expected = np.array([0, 0, 1, 2, 3, 4,
+ 4, 4, 4, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
+ indexer = self.index.get_indexer(target, method='backfill')
+ expected = np.array([0, 1, 1, 2, 3, 4,
+ -1, -1, -1, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ def test_intersection(self):
+ other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20])
+ result = self.index.intersection(other)
+ expected = Index(np.sort(np.intersect1d(self.index.values,
+ other.values)))
+ tm.assert_index_equal(result, expected)
+
+ result = other.intersection(self.index)
+ expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values,
+ other.values))))
+ tm.assert_index_equal(result, expected)
+
+ def test_join_inner(self):
+ other = UInt64Index(2**63 + np.array(
+ [7, 12, 25, 1, 2, 10], dtype='uint64'))
+ other_mono = UInt64Index(2**63 + np.array(
+ [1, 2, 7, 10, 12, 25], dtype='uint64'))
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='inner',
+ return_indexers=True)
+
+ # no guarantee of sortedness, so sort for comparison purposes
+ ind = res.argsort()
+ res = res.take(ind)
+ lidx = lidx.take(ind)
+ ridx = ridx.take(ind)
+
+ eres = UInt64Index(2**63 + np.array([10, 25], dtype='uint64'))
+ elidx = np.array([1, 4], dtype=np.intp)
+ eridx = np.array([5, 2], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='inner',
+ return_indexers=True)
+
+ res2 = self.index.intersection(other_mono)
+ tm.assert_index_equal(res, res2)
+
+ elidx = np.array([1, 4], dtype=np.intp)
+ eridx = np.array([3, 5], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_left(self):
+ other = UInt64Index(2**63 + np.array(
+ [7, 12, 25, 1, 2, 10], dtype='uint64'))
+ other_mono = UInt64Index(2**63 + np.array(
+ [1, 2, 7, 10, 12, 25], dtype='uint64'))
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='left',
+ return_indexers=True)
+ eres = self.index
+ eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='left',
+ return_indexers=True)
+ eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # non-unique
+ idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64'))
+ idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64'))
+ res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True)
+
+ # 1 is in idx2, so it should be x2
+ eres = UInt64Index(2**63 + np.array(
+ [1, 1, 2, 5, 7, 9], dtype='uint64'))
+ eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+ elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_right(self):
+ other = UInt64Index(2**63 + np.array(
+ [7, 12, 25, 1, 2, 10], dtype='uint64'))
+ other_mono = UInt64Index(2**63 + np.array(
+ [1, 2, 7, 10, 12, 25], dtype='uint64'))
+
+ # not monotonic
+ res, lidx, ridx = self.index.join(other, how='right',
+ return_indexers=True)
+ eres = other
+ elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp)
+
+ tm.assert_numpy_array_equal(lidx, elidx)
+ assert isinstance(other, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ assert ridx is None
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='right',
+ return_indexers=True)
+ eres = other_mono
+ elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp)
+
+ assert isinstance(other, UInt64Index)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_index_equal(res, eres)
+ assert ridx is None
+
+ # non-unique
+ idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64'))
+ idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64'))
+ res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True)
+
+ # 1 is in idx2, so it should be x2
+ eres = UInt64Index(2**63 + np.array(
+ [1, 1, 2, 5, 7, 9], dtype='uint64'))
+ elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+ eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_non_int_index(self):
+ other = Index(2**63 + np.array(
+ [1, 5, 7, 10, 20], dtype='uint64'), dtype=object)
+
+ outer = self.index.join(other, how='outer')
+ outer2 = other.join(self.index, how='outer')
+ expected = Index(2**63 + np.array(
+ [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64'))
+ tm.assert_index_equal(outer, outer2)
+ tm.assert_index_equal(outer, expected)
+
+ inner = self.index.join(other, how='inner')
+ inner2 = other.join(self.index, how='inner')
+ expected = Index(2**63 + np.array([10, 20], dtype='uint64'))
+ tm.assert_index_equal(inner, inner2)
+ tm.assert_index_equal(inner, expected)
+
+ left = self.index.join(other, how='left')
+ tm.assert_index_equal(left, self.index.astype(object))
+
+ left2 = other.join(self.index, how='left')
+ tm.assert_index_equal(left2, other)
+
+ right = self.index.join(other, how='right')
+ tm.assert_index_equal(right, other)
+
+ right2 = other.join(self.index, how='right')
+ tm.assert_index_equal(right2, self.index.astype(object))
+
+ def test_join_outer(self):
+ other = UInt64Index(2**63 + np.array(
+ [7, 12, 25, 1, 2, 10], dtype='uint64'))
+ other_mono = UInt64Index(2**63 + np.array(
+ [1, 2, 7, 10, 12, 25], dtype='uint64'))
+
+ # not monotonic
+ # guarantee of sortedness
+ res, lidx, ridx = self.index.join(other, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ eres = UInt64Index(2**63 + np.array(
+ [0, 1, 2, 7, 10, 12, 15, 20, 25], dtype='uint64'))
+ elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
+ eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # monotonic
+ res, lidx, ridx = self.index.join(other_mono, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other_mono, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
+ eridx = np.array([-1, 0, 1, 2, 3, 4, -1, -1, 5], dtype=np.intp)
+
+ assert isinstance(res, UInt64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/test_range.py b/contrib/python/pandas/py2/pandas/tests/indexes/test_range.py
new file mode 100644
index 00000000000..96cf83d4773
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/test_range.py
@@ -0,0 +1,887 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, range, u
+
+import pandas as pd
+from pandas import Float64Index, Index, Int64Index, RangeIndex, Series
+import pandas.util.testing as tm
+
+from .test_numeric import Numeric
+
+
+class TestRangeIndex(Numeric):
+ _holder = RangeIndex
+ _compat_props = ['shape', 'ndim', 'size']
+
+ def setup_method(self, method):
+ self.indices = dict(index=RangeIndex(0, 20, 2, name='foo'),
+ index_dec=RangeIndex(18, -1, -2, name='bar'))
+ self.setup_indices()
+
+ def create_index(self):
+ return RangeIndex(5)
+
+ def test_can_hold_identifiers(self):
+ idx = self.create_index()
+ key = idx[0]
+ assert idx._can_hold_identifiers_and_holds_name(key) is False
+
+ def test_too_many_names(self):
+ with pytest.raises(ValueError, match="^Length"):
+ self.index.names = ["roger", "harold"]
+
+ def test_constructor(self):
+ index = RangeIndex(5)
+ expected = np.arange(5, dtype=np.int64)
+ assert isinstance(index, RangeIndex)
+ assert index._start == 0
+ assert index._stop == 5
+ assert index._step == 1
+ assert index.name is None
+ tm.assert_index_equal(Index(expected), index)
+
+ index = RangeIndex(1, 5)
+ expected = np.arange(1, 5, dtype=np.int64)
+ assert isinstance(index, RangeIndex)
+ assert index._start == 1
+ tm.assert_index_equal(Index(expected), index)
+
+ index = RangeIndex(1, 5, 2)
+ expected = np.arange(1, 5, 2, dtype=np.int64)
+ assert isinstance(index, RangeIndex)
+ assert index._step == 2
+ tm.assert_index_equal(Index(expected), index)
+
+ for index in [RangeIndex(0), RangeIndex(start=0), RangeIndex(stop=0),
+ RangeIndex(0, 0)]:
+ expected = np.empty(0, dtype=np.int64)
+ assert isinstance(index, RangeIndex)
+ assert index._start == 0
+ assert index._stop == 0
+ assert index._step == 1
+ tm.assert_index_equal(Index(expected), index)
+
+ for index in [RangeIndex(0, name='Foo'),
+ RangeIndex(start=0, name='Foo'),
+ RangeIndex(stop=0, name='Foo'),
+ RangeIndex(0, 0, name='Foo')]:
+ assert isinstance(index, RangeIndex)
+ assert index.name == 'Foo'
+
+ # we don't allow on a bare Index
+ with pytest.raises(TypeError):
+ Index(0, 1000)
+
+ def test_constructor_invalid_args(self):
+ msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers"
+ with pytest.raises(TypeError, match=msg):
+ RangeIndex()
+
+ with pytest.raises(TypeError, match=msg):
+ RangeIndex(name='Foo')
+
+ # invalid args
+ for i in [Index(['a', 'b']), Series(['a', 'b']), np.array(['a', 'b']),
+ [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10),
+ np.array([1]), [1]]:
+ with pytest.raises(TypeError):
+ RangeIndex(i)
+
+ def test_constructor_same(self):
+
+ # pass thru w and w/o copy
+ index = RangeIndex(1, 5, 2)
+ result = RangeIndex(index, copy=False)
+ assert result.identical(index)
+
+ result = RangeIndex(index, copy=True)
+ tm.assert_index_equal(result, index, exact=True)
+
+ result = RangeIndex(index)
+ tm.assert_index_equal(result, index, exact=True)
+
+ with pytest.raises(TypeError):
+ RangeIndex(index, dtype='float64')
+
+ def test_constructor_range(self):
+
+ with pytest.raises(TypeError):
+ RangeIndex(range(1, 5, 2))
+
+ result = RangeIndex.from_range(range(1, 5, 2))
+ expected = RangeIndex(1, 5, 2)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = RangeIndex.from_range(range(5, 6))
+ expected = RangeIndex(5, 6, 1)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ # an invalid range
+ result = RangeIndex.from_range(range(5, 1))
+ expected = RangeIndex(0, 0, 1)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = RangeIndex.from_range(range(5))
+ expected = RangeIndex(0, 5, 1)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ result = Index(range(1, 5, 2))
+ expected = RangeIndex(1, 5, 2)
+ tm.assert_index_equal(result, expected, exact=True)
+
+ with pytest.raises(TypeError):
+ Index(range(1, 5, 2), dtype='float64')
+
+ def test_constructor_name(self):
+ # GH12288
+ orig = RangeIndex(10)
+ orig.name = 'original'
+
+ copy = RangeIndex(orig)
+ copy.name = 'copy'
+
+ assert orig.name == 'original'
+ assert copy.name == 'copy'
+
+ new = Index(copy)
+ assert new.name == 'copy'
+
+ new.name = 'new'
+ assert orig.name == 'original'
+ assert copy.name == 'copy'
+ assert new.name == 'new'
+
+ def test_constructor_corner(self):
+ arr = np.array([1, 2, 3, 4], dtype=object)
+ index = RangeIndex(1, 5)
+ assert index.values.dtype == np.int64
+ tm.assert_index_equal(index, Index(arr))
+
+ # non-int raise Exception
+ with pytest.raises(TypeError):
+ RangeIndex('1', '10', '1')
+ with pytest.raises(TypeError):
+ RangeIndex(1.1, 10.2, 1.3)
+
+ # invalid passed type
+ with pytest.raises(TypeError):
+ RangeIndex(1, 5, dtype='float64')
+
+ def test_copy(self):
+ i = RangeIndex(5, name='Foo')
+ i_copy = i.copy()
+ assert i_copy is not i
+ assert i_copy.identical(i)
+ assert i_copy._start == 0
+ assert i_copy._stop == 5
+ assert i_copy._step == 1
+ assert i_copy.name == 'Foo'
+
+ def test_repr(self):
+ i = RangeIndex(5, name='Foo')
+ result = repr(i)
+ if PY3:
+ expected = "RangeIndex(start=0, stop=5, step=1, name='Foo')"
+ else:
+ expected = "RangeIndex(start=0, stop=5, step=1, name=u'Foo')"
+ assert result == expected
+
+ result = eval(result)
+ tm.assert_index_equal(result, i, exact=True)
+
+ i = RangeIndex(5, 0, -1)
+ result = repr(i)
+ expected = "RangeIndex(start=5, stop=0, step=-1)"
+ assert result == expected
+
+ result = eval(result)
+ tm.assert_index_equal(result, i, exact=True)
+
+ def test_insert(self):
+
+ idx = RangeIndex(5, name='Foo')
+ result = idx[1:4]
+
+ # test 0th element
+ tm.assert_index_equal(idx[0:4], result.insert(0, idx[0]))
+
+ # GH 18295 (test missing)
+ expected = Float64Index([0, np.nan, 1, 2, 3, 4])
+ for na in (np.nan, pd.NaT, None):
+ result = RangeIndex(5).insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+ def test_delete(self):
+
+ idx = RangeIndex(5, name='Foo')
+ expected = idx[1:].astype(int)
+ result = idx.delete(0)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ expected = idx[:-1].astype(int)
+ result = idx.delete(-1)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ with pytest.raises((IndexError, ValueError)):
+ # either depending on numpy version
+ result = idx.delete(len(idx))
+
+ def test_view(self):
+ i = RangeIndex(0, name='Foo')
+ i_view = i.view()
+ assert i_view.name == 'Foo'
+
+ i_view = i.view('i8')
+ tm.assert_numpy_array_equal(i.values, i_view)
+
+ i_view = i.view(RangeIndex)
+ tm.assert_index_equal(i, i_view)
+
+ def test_dtype(self):
+ assert self.index.dtype == np.int64
+
+ def test_is_monotonic(self):
+ assert self.index.is_monotonic is True
+ assert self.index.is_monotonic_increasing is True
+ assert self.index.is_monotonic_decreasing is False
+ assert self.index._is_strictly_monotonic_increasing is True
+ assert self.index._is_strictly_monotonic_decreasing is False
+
+ index = RangeIndex(4, 0, -1)
+ assert index.is_monotonic is False
+ assert index._is_strictly_monotonic_increasing is False
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_decreasing is True
+
+ index = RangeIndex(1, 2)
+ assert index.is_monotonic is True
+ assert index.is_monotonic_increasing is True
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_increasing is True
+ assert index._is_strictly_monotonic_decreasing is True
+
+ index = RangeIndex(2, 1)
+ assert index.is_monotonic is True
+ assert index.is_monotonic_increasing is True
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_increasing is True
+ assert index._is_strictly_monotonic_decreasing is True
+
+ index = RangeIndex(1, 1)
+ assert index.is_monotonic is True
+ assert index.is_monotonic_increasing is True
+ assert index.is_monotonic_decreasing is True
+ assert index._is_strictly_monotonic_increasing is True
+ assert index._is_strictly_monotonic_decreasing is True
+
+ def test_equals_range(self):
+ equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)),
+ (RangeIndex(0), RangeIndex(1, -1, 3)),
+ (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)),
+ (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2))]
+ for left, right in equiv_pairs:
+ assert left.equals(right)
+ assert right.equals(left)
+
+ def test_logical_compat(self):
+ idx = self.create_index()
+ assert idx.all() == idx.values.all()
+ assert idx.any() == idx.values.any()
+
+ def test_identical(self):
+ i = Index(self.index.copy())
+ assert i.identical(self.index)
+
+ # we don't allow object dtype for RangeIndex
+ if isinstance(self.index, RangeIndex):
+ return
+
+ same_values_different_type = Index(i, dtype=object)
+ assert not i.identical(same_values_different_type)
+
+ i = self.index.copy(dtype=object)
+ i = i.rename('foo')
+ same_values = Index(i, dtype=object)
+ assert same_values.identical(self.index.copy(dtype=object))
+
+ assert not i.identical(self.index)
+ assert Index(same_values, name='foo', dtype=object).identical(i)
+
+ assert not self.index.copy(dtype=object).identical(
+ self.index.copy(dtype='int64'))
+
+ def test_get_indexer(self):
+ target = RangeIndex(10)
+ indexer = self.index.get_indexer(target)
+ expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ def test_get_indexer_pad(self):
+ target = RangeIndex(10)
+ indexer = self.index.get_indexer(target, method='pad')
+ expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ def test_get_indexer_backfill(self):
+ target = RangeIndex(10)
+ indexer = self.index.get_indexer(target, method='backfill')
+ expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp)
+ tm.assert_numpy_array_equal(indexer, expected)
+
+ def test_join_outer(self):
+ # join with Int64Index
+ other = Int64Index(np.arange(25, 14, -1))
+
+ res, lidx, ridx = self.index.join(other, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25])
+ elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9,
+ -1, -1, -1, -1, -1, -1, -1], dtype=np.intp)
+ eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6,
+ 5, 4, 3, 2, 1, 0], dtype=np.intp)
+
+ assert isinstance(res, Int64Index)
+ assert not isinstance(res, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # join with RangeIndex
+ other = RangeIndex(25, 14, -1)
+
+ res, lidx, ridx = self.index.join(other, how='outer',
+ return_indexers=True)
+ noidx_res = self.index.join(other, how='outer')
+ tm.assert_index_equal(res, noidx_res)
+
+ assert isinstance(res, Int64Index)
+ assert not isinstance(res, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_inner(self):
+ # Join with non-RangeIndex
+ other = Int64Index(np.arange(25, 14, -1))
+
+ res, lidx, ridx = self.index.join(other, how='inner',
+ return_indexers=True)
+
+ # no guarantee of sortedness, so sort for comparison purposes
+ ind = res.argsort()
+ res = res.take(ind)
+ lidx = lidx.take(ind)
+ ridx = ridx.take(ind)
+
+ eres = Int64Index([16, 18])
+ elidx = np.array([8, 9], dtype=np.intp)
+ eridx = np.array([9, 7], dtype=np.intp)
+
+ assert isinstance(res, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # Join two RangeIndex
+ other = RangeIndex(25, 14, -1)
+
+ res, lidx, ridx = self.index.join(other, how='inner',
+ return_indexers=True)
+
+ assert isinstance(res, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_left(self):
+ # Join with Int64Index
+ other = Int64Index(np.arange(25, 14, -1))
+
+ res, lidx, ridx = self.index.join(other, how='left',
+ return_indexers=True)
+ eres = self.index
+ eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp)
+
+ assert isinstance(res, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ # Join withRangeIndex
+ other = Int64Index(np.arange(25, 14, -1))
+
+ res, lidx, ridx = self.index.join(other, how='left',
+ return_indexers=True)
+
+ assert isinstance(res, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ assert lidx is None
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_right(self):
+ # Join with Int64Index
+ other = Int64Index(np.arange(25, 14, -1))
+
+ res, lidx, ridx = self.index.join(other, how='right',
+ return_indexers=True)
+ eres = other
+ elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1],
+ dtype=np.intp)
+
+ assert isinstance(other, Int64Index)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ assert ridx is None
+
+ # Join withRangeIndex
+ other = RangeIndex(25, 14, -1)
+
+ res, lidx, ridx = self.index.join(other, how='right',
+ return_indexers=True)
+ eres = other
+
+ assert isinstance(other, RangeIndex)
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ assert ridx is None
+
+ def test_join_non_int_index(self):
+ other = Index([3, 6, 7, 8, 10], dtype=object)
+
+ outer = self.index.join(other, how='outer')
+ outer2 = other.join(self.index, how='outer')
+ expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18])
+ tm.assert_index_equal(outer, outer2)
+ tm.assert_index_equal(outer, expected)
+
+ inner = self.index.join(other, how='inner')
+ inner2 = other.join(self.index, how='inner')
+ expected = Index([6, 8, 10])
+ tm.assert_index_equal(inner, inner2)
+ tm.assert_index_equal(inner, expected)
+
+ left = self.index.join(other, how='left')
+ tm.assert_index_equal(left, self.index.astype(object))
+
+ left2 = other.join(self.index, how='left')
+ tm.assert_index_equal(left2, other)
+
+ right = self.index.join(other, how='right')
+ tm.assert_index_equal(right, other)
+
+ right2 = other.join(self.index, how='right')
+ tm.assert_index_equal(right2, self.index.astype(object))
+
+ def test_join_non_unique(self):
+ other = Index([4, 4, 3, 3])
+
+ res, lidx, ridx = self.index.join(other, return_indexers=True)
+
+ eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18])
+ elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp)
+ eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1],
+ dtype=np.intp)
+
+ tm.assert_index_equal(res, eres)
+ tm.assert_numpy_array_equal(lidx, elidx)
+ tm.assert_numpy_array_equal(ridx, eridx)
+
+ def test_join_self(self):
+ kinds = 'outer', 'inner', 'left', 'right'
+ for kind in kinds:
+ joined = self.index.join(self.index, how=kind)
+ assert self.index is joined
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_intersection(self, sort):
+ # intersect with Int64Index
+ other = Index(np.arange(1, 6))
+ result = self.index.intersection(other, sort=sort)
+ expected = Index(np.sort(np.intersect1d(self.index.values,
+ other.values)))
+ tm.assert_index_equal(result, expected)
+
+ result = other.intersection(self.index, sort=sort)
+ expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values,
+ other.values))))
+ tm.assert_index_equal(result, expected)
+
+ # intersect with increasing RangeIndex
+ other = RangeIndex(1, 6)
+ result = self.index.intersection(other, sort=sort)
+ expected = Index(np.sort(np.intersect1d(self.index.values,
+ other.values)))
+ tm.assert_index_equal(result, expected)
+
+ # intersect with decreasing RangeIndex
+ other = RangeIndex(5, 0, -1)
+ result = self.index.intersection(other, sort=sort)
+ expected = Index(np.sort(np.intersect1d(self.index.values,
+ other.values)))
+ tm.assert_index_equal(result, expected)
+
+ # reversed (GH 17296)
+ result = other.intersection(self.index, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ # GH 17296: intersect two decreasing RangeIndexes
+ first = RangeIndex(10, -2, -2)
+ other = RangeIndex(5, -4, -1)
+ expected = first.astype(int).intersection(other.astype(int), sort=sort)
+ result = first.intersection(other, sort=sort).astype(int)
+ tm.assert_index_equal(result, expected)
+
+ # reversed
+ result = other.intersection(first, sort=sort).astype(int)
+ tm.assert_index_equal(result, expected)
+
+ index = RangeIndex(5)
+
+ # intersect of non-overlapping indices
+ other = RangeIndex(5, 10, 1)
+ result = index.intersection(other, sort=sort)
+ expected = RangeIndex(0, 0, 1)
+ tm.assert_index_equal(result, expected)
+
+ other = RangeIndex(-1, -5, -1)
+ result = index.intersection(other, sort=sort)
+ expected = RangeIndex(0, 0, 1)
+ tm.assert_index_equal(result, expected)
+
+ # intersection of empty indices
+ other = RangeIndex(0, 0, 1)
+ result = index.intersection(other, sort=sort)
+ expected = RangeIndex(0, 0, 1)
+ tm.assert_index_equal(result, expected)
+
+ result = other.intersection(index, sort=sort)
+ tm.assert_index_equal(result, expected)
+
+ # intersection of non-overlapping values based on start value and gcd
+ index = RangeIndex(1, 10, 2)
+ other = RangeIndex(0, 10, 4)
+ result = index.intersection(other, sort=sort)
+ expected = RangeIndex(0, 0, 1)
+ tm.assert_index_equal(result, expected)
+
+ def test_union_noncomparable(self):
+ from datetime import datetime, timedelta
+ # corner case, non-Int64Index
+ now = datetime.now()
+ other = Index([now + timedelta(i) for i in range(4)], dtype=object)
+ result = self.index.union(other)
+ expected = Index(np.concatenate((self.index, other)))
+ tm.assert_index_equal(result, expected)
+
+ result = other.union(self.index)
+ expected = Index(np.concatenate((other, self.index)))
+ tm.assert_index_equal(result, expected)
+
+ def test_union(self):
+ RI = RangeIndex
+ I64 = Int64Index
+ cases = [(RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)),
+ (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1)),
+ (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1)),
+ (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)),
+ (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1)),
+ (RI(0, 10, 2), RI(1, 10, 2), RI(0, 10, 1)),
+ (RI(0, 11, 2), RI(1, 12, 2), RI(0, 12, 1)),
+ (RI(0, 21, 4), RI(-2, 24, 4), RI(-2, 24, 2)),
+ (RI(0, -20, -2), RI(-1, -21, -2), RI(-19, 1, 1)),
+ (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5)),
+ (RI(0, -100, -5), RI(5, -100, -20), RI(-95, 10, 5)),
+ (RI(0, -11, -1), RI(1, -12, -4), RI(-11, 2, 1)),
+ (RI(0), RI(0), RI(0)),
+ (RI(0, -10, -2), RI(0), RI(0, -10, -2)),
+ (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2)),
+ (RI(0, -100, -2), RI(-100, 50, 102), RI(-100, 4, 2)),
+ (RI(0, -100, -1), RI(0, -50, -3), RI(-99, 1, 1)),
+ (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5)),
+ (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5)),
+ (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4])),
+ (RI(0, 10, 1), I64([]), RI(0, 10, 1)),
+ (RI(0), I64([1, 5, 6]), I64([1, 5, 6]))]
+ for idx1, idx2, expected in cases:
+ res1 = idx1.union(idx2)
+ res2 = idx2.union(idx1)
+ res3 = idx1._int64index.union(idx2)
+ tm.assert_index_equal(res1, expected, exact=True)
+ tm.assert_index_equal(res2, expected, exact=True)
+ tm.assert_index_equal(res3, expected)
+
+ def test_nbytes(self):
+
+ # memory savings vs int index
+ i = RangeIndex(0, 1000)
+ assert i.nbytes < i._int64index.nbytes / 10
+
+ # constant memory usage
+ i2 = RangeIndex(0, 10)
+ assert i.nbytes == i2.nbytes
+
+ def test_cant_or_shouldnt_cast(self):
+ # can't
+ with pytest.raises(TypeError):
+ RangeIndex('foo', 'bar', 'baz')
+
+ # shouldn't
+ with pytest.raises(TypeError):
+ RangeIndex('0', '1', '2')
+
+ def test_view_Index(self):
+ self.index.view(Index)
+
+ def test_prevent_casting(self):
+ result = self.index.astype('O')
+ assert result.dtype == np.object_
+
+ def test_take_preserve_name(self):
+ index = RangeIndex(1, 5, name='foo')
+ taken = index.take([3, 0, 1])
+ assert index.name == taken.name
+
+ def test_take_fill_value(self):
+ # GH 12631
+ idx = pd.RangeIndex(1, 4, name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = pd.Int64Index([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ msg = "Unable to fill values because RangeIndex cannot contain NA"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -1]), fill_value=True)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = pd.Int64Index([2, 1, 3], name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ msg = "Unable to fill values because RangeIndex cannot contain NA"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+ def test_print_unicode_columns(self):
+ df = pd.DataFrame({u("\u05d0"): [1, 2, 3],
+ "\u05d1": [4, 5, 6],
+ "c": [7, 8, 9]})
+ repr(df.columns) # should not raise UnicodeDecodeError
+
+ def test_repr_roundtrip(self):
+ tm.assert_index_equal(eval(repr(self.index)), self.index)
+
+ def test_slice_keep_name(self):
+ idx = RangeIndex(1, 2, name='asdf')
+ assert idx.name == idx[1:].name
+
+ def test_explicit_conversions(self):
+
+ # GH 8608
+ # add/sub are overridden explicitly for Float/Int Index
+ idx = RangeIndex(5)
+
+ # float conversions
+ arr = np.arange(5, dtype='int64') * 3.2
+ expected = Float64Index(arr)
+ fidx = idx * 3.2
+ tm.assert_index_equal(fidx, expected)
+ fidx = 3.2 * idx
+ tm.assert_index_equal(fidx, expected)
+
+ # interops with numpy arrays
+ expected = Float64Index(arr)
+ a = np.zeros(5, dtype='float64')
+ result = fidx - a
+ tm.assert_index_equal(result, expected)
+
+ expected = Float64Index(-arr)
+ a = np.zeros(5, dtype='float64')
+ result = a - fidx
+ tm.assert_index_equal(result, expected)
+
+ def test_has_duplicates(self):
+ for ind in self.indices:
+ if not len(ind):
+ continue
+ idx = self.indices[ind]
+ assert idx.is_unique
+ assert not idx.has_duplicates
+
+ def test_extended_gcd(self):
+ result = self.index._extended_gcd(6, 10)
+ assert result[0] == result[1] * 6 + result[2] * 10
+ assert 2 == result[0]
+
+ result = self.index._extended_gcd(10, 6)
+ assert 2 == result[1] * 10 + result[2] * 6
+ assert 2 == result[0]
+
+ def test_min_fitting_element(self):
+ result = RangeIndex(0, 20, 2)._min_fitting_element(1)
+ assert 2 == result
+
+ result = RangeIndex(1, 6)._min_fitting_element(1)
+ assert 1 == result
+
+ result = RangeIndex(18, -2, -2)._min_fitting_element(1)
+ assert 2 == result
+
+ result = RangeIndex(5, 0, -1)._min_fitting_element(1)
+ assert 1 == result
+
+ big_num = 500000000000000000000000
+
+ result = RangeIndex(5, big_num * 2, 1)._min_fitting_element(big_num)
+ assert big_num == result
+
+ def test_max_fitting_element(self):
+ result = RangeIndex(0, 20, 2)._max_fitting_element(17)
+ assert 16 == result
+
+ result = RangeIndex(1, 6)._max_fitting_element(4)
+ assert 4 == result
+
+ result = RangeIndex(18, -2, -2)._max_fitting_element(17)
+ assert 16 == result
+
+ result = RangeIndex(5, 0, -1)._max_fitting_element(4)
+ assert 4 == result
+
+ big_num = 500000000000000000000000
+
+ result = RangeIndex(5, big_num * 2, 1)._max_fitting_element(big_num)
+ assert big_num == result
+
+ def test_pickle_compat_construction(self):
+ # RangeIndex() is a valid constructor
+ pass
+
+ def test_slice_specialised(self):
+
+ # scalar indexing
+ res = self.index[1]
+ expected = 2
+ assert res == expected
+
+ res = self.index[-1]
+ expected = 18
+ assert res == expected
+
+ # slicing
+ # slice value completion
+ index = self.index[:]
+ expected = self.index
+ tm.assert_index_equal(index, expected)
+
+ # positive slice values
+ index = self.index[7:10:2]
+ expected = Index(np.array([14, 18]), name='foo')
+ tm.assert_index_equal(index, expected)
+
+ # negative slice values
+ index = self.index[-1:-5:-2]
+ expected = Index(np.array([18, 14]), name='foo')
+ tm.assert_index_equal(index, expected)
+
+ # stop overshoot
+ index = self.index[2:100:4]
+ expected = Index(np.array([4, 12]), name='foo')
+ tm.assert_index_equal(index, expected)
+
+ # reverse
+ index = self.index[::-1]
+ expected = Index(self.index.values[::-1], name='foo')
+ tm.assert_index_equal(index, expected)
+
+ index = self.index[-8::-1]
+ expected = Index(np.array([4, 2, 0]), name='foo')
+ tm.assert_index_equal(index, expected)
+
+ index = self.index[-40::-1]
+ expected = Index(np.array([], dtype=np.int64), name='foo')
+ tm.assert_index_equal(index, expected)
+
+ index = self.index[40::-1]
+ expected = Index(self.index.values[40::-1], name='foo')
+ tm.assert_index_equal(index, expected)
+
+ index = self.index[10::-1]
+ expected = Index(self.index.values[::-1], name='foo')
+ tm.assert_index_equal(index, expected)
+
+ def test_len_specialised(self):
+
+ # make sure that our len is the same as
+ # np.arange calc
+
+ for step in np.arange(1, 6, 1):
+
+ arr = np.arange(0, 5, step)
+ i = RangeIndex(0, 5, step)
+ assert len(i) == len(arr)
+
+ i = RangeIndex(5, 0, step)
+ assert len(i) == 0
+
+ for step in np.arange(-6, -1, 1):
+
+ arr = np.arange(5, 0, step)
+ i = RangeIndex(5, 0, step)
+ assert len(i) == len(arr)
+
+ i = RangeIndex(0, 5, step)
+ assert len(i) == 0
+
+ def test_append(self):
+ # GH16212
+ RI = RangeIndex
+ I64 = Int64Index
+ F64 = Float64Index
+ OI = Index
+ cases = [([RI(1, 12, 5)], RI(1, 12, 5)),
+ ([RI(0, 6, 4)], RI(0, 6, 4)),
+ ([RI(1, 3), RI(3, 7)], RI(1, 7)),
+ ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)),
+ ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)),
+ ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)),
+ ([RI(-4, -8), RI(-8, -12)], RI(0, 0)),
+ ([RI(-4, -8), RI(3, -4)], RI(0, 0)),
+ ([RI(-4, -8), RI(3, 5)], RI(3, 5)),
+ ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])),
+ ([RI(-2,), RI(3, 5)], RI(3, 5)),
+ ([RI(2,), RI(2)], I64([0, 1, 0, 1])),
+ ([RI(2,), RI(2, 5), RI(5, 8, 4)], RI(0, 6)),
+ ([RI(2,), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])),
+ ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)),
+ ([RI(3,), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])),
+ ([RI(3,), F64([-1, 3.1, 15.])], F64([0, 1, 2, -1, 3.1, 15.])),
+ ([RI(3,), OI(['a', None, 14])], OI([0, 1, 2, 'a', None, 14])),
+ ([RI(3, 1), OI(['a', None, 14])], OI(['a', None, 14]))
+ ]
+
+ for indices, expected in cases:
+ result = indices[0].append(indices[1:])
+ tm.assert_index_equal(result, expected, exact=True)
+
+ if len(indices) == 2:
+ # Append single item rather than list
+ result2 = indices[0].append(indices[1])
+ tm.assert_index_equal(result2, expected, exact=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_arithmetic.py
new file mode 100644
index 00000000000..04977023d7c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_arithmetic.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas.errors import NullFrequencyError
+
+import pandas as pd
+from pandas import Timedelta, TimedeltaIndex, timedelta_range
+import pandas.util.testing as tm
+
+
[email protected](params=[pd.offsets.Hour(2), timedelta(hours=2),
+ np.timedelta64(2, 'h'), Timedelta(hours=2)],
+ ids=str)
+def delta(request):
+ # Several ways of representing two hours
+ return request.param
+
+
[email protected](params=['B', 'D'])
+def freq(request):
+ return request.param
+
+
+class TestTimedeltaIndexArithmetic(object):
+ # Addition and Subtraction Operations
+
+ # -------------------------------------------------------------
+ # TimedeltaIndex.shift is used by __add__/__sub__
+
+ def test_tdi_shift_empty(self):
+ # GH#9903
+ idx = pd.TimedeltaIndex([], name='xxx')
+ tm.assert_index_equal(idx.shift(0, freq='H'), idx)
+ tm.assert_index_equal(idx.shift(3, freq='H'), idx)
+
+ def test_tdi_shift_hours(self):
+ # GH#9903
+ idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx')
+ tm.assert_index_equal(idx.shift(0, freq='H'), idx)
+ exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx')
+ tm.assert_index_equal(idx.shift(3, freq='H'), exp)
+ exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx')
+ tm.assert_index_equal(idx.shift(-3, freq='H'), exp)
+
+ def test_tdi_shift_minutes(self):
+ # GH#9903
+ idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx')
+ tm.assert_index_equal(idx.shift(0, freq='T'), idx)
+ exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'],
+ name='xxx')
+ tm.assert_index_equal(idx.shift(3, freq='T'), exp)
+ exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'],
+ name='xxx')
+ tm.assert_index_equal(idx.shift(-3, freq='T'), exp)
+
+ def test_tdi_shift_int(self):
+ # GH#8083
+ trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
+ result = trange.shift(1)
+ expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00',
+ '3 days 01:00:00',
+ '4 days 01:00:00', '5 days 01:00:00'],
+ freq='D')
+ tm.assert_index_equal(result, expected)
+
+ def test_tdi_shift_nonstandard_freq(self):
+ # GH#8083
+ trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
+ result = trange.shift(3, freq='2D 1s')
+ expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03',
+ '8 days 01:00:03', '9 days 01:00:03',
+ '10 days 01:00:03'], freq='D')
+ tm.assert_index_equal(result, expected)
+
+ def test_shift_no_freq(self):
+ # GH#19147
+ tdi = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00'], freq=None)
+ with pytest.raises(NullFrequencyError):
+ tdi.shift(2)
+
+ # -------------------------------------------------------------
+ # Binary operations TimedeltaIndex and integer
+
+ def test_tdi_add_int(self, one):
+ # Variants of `one` for #19012
+ rng = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng + one
+ expected = timedelta_range('1 days 10:00:00', freq='H', periods=10)
+ tm.assert_index_equal(result, expected)
+
+ def test_tdi_iadd_int(self, one):
+ rng = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ expected = timedelta_range('1 days 10:00:00', freq='H', periods=10)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ rng += one
+ tm.assert_index_equal(rng, expected)
+
+ def test_tdi_sub_int(self, one):
+ rng = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng - one
+ expected = timedelta_range('1 days 08:00:00', freq='H', periods=10)
+ tm.assert_index_equal(result, expected)
+
+ def test_tdi_isub_int(self, one):
+ rng = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ expected = timedelta_range('1 days 08:00:00', freq='H', periods=10)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ rng -= one
+ tm.assert_index_equal(rng, expected)
+
+ # -------------------------------------------------------------
+ # __add__/__sub__ with integer arrays
+
+ @pytest.mark.parametrize('box', [np.array, pd.Index])
+ def test_tdi_add_integer_array(self, box):
+ # GH#19959
+ rng = timedelta_range('1 days 09:00:00', freq='H', periods=3)
+ other = box([4, 3, 2])
+ expected = TimedeltaIndex(['1 day 13:00:00'] * 3)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng + other
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = other + rng
+ tm.assert_index_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [np.array, pd.Index])
+ def test_tdi_sub_integer_array(self, box):
+ # GH#19959
+ rng = timedelta_range('9H', freq='H', periods=3)
+ other = box([4, 3, 2])
+ expected = TimedeltaIndex(['5H', '7H', '9H'])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = rng - other
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # GH#22535
+ result = other - rng
+ tm.assert_index_equal(result, -expected)
+
+ @pytest.mark.parametrize('box', [np.array, pd.Index])
+ def test_tdi_addsub_integer_array_no_freq(self, box):
+ # GH#19959
+ tdi = TimedeltaIndex(['1 Day', 'NaT', '3 Hours'])
+ other = box([14, -1, 16])
+ with pytest.raises(NullFrequencyError):
+ tdi + other
+ with pytest.raises(NullFrequencyError):
+ other + tdi
+ with pytest.raises(NullFrequencyError):
+ tdi - other
+ with pytest.raises(NullFrequencyError):
+ other - tdi
+
+ # -------------------------------------------------------------
+ # Binary operations TimedeltaIndex and timedelta-like
+ # Note: add and sub are tested in tests.test_arithmetic, in-place
+ # tests are kept here because their behavior is Index-specific
+
+ def test_tdi_iadd_timedeltalike(self, delta):
+ # only test adding/sub offsets as + is now numeric
+ rng = timedelta_range('1 days', '10 days')
+ expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00',
+ freq='D')
+ rng += delta
+ tm.assert_index_equal(rng, expected)
+
+ def test_tdi_isub_timedeltalike(self, delta):
+ # only test adding/sub offsets as - is now numeric
+ rng = timedelta_range('1 days', '10 days')
+ expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00')
+ rng -= delta
+ tm.assert_index_equal(rng, expected)
+
+ # -------------------------------------------------------------
+
+ # TODO: after #24365 this probably belongs in scalar tests
+ def test_ops_ndarray(self):
+ td = Timedelta('1 day')
+
+ # timedelta, timedelta
+ other = pd.to_timedelta(['1 day']).values
+ expected = pd.to_timedelta(['2 days']).values
+ tm.assert_numpy_array_equal(td + other, expected)
+ tm.assert_numpy_array_equal(other + td, expected)
+ pytest.raises(TypeError, lambda: td + np.array([1]))
+ pytest.raises(TypeError, lambda: np.array([1]) + td)
+
+ expected = pd.to_timedelta(['0 days']).values
+ tm.assert_numpy_array_equal(td - other, expected)
+ tm.assert_numpy_array_equal(-other + td, expected)
+ pytest.raises(TypeError, lambda: td - np.array([1]))
+ pytest.raises(TypeError, lambda: np.array([1]) - td)
+
+ expected = pd.to_timedelta(['2 days']).values
+ tm.assert_numpy_array_equal(td * np.array([2]), expected)
+ tm.assert_numpy_array_equal(np.array([2]) * td, expected)
+ pytest.raises(TypeError, lambda: td * other)
+ pytest.raises(TypeError, lambda: other * td)
+
+ tm.assert_numpy_array_equal(td / other,
+ np.array([1], dtype=np.float64))
+ tm.assert_numpy_array_equal(other / td,
+ np.array([1], dtype=np.float64))
+
+ # timedelta, datetime
+ other = pd.to_datetime(['2000-01-01']).values
+ expected = pd.to_datetime(['2000-01-02']).values
+ tm.assert_numpy_array_equal(td + other, expected)
+ tm.assert_numpy_array_equal(other + td, expected)
+
+ expected = pd.to_datetime(['1999-12-31']).values
+ tm.assert_numpy_array_equal(-td + other, expected)
+ tm.assert_numpy_array_equal(other - td, expected)
+
+ def test_tdi_ops_attributes(self):
+ rng = timedelta_range('2 days', periods=5, freq='2D', name='x')
+
+ result = rng + 1 * rng.freq
+ exp = timedelta_range('4 days', periods=5, freq='2D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '2D'
+
+ result = rng - 2 * rng.freq
+ exp = timedelta_range('-2 days', periods=5, freq='2D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '2D'
+
+ result = rng * 2
+ exp = timedelta_range('4 days', periods=5, freq='4D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '4D'
+
+ result = rng / 2
+ exp = timedelta_range('1 days', periods=5, freq='D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == 'D'
+
+ result = -rng
+ exp = timedelta_range('-2 days', periods=5, freq='-2D', name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq == '-2D'
+
+ rng = pd.timedelta_range('-2 days', periods=5, freq='D', name='x')
+
+ result = abs(rng)
+ exp = TimedeltaIndex(['2 days', '1 days', '0 days', '1 days',
+ '2 days'], name='x')
+ tm.assert_index_equal(result, exp)
+ assert result.freq is None
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_astype.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_astype.py
new file mode 100644
index 00000000000..23e96dbc3d6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_astype.py
@@ -0,0 +1,110 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ Float64Index, Index, Int64Index, NaT, Timedelta, TimedeltaIndex,
+ timedelta_range)
+import pandas.util.testing as tm
+
+
+class TestTimedeltaIndex(object):
+ def test_astype_object(self):
+ idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx')
+ expected_list = [Timedelta('1 days'), Timedelta('2 days'),
+ Timedelta('3 days'), Timedelta('4 days')]
+ result = idx.astype(object)
+ expected = Index(expected_list, dtype=object, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert idx.tolist() == expected_list
+
+ def test_astype_object_with_nat(self):
+ idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), NaT,
+ timedelta(days=4)], name='idx')
+ expected_list = [Timedelta('1 days'), Timedelta('2 days'), NaT,
+ Timedelta('4 days')]
+ result = idx.astype(object)
+ expected = Index(expected_list, dtype=object, name='idx')
+ tm.assert_index_equal(result, expected)
+ assert idx.tolist() == expected_list
+
+ def test_astype(self):
+ # GH 13149, GH 13209
+ idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN])
+
+ result = idx.astype(object)
+ expected = Index([Timedelta('1 days 03:46:40')] + [NaT] * 3,
+ dtype=object)
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype(int)
+ expected = Int64Index([100000000000000] + [-9223372036854775808] * 3,
+ dtype=np.int64)
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype(str)
+ expected = Index(str(x) for x in idx)
+ tm.assert_index_equal(result, expected)
+
+ rng = timedelta_range('1 days', periods=10)
+ result = rng.astype('i8')
+ tm.assert_index_equal(result, Index(rng.asi8))
+ tm.assert_numpy_array_equal(rng.asi8, result.values)
+
+ def test_astype_uint(self):
+ arr = timedelta_range('1H', periods=2)
+ expected = pd.UInt64Index(
+ np.array([3600000000000, 90000000000000], dtype="uint64")
+ )
+
+ tm.assert_index_equal(arr.astype("uint64"), expected)
+ tm.assert_index_equal(arr.astype("uint32"), expected)
+
+ def test_astype_timedelta64(self):
+ # GH 13149, GH 13209
+ idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN])
+
+ result = idx.astype('timedelta64')
+ expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64')
+ tm.assert_index_equal(result, expected)
+
+ result = idx.astype('timedelta64[ns]')
+ tm.assert_index_equal(result, idx)
+ assert result is not idx
+
+ result = idx.astype('timedelta64[ns]', copy=False)
+ tm.assert_index_equal(result, idx)
+ assert result is idx
+
+ @pytest.mark.parametrize('dtype', [
+ float, 'datetime64', 'datetime64[ns]'])
+ def test_astype_raises(self, dtype):
+ # GH 13149, GH 13209
+ idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN])
+ msg = 'Cannot cast TimedeltaArray to dtype'
+ with pytest.raises(TypeError, match=msg):
+ idx.astype(dtype)
+
+ def test_astype_category(self):
+ obj = pd.timedelta_range("1H", periods=2, freq='H')
+
+ result = obj.astype('category')
+ expected = pd.CategoricalIndex([pd.Timedelta('1H'),
+ pd.Timedelta('2H')])
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype('category')
+ expected = expected.values
+ tm.assert_categorical_equal(result, expected)
+
+ def test_astype_array_fallback(self):
+ obj = pd.timedelta_range("1H", periods=2)
+ result = obj.astype(bool)
+ expected = pd.Index(np.array([True, True]))
+ tm.assert_index_equal(result, expected)
+
+ result = obj._data.astype(bool)
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_construction.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_construction.py
new file mode 100644
index 00000000000..3938d6acad2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_construction.py
@@ -0,0 +1,199 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Timedelta, TimedeltaIndex, timedelta_range, to_timedelta
+from pandas.core.arrays import TimedeltaArray
+import pandas.util.testing as tm
+
+
+class TestTimedeltaIndex(object):
+
+ def test_verify_integrity_deprecated(self):
+ # GH#23919
+ with tm.assert_produces_warning(FutureWarning):
+ TimedeltaIndex(['1 Day'], verify_integrity=False)
+
+ def test_range_kwargs_deprecated(self):
+ # GH#23919
+ with tm.assert_produces_warning(FutureWarning):
+ TimedeltaIndex(start='1 Day', end='3 Days', freq='D')
+
+ def test_int64_nocopy(self):
+ # GH#23539 check that a copy isn't made when we pass int64 data
+ # and copy=False
+ arr = np.arange(10, dtype=np.int64)
+ tdi = TimedeltaIndex(arr, copy=False)
+ assert tdi._data._data.base is arr
+
+ def test_infer_from_tdi(self):
+ # GH#23539
+ # fast-path for inferring a frequency if the passed data already
+ # has one
+ tdi = pd.timedelta_range('1 second', periods=10**7, freq='1s')
+
+ result = pd.TimedeltaIndex(tdi, freq='infer')
+ assert result.freq == tdi.freq
+
+ # check that inferred_freq was not called by checking that the
+ # value has not been cached
+ assert "inferred_freq" not in getattr(result, "_cache", {})
+
+ def test_infer_from_tdi_mismatch(self):
+ # GH#23539
+ # fast-path for invalidating a frequency if the passed data already
+ # has one and it does not match the `freq` input
+ tdi = pd.timedelta_range('1 second', periods=100, freq='1s')
+
+ msg = ("Inferred frequency .* from passed values does "
+ "not conform to passed frequency")
+ with pytest.raises(ValueError, match=msg):
+ TimedeltaIndex(tdi, freq='D')
+
+ with pytest.raises(ValueError, match=msg):
+ # GH#23789
+ TimedeltaArray(tdi, freq='D')
+
+ def test_dt64_data_invalid(self):
+ # GH#23539
+ # passing tz-aware DatetimeIndex raises, naive or ndarray[datetime64]
+ # does not yet, but will in the future
+ dti = pd.date_range('2016-01-01', periods=3)
+
+ msg = "cannot be converted to timedelta64"
+ with pytest.raises(TypeError, match=msg):
+ TimedeltaIndex(dti.tz_localize('Europe/Brussels'))
+
+ with tm.assert_produces_warning(FutureWarning):
+ TimedeltaIndex(dti)
+
+ with tm.assert_produces_warning(FutureWarning):
+ TimedeltaIndex(np.asarray(dti))
+
+ def test_float64_ns_rounded(self):
+ # GH#23539 without specifying a unit, floats are regarded as nanos,
+ # and fractional portions are truncated
+ tdi = TimedeltaIndex([2.3, 9.7])
+ expected = TimedeltaIndex([2, 9])
+ tm.assert_index_equal(tdi, expected)
+
+ # integral floats are non-lossy
+ tdi = TimedeltaIndex([2.0, 9.0])
+ expected = TimedeltaIndex([2, 9])
+ tm.assert_index_equal(tdi, expected)
+
+ # NaNs get converted to NaT
+ tdi = TimedeltaIndex([2.0, np.nan])
+ expected = TimedeltaIndex([pd.Timedelta(nanoseconds=2), pd.NaT])
+ tm.assert_index_equal(tdi, expected)
+
+ def test_float64_unit_conversion(self):
+ # GH#23539
+ tdi = TimedeltaIndex([1.5, 2.25], unit='D')
+ expected = TimedeltaIndex([Timedelta(days=1.5), Timedelta(days=2.25)])
+ tm.assert_index_equal(tdi, expected)
+
+ def test_construction_base_constructor(self):
+ arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')]
+ tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.TimedeltaIndex(np.array(arr)))
+
+ arr = [np.nan, pd.NaT, pd.Timedelta('1 days')]
+ tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr))
+ tm.assert_index_equal(pd.Index(np.array(arr)),
+ pd.TimedeltaIndex(np.array(arr)))
+
+ def test_constructor(self):
+ expected = TimedeltaIndex(['1 days', '1 days 00:00:05', '2 days',
+ '2 days 00:00:02', '0 days 00:00:03'])
+ result = TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(
+ 2, 'D'), timedelta(days=2, seconds=2), pd.offsets.Second(3)])
+ tm.assert_index_equal(result, expected)
+
+ # unicode
+ result = TimedeltaIndex([u'1 days', '1 days, 00:00:05', np.timedelta64(
+ 2, 'D'), timedelta(days=2, seconds=2), pd.offsets.Second(3)])
+
+ expected = TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01',
+ '0 days 00:00:02'])
+ tm.assert_index_equal(TimedeltaIndex(range(3), unit='s'), expected)
+ expected = TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:05',
+ '0 days 00:00:09'])
+ tm.assert_index_equal(TimedeltaIndex([0, 5, 9], unit='s'), expected)
+ expected = TimedeltaIndex(
+ ['0 days 00:00:00.400', '0 days 00:00:00.450',
+ '0 days 00:00:01.200'])
+ tm.assert_index_equal(TimedeltaIndex([400, 450, 1200], unit='ms'),
+ expected)
+
+ def test_constructor_iso(self):
+ # GH #21877
+ expected = timedelta_range('1s', periods=9, freq='s')
+ durations = ['P0DT0H0M{}S'.format(i) for i in range(1, 10)]
+ result = to_timedelta(durations)
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_coverage(self):
+ rng = timedelta_range('1 days', periods=10.5)
+ exp = timedelta_range('1 days', periods=10)
+ tm.assert_index_equal(rng, exp)
+
+ msg = 'periods must be a number, got foo'
+ with pytest.raises(TypeError, match=msg):
+ timedelta_range(start='1 days', periods='foo', freq='D')
+
+ with pytest.raises(ValueError):
+ with tm.assert_produces_warning(FutureWarning):
+ TimedeltaIndex(start='1 days', end='10 days')
+
+ with pytest.raises(TypeError):
+ TimedeltaIndex('1 days')
+
+ # generator expression
+ gen = (timedelta(i) for i in range(10))
+ result = TimedeltaIndex(gen)
+ expected = TimedeltaIndex([timedelta(i) for i in range(10)])
+ tm.assert_index_equal(result, expected)
+
+ # NumPy string array
+ strings = np.array(['1 days', '2 days', '3 days'])
+ result = TimedeltaIndex(strings)
+ expected = to_timedelta([1, 2, 3], unit='d')
+ tm.assert_index_equal(result, expected)
+
+ from_ints = TimedeltaIndex(expected.asi8)
+ tm.assert_index_equal(from_ints, expected)
+
+ # non-conforming freq
+ pytest.raises(ValueError, TimedeltaIndex,
+ ['1 days', '2 days', '4 days'], freq='D')
+
+ pytest.raises(ValueError, timedelta_range, periods=10, freq='D')
+
+ def test_constructor_name(self):
+ idx = timedelta_range(start='1 days', periods=1, freq='D', name='TEST')
+ assert idx.name == 'TEST'
+
+ # GH10025
+ idx2 = TimedeltaIndex(idx, name='something else')
+ assert idx2.name == 'something else'
+
+ def test_constructor_no_precision_warns(self):
+ # GH-24753, GH-24739
+ expected = pd.TimedeltaIndex(['2000'], dtype='timedelta64[ns]')
+
+ # we set the stacklevel for DatetimeIndex
+ with tm.assert_produces_warning(FutureWarning):
+ result = pd.TimedeltaIndex(['2000'], dtype='timedelta64')
+ tm.assert_index_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = pd.Index(['2000'], dtype='timedelta64')
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_wrong_precision_raises(self):
+ with pytest.raises(ValueError):
+ pd.TimedeltaIndex(['2000'], dtype='timedelta64[us]')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_formats.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_formats.py
new file mode 100644
index 00000000000..09921fac80d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_formats.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas as pd
+from pandas import TimedeltaIndex
+
+
+class TestTimedeltaIndexRendering(object):
+ @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__'])
+ def test_representation(self, method):
+ idx1 = TimedeltaIndex([], freq='D')
+ idx2 = TimedeltaIndex(['1 days'], freq='D')
+ idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
+ idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
+ idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])
+
+ exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')"""
+
+ exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', "
+ "freq='D')")
+
+ exp3 = ("TimedeltaIndex(['1 days', '2 days'], "
+ "dtype='timedelta64[ns]', freq='D')")
+
+ exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], "
+ "dtype='timedelta64[ns]', freq='D')")
+
+ exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', "
+ "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)")
+
+ with pd.option_context('display.width', 300):
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
+ [exp1, exp2, exp3, exp4, exp5]):
+ result = getattr(idx, method)()
+ assert result == expected
+
+ def test_representation_to_series(self):
+ idx1 = TimedeltaIndex([], freq='D')
+ idx2 = TimedeltaIndex(['1 days'], freq='D')
+ idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
+ idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
+ idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])
+
+ exp1 = """Series([], dtype: timedelta64[ns])"""
+
+ exp2 = ("0 1 days\n"
+ "dtype: timedelta64[ns]")
+
+ exp3 = ("0 1 days\n"
+ "1 2 days\n"
+ "dtype: timedelta64[ns]")
+
+ exp4 = ("0 1 days\n"
+ "1 2 days\n"
+ "2 3 days\n"
+ "dtype: timedelta64[ns]")
+
+ exp5 = ("0 1 days 00:00:01\n"
+ "1 2 days 00:00:00\n"
+ "2 3 days 00:00:00\n"
+ "dtype: timedelta64[ns]")
+
+ with pd.option_context('display.width', 300):
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
+ [exp1, exp2, exp3, exp4, exp5]):
+ result = repr(pd.Series(idx))
+ assert result == expected
+
+ def test_summary(self):
+ # GH#9116
+ idx1 = TimedeltaIndex([], freq='D')
+ idx2 = TimedeltaIndex(['1 days'], freq='D')
+ idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
+ idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
+ idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])
+
+ exp1 = ("TimedeltaIndex: 0 entries\n"
+ "Freq: D")
+
+ exp2 = ("TimedeltaIndex: 1 entries, 1 days to 1 days\n"
+ "Freq: D")
+
+ exp3 = ("TimedeltaIndex: 2 entries, 1 days to 2 days\n"
+ "Freq: D")
+
+ exp4 = ("TimedeltaIndex: 3 entries, 1 days to 3 days\n"
+ "Freq: D")
+
+ exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days "
+ "00:00:00")
+
+ for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
+ [exp1, exp2, exp3, exp4, exp5]):
+ result = idx._summary()
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_indexing.py
new file mode 100644
index 00000000000..a6264e4dad4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_indexing.py
@@ -0,0 +1,338 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, Timedelta, TimedeltaIndex, compat, timedelta_range
+import pandas.util.testing as tm
+
+
+class TestGetItem(object):
+ def test_ellipsis(self):
+ # GH#21282
+ idx = timedelta_range('1 day', '31 day', freq='D', name='idx')
+
+ result = idx[...]
+ assert result.equals(idx)
+ assert result is not idx
+
+ def test_getitem(self):
+ idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx')
+
+ for idx in [idx1]:
+ result = idx[0]
+ assert result == Timedelta('1 day')
+
+ result = idx[0:5]
+ expected = timedelta_range('1 day', '5 day', freq='D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[0:10:2]
+ expected = timedelta_range('1 day', '9 day', freq='2D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[-20:-5:3]
+ expected = timedelta_range('12 day', '24 day', freq='3D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx[4::-1]
+ expected = TimedeltaIndex(['5 day', '4 day', '3 day',
+ '2 day', '1 day'],
+ freq='-1D', name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ @pytest.mark.parametrize('key', [pd.Timestamp('1970-01-01'),
+ pd.Timestamp('1970-01-02'),
+ datetime(1970, 1, 1)])
+ def test_timestamp_invalid_key(self, key):
+ # GH#20464
+ tdi = pd.timedelta_range(0, periods=10)
+ with pytest.raises(TypeError):
+ tdi.get_loc(key)
+
+
+class TestWhere(object):
+ # placeholder for symmetry with DatetimeIndex and PeriodIndex tests
+ pass
+
+
+class TestTake(object):
+ def test_take(self):
+ # GH 10295
+ idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx')
+
+ for idx in [idx1]:
+ result = idx.take([0])
+ assert result == Timedelta('1 day')
+
+ result = idx.take([-1])
+ assert result == Timedelta('31 day')
+
+ result = idx.take([0, 1, 2])
+ expected = timedelta_range('1 day', '3 day', freq='D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([0, 2, 4])
+ expected = timedelta_range('1 day', '5 day', freq='2D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([7, 4, 1])
+ expected = timedelta_range('8 day', '2 day', freq='-3D',
+ name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq == expected.freq
+
+ result = idx.take([3, 2, 5])
+ expected = TimedeltaIndex(['4 day', '3 day', '6 day'], name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq is None
+
+ result = idx.take([-3, 2, 5])
+ expected = TimedeltaIndex(['29 day', '3 day', '6 day'], name='idx')
+ tm.assert_index_equal(result, expected)
+ assert result.freq is None
+
+ def test_take_invalid_kwargs(self):
+ idx = timedelta_range('1 day', '31 day', freq='D', name='idx')
+ indices = [1, 6, 5, 9, 10, 13, 15, 3]
+
+ msg = r"take\(\) got an unexpected keyword argument 'foo'"
+ with pytest.raises(TypeError, match=msg):
+ idx.take(indices, foo=2)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, out=indices)
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ idx.take(indices, mode='clip')
+
+ # TODO: This method came from test_timedelta; de-dup with version above
+ def test_take2(self):
+ tds = ['1day 02:00:00', '1 day 04:00:00', '1 day 10:00:00']
+ idx = timedelta_range(start='1d', end='2d', freq='H', name='idx')
+ expected = TimedeltaIndex(tds, freq=None, name='idx')
+
+ taken1 = idx.take([2, 4, 10])
+ taken2 = idx[[2, 4, 10]]
+
+ for taken in [taken1, taken2]:
+ tm.assert_index_equal(taken, expected)
+ assert isinstance(taken, TimedeltaIndex)
+ assert taken.freq is None
+ assert taken.name == expected.name
+
+ def test_take_fill_value(self):
+ # GH 12631
+ idx = TimedeltaIndex(['1 days', '2 days', '3 days'],
+ name='xxx')
+ result = idx.take(np.array([1, 0, -1]))
+ expected = TimedeltaIndex(['2 days', '1 days', '3 days'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # fill_value
+ result = idx.take(np.array([1, 0, -1]), fill_value=True)
+ expected = TimedeltaIndex(['2 days', '1 days', 'NaT'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ # allow_fill=False
+ result = idx.take(np.array([1, 0, -1]), allow_fill=False,
+ fill_value=True)
+ expected = TimedeltaIndex(['2 days', '1 days', '3 days'],
+ name='xxx')
+ tm.assert_index_equal(result, expected)
+
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -2]), fill_value=True)
+ with pytest.raises(ValueError, match=msg):
+ idx.take(np.array([1, 0, -5]), fill_value=True)
+
+ with pytest.raises(IndexError):
+ idx.take(np.array([1, -5]))
+
+
+class TestTimedeltaIndex(object):
+
+ def test_insert(self):
+
+ idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx')
+
+ result = idx.insert(2, timedelta(days=5))
+ exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx')
+ tm.assert_index_equal(result, exp)
+
+ # insertion of non-datetime should coerce to object index
+ result = idx.insert(1, 'inserted')
+ expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'),
+ Timedelta('2day')], name='idx')
+ assert not isinstance(result, TimedeltaIndex)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+
+ idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx')
+
+ # preserve freq
+ expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02',
+ '1day 00:00:03'],
+ name='idx', freq='s')
+ expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02',
+ '1day 00:00:03', '1day 00:00:04'],
+ name='idx', freq='s')
+
+ # reset freq to None
+ expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01',
+ '1day 00:00:02', '1day 00:00:03'],
+ name='idx', freq=None)
+ expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02',
+ '1day 00:00:03', '1day 00:00:05'],
+ name='idx', freq=None)
+
+ cases = [(0, Timedelta('1day'), expected_0),
+ (-3, Timedelta('1day'), expected_0),
+ (3, Timedelta('1day 00:00:04'), expected_3),
+ (1, Timedelta('1day 00:00:01'), expected_1_nofreq),
+ (3, Timedelta('1day 00:00:05'), expected_3_nofreq)]
+
+ for n, d, expected in cases:
+ result = idx.insert(n, d)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ # GH 18295 (test missing)
+ expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day'])
+ for na in (np.nan, pd.NaT, None):
+ result = timedelta_range('1day', '3day').insert(1, na)
+ tm.assert_index_equal(result, expected)
+
+ def test_delete(self):
+ idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx')
+
+ # prserve freq
+ expected_0 = timedelta_range(start='2 Days', periods=4, freq='D',
+ name='idx')
+ expected_4 = timedelta_range(start='1 Days', periods=4, freq='D',
+ name='idx')
+
+ # reset freq to None
+ expected_1 = TimedeltaIndex(
+ ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx')
+
+ cases = {0: expected_0,
+ -5: expected_0,
+ -1: expected_4,
+ 4: expected_4,
+ 1: expected_1}
+ for n, expected in compat.iteritems(cases):
+ result = idx.delete(n)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ with pytest.raises((IndexError, ValueError)):
+ # either depending on numpy version
+ idx.delete(5)
+
+ def test_delete_slice(self):
+ idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx')
+
+ # prserve freq
+ expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D',
+ name='idx')
+ expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D',
+ name='idx')
+
+ # reset freq to None
+ expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d',
+ '7 d', '8 d', '9 d', '10d'],
+ freq=None, name='idx')
+
+ cases = {(0, 1, 2): expected_0_2,
+ (7, 8, 9): expected_7_9,
+ (3, 4, 5): expected_3_5}
+ for n, expected in compat.iteritems(cases):
+ result = idx.delete(n)
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ result = idx.delete(slice(n[0], n[-1] + 1))
+ tm.assert_index_equal(result, expected)
+ assert result.name == expected.name
+ assert result.freq == expected.freq
+
+ def test_get_loc(self):
+ idx = pd.to_timedelta(['0 days', '1 days', '2 days'])
+
+ for method in [None, 'pad', 'backfill', 'nearest']:
+ assert idx.get_loc(idx[1], method) == 1
+ assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1
+ assert idx.get_loc(str(idx[1]), method) == 1
+
+ assert idx.get_loc(idx[1], 'pad',
+ tolerance=Timedelta(0)) == 1
+ assert idx.get_loc(idx[1], 'pad',
+ tolerance=np.timedelta64(0, 's')) == 1
+ assert idx.get_loc(idx[1], 'pad',
+ tolerance=timedelta(0)) == 1
+
+ with pytest.raises(ValueError, match='unit abbreviation w/o a number'):
+ idx.get_loc(idx[1], method='nearest', tolerance='foo')
+
+ with pytest.raises(
+ ValueError,
+ match='tolerance size must match'):
+ idx.get_loc(idx[1], method='nearest',
+ tolerance=[Timedelta(0).to_timedelta64(),
+ Timedelta(0).to_timedelta64()])
+
+ for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]:
+ assert idx.get_loc('1 day 1 hour', method) == loc
+
+ # GH 16909
+ assert idx.get_loc(idx[1].to_timedelta64()) == 1
+
+ # GH 16896
+ assert idx.get_loc('0 days') == 0
+
+ def test_get_loc_nat(self):
+ tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00'])
+
+ assert tidx.get_loc(pd.NaT) == 1
+ assert tidx.get_loc(None) == 1
+ assert tidx.get_loc(float('nan')) == 1
+ assert tidx.get_loc(np.nan) == 1
+
+ def test_get_indexer(self):
+ idx = pd.to_timedelta(['0 days', '1 days', '2 days'])
+ tm.assert_numpy_array_equal(idx.get_indexer(idx),
+ np.array([0, 1, 2], dtype=np.intp))
+
+ target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour'])
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
+ np.array([-1, 0, 1], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
+ np.array([0, 1, 2], dtype=np.intp))
+ tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
+ np.array([0, 1, 1], dtype=np.intp))
+
+ res = idx.get_indexer(target, 'nearest',
+ tolerance=Timedelta('1 hour'))
+ tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp))
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_ops.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_ops.py
new file mode 100644
index 00000000000..40377e4362b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_ops.py
@@ -0,0 +1,281 @@
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.generic import ABCDateOffset
+
+import pandas as pd
+from pandas import Series, TimedeltaIndex, timedelta_range
+from pandas.tests.test_base import Ops
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day, Hour
+
+
+class TestTimedeltaIndexOps(Ops):
+ def setup_method(self, method):
+ super(TestTimedeltaIndexOps, self).setup_method(method)
+ mask = lambda x: isinstance(x, TimedeltaIndex)
+ self.is_valid_objs = [o for o in self.objs if mask(o)]
+ self.not_valid_objs = []
+
+ def test_ops_properties(self):
+ f = lambda x: isinstance(x, TimedeltaIndex)
+ self.check_ops_properties(TimedeltaIndex._field_ops, f)
+ self.check_ops_properties(TimedeltaIndex._object_ops, f)
+
+ def test_value_counts_unique(self):
+ # GH 7735
+
+ idx = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ # create repeated values, 'n'th element is repeated by n+1 times
+ idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1)))
+
+ exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10)
+ expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ expected = timedelta_range('1 days 09:00:00', freq='H', periods=10)
+ tm.assert_index_equal(idx.unique(), expected)
+
+ idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00',
+ '1 days 09:00:00', '1 days 08:00:00',
+ '1 days 08:00:00', pd.NaT])
+
+ exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00'])
+ expected = Series([3, 2], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(), expected)
+
+ exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00',
+ pd.NaT])
+ expected = Series([3, 2, 1], index=exp_idx)
+
+ for obj in [idx, Series(idx)]:
+ tm.assert_series_equal(obj.value_counts(dropna=False), expected)
+
+ tm.assert_index_equal(idx.unique(), exp_idx)
+
+ def test_nonunique_contains(self):
+ # GH 9512
+ for idx in map(TimedeltaIndex, ([0, 1, 0], [0, 0, -1], [0, -1, -1],
+ ['00:01:00', '00:01:00', '00:02:00'],
+ ['00:01:00', '00:01:00', '00:00:01'])):
+ assert idx[0] in idx
+
+ def test_unknown_attribute(self):
+ # see gh-9680
+ tdi = pd.timedelta_range(start=0, periods=10, freq='1s')
+ ts = pd.Series(np.random.normal(size=10), index=tdi)
+ assert 'foo' not in ts.__dict__.keys()
+ pytest.raises(AttributeError, lambda: ts.foo)
+
+ def test_order(self):
+ # GH 10295
+ idx1 = TimedeltaIndex(['1 day', '2 day', '3 day'], freq='D',
+ name='idx')
+ idx2 = TimedeltaIndex(
+ ['1 hour', '2 hour', '3 hour'], freq='H', name='idx')
+
+ for idx in [idx1, idx2]:
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, idx)
+ assert ordered.freq == idx.freq
+
+ ordered = idx.sort_values(ascending=False)
+ expected = idx[::-1]
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq == expected.freq
+ assert ordered.freq.n == -1
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, idx)
+ tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]),
+ check_dtype=False)
+ assert ordered.freq == idx.freq
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, idx[::-1])
+ assert ordered.freq == expected.freq
+ assert ordered.freq.n == -1
+
+ idx1 = TimedeltaIndex(['1 hour', '3 hour', '5 hour',
+ '2 hour ', '1 hour'], name='idx1')
+ exp1 = TimedeltaIndex(['1 hour', '1 hour', '2 hour',
+ '3 hour', '5 hour'], name='idx1')
+
+ idx2 = TimedeltaIndex(['1 day', '3 day', '5 day',
+ '2 day', '1 day'], name='idx2')
+
+ # TODO(wesm): unused?
+ # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day',
+ # '3 day', '5 day'], name='idx2')
+
+ # idx3 = TimedeltaIndex([pd.NaT, '3 minute', '5 minute',
+ # '2 minute', pd.NaT], name='idx3')
+ # exp3 = TimedeltaIndex([pd.NaT, pd.NaT, '2 minute', '3 minute',
+ # '5 minute'], name='idx3')
+
+ for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]:
+ ordered = idx.sort_values()
+ tm.assert_index_equal(ordered, expected)
+ assert ordered.freq is None
+
+ ordered = idx.sort_values(ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+ assert ordered.freq is None
+
+ ordered, indexer = idx.sort_values(return_indexer=True)
+ tm.assert_index_equal(ordered, expected)
+
+ exp = np.array([0, 4, 3, 1, 2])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq is None
+
+ ordered, indexer = idx.sort_values(return_indexer=True,
+ ascending=False)
+ tm.assert_index_equal(ordered, expected[::-1])
+
+ exp = np.array([2, 1, 3, 4, 0])
+ tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
+ assert ordered.freq is None
+
+ def test_drop_duplicates_metadata(self):
+ # GH 10115
+ idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
+ result = idx.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert idx.freq == result.freq
+
+ idx_dup = idx.append(idx)
+ assert idx_dup.freq is None # freq is reset
+ result = idx_dup.drop_duplicates()
+ tm.assert_index_equal(idx, result)
+ assert result.freq is None
+
+ def test_drop_duplicates(self):
+ # to check Index/Series compat
+ base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
+ idx = base.append(base[:5])
+
+ res = idx.drop_duplicates()
+ tm.assert_index_equal(res, base)
+ res = Series(idx).drop_duplicates()
+ tm.assert_series_equal(res, Series(base))
+
+ res = idx.drop_duplicates(keep='last')
+ exp = base[5:].append(base[:5])
+ tm.assert_index_equal(res, exp)
+ res = Series(idx).drop_duplicates(keep='last')
+ tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
+
+ res = idx.drop_duplicates(keep=False)
+ tm.assert_index_equal(res, base[5:])
+ res = Series(idx).drop_duplicates(keep=False)
+ tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+
+ @pytest.mark.parametrize('freq', ['D', '3D', '-3D',
+ 'H', '2H', '-2H',
+ 'T', '2T', 'S', '-3S'])
+ def test_infer_freq(self, freq):
+ # GH#11018
+ idx = pd.timedelta_range('1', freq=freq, periods=10)
+ result = pd.TimedeltaIndex(idx.asi8, freq='infer')
+ tm.assert_index_equal(idx, result)
+ assert result.freq == freq
+
+ def test_shift(self):
+ pass # handled in test_arithmetic.py
+
+ def test_repeat(self):
+ index = pd.timedelta_range('1 days', periods=2, freq='D')
+ exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days'])
+ for res in [index.repeat(2), np.repeat(index, 2)]:
+ tm.assert_index_equal(res, exp)
+ assert res.freq is None
+
+ index = TimedeltaIndex(['1 days', 'NaT', '3 days'])
+ exp = TimedeltaIndex(['1 days', '1 days', '1 days',
+ 'NaT', 'NaT', 'NaT',
+ '3 days', '3 days', '3 days'])
+ for res in [index.repeat(3), np.repeat(index, 3)]:
+ tm.assert_index_equal(res, exp)
+ assert res.freq is None
+
+ def test_nat(self):
+ assert pd.TimedeltaIndex._na_value is pd.NaT
+ assert pd.TimedeltaIndex([])._na_value is pd.NaT
+
+ idx = pd.TimedeltaIndex(['1 days', '2 days'])
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, False]))
+ assert idx.hasnans is False
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([], dtype=np.intp))
+
+ idx = pd.TimedeltaIndex(['1 days', 'NaT'])
+ assert idx._can_hold_na
+
+ tm.assert_numpy_array_equal(idx._isnan, np.array([False, True]))
+ assert idx.hasnans is True
+ tm.assert_numpy_array_equal(idx._nan_idxs,
+ np.array([1], dtype=np.intp))
+
+ def test_equals(self):
+ # GH 13107
+ idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT'])
+ assert idx.equals(idx)
+ assert idx.equals(idx.copy())
+ assert idx.equals(idx.astype(object))
+ assert idx.astype(object).equals(idx)
+ assert idx.astype(object).equals(idx.astype(object))
+ assert not idx.equals(list(idx))
+ assert not idx.equals(pd.Series(idx))
+
+ idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT'])
+ assert not idx.equals(idx2)
+ assert not idx.equals(idx2.copy())
+ assert not idx.equals(idx2.astype(object))
+ assert not idx.astype(object).equals(idx2)
+ assert not idx.astype(object).equals(idx2.astype(object))
+ assert not idx.equals(list(idx2))
+ assert not idx.equals(pd.Series(idx2))
+
+ @pytest.mark.parametrize('values', [['0 days', '2 days', '4 days'], []])
+ @pytest.mark.parametrize('freq', ['2D', Day(2), '48H', Hour(48)])
+ def test_freq_setter(self, values, freq):
+ # GH 20678
+ idx = TimedeltaIndex(values)
+
+ # can set to an offset, converting from string if necessary
+ idx.freq = freq
+ assert idx.freq == freq
+ assert isinstance(idx.freq, ABCDateOffset)
+
+ # can reset to None
+ idx.freq = None
+ assert idx.freq is None
+
+ def test_freq_setter_errors(self):
+ # GH 20678
+ idx = TimedeltaIndex(['0 days', '2 days', '4 days'])
+
+ # setting with an incompatible freq
+ msg = ('Inferred frequency 2D from passed values does not conform to '
+ 'passed frequency 5D')
+ with pytest.raises(ValueError, match=msg):
+ idx.freq = '5D'
+
+ # setting with a non-fixed frequency
+ msg = r'<2 \* BusinessDays> is a non-fixed frequency'
+ with pytest.raises(ValueError, match=msg):
+ idx.freq = '2B'
+
+ # setting with non-freq string
+ with pytest.raises(ValueError, match='Invalid frequency'):
+ idx.freq = 'foo'
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_partial_slicing.py
new file mode 100644
index 00000000000..62bf2a0b4a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_partial_slicing.py
@@ -0,0 +1,85 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Series, Timedelta, timedelta_range
+from pandas.util.testing import assert_series_equal
+
+
+class TestSlicing(object):
+ def test_slice_keeps_name(self):
+ # GH4226
+ dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket')
+ assert dr[1:].name == dr.name
+
+ def test_partial_slice(self):
+ rng = timedelta_range('1 day 10:11:12', freq='h', periods=500)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['5 day':'6 day']
+ expected = s.iloc[86:134]
+ assert_series_equal(result, expected)
+
+ result = s['5 day':]
+ expected = s.iloc[86:]
+ assert_series_equal(result, expected)
+
+ result = s[:'6 day']
+ expected = s.iloc[:134]
+ assert_series_equal(result, expected)
+
+ result = s['6 days, 23:11:12']
+ assert result == s.iloc[133]
+
+ pytest.raises(KeyError, s.__getitem__, '50 days')
+
+ def test_partial_slice_high_reso(self):
+
+ # higher reso
+ rng = timedelta_range('1 day 10:11:12', freq='us', periods=2000)
+ s = Series(np.arange(len(rng)), index=rng)
+
+ result = s['1 day 10:11:12':]
+ expected = s.iloc[0:]
+ assert_series_equal(result, expected)
+
+ result = s['1 day 10:11:12.001':]
+ expected = s.iloc[1000:]
+ assert_series_equal(result, expected)
+
+ result = s['1 days, 10:11:12.001001']
+ assert result == s.iloc[1001]
+
+ def test_slice_with_negative_step(self):
+ ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H'))
+ SLC = pd.IndexSlice
+
+ def assert_slices_equivalent(l_slc, i_slc):
+ assert_series_equal(ts[l_slc], ts.iloc[i_slc])
+ assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+ assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
+
+ assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1])
+ assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1])
+
+ assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1])
+ assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1])
+
+ assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1])
+ assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):-
+ 1], SLC[15:6:-1])
+ assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1],
+ SLC[15:6:-1])
+ assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1],
+ SLC[15:6:-1])
+
+ assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0])
+
+ def test_slice_with_zero_step_raises(self):
+ ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H'))
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ ts.loc[::0]
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_scalar_compat.py
new file mode 100644
index 00000000000..788d27eb8ab
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_scalar_compat.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for TimedeltaIndex methods behaving like their Timedelta counterparts
+"""
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range
+import pandas.util.testing as tm
+
+
+class TestVectorizedTimedelta(object):
+ def test_tdi_total_seconds(self):
+ # GH#10939
+ # test index
+ rng = timedelta_range('1 days, 10:11:12.100123456', periods=2,
+ freq='s')
+ expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9,
+ 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9]
+ tm.assert_almost_equal(rng.total_seconds(), Index(expt))
+
+ # test Series
+ ser = Series(rng)
+ s_expt = Series(expt, index=[0, 1])
+ tm.assert_series_equal(ser.dt.total_seconds(), s_expt)
+
+ # with nat
+ ser[1] = np.nan
+ s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 +
+ 12 + 100123456. / 1e9, np.nan], index=[0, 1])
+ tm.assert_series_equal(ser.dt.total_seconds(), s_expt)
+
+ # with both nat
+ ser = Series([np.nan, np.nan], dtype='timedelta64[ns]')
+ tm.assert_series_equal(ser.dt.total_seconds(),
+ Series([np.nan, np.nan], index=[0, 1]))
+
+ def test_tdi_round(self):
+ td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min')
+ elt = td[1]
+
+ expected_rng = TimedeltaIndex([Timedelta('16801 days 00:00:00'),
+ Timedelta('16801 days 00:00:00'),
+ Timedelta('16801 days 01:00:00'),
+ Timedelta('16801 days 02:00:00'),
+ Timedelta('16801 days 02:00:00')])
+ expected_elt = expected_rng[1]
+
+ tm.assert_index_equal(td.round(freq='H'), expected_rng)
+ assert elt.round(freq='H') == expected_elt
+
+ msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ td.round(freq='foo')
+ with pytest.raises(ValueError, match=msg):
+ elt.round(freq='foo')
+
+ msg = "<MonthEnd> is a non-fixed frequency"
+ with pytest.raises(ValueError, match=msg):
+ td.round(freq='M')
+ with pytest.raises(ValueError, match=msg):
+ elt.round(freq='M')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_setops.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_setops.py
new file mode 100644
index 00000000000..f7c3f764df0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_setops.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+import pandas as pd
+from pandas import Int64Index, TimedeltaIndex, timedelta_range
+import pandas.util.testing as tm
+
+
+class TestTimedeltaIndex(object):
+
+ def test_union(self):
+
+ i1 = timedelta_range('1day', periods=5)
+ i2 = timedelta_range('3day', periods=5)
+ result = i1.union(i2)
+ expected = timedelta_range('1day', periods=7)
+ tm.assert_index_equal(result, expected)
+
+ i1 = Int64Index(np.arange(0, 20, 2))
+ i2 = timedelta_range(start='1 day', periods=10, freq='D')
+ i1.union(i2) # Works
+ i2.union(i1) # Fails with "AttributeError: can't set attribute"
+
+ def test_union_coverage(self):
+
+ idx = TimedeltaIndex(['3d', '1d', '2d'])
+ ordered = TimedeltaIndex(idx.sort_values(), freq='infer')
+ result = ordered.union(idx)
+ tm.assert_index_equal(result, ordered)
+
+ result = ordered[:0].union(ordered)
+ tm.assert_index_equal(result, ordered)
+ assert result.freq == ordered.freq
+
+ def test_union_bug_1730(self):
+
+ rng_a = timedelta_range('1 day', periods=4, freq='3H')
+ rng_b = timedelta_range('1 day', periods=4, freq='4H')
+
+ result = rng_a.union(rng_b)
+ exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b))))
+ tm.assert_index_equal(result, exp)
+
+ def test_union_bug_1745(self):
+
+ left = TimedeltaIndex(['1 day 15:19:49.695000'])
+ right = TimedeltaIndex(['2 day 13:04:21.322000',
+ '1 day 15:27:24.873000',
+ '1 day 15:31:05.350000'])
+
+ result = left.union(right)
+ exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right))))
+ tm.assert_index_equal(result, exp)
+
+ def test_union_bug_4564(self):
+
+ left = timedelta_range("1 day", "30d")
+ right = left + pd.offsets.Minute(15)
+
+ result = left.union(right)
+ exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right))))
+ tm.assert_index_equal(result, exp)
+
+ def test_intersection_bug_1708(self):
+ index_1 = timedelta_range('1 day', periods=4, freq='h')
+ index_2 = index_1 + pd.offsets.Hour(5)
+
+ result = index_1 & index_2
+ assert len(result) == 0
+
+ index_1 = timedelta_range('1 day', periods=4, freq='h')
+ index_2 = index_1 + pd.offsets.Hour(1)
+
+ result = index_1 & index_2
+ expected = timedelta_range('1 day 01:00:00', periods=3, freq='h')
+ tm.assert_index_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta.py
new file mode 100644
index 00000000000..79210705103
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta.py
@@ -0,0 +1,335 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, Int64Index, Series, Timedelta, TimedeltaIndex,
+ date_range, timedelta_range)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_index_equal, assert_series_equal)
+
+from ..datetimelike import DatetimeLike
+
+randn = np.random.randn
+
+
+class TestTimedeltaIndex(DatetimeLike):
+ _holder = TimedeltaIndex
+
+ def setup_method(self, method):
+ self.indices = dict(index=tm.makeTimedeltaIndex(10))
+ self.setup_indices()
+
+ def create_index(self):
+ return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
+
+ def test_numeric_compat(self):
+ # Dummy method to override super's version; this test is now done
+ # in test_arithmetic.py
+ pass
+
+ def test_shift(self):
+ pass # this is handled in test_arithmetic.py
+
+ def test_pickle_compat_construction(self):
+ pass
+
+ def test_fillna_timedelta(self):
+ # GH 11343
+ idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day'])
+
+ exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day'])
+ tm.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp)
+
+ exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day'])
+ idx.fillna(pd.Timedelta('3 hour'))
+
+ exp = pd.Index(
+ [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object)
+ tm.assert_index_equal(idx.fillna('x'), exp)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_freq(self, sort):
+ # GH14323: Difference of TimedeltaIndex should not preserve frequency
+
+ index = timedelta_range("0 days", "5 days", freq="D")
+
+ other = timedelta_range("1 days", "4 days", freq="D")
+ expected = TimedeltaIndex(["0 days", "5 days"], freq=None)
+ idx_diff = index.difference(other, sort)
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ other = timedelta_range("2 days", "5 days", freq="D")
+ idx_diff = index.difference(other, sort)
+ expected = TimedeltaIndex(["0 days", "1 days"], freq=None)
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ @pytest.mark.parametrize("sort", [None, False])
+ def test_difference_sort(self, sort):
+
+ index = pd.TimedeltaIndex(["5 days", "3 days", "2 days", "4 days",
+ "1 days", "0 days"])
+
+ other = timedelta_range("1 days", "4 days", freq="D")
+ idx_diff = index.difference(other, sort)
+
+ expected = TimedeltaIndex(["5 days", "0 days"], freq=None)
+
+ if sort is None:
+ expected = expected.sort_values()
+
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ other = timedelta_range("2 days", "5 days", freq="D")
+ idx_diff = index.difference(other, sort)
+ expected = TimedeltaIndex(["1 days", "0 days"], freq=None)
+
+ if sort is None:
+ expected = expected.sort_values()
+
+ tm.assert_index_equal(idx_diff, expected)
+ tm.assert_attr_equal('freq', idx_diff, expected)
+
+ def test_isin(self):
+
+ index = tm.makeTimedeltaIndex(4)
+ result = index.isin(index)
+ assert result.all()
+
+ result = index.isin(list(index))
+ assert result.all()
+
+ assert_almost_equal(index.isin([index[2], 5]),
+ np.array([False, False, True, False]))
+
+ def test_factorize(self):
+ idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day',
+ '3 day'])
+
+ exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp)
+ exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day'])
+
+ arr, idx = idx1.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ arr, idx = idx1.factorize(sort=True)
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, exp_idx)
+
+ # freq must be preserved
+ idx3 = timedelta_range('1 day', periods=4, freq='s')
+ exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
+ arr, idx = idx3.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ tm.assert_index_equal(idx, idx3)
+
+ def test_join_self(self, join_type):
+ index = timedelta_range('1 day', periods=10)
+ joined = index.join(index, how=join_type)
+ tm.assert_index_equal(index, joined)
+
+ def test_does_not_convert_mixed_integer(self):
+ df = tm.makeCustomDataframe(10, 10,
+ data_gen_f=lambda *args, **kwargs: randn(),
+ r_idx_type='i', c_idx_type='td')
+ str(df)
+
+ cols = df.columns.join(df.index, how='outer')
+ joined = cols.join(df.columns)
+ assert cols.dtype == np.dtype('O')
+ assert cols.dtype == joined.dtype
+ tm.assert_index_equal(cols, joined)
+
+ def test_sort_values(self):
+
+ idx = TimedeltaIndex(['4d', '1d', '2d'])
+
+ ordered = idx.sort_values()
+ assert ordered.is_monotonic
+
+ ordered = idx.sort_values(ascending=False)
+ assert ordered[::-1].is_monotonic
+
+ ordered, dexer = idx.sort_values(return_indexer=True)
+ assert ordered.is_monotonic
+
+ tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]),
+ check_dtype=False)
+
+ ordered, dexer = idx.sort_values(return_indexer=True, ascending=False)
+ assert ordered[::-1].is_monotonic
+
+ tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]),
+ check_dtype=False)
+
+ def test_get_duplicates(self):
+ idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day',
+ '4day'])
+
+ with tm.assert_produces_warning(FutureWarning):
+ # Deprecated - see GH20239
+ result = idx.get_duplicates()
+
+ ex = TimedeltaIndex(['2 day', '3day'])
+ tm.assert_index_equal(result, ex)
+
+ def test_argmin_argmax(self):
+ idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01',
+ '1 day 00:00:02'])
+ assert idx.argmin() == 1
+ assert idx.argmax() == 0
+
+ def test_misc_coverage(self):
+
+ rng = timedelta_range('1 day', periods=5)
+ result = rng.groupby(rng.days)
+ assert isinstance(list(result.values())[0][0], Timedelta)
+
+ idx = TimedeltaIndex(['3d', '1d', '2d'])
+ assert not idx.equals(list(idx))
+
+ non_td = Index(list('abc'))
+ assert not idx.equals(list(non_td))
+
+ def test_map(self):
+ # test_map_dictlike generally tests
+
+ rng = timedelta_range('1 day', periods=10)
+
+ f = lambda x: x.days
+ result = rng.map(f)
+ exp = Int64Index([f(x) for x in rng])
+ tm.assert_index_equal(result, exp)
+
+ def test_pass_TimedeltaIndex_to_index(self):
+
+ rng = timedelta_range('1 days', '10 days')
+ idx = Index(rng, dtype=object)
+
+ expected = Index(rng.to_pytimedelta(), dtype=object)
+
+ tm.assert_numpy_array_equal(idx.values, expected.values)
+
+ def test_pickle(self):
+
+ rng = timedelta_range('1 days', periods=10)
+ rng_p = tm.round_trip_pickle(rng)
+ tm.assert_index_equal(rng, rng_p)
+
+ def test_hash_error(self):
+ index = timedelta_range('1 days', periods=10)
+ with pytest.raises(TypeError, match=("unhashable type: %r" %
+ type(index).__name__)):
+ hash(index)
+
+ def test_append_join_nondatetimeindex(self):
+ rng = timedelta_range('1 days', periods=10)
+ idx = Index(['a', 'b', 'c', 'd'])
+
+ result = rng.append(idx)
+ assert isinstance(result[0], Timedelta)
+
+ # it works
+ rng.join(idx, how='outer')
+
+ def test_append_numpy_bug_1681(self):
+
+ td = timedelta_range('1 days', '10 days', freq='2D')
+ a = DataFrame()
+ c = DataFrame({'A': 'foo', 'B': td}, index=td)
+ str(c)
+
+ result = a.append(c)
+ assert (result['B'] == td).all()
+
+ def test_fields(self):
+ rng = timedelta_range('1 days, 10:11:12.100123456', periods=2,
+ freq='s')
+ tm.assert_index_equal(rng.days, Index([1, 1], dtype='int64'))
+ tm.assert_index_equal(
+ rng.seconds,
+ Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13],
+ dtype='int64'))
+ tm.assert_index_equal(
+ rng.microseconds,
+ Index([100 * 1000 + 123, 100 * 1000 + 123], dtype='int64'))
+ tm.assert_index_equal(rng.nanoseconds,
+ Index([456, 456], dtype='int64'))
+
+ pytest.raises(AttributeError, lambda: rng.hours)
+ pytest.raises(AttributeError, lambda: rng.minutes)
+ pytest.raises(AttributeError, lambda: rng.milliseconds)
+
+ # with nat
+ s = Series(rng)
+ s[1] = np.nan
+
+ tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1]))
+ tm.assert_series_equal(s.dt.seconds, Series(
+ [10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]))
+
+ # preserve name (GH15589)
+ rng.name = 'name'
+ assert rng.days.name == 'name'
+
+ def test_freq_conversion(self):
+
+ # doc example
+
+ # series
+ td = Series(date_range('20130101', periods=4)) - \
+ Series(date_range('20121201', periods=4))
+ td[2] += timedelta(minutes=5, seconds=3)
+ td[3] = np.nan
+
+ result = td / np.timedelta64(1, 'D')
+ expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan
+ ])
+ assert_series_equal(result, expected)
+
+ result = td.astype('timedelta64[D]')
+ expected = Series([31, 31, 31, np.nan])
+ assert_series_equal(result, expected)
+
+ result = td / np.timedelta64(1, 's')
+ expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3,
+ np.nan])
+ assert_series_equal(result, expected)
+
+ result = td.astype('timedelta64[s]')
+ assert_series_equal(result, expected)
+
+ # tdi
+ td = TimedeltaIndex(td)
+
+ result = td / np.timedelta64(1, 'D')
+ expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan])
+ assert_index_equal(result, expected)
+
+ result = td.astype('timedelta64[D]')
+ expected = Index([31, 31, 31, np.nan])
+ assert_index_equal(result, expected)
+
+ result = td / np.timedelta64(1, 's')
+ expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3,
+ np.nan])
+ assert_index_equal(result, expected)
+
+ result = td.astype('timedelta64[s]')
+ assert_index_equal(result, expected)
+
+
+class TestTimeSeries(object):
+
+ def test_series_box_timedelta(self):
+ rng = timedelta_range('1 day 1 s', periods=5, freq='h')
+ s = Series(rng)
+ assert isinstance(s[1], Timedelta)
+ assert isinstance(s.iat[2], Timedelta)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta_range.py
new file mode 100644
index 00000000000..1c06abad1ab
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_timedelta_range.py
@@ -0,0 +1,79 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import timedelta_range, to_timedelta
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day, Second
+
+
+class TestTimedeltas(object):
+
+ def test_timedelta_range(self):
+
+ expected = to_timedelta(np.arange(5), unit='D')
+ result = timedelta_range('0 days', periods=5, freq='D')
+ tm.assert_index_equal(result, expected)
+
+ expected = to_timedelta(np.arange(11), unit='D')
+ result = timedelta_range('0 days', '10 days', freq='D')
+ tm.assert_index_equal(result, expected)
+
+ expected = to_timedelta(np.arange(5), unit='D') + Second(2) + Day()
+ result = timedelta_range('1 days, 00:00:02', '5 days, 00:00:02',
+ freq='D')
+ tm.assert_index_equal(result, expected)
+
+ expected = to_timedelta([1, 3, 5, 7, 9], unit='D') + Second(2)
+ result = timedelta_range('1 days, 00:00:02', periods=5, freq='2D')
+ tm.assert_index_equal(result, expected)
+
+ expected = to_timedelta(np.arange(50), unit='T') * 30
+ result = timedelta_range('0 days', freq='30T', periods=50)
+ tm.assert_index_equal(result, expected)
+
+ # GH 11776
+ arr = np.arange(10).reshape(2, 5)
+ df = pd.DataFrame(np.arange(10).reshape(2, 5))
+ for arg in (arr, df):
+ with pytest.raises(TypeError, match="1-d array"):
+ to_timedelta(arg)
+ for errors in ['ignore', 'raise', 'coerce']:
+ with pytest.raises(TypeError, match="1-d array"):
+ to_timedelta(arg, errors=errors)
+
+ # issue10583
+ df = pd.DataFrame(np.random.normal(size=(10, 4)))
+ df.index = pd.timedelta_range(start='0s', periods=10, freq='s')
+ expected = df.loc[pd.Timedelta('0s'):, :]
+ result = df.loc['0s':, :]
+ tm.assert_frame_equal(expected, result)
+
+ @pytest.mark.parametrize('periods, freq', [
+ (3, '2D'), (5, 'D'), (6, '19H12T'), (7, '16H'), (9, '12H')])
+ def test_linspace_behavior(self, periods, freq):
+ # GH 20976
+ result = timedelta_range(start='0 days', end='4 days', periods=periods)
+ expected = timedelta_range(start='0 days', end='4 days', freq=freq)
+ tm.assert_index_equal(result, expected)
+
+ def test_errors(self):
+ # not enough params
+ msg = ('Of the four parameters: start, end, periods, and freq, '
+ 'exactly three must be specified')
+ with pytest.raises(ValueError, match=msg):
+ timedelta_range(start='0 days')
+
+ with pytest.raises(ValueError, match=msg):
+ timedelta_range(end='5 days')
+
+ with pytest.raises(ValueError, match=msg):
+ timedelta_range(periods=2)
+
+ with pytest.raises(ValueError, match=msg):
+ timedelta_range()
+
+ # too many params
+ with pytest.raises(ValueError, match=msg):
+ timedelta_range(start='0 days', end='5 days', periods=10, freq='H')
diff --git a/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_tools.py b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_tools.py
new file mode 100644
index 00000000000..45a6e2e7a8a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexes/timedeltas/test_tools.py
@@ -0,0 +1,182 @@
+from datetime import time, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+
+import pandas as pd
+from pandas import Series, TimedeltaIndex, isna, to_timedelta
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestTimedeltas(object):
+
+ def test_to_timedelta(self):
+ def conv(v):
+ return v.astype('m8[ns]')
+
+ d1 = np.timedelta64(1, 'D')
+
+ assert (to_timedelta('1 days 06:05:01.00003', box=False) ==
+ conv(d1 + np.timedelta64(6 * 3600 + 5 * 60 + 1, 's') +
+ np.timedelta64(30, 'us')))
+ assert (to_timedelta('15.5us', box=False) ==
+ conv(np.timedelta64(15500, 'ns')))
+
+ # empty string
+ result = to_timedelta('', box=False)
+ assert result.astype('int64') == iNaT
+
+ result = to_timedelta(['', ''])
+ assert isna(result).all()
+
+ # pass thru
+ result = to_timedelta(np.array([np.timedelta64(1, 's')]))
+ expected = pd.Index(np.array([np.timedelta64(1, 's')]))
+ tm.assert_index_equal(result, expected)
+
+ # ints
+ result = np.timedelta64(0, 'ns')
+ expected = to_timedelta(0, box=False)
+ assert result == expected
+
+ # Series
+ expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)])
+ result = to_timedelta(Series(['1d', '1days 00:00:01']))
+ tm.assert_series_equal(result, expected)
+
+ # with units
+ result = TimedeltaIndex([np.timedelta64(0, 'ns'), np.timedelta64(
+ 10, 's').astype('m8[ns]')])
+ expected = to_timedelta([0, 10], unit='s')
+ tm.assert_index_equal(result, expected)
+
+ # single element conversion
+ v = timedelta(seconds=1)
+ result = to_timedelta(v, box=False)
+ expected = np.timedelta64(timedelta(seconds=1))
+ assert result == expected
+
+ v = np.timedelta64(timedelta(seconds=1))
+ result = to_timedelta(v, box=False)
+ expected = np.timedelta64(timedelta(seconds=1))
+ assert result == expected
+
+ # arrays of various dtypes
+ arr = np.array([1] * 5, dtype='int64')
+ result = to_timedelta(arr, unit='s')
+ expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5)
+ tm.assert_index_equal(result, expected)
+
+ arr = np.array([1] * 5, dtype='int64')
+ result = to_timedelta(arr, unit='m')
+ expected = TimedeltaIndex([np.timedelta64(1, 'm')] * 5)
+ tm.assert_index_equal(result, expected)
+
+ arr = np.array([1] * 5, dtype='int64')
+ result = to_timedelta(arr, unit='h')
+ expected = TimedeltaIndex([np.timedelta64(1, 'h')] * 5)
+ tm.assert_index_equal(result, expected)
+
+ arr = np.array([1] * 5, dtype='timedelta64[s]')
+ result = to_timedelta(arr)
+ expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5)
+ tm.assert_index_equal(result, expected)
+
+ arr = np.array([1] * 5, dtype='timedelta64[D]')
+ result = to_timedelta(arr)
+ expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5)
+ tm.assert_index_equal(result, expected)
+
+ # Test with lists as input when box=false
+ expected = np.array(np.arange(3) * 1000000000, dtype='timedelta64[ns]')
+ result = to_timedelta(range(3), unit='s', box=False)
+ tm.assert_numpy_array_equal(expected, result)
+
+ result = to_timedelta(np.arange(3), unit='s', box=False)
+ tm.assert_numpy_array_equal(expected, result)
+
+ result = to_timedelta([0, 1, 2], unit='s', box=False)
+ tm.assert_numpy_array_equal(expected, result)
+
+ # Tests with fractional seconds as input:
+ expected = np.array(
+ [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]')
+ result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False)
+ tm.assert_numpy_array_equal(expected, result)
+
+ def test_to_timedelta_invalid(self):
+
+ # bad value for errors parameter
+ msg = "errors must be one of"
+ with pytest.raises(ValueError, match=msg):
+ to_timedelta(['foo'], errors='never')
+
+ # these will error
+ pytest.raises(ValueError, lambda: to_timedelta([1, 2], unit='foo'))
+ pytest.raises(ValueError, lambda: to_timedelta(1, unit='foo'))
+
+ # time not supported ATM
+ pytest.raises(ValueError, lambda: to_timedelta(time(second=1)))
+ assert to_timedelta(time(second=1), errors='coerce') is pd.NaT
+
+ pytest.raises(ValueError, lambda: to_timedelta(['foo', 'bar']))
+ tm.assert_index_equal(TimedeltaIndex([pd.NaT, pd.NaT]),
+ to_timedelta(['foo', 'bar'], errors='coerce'))
+
+ tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']),
+ to_timedelta(['1 day', 'bar', '1 min'],
+ errors='coerce'))
+
+ # gh-13613: these should not error because errors='ignore'
+ invalid_data = 'apple'
+ assert invalid_data == to_timedelta(invalid_data, errors='ignore')
+
+ invalid_data = ['apple', '1 days']
+ tm.assert_numpy_array_equal(
+ np.array(invalid_data, dtype=object),
+ to_timedelta(invalid_data, errors='ignore'))
+
+ invalid_data = pd.Index(['apple', '1 days'])
+ tm.assert_index_equal(invalid_data, to_timedelta(
+ invalid_data, errors='ignore'))
+
+ invalid_data = Series(['apple', '1 days'])
+ tm.assert_series_equal(invalid_data, to_timedelta(
+ invalid_data, errors='ignore'))
+
+ def test_to_timedelta_via_apply(self):
+ # GH 5458
+ expected = Series([np.timedelta64(1, 's')])
+ result = Series(['00:00:01']).apply(to_timedelta)
+ tm.assert_series_equal(result, expected)
+
+ result = Series([to_timedelta('00:00:01')])
+ tm.assert_series_equal(result, expected)
+
+ def test_to_timedelta_on_missing_values(self):
+ # GH5438
+ timedelta_NaT = np.timedelta64('NaT')
+
+ actual = pd.to_timedelta(Series(['00:00:01', np.nan]))
+ expected = Series([np.timedelta64(1000000000, 'ns'),
+ timedelta_NaT], dtype='<m8[ns]')
+ assert_series_equal(actual, expected)
+
+ actual = pd.to_timedelta(Series(['00:00:01', pd.NaT]))
+ assert_series_equal(actual, expected)
+
+ actual = pd.to_timedelta(np.nan)
+ assert actual.value == timedelta_NaT.astype('int64')
+
+ actual = pd.to_timedelta(pd.NaT)
+ assert actual.value == timedelta_NaT.astype('int64')
+
+ def test_to_timedelta_float(self):
+ # https://github.com/pandas-dev/pandas/issues/25077
+ arr = np.arange(0, 1, 1e-6)[-10:]
+ result = pd.to_timedelta(arr, unit='s')
+ expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype='int64')
+ tm.assert_numpy_array_equal(result.asi8, expected_asi8)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexing/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/common.py b/contrib/python/pandas/py2/pandas/tests/indexing/common.py
new file mode 100644
index 00000000000..f4d6fe42851
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/common.py
@@ -0,0 +1,307 @@
+""" common utilities """
+
+import itertools
+from warnings import catch_warnings, filterwarnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+from pandas.core.dtypes.common import is_scalar
+
+from pandas import (
+ DataFrame, Float64Index, MultiIndex, Panel, Series, UInt64Index,
+ date_range)
+from pandas.util import testing as tm
+
+from pandas.io.formats.printing import pprint_thing
+
+_verbose = False
+
+
+def _mklbl(prefix, n):
+ return ["%s%s" % (prefix, i) for i in range(n)]
+
+
+def _axify(obj, key, axis):
+ # create a tuple accessor
+ axes = [slice(None)] * obj.ndim
+ axes[axis] = key
+ return tuple(axes)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class Base(object):
+ """ indexing comprehensive base class """
+
+ _objs = {'series', 'frame', 'panel'}
+ _typs = {'ints', 'uints', 'labels', 'mixed', 'ts', 'floats', 'empty',
+ 'ts_rev', 'multi'}
+
+ def setup_method(self, method):
+
+ self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2))
+ self.frame_ints = DataFrame(np.random.randn(4, 4),
+ index=lrange(0, 8, 2),
+ columns=lrange(0, 12, 3))
+ with catch_warnings(record=True):
+ self.panel_ints = Panel(np.random.rand(4, 4, 4),
+ items=lrange(0, 8, 2),
+ major_axis=lrange(0, 12, 3),
+ minor_axis=lrange(0, 16, 4))
+
+ self.series_uints = Series(np.random.rand(4),
+ index=UInt64Index(lrange(0, 8, 2)))
+ self.frame_uints = DataFrame(np.random.randn(4, 4),
+ index=UInt64Index(lrange(0, 8, 2)),
+ columns=UInt64Index(lrange(0, 12, 3)))
+ self.panel_uints = Panel(np.random.rand(4, 4, 4),
+ items=UInt64Index(lrange(0, 8, 2)),
+ major_axis=UInt64Index(lrange(0, 12, 3)),
+ minor_axis=UInt64Index(lrange(0, 16, 4)))
+
+ self.series_floats = Series(np.random.rand(4),
+ index=Float64Index(range(0, 8, 2)))
+ self.frame_floats = DataFrame(np.random.randn(4, 4),
+ index=Float64Index(range(0, 8, 2)),
+ columns=Float64Index(range(0, 12, 3)))
+ self.panel_floats = Panel(np.random.rand(4, 4, 4),
+ items=Float64Index(range(0, 8, 2)),
+ major_axis=Float64Index(range(0, 12, 3)),
+ minor_axis=Float64Index(range(0, 16, 4)))
+
+ m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]),
+ MultiIndex.from_product([[5, 6], [7, 8]]),
+ MultiIndex.from_product([[9, 10], [11, 12]])]
+
+ self.series_multi = Series(np.random.rand(4),
+ index=m_idces[0])
+ self.frame_multi = DataFrame(np.random.randn(4, 4),
+ index=m_idces[0],
+ columns=m_idces[1])
+ self.panel_multi = Panel(np.random.rand(4, 4, 4),
+ items=m_idces[0],
+ major_axis=m_idces[1],
+ minor_axis=m_idces[2])
+
+ self.series_labels = Series(np.random.randn(4), index=list('abcd'))
+ self.frame_labels = DataFrame(np.random.randn(4, 4),
+ index=list('abcd'), columns=list('ABCD'))
+ self.panel_labels = Panel(np.random.randn(4, 4, 4),
+ items=list('abcd'),
+ major_axis=list('ABCD'),
+ minor_axis=list('ZYXW'))
+
+ self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8])
+ self.frame_mixed = DataFrame(np.random.randn(4, 4),
+ index=[2, 4, 'null', 8])
+ self.panel_mixed = Panel(np.random.randn(4, 4, 4),
+ items=[2, 4, 'null', 8])
+
+ self.series_ts = Series(np.random.randn(4),
+ index=date_range('20130101', periods=4))
+ self.frame_ts = DataFrame(np.random.randn(4, 4),
+ index=date_range('20130101', periods=4))
+ self.panel_ts = Panel(np.random.randn(4, 4, 4),
+ items=date_range('20130101', periods=4))
+
+ dates_rev = (date_range('20130101', periods=4)
+ .sort_values(ascending=False))
+ self.series_ts_rev = Series(np.random.randn(4),
+ index=dates_rev)
+ self.frame_ts_rev = DataFrame(np.random.randn(4, 4),
+ index=dates_rev)
+ self.panel_ts_rev = Panel(np.random.randn(4, 4, 4),
+ items=dates_rev)
+
+ self.frame_empty = DataFrame({})
+ self.series_empty = Series({})
+ self.panel_empty = Panel({})
+
+ # form agglomerates
+ for o in self._objs:
+
+ d = dict()
+ for t in self._typs:
+ d[t] = getattr(self, '%s_%s' % (o, t), None)
+
+ setattr(self, o, d)
+
+ def generate_indices(self, f, values=False):
+ """ generate the indices
+ if values is True , use the axis values
+ is False, use the range
+ """
+
+ axes = f.axes
+ if values:
+ axes = [lrange(len(a)) for a in axes]
+
+ return itertools.product(*axes)
+
+ def get_result(self, obj, method, key, axis):
+ """ return the result for this obj with this key and this axis """
+
+ if isinstance(key, dict):
+ key = key[axis]
+
+ # use an artificial conversion to map the key as integers to the labels
+ # so ix can work for comparisons
+ if method == 'indexer':
+ method = 'ix'
+ key = obj._get_axis(axis)[key]
+
+ # in case we actually want 0 index slicing
+ with catch_warnings(record=True):
+ try:
+ xp = getattr(obj, method).__getitem__(_axify(obj, key, axis))
+ except AttributeError:
+ xp = getattr(obj, method).__getitem__(key)
+
+ return xp
+
+ def get_value(self, f, i, values=False):
+ """ return the value for the location i """
+
+ # check against values
+ if values:
+ return f.values[i]
+
+ # this is equiv of f[col][row].....
+ # v = f
+ # for a in reversed(i):
+ # v = v.__getitem__(a)
+ # return v
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ return f.ix[i]
+
+ def check_values(self, f, func, values=False):
+
+ if f is None:
+ return
+ axes = f.axes
+ indicies = itertools.product(*axes)
+
+ for i in indicies:
+ result = getattr(f, func)[i]
+
+ # check against values
+ if values:
+ expected = f.values[i]
+ else:
+ expected = f
+ for a in reversed(i):
+ expected = expected.__getitem__(a)
+
+ tm.assert_almost_equal(result, expected)
+
+ def check_result(self, name, method1, key1, method2, key2, typs=None,
+ objs=None, axes=None, fails=None):
+ def _eq(t, o, a, obj, k1, k2):
+ """ compare equal for these 2 keys """
+
+ if a is not None and a > obj.ndim - 1:
+ return
+
+ def _print(result, error=None):
+ if error is not None:
+ error = str(error)
+ v = ("%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s,"
+ "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" %
+ (name, result, t, o, method1, method2, a, error or ''))
+ if _verbose:
+ pprint_thing(v)
+
+ try:
+ rs = getattr(obj, method1).__getitem__(_axify(obj, k1, a))
+
+ try:
+ xp = self.get_result(obj, method2, k2, a)
+ except Exception:
+ result = 'no comp'
+ _print(result)
+ return
+
+ detail = None
+
+ try:
+ if is_scalar(rs) and is_scalar(xp):
+ assert rs == xp
+ elif xp.ndim == 1:
+ tm.assert_series_equal(rs, xp)
+ elif xp.ndim == 2:
+ tm.assert_frame_equal(rs, xp)
+ elif xp.ndim == 3:
+ tm.assert_panel_equal(rs, xp)
+ result = 'ok'
+ except AssertionError as e:
+ detail = str(e)
+ result = 'fail'
+
+ # reverse the checks
+ if fails is True:
+ if result == 'fail':
+ result = 'ok (fail)'
+
+ _print(result)
+ if not result.startswith('ok'):
+ raise AssertionError(detail)
+
+ except AssertionError:
+ raise
+ except Exception as detail:
+
+ # if we are in fails, the ok, otherwise raise it
+ if fails is not None:
+ if isinstance(detail, fails):
+ result = 'ok (%s)' % type(detail).__name__
+ _print(result)
+ return
+
+ result = type(detail).__name__
+ raise AssertionError(_print(result, error=detail))
+
+ if typs is None:
+ typs = self._typs
+
+ if objs is None:
+ objs = self._objs
+
+ if axes is not None:
+ if not isinstance(axes, (tuple, list)):
+ axes = [axes]
+ else:
+ axes = list(axes)
+ else:
+ axes = [0, 1, 2]
+
+ # check
+ for o in objs:
+ if o not in self._objs:
+ continue
+
+ d = getattr(self, o)
+ for a in axes:
+ for t in typs:
+ if t not in self._typs:
+ continue
+
+ obj = d[t]
+ if obj is None:
+ continue
+
+ def _call(obj=obj):
+ obj = obj.copy()
+
+ k2 = key2
+ _eq(t, o, a, obj, key1, k2)
+
+ # Panel deprecations
+ if isinstance(obj, Panel):
+ with catch_warnings():
+ filterwarnings("ignore", "\nPanel*", FutureWarning)
+ _call()
+ else:
+ _call()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/conftest.py b/contrib/python/pandas/py2/pandas/tests/indexing/conftest.py
new file mode 100644
index 00000000000..be1cf4800a2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/conftest.py
@@ -0,0 +1,20 @@
+import numpy as np
+import pytest
+
+from pandas._libs import index as libindex
+
+
+ (libindex.Int64Engine, np.int64),
+ (libindex.Int32Engine, np.int32),
+ (libindex.Int16Engine, np.int16),
+ (libindex.Int8Engine, np.int8),
+ (libindex.UInt64Engine, np.uint64),
+ (libindex.UInt32Engine, np.uint32),
+ (libindex.UInt16Engine, np.uint16),
+ (libindex.UInt8Engine, np.uint8),
+ (libindex.Float64Engine, np.float64),
+ (libindex.Float32Engine, np.float32),
+], ids=lambda x: x[0].__name__)
+def numeric_indexing_engine_type_and_dtype(request):
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/interval/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexing/interval/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/interval/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval.py b/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval.py
new file mode 100644
index 00000000000..938caec006f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval.py
@@ -0,0 +1,267 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Interval, IntervalIndex, Series
+import pandas.util.testing as tm
+
+
+class TestIntervalIndex(object):
+
+ def setup_method(self, method):
+ self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6)))
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_loc_with_scalar(self):
+
+ s = self.s
+
+ expected = s.iloc[:3]
+ tm.assert_series_equal(expected, s.loc[:3])
+ tm.assert_series_equal(expected, s.loc[:2.5])
+ tm.assert_series_equal(expected, s.loc[0.1:2.5])
+ tm.assert_series_equal(expected, s.loc[-1:3])
+
+ expected = s.iloc[1:4]
+ tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]])
+ tm.assert_series_equal(expected, s.loc[[2, 3, 4]])
+ tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]])
+
+ expected = s.iloc[2:5]
+ tm.assert_series_equal(expected, s.loc[s >= 2])
+
+ # TODO: check this behavior is consistent with test_interval_new.py
+ def test_getitem_with_scalar(self):
+
+ s = self.s
+
+ expected = s.iloc[:3]
+ tm.assert_series_equal(expected, s[:3])
+ tm.assert_series_equal(expected, s[:2.5])
+ tm.assert_series_equal(expected, s[0.1:2.5])
+ tm.assert_series_equal(expected, s[-1:3])
+
+ expected = s.iloc[1:4]
+ tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]])
+ tm.assert_series_equal(expected, s[[2, 3, 4]])
+ tm.assert_series_equal(expected, s[[1.5, 3, 4]])
+
+ expected = s.iloc[2:5]
+ tm.assert_series_equal(expected, s[s >= 2])
+
+ # TODO: check this behavior is consistent with test_interval_new.py
+ @pytest.mark.parametrize('direction', ['increasing', 'decreasing'])
+ def test_nonoverlapping_monotonic(self, direction, closed):
+ tpls = [(0, 1), (2, 3), (4, 5)]
+ if direction == 'decreasing':
+ tpls = tpls[::-1]
+
+ idx = IntervalIndex.from_tuples(tpls, closed=closed)
+ s = Series(list('abc'), idx)
+
+ for key, expected in zip(idx.left, s):
+ if idx.closed_left:
+ assert s[key] == expected
+ assert s.loc[key] == expected
+ else:
+ with pytest.raises(KeyError):
+ s[key]
+ with pytest.raises(KeyError):
+ s.loc[key]
+
+ for key, expected in zip(idx.right, s):
+ if idx.closed_right:
+ assert s[key] == expected
+ assert s.loc[key] == expected
+ else:
+ with pytest.raises(KeyError):
+ s[key]
+ with pytest.raises(KeyError):
+ s.loc[key]
+
+ for key, expected in zip(idx.mid, s):
+ assert s[key] == expected
+ assert s.loc[key] == expected
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_with_interval(self):
+
+ s = self.s
+ expected = 0
+
+ result = s.loc[Interval(0, 1)]
+ assert result == expected
+
+ result = s[Interval(0, 1)]
+ assert result == expected
+
+ expected = s.iloc[3:5]
+ result = s.loc[Interval(3, 6)]
+ tm.assert_series_equal(expected, result)
+
+ expected = s.iloc[3:5]
+ result = s.loc[[Interval(3, 6)]]
+ tm.assert_series_equal(expected, result)
+
+ expected = s.iloc[3:5]
+ result = s.loc[[Interval(3, 5)]]
+ tm.assert_series_equal(expected, result)
+
+ # missing
+ with pytest.raises(KeyError):
+ s.loc[Interval(-2, 0)]
+
+ with pytest.raises(KeyError):
+ s[Interval(-2, 0)]
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(5, 6)]
+
+ with pytest.raises(KeyError):
+ s[Interval(5, 6)]
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_with_slices(self):
+
+ s = self.s
+
+ # slice of interval
+ with pytest.raises(NotImplementedError):
+ s.loc[Interval(3, 6):]
+
+ with pytest.raises(NotImplementedError):
+ s[Interval(3, 6):]
+
+ expected = s.iloc[3:5]
+ result = s[[Interval(3, 6)]]
+ tm.assert_series_equal(expected, result)
+
+ # slice of scalar with step != 1
+ with pytest.raises(ValueError):
+ s[0:4:2]
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_with_overlaps(self):
+
+ s = self.s
+ expected = s.iloc[[3, 4, 3, 4]]
+ result = s.loc[[Interval(3, 6), Interval(3, 6)]]
+ tm.assert_series_equal(expected, result)
+
+ idx = IntervalIndex.from_tuples([(1, 5), (3, 7)])
+ s = Series(range(len(idx)), index=idx)
+
+ result = s[4]
+ expected = s
+ tm.assert_series_equal(expected, result)
+
+ result = s[[4]]
+ expected = s
+ tm.assert_series_equal(expected, result)
+
+ result = s.loc[[4]]
+ expected = s
+ tm.assert_series_equal(expected, result)
+
+ result = s[Interval(3, 5)]
+ expected = s
+ tm.assert_series_equal(expected, result)
+
+ result = s.loc[Interval(3, 5)]
+ expected = s
+ tm.assert_series_equal(expected, result)
+
+ # doesn't intersect unique set of intervals
+ with pytest.raises(KeyError):
+ s[[Interval(3, 5)]]
+
+ with pytest.raises(KeyError):
+ s.loc[[Interval(3, 5)]]
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_non_unique(self):
+
+ idx = IntervalIndex.from_tuples([(1, 3), (3, 7)])
+
+ s = Series(range(len(idx)), index=idx)
+
+ result = s.loc[Interval(1, 3)]
+ assert result == 0
+
+ result = s.loc[[Interval(1, 3)]]
+ expected = s.iloc[0:1]
+ tm.assert_series_equal(expected, result)
+
+ # To be removed, replaced by test_interval_new.py (see #16316, #16386)
+ def test_non_unique_moar(self):
+
+ idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)])
+ s = Series(range(len(idx)), index=idx)
+
+ result = s.loc[Interval(1, 3)]
+ expected = s.iloc[[0, 1]]
+ tm.assert_series_equal(expected, result)
+
+ # non-unique index and slices not allowed
+ with pytest.raises(ValueError):
+ s.loc[Interval(1, 3):]
+
+ with pytest.raises(ValueError):
+ s[Interval(1, 3):]
+
+ # non-unique
+ with pytest.raises(ValueError):
+ s[[Interval(1, 3)]]
+
+ # TODO: check this behavior is consistent with test_interval_new.py
+ def test_non_matching(self):
+ s = self.s
+
+ # this is a departure from our current
+ # indexin scheme, but simpler
+ with pytest.raises(KeyError):
+ s.loc[[-1, 3, 4, 5]]
+
+ with pytest.raises(KeyError):
+ s.loc[[-1, 3]]
+
+ def test_large_series(self):
+ s = Series(np.arange(1000000),
+ index=IntervalIndex.from_breaks(np.arange(1000001)))
+
+ result1 = s.loc[:80000]
+ result2 = s.loc[0:80000]
+ result3 = s.loc[0:80000:1]
+ tm.assert_series_equal(result1, result2)
+ tm.assert_series_equal(result1, result3)
+
+ def test_loc_getitem_frame(self):
+
+ df = DataFrame({'A': range(10)})
+ s = pd.cut(df.A, 5)
+ df['B'] = s
+ df = df.set_index('B')
+
+ result = df.loc[4]
+ expected = df.iloc[4:6]
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ df.loc[10]
+
+ # single list-like
+ result = df.loc[[4]]
+ expected = df.iloc[4:6]
+ tm.assert_frame_equal(result, expected)
+
+ # non-unique
+ result = df.loc[[4, 5]]
+ expected = df.take([4, 5, 4, 5])
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(KeyError):
+ df.loc[[10]]
+
+ # partial missing
+ with pytest.raises(KeyError):
+ df.loc[[10, 4]]
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval_new.py b/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval_new.py
new file mode 100644
index 00000000000..4b2ec0c4d17
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/interval/test_interval_new.py
@@ -0,0 +1,246 @@
+import numpy as np
+import pytest
+
+from pandas import Interval, IntervalIndex, Series
+import pandas.util.testing as tm
+
+pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316")
+
+
+class TestIntervalIndex(object):
+
+ def setup_method(self, method):
+ self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6)))
+
+ def test_loc_with_interval(self):
+
+ # loc with single label / list of labels:
+ # - Intervals: only exact matches
+ # - scalars: those that contain it
+
+ s = self.s
+
+ expected = 0
+ result = s.loc[Interval(0, 1)]
+ assert result == expected
+ result = s[Interval(0, 1)]
+ assert result == expected
+
+ expected = s.iloc[3:5]
+ result = s.loc[[Interval(3, 4), Interval(4, 5)]]
+ tm.assert_series_equal(expected, result)
+ result = s[[Interval(3, 4), Interval(4, 5)]]
+ tm.assert_series_equal(expected, result)
+
+ # missing or not exact
+ with pytest.raises(KeyError):
+ s.loc[Interval(3, 5, closed='left')]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 5, closed='left')]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 5)]
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(3, 5)]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 5)]
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(-2, 0)]
+
+ with pytest.raises(KeyError):
+ s[Interval(-2, 0)]
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(5, 6)]
+
+ with pytest.raises(KeyError):
+ s[Interval(5, 6)]
+
+ def test_loc_with_scalar(self):
+
+ # loc with single label / list of labels:
+ # - Intervals: only exact matches
+ # - scalars: those that contain it
+
+ s = self.s
+
+ assert s.loc[1] == 0
+ assert s.loc[1.5] == 1
+ assert s.loc[2] == 1
+
+ # TODO with __getitem__ same rules as loc, or positional ?
+ # assert s[1] == 0
+ # assert s[1.5] == 1
+ # assert s[2] == 1
+
+ expected = s.iloc[1:4]
+ tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]])
+ tm.assert_series_equal(expected, s.loc[[2, 3, 4]])
+ tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]])
+
+ expected = s.iloc[[1, 1, 2, 1]]
+ tm.assert_series_equal(expected, s.loc[[1.5, 2, 2.5, 1.5]])
+
+ expected = s.iloc[2:5]
+ tm.assert_series_equal(expected, s.loc[s >= 2])
+
+ def test_loc_with_slices(self):
+
+ # loc with slices:
+ # - Interval objects: only works with exact matches
+ # - scalars: only works for non-overlapping, monotonic intervals,
+ # and start/stop select location based on the interval that
+ # contains them:
+ # (slice_loc(start, stop) == (idx.get_loc(start), idx.get_loc(stop))
+
+ s = self.s
+
+ # slice of interval
+
+ expected = s.iloc[:3]
+ result = s.loc[Interval(0, 1):Interval(2, 3)]
+ tm.assert_series_equal(expected, result)
+ result = s[Interval(0, 1):Interval(2, 3)]
+ tm.assert_series_equal(expected, result)
+
+ expected = s.iloc[4:]
+ result = s.loc[Interval(3, 4):]
+ tm.assert_series_equal(expected, result)
+ result = s[Interval(3, 4):]
+ tm.assert_series_equal(expected, result)
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(3, 6):]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 6):]
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(3, 4, closed='left'):]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 4, closed='left'):]
+
+ # TODO with non-existing intervals ?
+ # s.loc[Interval(-1, 0):Interval(2, 3)]
+
+ # slice of scalar
+
+ expected = s.iloc[:3]
+ tm.assert_series_equal(expected, s.loc[:3])
+ tm.assert_series_equal(expected, s.loc[:2.5])
+ tm.assert_series_equal(expected, s.loc[0.1:2.5])
+
+ # TODO should this work? (-1 is not contained in any of the Intervals)
+ # tm.assert_series_equal(expected, s.loc[-1:3])
+
+ # TODO with __getitem__ same rules as loc, or positional ?
+ # tm.assert_series_equal(expected, s[:3])
+ # tm.assert_series_equal(expected, s[:2.5])
+ # tm.assert_series_equal(expected, s[0.1:2.5])
+
+ # slice of scalar with step != 1
+ with pytest.raises(NotImplementedError):
+ s[0:4:2]
+
+ def test_loc_with_overlap(self):
+
+ idx = IntervalIndex.from_tuples([(1, 5), (3, 7)])
+ s = Series(range(len(idx)), index=idx)
+
+ # scalar
+ expected = s
+ result = s.loc[4]
+ tm.assert_series_equal(expected, result)
+
+ result = s[4]
+ tm.assert_series_equal(expected, result)
+
+ result = s.loc[[4]]
+ tm.assert_series_equal(expected, result)
+
+ result = s[[4]]
+ tm.assert_series_equal(expected, result)
+
+ # interval
+ expected = 0
+ result = s.loc[Interval(1, 5)]
+ tm.assert_series_equal(expected, result)
+
+ result = s[Interval(1, 5)]
+ tm.assert_series_equal(expected, result)
+
+ expected = s
+ result = s.loc[[Interval(1, 5), Interval(3, 7)]]
+ tm.assert_series_equal(expected, result)
+
+ result = s[[Interval(1, 5), Interval(3, 7)]]
+ tm.assert_series_equal(expected, result)
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(3, 5)]
+
+ with pytest.raises(KeyError):
+ s.loc[[Interval(3, 5)]]
+
+ with pytest.raises(KeyError):
+ s[Interval(3, 5)]
+
+ with pytest.raises(KeyError):
+ s[[Interval(3, 5)]]
+
+ # slices with interval (only exact matches)
+ expected = s
+ result = s.loc[Interval(1, 5):Interval(3, 7)]
+ tm.assert_series_equal(expected, result)
+
+ result = s[Interval(1, 5):Interval(3, 7)]
+ tm.assert_series_equal(expected, result)
+
+ with pytest.raises(KeyError):
+ s.loc[Interval(1, 6):Interval(3, 8)]
+
+ with pytest.raises(KeyError):
+ s[Interval(1, 6):Interval(3, 8)]
+
+ # slices with scalar raise for overlapping intervals
+ # TODO KeyError is the appropriate error?
+ with pytest.raises(KeyError):
+ s.loc[1:4]
+
+ def test_non_unique(self):
+
+ idx = IntervalIndex.from_tuples([(1, 3), (3, 7)])
+ s = Series(range(len(idx)), index=idx)
+
+ result = s.loc[Interval(1, 3)]
+ assert result == 0
+
+ result = s.loc[[Interval(1, 3)]]
+ expected = s.iloc[0:1]
+ tm.assert_series_equal(expected, result)
+
+ def test_non_unique_moar(self):
+
+ idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)])
+ s = Series(range(len(idx)), index=idx)
+
+ expected = s.iloc[[0, 1]]
+ result = s.loc[Interval(1, 3)]
+ tm.assert_series_equal(expected, result)
+
+ expected = s
+ result = s.loc[Interval(1, 3):]
+ tm.assert_series_equal(expected, result)
+
+ expected = s
+ result = s[Interval(1, 3):]
+ tm.assert_series_equal(expected, result)
+
+ expected = s.iloc[[0, 1]]
+ result = s[[Interval(1, 3)]]
+ tm.assert_series_equal(expected, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/__init__.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/conftest.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/conftest.py
new file mode 100644
index 00000000000..545e092d9ce
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/conftest.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Index, MultiIndex
+from pandas.util import testing as tm
+
+
+def multiindex_dataframe_random_data():
+ """DataFrame with 2 level MultiIndex with random data"""
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ return DataFrame(np.random.randn(10, 3), index=index,
+ columns=Index(['A', 'B', 'C'], name='exp'))
+
+
+def multiindex_year_month_day_dataframe_random_data():
+ """DataFrame with 3 level MultiIndex (year, month, day) covering
+ first 100 business days from 2000-01-01 with random data"""
+ tdf = tm.makeTimeDataFrame(100)
+ ymd = tdf.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day]).sum()
+ # use Int64Index, to make sure things work
+ ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels],
+ inplace=True)
+ ymd.index.set_names(['year', 'month', 'day'], inplace=True)
+ return ymd
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
new file mode 100644
index 00000000000..0ff499155f0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, lzip, range
+
+from pandas import DataFrame, MultiIndex, Series
+from pandas.core import common as com
+import pandas.util.testing as tm
+
+
+def test_detect_chained_assignment():
+ # Inplace ops, originally from:
+ # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug
+ a = [12, 23]
+ b = [123, None]
+ c = [1234, 2345]
+ d = [12345, 23456]
+ tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'),
+ ('ears', 'right')]
+ events = {('eyes', 'left'): a,
+ ('eyes', 'right'): b,
+ ('ears', 'left'): c,
+ ('ears', 'right'): d}
+ multiind = MultiIndex.from_tuples(tuples, names=['part', 'side'])
+ zed = DataFrame(events, index=['a', 'b'], columns=multiind)
+
+ with pytest.raises(com.SettingWithCopyError):
+ zed['eyes']['right'].fillna(value=555, inplace=True)
+
+
+def test_cache_updating():
+ # 5216
+ # make sure that we don't try to set a dead cache
+ a = np.random.rand(10, 3)
+ df = DataFrame(a, columns=['x', 'y', 'z'])
+ tuples = [(i, j) for i in range(5) for j in range(2)]
+ index = MultiIndex.from_tuples(tuples)
+ df.index = index
+
+ # setting via chained assignment
+ # but actually works, since everything is a view
+ df.loc[0]['z'].iloc[0] = 1.
+ result = df.loc[(0, 0), 'z']
+ assert result == 1
+
+ # correct setting
+ df.loc[(0, 0), 'z'] = 2
+ result = df.loc[(0, 0), 'z']
+ assert result == 2
+
+
+def test_indexer_caching():
+ # GH5727
+ # make sure that indexers are in the _internal_names_set
+ n = 1000001
+ arrays = [lrange(n), lrange(n)]
+ index = MultiIndex.from_tuples(lzip(*arrays))
+ s = Series(np.zeros(n), index=index)
+ str(s)
+
+ # setitem
+ expected = Series(np.ones(n), index=index)
+ s = Series(np.zeros(n), index=index)
+ s[s == 0] = 1
+ tm.assert_series_equal(s, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_datetime.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_datetime.py
new file mode 100644
index 00000000000..a270ab32e9b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_datetime.py
@@ -0,0 +1,22 @@
+from datetime import datetime
+
+import numpy as np
+
+from pandas import Index, Period, Series, period_range
+
+
+def test_multiindex_period_datetime():
+ # GH4861, using datetime in period of multiindex raises exception
+
+ idx1 = Index(['a', 'a', 'a', 'b', 'b'])
+ idx2 = period_range('2012-01', periods=len(idx1), freq='M')
+ s = Series(np.random.randn(len(idx1)), [idx1, idx2])
+
+ # try Period as index
+ expected = s.iloc[0]
+ result = s.loc['a', Period('2012-01')]
+ assert result == expected
+
+ # try datetime as index
+ result = s.loc['a', datetime(2012, 1, 1)]
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_getitem.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_getitem.py
new file mode 100644
index 00000000000..b7fdbee0b71
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_getitem.py
@@ -0,0 +1,237 @@
+import numpy as np
+import pytest
+
+from pandas.compat import u, zip
+
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.core.indexing import IndexingError
+from pandas.util import testing as tm
+
+# ----------------------------------------------------------------------------
+# test indexing of Series with multi-level Index
+# ----------------------------------------------------------------------------
+
+
[email protected]('access_method', [lambda s, x: s[:, x],
+ lambda s, x: s.loc[:, x],
+ lambda s, x: s.xs(x, level=1)])
[email protected]('level1_value, expected', [
+ (0, Series([1], index=[0])),
+ (1, Series([2, 3], index=[1, 2]))
+])
+def test_series_getitem_multiindex(access_method, level1_value, expected):
+
+ # GH 6018
+ # series regression getitem with a multi-index
+
+ s = Series([1, 2, 3])
+ s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)])
+ result = access_method(s, level1_value)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('level0_value', ['D', 'A'])
+def test_series_getitem_duplicates_multiindex(level0_value):
+ # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
+ # the appropriate error, only in PY3 of course!
+
+ index = MultiIndex(levels=[[level0_value, 'B', 'C'],
+ [0, 26, 27, 37, 57, 67, 75, 82]],
+ codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
+ [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
+ names=['tag', 'day'])
+ arr = np.random.randn(len(index), 1)
+ df = DataFrame(arr, index=index, columns=['val'])
+
+ # confirm indexing on missing value raises KeyError
+ if level0_value != 'A':
+ with pytest.raises(KeyError, match=r"^'A'$"):
+ df.val['A']
+
+ with pytest.raises(KeyError, match=r"^'X'$"):
+ df.val['X']
+
+ result = df.val[level0_value]
+ expected = Series(arr.ravel()[0:3], name='val', index=Index(
+ [26, 37, 57], name='day'))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('indexer', [
+ lambda s: s[2000, 3],
+ lambda s: s.loc[2000, 3]
+])
+def test_series_getitem(
+ multiindex_year_month_day_dataframe_random_data, indexer):
+ s = multiindex_year_month_day_dataframe_random_data['A']
+ expected = s.reindex(s.index[42:65])
+ expected.index = expected.index.droplevel(0).droplevel(0)
+
+ result = indexer(s)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('indexer', [
+ lambda s: s[2000, 3, 10],
+ lambda s: s.loc[2000, 3, 10]
+])
+def test_series_getitem_returns_scalar(
+ multiindex_year_month_day_dataframe_random_data, indexer):
+ s = multiindex_year_month_day_dataframe_random_data['A']
+ expected = s.iloc[49]
+
+ result = indexer(s)
+ assert result == expected
+
+
[email protected]('indexer,expected_error,expected_error_msg', [
+ (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356L?$"),
+ (lambda s: s[(2000, 3, 4)], KeyError, r"^356L?$"),
+ (lambda s: s.loc[(2000, 3, 4)], IndexingError, 'Too many indexers'),
+ (lambda s: s.__getitem__(len(s)), IndexError, 'index out of bounds'),
+ (lambda s: s[len(s)], IndexError, 'index out of bounds'),
+ (lambda s: s.iloc[len(s)], IndexError,
+ 'single positional indexer is out-of-bounds')
+])
+def test_series_getitem_indexing_errors(
+ multiindex_year_month_day_dataframe_random_data, indexer,
+ expected_error, expected_error_msg):
+ s = multiindex_year_month_day_dataframe_random_data['A']
+ with pytest.raises(expected_error, match=expected_error_msg):
+ indexer(s)
+
+
+def test_series_getitem_corner_generator(
+ multiindex_year_month_day_dataframe_random_data):
+ s = multiindex_year_month_day_dataframe_random_data['A']
+ result = s[(x > 0 for x in s)]
+ expected = s[s > 0]
+ tm.assert_series_equal(result, expected)
+
+
+# ----------------------------------------------------------------------------
+# test indexing of DataFrame with multi-level Index
+# ----------------------------------------------------------------------------
+
+def test_getitem_simple(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data.T
+ expected = df.values[:, 0]
+ result = df['foo', 'one'].values
+ tm.assert_almost_equal(result, expected)
+
+
[email protected]('indexer,expected_error_msg', [
+ (lambda df: df[('foo', 'four')], r"^\('foo', 'four'\)$"),
+ (lambda df: df['foobar'], r"^'foobar'$")
+])
+def test_frame_getitem_simple_key_error(
+ multiindex_dataframe_random_data, indexer, expected_error_msg):
+ df = multiindex_dataframe_random_data.T
+ with pytest.raises(KeyError, match=expected_error_msg):
+ indexer(df)
+
+
+def test_frame_getitem_multicolumn_empty_level():
+ df = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']})
+ df.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'],
+ ['level3 item1', 'level3 item2']]
+
+ result = df['level1 item1']
+ expected = DataFrame([['1'], ['2'], ['3']], index=df.index,
+ columns=['level3 item1'])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('indexer,expected_slice', [
+ (lambda df: df['foo'], slice(3)),
+ (lambda df: df['bar'], slice(3, 5)),
+ (lambda df: df.loc[:, 'bar'], slice(3, 5))
+])
+def test_frame_getitem_toplevel(
+ multiindex_dataframe_random_data, indexer, expected_slice):
+ df = multiindex_dataframe_random_data.T
+ expected = df.reindex(columns=df.columns[expected_slice])
+ expected.columns = expected.columns.droplevel(0)
+ result = indexer(df)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('unicode_strings', [True, False])
+def test_frame_mixed_depth_get(unicode_strings):
+ # If unicode_strings is True, the column labels in dataframe
+ # construction will use unicode strings in Python 2 (pull request
+ # #17099).
+
+ arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
+ ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
+ ['', 'wx', 'wy', '', '', '']]
+
+ if unicode_strings:
+ arrays = [[u(s) for s in arr] for arr in arrays]
+
+ tuples = sorted(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples)
+ df = DataFrame(np.random.randn(4, 6), columns=index)
+
+ result = df['a']
+ expected = df['a', '', ''].rename('a')
+ tm.assert_series_equal(result, expected)
+
+ result = df['routine1', 'result1']
+ expected = df['routine1', 'result1', '']
+ expected = expected.rename(('routine1', 'result1'))
+ tm.assert_series_equal(result, expected)
+
+
+# ----------------------------------------------------------------------------
+# test indexing of DataFrame with multi-level Index with duplicates
+# ----------------------------------------------------------------------------
+
+def dataframe_with_duplicate_index():
+ """Fixture for DataFrame used in tests for gh-4145 and gh-4146"""
+ data = [['a', 'd', 'e', 'c', 'f', 'b'],
+ [1, 4, 5, 3, 6, 2],
+ [1, 4, 5, 3, 6, 2]]
+ index = ['h1', 'h3', 'h5']
+ columns = MultiIndex(
+ levels=[['A', 'B'], ['A1', 'A2', 'B1', 'B2']],
+ codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]],
+ names=['main', 'sub'])
+ return DataFrame(data, index=index, columns=columns)
+
+
[email protected]('indexer', [
+ lambda df: df[('A', 'A1')],
+ lambda df: df.loc[:, ('A', 'A1')]
+])
+def test_frame_mi_access(dataframe_with_duplicate_index, indexer):
+ # GH 4145
+ df = dataframe_with_duplicate_index
+ index = Index(['h1', 'h3', 'h5'])
+ columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub'])
+ expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T
+
+ result = indexer(df)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_frame_mi_access_returns_series(dataframe_with_duplicate_index):
+ # GH 4146, not returning a block manager when selecting a unique index
+ # from a duplicate index
+ # as of 4879, this returns a Series (which is similar to what happens
+ # with a non-unique)
+ df = dataframe_with_duplicate_index
+ expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1')
+ result = df['A']['A1']
+ tm.assert_series_equal(result, expected)
+
+
+def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index):
+ # selecting a non_unique from the 2nd level
+ df = dataframe_with_duplicate_index
+ expected = DataFrame([['d', 4, 4], ['e', 5, 5]],
+ index=Index(['B2', 'B2'], name='sub'),
+ columns=['h1', 'h3', 'h5'], ).T
+ result = df['A']['B2']
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_iloc.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_iloc.py
new file mode 100644
index 00000000000..bdd505804c8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_iloc.py
@@ -0,0 +1,151 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex, Series
+from pandas.util import testing as tm
+
+
+def simple_multiindex_dataframe():
+ """
+ Factory function to create simple 3 x 3 dataframe with
+ both columns and row MultiIndex using supplied data or
+ random data by default.
+ """
+ def _simple_multiindex_dataframe(data=None):
+ if data is None:
+ data = np.random.randn(3, 3)
+ return DataFrame(data, columns=[[2, 2, 4], [6, 8, 10]],
+ index=[[4, 4, 8], [8, 10, 12]])
+ return _simple_multiindex_dataframe
+
+
[email protected]('indexer, expected', [
+ (lambda df: df.iloc[0],
+ lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8))),
+ (lambda df: df.iloc[2],
+ lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12))),
+ (lambda df: df.iloc[:, 2],
+ lambda arr: Series(
+ arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10)))
+])
+def test_iloc_returns_series(indexer, expected, simple_multiindex_dataframe):
+ arr = np.random.randn(3, 3)
+ df = simple_multiindex_dataframe(arr)
+ result = indexer(df)
+ expected = expected(arr)
+ tm.assert_series_equal(result, expected)
+
+
+def test_iloc_returns_dataframe(simple_multiindex_dataframe):
+ df = simple_multiindex_dataframe()
+ result = df.iloc[[0, 1]]
+ expected = df.xs(4, drop_level=False)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_iloc_returns_scalar(simple_multiindex_dataframe):
+ arr = np.random.randn(3, 3)
+ df = simple_multiindex_dataframe(arr)
+ result = df.iloc[2, 2]
+ expected = arr[2, 2]
+ assert result == expected
+
+
+def test_iloc_getitem_multiple_items():
+ # GH 5528
+ tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']])
+ index = MultiIndex.from_tuples(tup)
+ df = DataFrame(np.random.randn(4, 4), index=index)
+ result = df.iloc[[2, 3]]
+ expected = df.xs('b', drop_level=False)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_iloc_getitem_labels():
+ # this is basically regular indexing
+ arr = np.random.randn(4, 3)
+ df = DataFrame(arr,
+ columns=[['i', 'i', 'j'], ['A', 'A', 'B']],
+ index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y', 'Y']])
+ result = df.iloc[2, 2]
+ expected = arr[2, 2]
+ assert result == expected
+
+
+def test_frame_getitem_slice(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ result = df.iloc[:4]
+ expected = df[:4]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_frame_setitem_slice(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ df.iloc[:4] = 0
+
+ assert (df.values[:4] == 0).all()
+ assert (df.values[4:] != 0).all()
+
+
+def test_indexing_ambiguity_bug_1678():
+ # GH 1678
+ columns = MultiIndex.from_tuples(
+ [('Ohio', 'Green'), ('Ohio', 'Red'), ('Colorado', 'Green')])
+ index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
+
+ df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns)
+
+ result = df.iloc[:, 1]
+ expected = df.loc[:, ('Ohio', 'Red')]
+ tm.assert_series_equal(result, expected)
+
+
+def test_iloc_integer_locations():
+ # GH 13797
+ data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'],
+ ['str30', 'str31'], ['str40', 'str41']]
+
+ index = MultiIndex.from_tuples(
+ [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')])
+
+ expected = DataFrame(data)
+ df = DataFrame(data, index=index)
+
+ result = DataFrame([[df.iloc[r, c] for c in range(2)] for r in range(5)])
+
+ tm.assert_frame_equal(result, expected)
+
+
+ 'data, indexes, values, expected_k', [
+ # test without indexer value in first level of MultiIndex
+ ([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]),
+ # test like code sample 1 in the issue
+ ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100],
+ [755, 1066]),
+ # test like code sample 2 in the issue
+ ([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]),
+ # test like code sample 3 in the issue
+ ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10],
+ [8, 15, 13])
+ ])
+def test_iloc_setitem_int_multiindex_series(data, indexes, values, expected_k):
+ # GH17148
+ df = DataFrame(data=data, columns=['i', 'j', 'k'])
+ df = df.set_index(['i', 'j'])
+
+ series = df.k.copy()
+ for i, v in zip(indexes, values):
+ series.iloc[i] += v
+
+ df['k'] = expected_k
+ expected = df.k
+ tm.assert_series_equal(series, expected)
+
+
+def test_getitem_iloc(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ result = df.iloc[2]
+ expected = df.xs(df.index[2])
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_indexing_slow.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_indexing_slow.py
new file mode 100644
index 00000000000..1fdd42e3077
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_indexing_slow.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, MultiIndex, Series
+import pandas.util.testing as tm
+
+
[email protected]("ignore::pandas.errors.PerformanceWarning")
+def test_multiindex_get_loc(): # GH7724, GH2646
+
+ with warnings.catch_warnings(record=True):
+
+ # test indexing into a multi-index before & past the lexsort depth
+ from numpy.random import randint, choice, randn
+ cols = ['jim', 'joe', 'jolie', 'joline', 'jolia']
+
+ def validate(mi, df, key):
+ mask = np.ones(len(df)).astype('bool')
+
+ # test for all partials of this key
+ for i, k in enumerate(key):
+ mask &= df.iloc[:, i] == k
+
+ if not mask.any():
+ assert key[:i + 1] not in mi.index
+ continue
+
+ assert key[:i + 1] in mi.index
+ right = df[mask].copy()
+
+ if i + 1 != len(key): # partial key
+ right.drop(cols[:i + 1], axis=1, inplace=True)
+ right.set_index(cols[i + 1:-1], inplace=True)
+ tm.assert_frame_equal(mi.loc[key[:i + 1]], right)
+
+ else: # full key
+ right.set_index(cols[:-1], inplace=True)
+ if len(right) == 1: # single hit
+ right = Series(right['jolia'].values,
+ name=right.index[0],
+ index=['jolia'])
+ tm.assert_series_equal(mi.loc[key[:i + 1]], right)
+ else: # multi hit
+ tm.assert_frame_equal(mi.loc[key[:i + 1]], right)
+
+ def loop(mi, df, keys):
+ for key in keys:
+ validate(mi, df, key)
+
+ n, m = 1000, 50
+
+ vals = [randint(0, 10, n), choice(
+ list('abcdefghij'), n), choice(
+ pd.date_range('20141009', periods=10).tolist(), n), choice(
+ list('ZYXWVUTSRQ'), n), randn(n)]
+ vals = list(map(tuple, zip(*vals)))
+
+ # bunch of keys for testing
+ keys = [randint(0, 11, m), choice(
+ list('abcdefghijk'), m), choice(
+ pd.date_range('20141009', periods=11).tolist(), m), choice(
+ list('ZYXWVUTSRQP'), m)]
+ keys = list(map(tuple, zip(*keys)))
+ keys += list(map(lambda t: t[:-1], vals[::n // m]))
+
+ # covers both unique index and non-unique index
+ df = DataFrame(vals, columns=cols)
+ a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1])
+
+ for frame in a, b:
+ for i in range(5): # lexsort depth
+ df = frame.copy() if i == 0 else frame.sort_values(
+ by=cols[:i])
+ mi = df.set_index(cols[:-1])
+ assert not mi.index.lexsort_depth < i
+ loop(mi, df, keys)
+
+
+def test_large_mi_dataframe_indexing():
+ # GH10645
+ result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)])
+ assert (not (10 ** 6, 0) in result)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_ix.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_ix.py
new file mode 100644
index 00000000000..4970190252e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_ix.py
@@ -0,0 +1,56 @@
+from warnings import catch_warnings, simplefilter
+
+import pytest
+
+from pandas.compat import lrange
+from pandas.errors import PerformanceWarning
+
+from pandas import DataFrame, MultiIndex
+from pandas.util import testing as tm
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
+class TestMultiIndexIx(object):
+
+ def test_frame_setitem_ix(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ frame.loc[('bar', 'two'), 'B'] = 5
+ assert frame.loc[('bar', 'two'), 'B'] == 5
+
+ # with integer labels
+ df = frame.copy()
+ df.columns = lrange(3)
+ df.loc[('bar', 'two'), 1] = 7
+ assert df.loc[('bar', 'two'), 1] == 7
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ df = frame.copy()
+ df.columns = lrange(3)
+ df.ix[('bar', 'two'), 1] = 7
+ assert df.loc[('bar', 'two'), 1] == 7
+
+ def test_ix_general(self):
+
+ # ix general issues
+
+ # GH 2817
+ data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444},
+ 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0},
+ 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}}
+ df = DataFrame(data).set_index(keys=['col', 'year'])
+ key = 4.0, 2012
+
+ # emits a PerformanceWarning, ok
+ with tm.assert_produces_warning(PerformanceWarning):
+ tm.assert_frame_equal(df.loc[key], df.iloc[2:])
+
+ # this is ok
+ df.sort_index(inplace=True)
+ res = df.loc[key]
+
+ # col has float dtype, result should be Float64Index
+ index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3],
+ names=['col', 'year'])
+ expected = DataFrame({'amount': [222, 333, 444]}, index=index)
+ tm.assert_frame_equal(res, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_loc.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_loc.py
new file mode 100644
index 00000000000..ea451d40eb5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_loc.py
@@ -0,0 +1,378 @@
+import itertools
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.util import testing as tm
+
+
+def single_level_multiindex():
+ """single level MultiIndex"""
+ return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
+ codes=[[0, 1, 2, 3]], names=['first'])
+
+
+def frame_random_data_integer_multi_index():
+ levels = [[0, 1], [0, 1, 2]]
+ codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
+ index = MultiIndex(levels=levels, codes=codes)
+ return DataFrame(np.random.randn(6, 2), index=index)
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
+class TestMultiIndexLoc(object):
+
+ def test_loc_getitem_series(self):
+ # GH14730
+ # passing a series as a key with a MultiIndex
+ index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']])
+ x = Series(index=index, data=range(9), dtype=np.float64)
+ y = Series([1, 3])
+ expected = Series(
+ data=[0, 1, 2, 6, 7, 8],
+ index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]),
+ dtype=np.float64)
+ result = x.loc[y]
+ tm.assert_series_equal(result, expected)
+
+ result = x.loc[[1, 3]]
+ tm.assert_series_equal(result, expected)
+
+ # GH15424
+ y1 = Series([1, 3], index=[1, 2])
+ result = x.loc[y1]
+ tm.assert_series_equal(result, expected)
+
+ empty = Series(data=[], dtype=np.float64)
+ expected = Series([], index=MultiIndex(
+ levels=index.levels, codes=[[], []], dtype=np.float64))
+ result = x.loc[empty]
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_getitem_array(self):
+ # GH15434
+ # passing an array as a key with a MultiIndex
+ index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']])
+ x = Series(index=index, data=range(9), dtype=np.float64)
+ y = np.array([1, 3])
+ expected = Series(
+ data=[0, 1, 2, 6, 7, 8],
+ index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]),
+ dtype=np.float64)
+ result = x.loc[y]
+ tm.assert_series_equal(result, expected)
+
+ # empty array:
+ empty = np.array([])
+ expected = Series([], index=MultiIndex(
+ levels=index.levels, codes=[[], []], dtype=np.float64))
+ result = x.loc[empty]
+ tm.assert_series_equal(result, expected)
+
+ # 0-dim array (scalar):
+ scalar = np.int64(1)
+ expected = Series(
+ data=[0, 1, 2],
+ index=['A', 'B', 'C'],
+ dtype=np.float64)
+ result = x.loc[scalar]
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_multiindex(self):
+
+ mi_labels = DataFrame(np.random.randn(3, 3),
+ columns=[['i', 'i', 'j'], ['A', 'A', 'B']],
+ index=[['i', 'i', 'j'], ['X', 'X', 'Y']])
+
+ mi_int = DataFrame(np.random.randn(3, 3),
+ columns=[[2, 2, 4], [6, 8, 10]],
+ index=[[4, 4, 8], [8, 10, 12]])
+
+ # the first row
+ rs = mi_labels.loc['i']
+ with catch_warnings(record=True):
+ xp = mi_labels.ix['i']
+ tm.assert_frame_equal(rs, xp)
+
+ # 2nd (last) columns
+ rs = mi_labels.loc[:, 'j']
+ with catch_warnings(record=True):
+ xp = mi_labels.ix[:, 'j']
+ tm.assert_frame_equal(rs, xp)
+
+ # corner column
+ rs = mi_labels.loc['j'].loc[:, 'j']
+ with catch_warnings(record=True):
+ xp = mi_labels.ix['j'].ix[:, 'j']
+ tm.assert_frame_equal(rs, xp)
+
+ # with a tuple
+ rs = mi_labels.loc[('i', 'X')]
+ with catch_warnings(record=True):
+ xp = mi_labels.ix[('i', 'X')]
+ tm.assert_frame_equal(rs, xp)
+
+ rs = mi_int.loc[4]
+ with catch_warnings(record=True):
+ xp = mi_int.ix[4]
+ tm.assert_frame_equal(rs, xp)
+
+ # missing label
+ pytest.raises(KeyError, lambda: mi_int.loc[2])
+ with catch_warnings(record=True):
+ # GH 21593
+ pytest.raises(KeyError, lambda: mi_int.ix[2])
+
+ def test_loc_multiindex_indexer_none(self):
+
+ # GH6788
+ # multi-index indexer is None (meaning take all)
+ attributes = ['Attribute' + str(i) for i in range(1)]
+ attribute_values = ['Value' + str(i) for i in range(5)]
+
+ index = MultiIndex.from_product([attributes, attribute_values])
+ df = 0.1 * np.random.randn(10, 1 * 5) + 0.5
+ df = DataFrame(df, columns=index)
+ result = df[attributes]
+ tm.assert_frame_equal(result, df)
+
+ # GH 7349
+ # loc with a multi-index seems to be doing fallback
+ df = DataFrame(np.arange(12).reshape(-1, 1),
+ index=MultiIndex.from_product([[1, 2, 3, 4],
+ [1, 2, 3]]))
+
+ expected = df.loc[([1, 2], ), :]
+ result = df.loc[[1, 2]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_loc_multiindex_incomplete(self):
+
+ # GH 7399
+ # incomplete indexers
+ s = Series(np.arange(15, dtype='int64'),
+ MultiIndex.from_product([range(5), ['a', 'b', 'c']]))
+ expected = s.loc[:, 'a':'c']
+
+ result = s.loc[0:4, 'a':'c']
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
+
+ result = s.loc[:4, 'a':'c']
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
+
+ result = s.loc[0:, 'a':'c']
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
+
+ # GH 7400
+ # multiindexer gettitem with list of indexers skips wrong element
+ s = Series(np.arange(15, dtype='int64'),
+ MultiIndex.from_product([range(5), ['a', 'b', 'c']]))
+ expected = s.iloc[[6, 7, 8, 12, 13, 14]]
+ result = s.loc[2:4:2, 'a':'c']
+ tm.assert_series_equal(result, expected)
+
+ def test_get_loc_single_level(self, single_level_multiindex):
+ single_level = single_level_multiindex
+ s = Series(np.random.randn(len(single_level)),
+ index=single_level)
+ for k in single_level.values:
+ s[k]
+
+ def test_loc_getitem_int_slice(self):
+ # GH 3053
+ # loc should treat integer slices like label slices
+
+ index = MultiIndex.from_tuples([t for t in itertools.product(
+ [6, 7, 8], ['a', 'b'])])
+ df = DataFrame(np.random.randn(6, 6), index, index)
+ result = df.loc[6:8, :]
+ expected = df
+ tm.assert_frame_equal(result, expected)
+
+ index = MultiIndex.from_tuples([t
+ for t in itertools.product(
+ [10, 20, 30], ['a', 'b'])])
+ df = DataFrame(np.random.randn(6, 6), index, index)
+ result = df.loc[20:30, :]
+ expected = df.iloc[2:]
+ tm.assert_frame_equal(result, expected)
+
+ # doc examples
+ result = df.loc[10, :]
+ expected = df.iloc[0:2]
+ expected.index = ['a', 'b']
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[:, 10]
+ # expected = df.ix[:,10] (this fails)
+ expected = df[10]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ 'indexer_type_1',
+ (list, tuple, set, slice, np.ndarray, Series, Index))
+ @pytest.mark.parametrize(
+ 'indexer_type_2',
+ (list, tuple, set, slice, np.ndarray, Series, Index))
+ def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2):
+ # GH #19686
+ # .loc should work with nested indexers which can be
+ # any list-like objects (see `pandas.api.types.is_list_like`) or slices
+
+ def convert_nested_indexer(indexer_type, keys):
+ if indexer_type == np.ndarray:
+ return np.array(keys)
+ if indexer_type == slice:
+ return slice(*keys)
+ return indexer_type(keys)
+
+ a = [10, 20, 30]
+ b = [1, 2, 3]
+ index = MultiIndex.from_product([a, b])
+ df = DataFrame(
+ np.arange(len(index), dtype='int64'),
+ index=index, columns=['Data'])
+
+ keys = ([10, 20], [2, 3])
+ types = (indexer_type_1, indexer_type_2)
+
+ # check indexers with all the combinations of nested objects
+ # of all the valid types
+ indexer = tuple(
+ convert_nested_indexer(indexer_type, k)
+ for indexer_type, k in zip(types, keys))
+
+ result = df.loc[indexer, 'Data']
+ expected = Series(
+ [1, 2, 4, 5], name='Data',
+ index=MultiIndex.from_product(keys))
+
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('indexer, is_level1, expected_error', [
+ ([], False, None), # empty ok
+ (['A'], False, None),
+ (['A', 'D'], False, None),
+ (['D'], False, r"\['D'\] not in index"), # not any values found
+ (pd.IndexSlice[:, ['foo']], True, None),
+ (pd.IndexSlice[:, ['foo', 'bah']], True, None)
+])
+def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1,
+ expected_error):
+ # GH 7866
+ # multi-index slicing with missing indexers
+ idx = MultiIndex.from_product([['A', 'B', 'C'],
+ ['foo', 'bar', 'baz']],
+ names=['one', 'two'])
+ s = Series(np.arange(9, dtype='int64'), index=idx).sort_index()
+
+ if indexer == []:
+ expected = s.iloc[[]]
+ elif is_level1:
+ expected = Series([0, 3, 6], index=MultiIndex.from_product(
+ [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index()
+ else:
+ exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']],
+ names=['one', 'two'])
+ expected = Series(np.arange(3, dtype='int64'),
+ index=exp_idx).sort_index()
+
+ if expected_error is not None:
+ with pytest.raises(KeyError, match=expected_error):
+ s.loc[indexer]
+ else:
+ result = s.loc[indexer]
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
[email protected]('indexer', [
+ lambda s: s.loc[[(2000, 3, 10), (2000, 3, 13)]],
+ lambda s: s.ix[[(2000, 3, 10), (2000, 3, 13)]]
+])
+def test_series_loc_getitem_fancy(
+ multiindex_year_month_day_dataframe_random_data, indexer):
+ s = multiindex_year_month_day_dataframe_random_data['A']
+ expected = s.reindex(s.index[49:51])
+
+ result = indexer(s)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('columns_indexer', [
+ ([], slice(None)),
+ (['foo'], [])
+])
+def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer):
+ # GH 8737
+ # empty indexer
+ multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'],
+ ['alpha', 'beta']))
+ df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index)
+ df = df.sort_index(level=0, axis=1)
+
+ expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0])
+ result = df.loc[:, columns_indexer]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_loc_getitem_duplicates_multiindex_non_scalar_type_object():
+ # regression from < 0.14.0
+ # GH 7914
+ df = DataFrame([[np.mean, np.median], ['mean', 'median']],
+ columns=MultiIndex.from_tuples([('functs', 'mean'),
+ ('functs', 'median')]),
+ index=['function', 'name'])
+ result = df.loc['function', ('functs', 'mean')]
+ expected = np.mean
+ assert result == expected
+
+
+def test_loc_getitem_tuple_plus_slice():
+ # GH 671
+ df = DataFrame({'a': np.arange(10),
+ 'b': np.arange(10),
+ 'c': np.random.randn(10),
+ 'd': np.random.randn(10)}
+ ).set_index(['a', 'b'])
+ expected = df.loc[0, 0]
+ result = df.loc[(0, 0), :]
+ tm.assert_series_equal(result, expected)
+
+
+def test_loc_getitem_int(frame_random_data_integer_multi_index):
+ df = frame_random_data_integer_multi_index
+ result = df.loc[1]
+ expected = df[-3:]
+ expected.index = expected.index.droplevel(0)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_loc_getitem_int_raises_exception(
+ frame_random_data_integer_multi_index):
+ df = frame_random_data_integer_multi_index
+ with pytest.raises(KeyError, match=r"^3L?$"):
+ df.loc[3]
+
+
+def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+
+ # test setup - check key not in dataframe
+ with pytest.raises(KeyError, match=r"^11L?$"):
+ df.loc[('bar', 'three'), 'B']
+
+ # in theory should be inserting in a sorted space????
+ df.loc[('bar', 'three'), 'B'] = 0
+ expected = 0
+ result = df.sort_index().loc[('bar', 'three'), 'B']
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_multiindex.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_multiindex.py
new file mode 100644
index 00000000000..4f5517f89e8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -0,0 +1,86 @@
+
+import numpy as np
+import pytest
+
+import pandas._libs.index as _index
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.util import testing as tm
+
+
+class TestMultiIndexBasic(object):
+
+ def test_multiindex_perf_warn(self):
+
+ df = DataFrame({'jim': [0, 0, 1, 1],
+ 'joe': ['x', 'x', 'z', 'y'],
+ 'jolie': np.random.rand(4)}).set_index(['jim', 'joe'])
+
+ with tm.assert_produces_warning(PerformanceWarning,
+ clear=[pd.core.index]):
+ df.loc[(1, 'z')]
+
+ df = df.iloc[[2, 1, 3, 0]]
+ with tm.assert_produces_warning(PerformanceWarning):
+ df.loc[(0, )]
+
+ def test_multiindex_contains_dropped(self):
+ # GH 19027
+ # test that dropped MultiIndex levels are not in the MultiIndex
+ # despite continuing to be in the MultiIndex's levels
+ idx = MultiIndex.from_product([[1, 2], [3, 4]])
+ assert 2 in idx
+ idx = idx.drop(2)
+
+ # drop implementation keeps 2 in the levels
+ assert 2 in idx.levels[0]
+ # but it should no longer be in the index itself
+ assert 2 not in idx
+
+ # also applies to strings
+ idx = MultiIndex.from_product([['a', 'b'], ['c', 'd']])
+ assert 'a' in idx
+ idx = idx.drop('a')
+ assert 'a' in idx.levels[0]
+ assert 'a' not in idx
+
+ @pytest.mark.parametrize("data, expected", [
+ (MultiIndex.from_product([(), ()]), True),
+ (MultiIndex.from_product([(1, 2), (3, 4)]), True),
+ (MultiIndex.from_product([('a', 'b'), (1, 2)]), False),
+ ])
+ def test_multiindex_is_homogeneous_type(self, data, expected):
+ assert data._is_homogeneous_type is expected
+
+ def test_indexing_over_hashtable_size_cutoff(self):
+ n = 10000
+
+ old_cutoff = _index._SIZE_CUTOFF
+ _index._SIZE_CUTOFF = 20000
+
+ s = Series(np.arange(n),
+ MultiIndex.from_arrays((["a"] * n, np.arange(n))))
+
+ # hai it works!
+ assert s[("a", 5)] == 5
+ assert s[("a", 6)] == 6
+ assert s[("a", 7)] == 7
+
+ _index._SIZE_CUTOFF = old_cutoff
+
+ def test_multi_nan_indexing(self):
+
+ # GH 3588
+ df = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'],
+ 'b': ["C1", "C2", "C3", "C4"],
+ "c": [10, 15, np.nan, 20]})
+ result = df.set_index(['a', 'b'], drop=False)
+ expected = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'],
+ 'b': ["C1", "C2", "C3", "C4"],
+ "c": [10, 15, np.nan, 20]},
+ index=[Index(['R1', 'R2', np.nan, 'R4'],
+ name='a'),
+ Index(['C1', 'C2', 'C3', 'C4'], name='b')])
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_panel.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_panel.py
new file mode 100644
index 00000000000..68c8fadd2f0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_panel.py
@@ -0,0 +1,103 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex, Panel, Series
+from pandas.util import testing as tm
+
+
[email protected]('ignore:\\nPanel:FutureWarning')
+class TestMultiIndexPanel(object):
+
+ def test_iloc_getitem_panel_multiindex(self):
+
+ # GH 7199
+ # Panel with multi-index
+ multi_index = MultiIndex.from_tuples([('ONE', 'one'),
+ ('TWO', 'two'),
+ ('THREE', 'three')],
+ names=['UPPER', 'lower'])
+
+ simple_index = [x[0] for x in multi_index]
+ wd1 = Panel(items=['First', 'Second'],
+ major_axis=['a', 'b', 'c', 'd'],
+ minor_axis=multi_index)
+
+ wd2 = Panel(items=['First', 'Second'],
+ major_axis=['a', 'b', 'c', 'd'],
+ minor_axis=simple_index)
+
+ expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]]
+ result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG
+ tm.assert_frame_equal(result1, expected1)
+
+ expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]]
+ result2 = wd2.iloc[0, [True, True, True, False], [0, 2]]
+ tm.assert_frame_equal(result2, expected2)
+
+ expected1 = DataFrame(index=['a'], columns=multi_index,
+ dtype='float64')
+ result1 = wd1.iloc[0, [0], [0, 1, 2]]
+ tm.assert_frame_equal(result1, expected1)
+
+ expected2 = DataFrame(index=['a'], columns=simple_index,
+ dtype='float64')
+ result2 = wd2.iloc[0, [0], [0, 1, 2]]
+ tm.assert_frame_equal(result2, expected2)
+
+ # GH 7516
+ mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')])
+ p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3),
+ items=['a', 'b', 'c'], major_axis=mi,
+ minor_axis=['u', 'v', 'w'])
+ result = p.iloc[:, 1, 0]
+ expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u')
+ tm.assert_series_equal(result, expected)
+
+ result = p.loc[:, (1, 'y'), 'u']
+ tm.assert_series_equal(result, expected)
+
+ def test_panel_setitem_with_multiindex(self):
+
+ # 10360
+ # failing with a multi-index
+ arr = np.array([[[1, 2, 3], [0, 0, 0]],
+ [[0, 0, 0], [0, 0, 0]]],
+ dtype=np.float64)
+
+ # reg index
+ axes = dict(items=['A', 'B'], major_axis=[0, 1],
+ minor_axis=['X', 'Y', 'Z'])
+ p1 = Panel(0., **axes)
+ p1.iloc[0, 0, :] = [1, 2, 3]
+ expected = Panel(arr, **axes)
+ tm.assert_panel_equal(p1, expected)
+
+ # multi-indexes
+ axes['items'] = MultiIndex.from_tuples(
+ [('A', 'a'), ('B', 'b')])
+ p2 = Panel(0., **axes)
+ p2.iloc[0, 0, :] = [1, 2, 3]
+ expected = Panel(arr, **axes)
+ tm.assert_panel_equal(p2, expected)
+
+ axes['major_axis'] = MultiIndex.from_tuples(
+ [('A', 1), ('A', 2)])
+ p3 = Panel(0., **axes)
+ p3.iloc[0, 0, :] = [1, 2, 3]
+ expected = Panel(arr, **axes)
+ tm.assert_panel_equal(p3, expected)
+
+ axes['minor_axis'] = MultiIndex.from_product(
+ [['X'], range(3)])
+ p4 = Panel(0., **axes)
+ p4.iloc[0, 0, :] = [1, 2, 3]
+ expected = Panel(arr, **axes)
+ tm.assert_panel_equal(p4, expected)
+
+ arr = np.array(
+ [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]],
+ dtype=np.float64)
+ p5 = Panel(0., **axes)
+ p5.iloc[0, :, 0] = [1, 2]
+ expected = Panel(arr, **axes)
+ tm.assert_panel_equal(p5, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_partial.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_partial.py
new file mode 100644
index 00000000000..2e37ebe4a06
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_partial.py
@@ -0,0 +1,183 @@
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex
+from pandas.util import testing as tm
+
+
+class TestMultiIndexPartial(object):
+
+ def test_getitem_partial_int(self):
+ # GH 12416
+ # with single item
+ l1 = [10, 20]
+ l2 = ['a', 'b']
+ df = DataFrame(index=range(2),
+ columns=MultiIndex.from_product([l1, l2]))
+ expected = DataFrame(index=range(2),
+ columns=l2)
+ result = df[20]
+ tm.assert_frame_equal(result, expected)
+
+ # with list
+ expected = DataFrame(index=range(2),
+ columns=MultiIndex.from_product([l1[1:], l2]))
+ result = df[[20]]
+ tm.assert_frame_equal(result, expected)
+
+ # missing item:
+ with pytest.raises(KeyError, match='1'):
+ df[1]
+ with pytest.raises(KeyError, match=r"'\[1\] not in index'"):
+ df[[1]]
+
+ def test_series_slice_partial(self):
+ pass
+
+ def test_xs_partial(self, multiindex_dataframe_random_data,
+ multiindex_year_month_day_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ ymd = multiindex_year_month_day_dataframe_random_data
+ result = frame.xs('foo')
+ result2 = frame.loc['foo']
+ expected = frame.T['foo'].T
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result, result2)
+
+ result = ymd.xs((2000, 4))
+ expected = ymd.loc[2000, 4]
+ tm.assert_frame_equal(result, expected)
+
+ # ex from #1796
+ index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]],
+ codes=[[0, 0, 0, 0, 1, 1, 1, 1],
+ [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1,
+ 0, 1]])
+ df = DataFrame(np.random.randn(8, 4), index=index,
+ columns=list('abcd'))
+
+ result = df.xs(['foo', 'one'])
+ expected = df.loc['foo', 'one']
+ tm.assert_frame_equal(result, expected)
+
+ def test_getitem_partial(
+ self, multiindex_year_month_day_dataframe_random_data):
+ ymd = multiindex_year_month_day_dataframe_random_data
+ ymd = ymd.T
+ result = ymd[2000, 2]
+
+ expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1])
+ expected.columns = expected.columns.droplevel(0).droplevel(0)
+ tm.assert_frame_equal(result, expected)
+
+ def test_fancy_slice_partial(
+ self, multiindex_dataframe_random_data,
+ multiindex_year_month_day_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ result = frame.loc['bar':'baz']
+ expected = frame[3:7]
+ tm.assert_frame_equal(result, expected)
+
+ ymd = multiindex_year_month_day_dataframe_random_data
+ result = ymd.loc[(2000, 2):(2000, 4)]
+ lev = ymd.index.codes[1]
+ expected = ymd[(lev >= 1) & (lev <= 3)]
+ tm.assert_frame_equal(result, expected)
+
+ def test_getitem_partial_column_select(self):
+ idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]],
+ levels=[['a', 'b'], ['x', 'y'], ['p', 'q']])
+ df = DataFrame(np.random.rand(3, 2), index=idx)
+
+ result = df.loc[('a', 'y'), :]
+ expected = df.loc[('a', 'y')]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[('a', 'y'), [1, 0]]
+ expected = df.loc[('a', 'y')][[1, 0]]
+ tm.assert_frame_equal(result, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[('a', 'y'), [1, 0]]
+ tm.assert_frame_equal(result, expected)
+
+ pytest.raises(KeyError, df.loc.__getitem__,
+ (('a', 'foo'), slice(None, None)))
+
+ def test_partial_set(
+ self, multiindex_year_month_day_dataframe_random_data):
+ # GH #397
+ ymd = multiindex_year_month_day_dataframe_random_data
+ df = ymd.copy()
+ exp = ymd.copy()
+ df.loc[2000, 4] = 0
+ exp.loc[2000, 4].values[:] = 0
+ tm.assert_frame_equal(df, exp)
+
+ df['A'].loc[2000, 4] = 1
+ exp['A'].loc[2000, 4].values[:] = 1
+ tm.assert_frame_equal(df, exp)
+
+ df.loc[2000] = 5
+ exp.loc[2000].values[:] = 5
+ tm.assert_frame_equal(df, exp)
+
+ # this works...for now
+ df['A'].iloc[14] = 5
+ assert df['A'][14] == 5
+
+ # ---------------------------------------------------------------------
+ # AMBIGUOUS CASES!
+
+ def test_partial_ix_missing(
+ self, multiindex_year_month_day_dataframe_random_data):
+ pytest.skip("skipping for now")
+
+ ymd = multiindex_year_month_day_dataframe_random_data
+ result = ymd.loc[2000, 0]
+ expected = ymd.loc[2000]['A']
+ tm.assert_series_equal(result, expected)
+
+ # need to put in some work here
+
+ # self.ymd.loc[2000, 0] = 0
+ # assert (self.ymd.loc[2000]['A'] == 0).all()
+
+ # Pretty sure the second (and maybe even the first) is already wrong.
+ pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6))
+ pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6), 0)
+
+ # ---------------------------------------------------------------------
+
+ def test_setitem_multiple_partial(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ expected = frame.copy()
+ result = frame.copy()
+ result.loc[['foo', 'bar']] = 0
+ expected.loc['foo'] = 0
+ expected.loc['bar'] = 0
+ tm.assert_frame_equal(result, expected)
+
+ expected = frame.copy()
+ result = frame.copy()
+ result.loc['foo':'bar'] = 0
+ expected.loc['foo'] = 0
+ expected.loc['bar'] = 0
+ tm.assert_frame_equal(result, expected)
+
+ expected = frame['A'].copy()
+ result = frame['A'].copy()
+ result.loc[['foo', 'bar']] = 0
+ expected.loc['foo'] = 0
+ expected.loc['bar'] = 0
+ tm.assert_series_equal(result, expected)
+
+ expected = frame['A'].copy()
+ result = frame['A'].copy()
+ result.loc['foo':'bar'] = 0
+ expected.loc['foo'] = 0
+ expected.loc['bar'] = 0
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_set_ops.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_set_ops.py
new file mode 100644
index 00000000000..1f864de2dac
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_set_ops.py
@@ -0,0 +1,42 @@
+from numpy.random import randn
+
+from pandas import DataFrame, MultiIndex, Series
+from pandas.util import testing as tm
+
+
+class TestMultiIndexSetOps(object):
+
+ def test_multiindex_symmetric_difference(self):
+ # GH 13490
+ idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']],
+ names=['a', 'b'])
+ result = idx ^ idx
+ assert result.names == idx.names
+
+ idx2 = idx.copy().rename(['A', 'B'])
+ result = idx ^ idx2
+ assert result.names == [None, None]
+
+ def test_mixed_depth_insert(self):
+ arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
+ ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
+ ['', 'wx', 'wy', '', '', '']]
+
+ tuples = sorted(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples)
+ df = DataFrame(randn(4, 6), columns=index)
+
+ result = df.copy()
+ expected = df.copy()
+ result['b'] = [1, 2, 3, 4]
+ expected['b', '', ''] = [1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ def test_dataframe_insert_column_all_na(self):
+ # GH #1534
+ mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c')
+ ])
+ df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix)
+ s = Series({(1, 1): 1, (1, 2): 2})
+ df['new'] = s
+ assert df['new'].isna().all()
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_setitem.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_setitem.py
new file mode 100644
index 00000000000..f8f037dbda4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_setitem.py
@@ -0,0 +1,439 @@
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna)
+import pandas.core.common as com
+from pandas.util import testing as tm
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
+class TestMultiIndexSetItem(object):
+
+ def test_setitem_multiindex(self):
+ with catch_warnings(record=True):
+
+ for index_fn in ('ix', 'loc'):
+
+ def assert_equal(a, b):
+ assert a == b
+
+ def check(target, indexers, value, compare_fn, expected=None):
+ fn = getattr(target, index_fn)
+ fn.__setitem__(indexers, value)
+ result = fn.__getitem__(indexers)
+ if expected is None:
+ expected = value
+ compare_fn(result, expected)
+ # GH7190
+ index = MultiIndex.from_product([np.arange(0, 100),
+ np.arange(0, 80)],
+ names=['time', 'firm'])
+ t, n = 0, 2
+ df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x',
+ 'X', 'd', 'profit'],
+ index=index)
+ check(target=df, indexers=((t, n), 'X'), value=0,
+ compare_fn=assert_equal)
+
+ df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x',
+ 'X', 'd', 'profit'],
+ index=index)
+ check(target=df, indexers=((t, n), 'X'), value=1,
+ compare_fn=assert_equal)
+
+ df = DataFrame(columns=['A', 'w', 'l', 'a', 'x',
+ 'X', 'd', 'profit'],
+ index=index)
+ check(target=df, indexers=((t, n), 'X'), value=2,
+ compare_fn=assert_equal)
+
+ # gh-7218: assigning with 0-dim arrays
+ df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x',
+ 'X', 'd', 'profit'],
+ index=index)
+ check(target=df,
+ indexers=((t, n), 'X'),
+ value=np.array(3),
+ compare_fn=assert_equal,
+ expected=3, )
+
+ # GH5206
+ df = DataFrame(np.arange(25).reshape(5, 5),
+ columns='A,B,C,D,E'.split(','), dtype=float)
+ df['F'] = 99
+ row_selection = df['A'] % 2 == 0
+ col_selection = ['B', 'C']
+ with catch_warnings(record=True):
+ df.ix[row_selection, col_selection] = df['F']
+ output = DataFrame(99., index=[0, 2, 4], columns=['B', 'C'])
+ with catch_warnings(record=True):
+ tm.assert_frame_equal(df.ix[row_selection, col_selection],
+ output)
+ check(target=df,
+ indexers=(row_selection, col_selection),
+ value=df['F'],
+ compare_fn=tm.assert_frame_equal,
+ expected=output, )
+
+ # GH11372
+ idx = MultiIndex.from_product([
+ ['A', 'B', 'C'],
+ date_range('2015-01-01', '2015-04-01', freq='MS')])
+ cols = MultiIndex.from_product([
+ ['foo', 'bar'],
+ date_range('2016-01-01', '2016-02-01', freq='MS')])
+
+ df = DataFrame(np.random.random((12, 4)),
+ index=idx, columns=cols)
+
+ subidx = MultiIndex.from_tuples(
+ [('A', Timestamp('2015-01-01')),
+ ('A', Timestamp('2015-02-01'))])
+ subcols = MultiIndex.from_tuples(
+ [('foo', Timestamp('2016-01-01')),
+ ('foo', Timestamp('2016-02-01'))])
+
+ vals = DataFrame(np.random.random((2, 2)),
+ index=subidx, columns=subcols)
+ check(target=df,
+ indexers=(subidx, subcols),
+ value=vals,
+ compare_fn=tm.assert_frame_equal, )
+ # set all columns
+ vals = DataFrame(
+ np.random.random((2, 4)), index=subidx, columns=cols)
+ check(target=df,
+ indexers=(subidx, slice(None, None, None)),
+ value=vals,
+ compare_fn=tm.assert_frame_equal, )
+ # identity
+ copy = df.copy()
+ check(target=df, indexers=(df.index, df.columns), value=df,
+ compare_fn=tm.assert_frame_equal, expected=copy)
+
+ def test_multiindex_setitem(self):
+
+ # GH 3738
+ # setting with a multi-index right hand side
+ arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']),
+ np.array(['one', 'two', 'one', 'one', 'two', 'one']),
+ np.arange(0, 6, 1)]
+
+ df_orig = DataFrame(np.random.randn(6, 3), index=arrays,
+ columns=['A', 'B', 'C']).sort_index()
+
+ expected = df_orig.loc[['bar']] * 2
+ df = df_orig.copy()
+ df.loc[['bar']] *= 2
+ tm.assert_frame_equal(df.loc[['bar']], expected)
+
+ # raise because these have differing levels
+ with pytest.raises(TypeError):
+ df.loc['bar'] *= 2
+
+ # from SO
+ # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation
+ df_orig = DataFrame.from_dict({'price': {
+ ('DE', 'Coal', 'Stock'): 2,
+ ('DE', 'Gas', 'Stock'): 4,
+ ('DE', 'Elec', 'Demand'): 1,
+ ('FR', 'Gas', 'Stock'): 5,
+ ('FR', 'Solar', 'SupIm'): 0,
+ ('FR', 'Wind', 'SupIm'): 0
+ }})
+ df_orig.index = MultiIndex.from_tuples(df_orig.index,
+ names=['Sit', 'Com', 'Type'])
+
+ expected = df_orig.copy()
+ expected.iloc[[0, 2, 3]] *= 2
+
+ idx = pd.IndexSlice
+ df = df_orig.copy()
+ df.loc[idx[:, :, 'Stock'], :] *= 2
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[idx[:, :, 'Stock'], 'price'] *= 2
+ tm.assert_frame_equal(df, expected)
+
+ def test_multiindex_assignment(self):
+
+ # GH3777 part 2
+
+ # mixed dtype
+ df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3),
+ columns=list('abc'),
+ index=[[4, 4, 8], [8, 10, 12]])
+ df['d'] = np.nan
+ arr = np.array([0., 1.])
+
+ with catch_warnings(record=True):
+ df.ix[4, 'd'] = arr
+ tm.assert_series_equal(df.ix[4, 'd'],
+ Series(arr, index=[8, 10], name='d'))
+
+ # single dtype
+ df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3),
+ columns=list('abc'),
+ index=[[4, 4, 8], [8, 10, 12]])
+
+ with catch_warnings(record=True):
+ df.ix[4, 'c'] = arr
+ exp = Series(arr, index=[8, 10], name='c', dtype='float64')
+ tm.assert_series_equal(df.ix[4, 'c'], exp)
+
+ # scalar ok
+ with catch_warnings(record=True):
+ df.ix[4, 'c'] = 10
+ exp = Series(10, index=[8, 10], name='c', dtype='float64')
+ tm.assert_series_equal(df.ix[4, 'c'], exp)
+
+ # invalid assignments
+ with pytest.raises(ValueError):
+ with catch_warnings(record=True):
+ df.ix[4, 'c'] = [0, 1, 2, 3]
+
+ with pytest.raises(ValueError):
+ with catch_warnings(record=True):
+ df.ix[4, 'c'] = [0]
+
+ # groupby example
+ NUM_ROWS = 100
+ NUM_COLS = 10
+ col_names = ['A' + num for num in
+ map(str, np.arange(NUM_COLS).tolist())]
+ index_cols = col_names[:5]
+
+ df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)),
+ dtype=np.int64, columns=col_names)
+ df = df.set_index(index_cols).sort_index()
+ grp = df.groupby(level=index_cols[:4])
+ df['new_col'] = np.nan
+
+ f_index = np.arange(5)
+
+ def f(name, df2):
+ return Series(np.arange(df2.shape[0]),
+ name=df2.index.values[0]).reindex(f_index)
+
+ # TODO(wesm): unused?
+ # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T
+
+ # we are actually operating on a copy here
+ # but in this case, that's ok
+ for name, df2 in grp:
+ new_vals = np.arange(df2.shape[0])
+ with catch_warnings(record=True):
+ df.ix[name, 'new_col'] = new_vals
+
+ def test_series_setitem(
+ self, multiindex_year_month_day_dataframe_random_data):
+ ymd = multiindex_year_month_day_dataframe_random_data
+ s = ymd['A']
+
+ s[2000, 3] = np.nan
+ assert isna(s.values[42:65]).all()
+ assert notna(s.values[:42]).all()
+ assert notna(s.values[65:]).all()
+
+ s[2000, 3, 10] = np.nan
+ assert isna(s[49])
+
+ def test_frame_getitem_setitem_boolean(
+ self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ df = frame.T.copy()
+ values = df.values
+
+ result = df[df > 0]
+ expected = df.where(df > 0)
+ tm.assert_frame_equal(result, expected)
+
+ df[df > 0] = 5
+ values[values > 0] = 5
+ tm.assert_almost_equal(df.values, values)
+
+ df[df == 5] = 0
+ values[values == 5] = 0
+ tm.assert_almost_equal(df.values, values)
+
+ # a df that needs alignment first
+ df[df[:-1] < 0] = 2
+ np.putmask(values[:-1], values[:-1] < 0, 2)
+ tm.assert_almost_equal(df.values, values)
+
+ with pytest.raises(TypeError, match='boolean values only'):
+ df[df * 0] = 2
+
+ def test_frame_getitem_setitem_multislice(self):
+ levels = [['t1', 't2'], ['a', 'b', 'c']]
+ codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]]
+ midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id'])
+ df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx)
+
+ result = df.loc[:, 'value']
+ tm.assert_series_equal(df['value'], result)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ result = df.ix[:, 'value']
+ tm.assert_series_equal(df['value'], result)
+
+ result = df.loc[df.index[1:3], 'value']
+ tm.assert_series_equal(df['value'][1:3], result)
+
+ result = df.loc[:, :]
+ tm.assert_frame_equal(df, result)
+
+ result = df
+ df.loc[:, 'value'] = 10
+ result['value'] = 10
+ tm.assert_frame_equal(df, result)
+
+ df.loc[:, :] = 10
+ tm.assert_frame_equal(df, result)
+
+ def test_frame_setitem_multi_column(self):
+ df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'],
+ [0, 1, 0, 1]])
+
+ cp = df.copy()
+ cp['a'] = cp['b']
+ tm.assert_frame_equal(cp['a'], cp['b'])
+
+ # set with ndarray
+ cp = df.copy()
+ cp['a'] = cp['b'].values
+ tm.assert_frame_equal(cp['a'], cp['b'])
+
+ # ---------------------------------------
+ # #1803
+ columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')])
+ df = DataFrame(index=[1, 3, 5], columns=columns)
+
+ # Works, but adds a column instead of updating the two existing ones
+ df['A'] = 0.0 # Doesn't work
+ assert (df['A'].values == 0).all()
+
+ # it broadcasts
+ df['B', '1'] = [1, 2, 3]
+ df['A'] = df['B', '1']
+
+ sliced_a1 = df['A', '1']
+ sliced_a2 = df['A', '2']
+ sliced_b1 = df['B', '1']
+ tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False)
+ tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False)
+ assert sliced_a1.name == ('A', '1')
+ assert sliced_a2.name == ('A', '2')
+ assert sliced_b1.name == ('B', '1')
+
+ def test_getitem_setitem_tuple_plus_columns(
+ self, multiindex_year_month_day_dataframe_random_data):
+ # GH #1013
+ ymd = multiindex_year_month_day_dataframe_random_data
+ df = ymd[:5]
+
+ result = df.loc[(2000, 1, 6), ['A', 'B', 'C']]
+ expected = df.loc[2000, 1, 6][['A', 'B', 'C']]
+ tm.assert_series_equal(result, expected)
+
+ def test_getitem_setitem_slice_integers(self):
+ index = MultiIndex(levels=[[0, 1, 2], [0, 2]],
+ codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
+
+ frame = DataFrame(np.random.randn(len(index), 4), index=index,
+ columns=['a', 'b', 'c', 'd'])
+ res = frame.loc[1:2]
+ exp = frame.reindex(frame.index[2:])
+ tm.assert_frame_equal(res, exp)
+
+ frame.loc[1:2] = 7
+ assert (frame.loc[1:2] == 7).values.all()
+
+ series = Series(np.random.randn(len(index)), index=index)
+
+ res = series.loc[1:2]
+ exp = series.reindex(series.index[2:])
+ tm.assert_series_equal(res, exp)
+
+ series.loc[1:2] = 7
+ assert (series.loc[1:2] == 7).values.all()
+
+ def test_setitem_change_dtype(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ dft = frame.T
+ s = dft['foo', 'two']
+ dft['foo', 'two'] = s > s.median()
+ tm.assert_series_equal(dft['foo', 'two'], s > s.median())
+ # assert isinstance(dft._data.blocks[1].items, MultiIndex)
+
+ reindexed = dft.reindex(columns=[('foo', 'two')])
+ tm.assert_series_equal(reindexed['foo', 'two'], s > s.median())
+
+ def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ subset = frame.index[[1, 4, 5]]
+
+ frame.loc[subset] = 99
+ assert (frame.loc[subset].values == 99).all()
+
+ col = frame['B']
+ col[subset] = 97
+ assert (frame.loc[subset, 'B'] == 97).all()
+
+ def test_nonunique_assignment_1750(self):
+ df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]],
+ columns=list("ABCD"))
+
+ df = df.set_index(['A', 'B'])
+ ix = MultiIndex.from_tuples([(1, 1)])
+
+ df.loc[ix, "C"] = '_'
+
+ assert (df.xs((1, 1))['C'] == '_').all()
+
+ def test_astype_assignment_with_dups(self):
+
+ # GH 4686
+ # assignment with dups that has a dtype change
+ cols = MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')])
+ df = DataFrame(np.arange(3).reshape((1, 3)),
+ columns=cols, dtype=object)
+ index = df.index.copy()
+
+ df['A'] = df['A'].astype(np.float64)
+ tm.assert_index_equal(df.index, index)
+
+
+def test_frame_setitem_view_direct(multiindex_dataframe_random_data):
+ # this works because we are modifying the underlying array
+ # really a no-no
+ df = multiindex_dataframe_random_data.T
+ df['foo'].values[:] = 0
+ assert (df['foo'].values == 0).all()
+
+
+def test_frame_setitem_copy_raises(multiindex_dataframe_random_data):
+ # will raise/warn as its chained assignment
+ df = multiindex_dataframe_random_data.T
+ msg = "A value is trying to be set on a copy of a slice from a DataFrame"
+ with pytest.raises(com.SettingWithCopyError, match=msg):
+ df['foo']['one'] = 2
+
+
+def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data.T
+ expected = frame
+ df = frame.copy()
+ msg = "A value is trying to be set on a copy of a slice from a DataFrame"
+ with pytest.raises(com.SettingWithCopyError, match=msg):
+ df['foo']['one'] = 2
+
+ result = df
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_slice.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_slice.py
new file mode 100644
index 00000000000..fcecb2b454e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_slice.py
@@ -0,0 +1,576 @@
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas.errors import UnsortedIndexError
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, Timestamp
+from pandas.core.indexing import _non_reducing_slice
+from pandas.tests.indexing.common import _mklbl
+from pandas.util import testing as tm
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
+class TestMultiIndexSlicers(object):
+
+ def test_per_axis_per_level_getitem(self):
+
+ # GH6134
+ # example test case
+ ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl(
+ 'C', 4), _mklbl('D', 2)])
+ df = DataFrame(np.arange(len(ix.get_values())), index=ix)
+
+ result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :]
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (a == 'A1' or a == 'A2' or a == 'A3') and (
+ c == 'C1' or c == 'C3')]]
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (a == 'A1' or a == 'A2' or a == 'A3') and (
+ c == 'C1' or c == 'C2' or c == 'C3')]]
+ result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :]
+ tm.assert_frame_equal(result, expected)
+
+ # test multi-index slicing with per axis and per index controls
+ index = MultiIndex.from_tuples([('A', 1), ('A', 2),
+ ('A', 3), ('B', 1)],
+ names=['one', 'two'])
+ columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
+ ('b', 'foo'), ('b', 'bah')],
+ names=['lvl0', 'lvl1'])
+
+ df = DataFrame(
+ np.arange(16, dtype='int64').reshape(
+ 4, 4), index=index, columns=columns)
+ df = df.sort_index(axis=0).sort_index(axis=1)
+
+ # identity
+ result = df.loc[(slice(None), slice(None)), :]
+ tm.assert_frame_equal(result, df)
+ result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))]
+ tm.assert_frame_equal(result, df)
+ result = df.loc[:, (slice(None), slice(None))]
+ tm.assert_frame_equal(result, df)
+
+ # index
+ result = df.loc[(slice(None), [1]), :]
+ expected = df.iloc[[0, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(slice(None), 1), :]
+ expected = df.iloc[[0, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # columns
+ result = df.loc[:, (slice(None), ['foo'])]
+ expected = df.iloc[:, [1, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # both
+ result = df.loc[(slice(None), 1), (slice(None), ['foo'])]
+ expected = df.iloc[[0, 3], [1, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc['A', 'a']
+ expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]),
+ index=Index([1, 2, 3], name='two'),
+ columns=Index(['bar', 'foo'], name='lvl1'))
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(slice(None), [1, 2]), :]
+ expected = df.iloc[[0, 1, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # multi-level series
+ s = Series(np.arange(len(ix.get_values())), index=ix)
+ result = s.loc['A1':'A3', :, ['C1', 'C3']]
+ expected = s.loc[[tuple([a, b, c, d])
+ for a, b, c, d in s.index.values
+ if (a == 'A1' or a == 'A2' or a == 'A3') and (
+ c == 'C1' or c == 'C3')]]
+ tm.assert_series_equal(result, expected)
+
+ # boolean indexers
+ result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
+ expected = df.iloc[[2, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ df.loc[(slice(None), np.array([True, False])), :]
+
+ # ambiguous cases
+ # these can be multiply interpreted (e.g. in this case
+ # as df.loc[slice(None),[1]] as well
+ pytest.raises(KeyError, lambda: df.loc[slice(None), [1]])
+
+ result = df.loc[(slice(None), [1]), :]
+ expected = df.iloc[[0, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # not lexsorted
+ assert df.index.lexsort_depth == 2
+ df = df.sort_index(level=1, axis=0)
+ assert df.index.lexsort_depth == 0
+
+ msg = ('MultiIndex slicing requires the index to be '
+ r'lexsorted: slicing on levels \[1\], lexsort depth 0')
+ with pytest.raises(UnsortedIndexError, match=msg):
+ df.loc[(slice(None), slice('bar')), :]
+
+ # GH 16734: not sorted, but no real slicing
+ result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
+ tm.assert_frame_equal(result, df.iloc[[1, 3], :])
+
+ def test_multiindex_slicers_non_unique(self):
+
+ # GH 7106
+ # non-unique mi index support
+ df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'],
+ B=['a', 'a', 'a', 'a'],
+ C=[1, 2, 1, 3],
+ D=[1, 2, 3, 4]))
+ .set_index(['A', 'B', 'C']).sort_index())
+ assert not df.index.is_unique
+ expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'],
+ C=[1, 1], D=[1, 3]))
+ .set_index(['A', 'B', 'C']).sort_index())
+ result = df.loc[(slice(None), slice(None), 1), :]
+ tm.assert_frame_equal(result, expected)
+
+ # this is equivalent of an xs expression
+ result = df.xs(1, level=2, drop_level=False)
+ tm.assert_frame_equal(result, expected)
+
+ df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'],
+ B=['a', 'a', 'a', 'a'],
+ C=[1, 2, 1, 2],
+ D=[1, 2, 3, 4]))
+ .set_index(['A', 'B', 'C']).sort_index())
+ assert not df.index.is_unique
+ expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'],
+ C=[1, 1], D=[1, 3]))
+ .set_index(['A', 'B', 'C']).sort_index())
+ result = df.loc[(slice(None), slice(None), 1), :]
+ assert not result.index.is_unique
+ tm.assert_frame_equal(result, expected)
+
+ # GH12896
+ # numpy-implementation dependent bug
+ ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16,
+ 17, 18, 19, 200000, 200000]
+ n = len(ints)
+ idx = MultiIndex.from_arrays([['a'] * n, ints])
+ result = Series([1] * n, index=idx)
+ result = result.sort_index()
+ result = result.loc[(slice(None), slice(100000))]
+ expected = Series([1] * (n - 2), index=idx[:-2]).sort_index()
+ tm.assert_series_equal(result, expected)
+
+ def test_multiindex_slicers_datetimelike(self):
+
+ # GH 7429
+ # buggy/inconsistent behavior when slicing with datetime-like
+ import datetime
+ dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) +
+ datetime.timedelta(days=i) for i in range(6)]
+ freq = [1, 2]
+ index = MultiIndex.from_product(
+ [dates, freq], names=['date', 'frequency'])
+
+ df = DataFrame(
+ np.arange(6 * 2 * 4, dtype='int64').reshape(
+ -1, 4), index=index, columns=list('ABCD'))
+
+ # multi-axis slicing
+ idx = pd.IndexSlice
+ expected = df.iloc[[0, 2, 4], [0, 1]]
+ result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'),
+ Timestamp('2012-01-03 12:12:12')),
+ slice(1, 1)), slice('A', 'B')]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp(
+ '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'),
+ Timestamp('2012-01-03 12:12:12')), 1),
+ slice('A', 'B')]
+ tm.assert_frame_equal(result, expected)
+
+ # with strings
+ result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'),
+ slice(1, 1)), slice('A', 'B')]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1),
+ idx['A', 'B']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_multiindex_slicers_edges(self):
+ # GH 8132
+ # various edge cases
+ df = DataFrame(
+ {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5,
+ 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3,
+ 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30",
+ "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09",
+ "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01",
+ "2013-07-09", "2013-08-06", "2013-09-03"],
+ 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]})
+
+ df['DATE'] = pd.to_datetime(df['DATE'])
+ df1 = df.set_index(['A', 'B', 'DATE'])
+ df1 = df1.sort_index()
+
+ # A1 - Get all values under "A0" and "A1"
+ result = df1.loc[(slice('A1')), :]
+ expected = df1.iloc[0:10]
+ tm.assert_frame_equal(result, expected)
+
+ # A2 - Get all values from the start to "A2"
+ result = df1.loc[(slice('A2')), :]
+ expected = df1
+ tm.assert_frame_equal(result, expected)
+
+ # A3 - Get all values under "B1" or "B2"
+ result = df1.loc[(slice(None), slice('B1', 'B2')), :]
+ expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]]
+ tm.assert_frame_equal(result, expected)
+
+ # A4 - Get all values between 2013-07-02 and 2013-07-09
+ result = df1.loc[(slice(None), slice(None),
+ slice('20130702', '20130709')), :]
+ expected = df1.iloc[[1, 2, 6, 7, 12]]
+ tm.assert_frame_equal(result, expected)
+
+ # B1 - Get all values in B0 that are also under A0, A1 and A2
+ result = df1.loc[(slice('A2'), slice('B0')), :]
+ expected = df1.iloc[[0, 1, 5, 6, 10, 11]]
+ tm.assert_frame_equal(result, expected)
+
+ # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for
+ # the As)
+ result = df1.loc[(slice(None), slice('B2')), :]
+ expected = df1
+ tm.assert_frame_equal(result, expected)
+
+ # B3 - Get all values from B1 to B2 and up to 2013-08-06
+ result = df1.loc[(slice(None), slice('B1', 'B2'),
+ slice('2013-08-06')), :]
+ expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]]
+ tm.assert_frame_equal(result, expected)
+
+ # B4 - Same as A4 but the start of the date slice is not a key.
+ # shows indexing on a partial selection slice
+ result = df1.loc[(slice(None), slice(None),
+ slice('20130701', '20130709')), :]
+ expected = df1.iloc[[1, 2, 6, 7, 12]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_per_axis_per_level_doc_examples(self):
+
+ # test index maker
+ idx = pd.IndexSlice
+
+ # from indexing.rst / advanced
+ index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2),
+ _mklbl('C', 4), _mklbl('D', 2)])
+ columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
+ ('b', 'foo'), ('b', 'bah')],
+ names=['lvl0', 'lvl1'])
+ df = DataFrame(np.arange(len(index) * len(columns), dtype='int64')
+ .reshape((len(index), len(columns))),
+ index=index, columns=columns)
+ result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :]
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (a == 'A1' or a == 'A2' or a == 'A3') and (
+ c == 'C1' or c == 'C3')]]
+ tm.assert_frame_equal(result, expected)
+ result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :]
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (c == 'C1' or c == 'C3')]]
+ tm.assert_frame_equal(result, expected)
+ result = df.loc[idx[:, :, ['C1', 'C3']], :]
+ tm.assert_frame_equal(result, expected)
+
+ # not sorted
+ with pytest.raises(UnsortedIndexError):
+ df.loc['A1', ('a', slice('foo'))]
+
+ # GH 16734: not sorted, but no real slicing
+ tm.assert_frame_equal(df.loc['A1', (slice(None), 'foo')],
+ df.loc['A1'].iloc[:, [0, 2]])
+
+ df = df.sort_index(axis=1)
+
+ # slicing
+ df.loc['A1', (slice(None), 'foo')]
+ df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')]
+
+ # setitem
+ df.loc(axis=0)[:, :, ['C1', 'C3']] = -10
+
+ def test_loc_axis_arguments(self):
+
+ index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2),
+ _mklbl('C', 4), _mklbl('D', 2)])
+ columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
+ ('b', 'foo'), ('b', 'bah')],
+ names=['lvl0', 'lvl1'])
+ df = DataFrame(np.arange(len(index) * len(columns), dtype='int64')
+ .reshape((len(index), len(columns))),
+ index=index,
+ columns=columns).sort_index().sort_index(axis=1)
+
+ # axis 0
+ result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']]
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (a == 'A1' or a == 'A2' or a == 'A3') and (
+ c == 'C1' or c == 'C3')]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc(axis='index')[:, :, ['C1', 'C3']]
+ expected = df.loc[[tuple([a, b, c, d])
+ for a, b, c, d in df.index.values
+ if (c == 'C1' or c == 'C3')]]
+ tm.assert_frame_equal(result, expected)
+
+ # axis 1
+ result = df.loc(axis=1)[:, 'foo']
+ expected = df.loc[:, (slice(None), 'foo')]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc(axis='columns')[:, 'foo']
+ expected = df.loc[:, (slice(None), 'foo')]
+ tm.assert_frame_equal(result, expected)
+
+ # invalid axis
+ with pytest.raises(ValueError):
+ df.loc(axis=-1)[:, :, ['C1', 'C3']]
+
+ with pytest.raises(ValueError):
+ df.loc(axis=2)[:, :, ['C1', 'C3']]
+
+ with pytest.raises(ValueError):
+ df.loc(axis='foo')[:, :, ['C1', 'C3']]
+
+ def test_per_axis_per_level_setitem(self):
+
+ # test index maker
+ idx = pd.IndexSlice
+
+ # test multi-index slicing with per axis and per index controls
+ index = MultiIndex.from_tuples([('A', 1), ('A', 2),
+ ('A', 3), ('B', 1)],
+ names=['one', 'two'])
+ columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'),
+ ('b', 'foo'), ('b', 'bah')],
+ names=['lvl0', 'lvl1'])
+
+ df_orig = DataFrame(
+ np.arange(16, dtype='int64').reshape(
+ 4, 4), index=index, columns=columns)
+ df_orig = df_orig.sort_index(axis=0).sort_index(axis=1)
+
+ # identity
+ df = df_orig.copy()
+ df.loc[(slice(None), slice(None)), :] = 100
+ expected = df_orig.copy()
+ expected.iloc[:, :] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc(axis=0)[:, :] = 100
+ expected = df_orig.copy()
+ expected.iloc[:, :] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100
+ expected = df_orig.copy()
+ expected.iloc[:, :] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[:, (slice(None), slice(None))] = 100
+ expected = df_orig.copy()
+ expected.iloc[:, :] = 100
+ tm.assert_frame_equal(df, expected)
+
+ # index
+ df = df_orig.copy()
+ df.loc[(slice(None), [1]), :] = 100
+ expected = df_orig.copy()
+ expected.iloc[[0, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), :] = 100
+ expected = df_orig.copy()
+ expected.iloc[[0, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc(axis=0)[:, 1] = 100
+ expected = df_orig.copy()
+ expected.iloc[[0, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ # columns
+ df = df_orig.copy()
+ df.loc[:, (slice(None), ['foo'])] = 100
+ expected = df_orig.copy()
+ expected.iloc[:, [1, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ # both
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[idx[:, 1], idx[:, ['foo']]] = 100
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc['A', 'a'] = 100
+ expected = df_orig.copy()
+ expected.iloc[0:3, 0:2] = 100
+ tm.assert_frame_equal(df, expected)
+
+ # setting with a list-like
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
+ [[100, 100], [100, 100]], dtype='int64')
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] = 100
+ tm.assert_frame_equal(df, expected)
+
+ # not enough values
+ df = df_orig.copy()
+
+ with pytest.raises(ValueError):
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
+ [[100], [100, 100]], dtype='int64')
+
+ with pytest.raises(ValueError):
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array(
+ [100, 100, 100, 100], dtype='int64')
+
+ # with an alignable rhs
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice(
+ None), 1), (slice(None), ['foo'])] * 5
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice(
+ None), 1), (slice(None), ['foo'])]
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
+ tm.assert_frame_equal(df, expected)
+
+ rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy()
+ rhs.loc[:, ('c', 'bah')] = 10
+ df = df_orig.copy()
+ df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs
+ expected = df_orig.copy()
+ expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
+ tm.assert_frame_equal(df, expected)
+
+ def test_multiindex_label_slicing_with_negative_step(self):
+ s = Series(np.arange(20),
+ MultiIndex.from_product([list('abcde'), np.arange(4)]))
+ SLC = pd.IndexSlice
+
+ def assert_slices_equivalent(l_slc, i_slc):
+ tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc])
+ tm.assert_series_equal(s[l_slc], s.iloc[i_slc])
+ with catch_warnings(record=True):
+ tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc])
+
+ assert_slices_equivalent(SLC[::-1], SLC[::-1])
+
+ assert_slices_equivalent(SLC['d'::-1], SLC[15::-1])
+ assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1])
+
+ assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1])
+ assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1])
+
+ assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1])
+ assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1])
+ assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1])
+ assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1])
+ assert_slices_equivalent(SLC['b':'d':-1], SLC[:0])
+
+ assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1])
+ assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1])
+ assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1])
+
+ def test_multiindex_slice_first_level(self):
+ # GH 12697
+ freq = ['a', 'b', 'c', 'd']
+ idx = MultiIndex.from_product([freq, np.arange(500)])
+ df = DataFrame(list(range(2000)), index=idx, columns=['Test'])
+ df_slice = df.loc[pd.IndexSlice[:, 30:70], :]
+ result = df_slice.loc['a']
+ expected = DataFrame(list(range(30, 71)),
+ columns=['Test'], index=range(30, 71))
+ tm.assert_frame_equal(result, expected)
+ result = df_slice.loc['d']
+ expected = DataFrame(list(range(1530, 1571)),
+ columns=['Test'], index=range(30, 71))
+ tm.assert_frame_equal(result, expected)
+
+ def test_int_series_slicing(
+ self, multiindex_year_month_day_dataframe_random_data):
+ ymd = multiindex_year_month_day_dataframe_random_data
+ s = ymd['A']
+ result = s[5:]
+ expected = s.reindex(s.index[5:])
+ tm.assert_series_equal(result, expected)
+
+ exp = ymd['A'].copy()
+ s[5:] = 0
+ exp.values[5:] = 0
+ tm.assert_numpy_array_equal(s.values, exp.values)
+
+ result = ymd[5:]
+ expected = ymd.reindex(s.index[5:])
+ tm.assert_frame_equal(result, expected)
+
+ def test_non_reducing_slice_on_multiindex(self):
+ # GH 19861
+ dic = {
+ ('a', 'd'): [1, 4],
+ ('a', 'c'): [2, 3],
+ ('b', 'c'): [3, 2],
+ ('b', 'd'): [4, 1]
+ }
+ df = pd.DataFrame(dic, index=[0, 1])
+ idx = pd.IndexSlice
+ slice_ = idx[:, idx['b', 'd']]
+ tslice_ = _non_reducing_slice(slice_)
+
+ result = df.loc[tslice_]
+ expected = pd.DataFrame({('b', 'd'): [4, 1]})
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_sorted.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_sorted.py
new file mode 100644
index 00000000000..f565c30fc3e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_sorted.py
@@ -0,0 +1,92 @@
+import numpy as np
+from numpy.random import randn
+
+from pandas.compat import lzip
+
+from pandas import DataFrame, MultiIndex, Series
+from pandas.util import testing as tm
+
+
+class TestMultiIndexSorted(object):
+ def test_getitem_multilevel_index_tuple_not_sorted(self):
+ index_columns = list("abc")
+ df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]],
+ columns=index_columns + ["data"])
+ df = df.set_index(index_columns)
+ query_index = df.index[:1]
+ rs = df.loc[query_index, "data"]
+
+ xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c'])
+ xp = Series(['x'], index=xp_idx, name='data')
+ tm.assert_series_equal(rs, xp)
+
+ def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ df = frame.sort_index(level=1).T
+
+ # buglet with int typechecking
+ result = df.iloc[:, :np.int32(3)]
+ expected = df.reindex(columns=df.columns[:3])
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_getitem_not_sorted2(self):
+ # 13431
+ df = DataFrame({'col1': ['b', 'd', 'b', 'a'],
+ 'col2': [3, 1, 1, 2],
+ 'data': ['one', 'two', 'three', 'four']})
+
+ df2 = df.set_index(['col1', 'col2'])
+ df2_original = df2.copy()
+
+ df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True)
+ df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True)
+ assert not df2.index.is_lexsorted()
+ assert not df2.index.is_monotonic
+
+ assert df2_original.index.equals(df2.index)
+ expected = df2.sort_index()
+ assert expected.index.is_lexsorted()
+ assert expected.index.is_monotonic
+
+ result = df2.sort_index(level=0)
+ assert result.index.is_lexsorted()
+ assert result.index.is_monotonic
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data):
+ frame = multiindex_dataframe_random_data
+ df = frame.T
+ df['foo', 'four'] = 'foo'
+
+ arrays = [np.array(x) for x in zip(*df.columns.values)]
+
+ result = df['foo']
+ result2 = df.loc[:, 'foo']
+ expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
+ expected.columns = expected.columns.droplevel(0)
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ df = df.T
+ result = df.xs('foo')
+ result2 = df.loc['foo']
+ expected = df.reindex(df.index[arrays[0] == 'foo'])
+ expected.index = expected.index.droplevel(0)
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ def test_series_getitem_not_sorted(self):
+ arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ tuples = lzip(*arrays)
+ index = MultiIndex.from_tuples(tuples)
+ s = Series(randn(8), index=index)
+
+ arrays = [np.array(x) for x in zip(*index.values)]
+
+ result = s['qux']
+ result2 = s.loc['qux']
+ expected = s[arrays[0] == 'qux']
+ expected.index = expected.index.droplevel(0)
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_xs.py b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_xs.py
new file mode 100644
index 00000000000..fb6d763cfcf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/multiindex/test_xs.py
@@ -0,0 +1,237 @@
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, product as cart_product
+
+from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range
+import pandas.core.common as com
+from pandas.util import testing as tm
+
+
+def four_level_index_dataframe():
+ arr = np.array([[-0.5109, -2.3358, -0.4645, 0.05076, 0.364],
+ [0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
+ [-0.6662, -0.5243, -0.358, 0.89145, 2.5838]])
+ index = MultiIndex(
+ levels=[['a', 'x'], ['b', 'q'], [10.0032, 20.0, 30.0], [3, 4, 5]],
+ codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]],
+ names=['one', 'two', 'three', 'four'])
+ return DataFrame(arr, index=index, columns=list('ABCDE'))
+
+
[email protected]('key, level, exp_arr, exp_index', [
+ ('a', 'lvl0', lambda x: x[:, 0:2], Index(['bar', 'foo'], name='lvl1')),
+ ('foo', 'lvl1', lambda x: x[:, 1:2], Index(['a'], name='lvl0'))
+])
+def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index):
+ # see gh-2903
+ arr = np.random.randn(4, 4)
+ index = MultiIndex(levels=[['a', 'b'], ['bar', 'foo', 'hello', 'world']],
+ codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
+ names=['lvl0', 'lvl1'])
+ df = DataFrame(arr, columns=index)
+ result = df.xs(key, level=level, axis=1)
+ expected = DataFrame(exp_arr(arr), columns=exp_index)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_xs_values(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ result = df.xs(('bar', 'two')).values
+ expected = df.values[4]
+ tm.assert_almost_equal(result, expected)
+
+
+def test_xs_loc_equality(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ result = df.xs(('bar', 'two'))
+ expected = df.loc[('bar', 'two')]
+ tm.assert_series_equal(result, expected)
+
+
+def test_xs_missing_values_in_index():
+ # see gh-6574
+ # missing values in returned index should be preserrved
+ acc = [
+ ('a', 'abcde', 1),
+ ('b', 'bbcde', 2),
+ ('y', 'yzcde', 25),
+ ('z', 'xbcde', 24),
+ ('z', None, 26),
+ ('z', 'zbcde', 25),
+ ('z', 'ybcde', 26),
+ ]
+ df = DataFrame(acc,
+ columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2'])
+ expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index(
+ ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2'))
+
+ result = df.xs('z', level='a1')
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('key, level', [
+ ('one', 'second'),
+ (['one'], ['second'])
+])
+def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data):
+ # see gh-13719
+ frame = multiindex_dataframe_random_data
+ df = concat([frame] * 2)
+ assert df.index.is_unique is False
+ expected = concat([frame.xs('one', level='second')] * 2)
+
+ result = df.xs(key, level=level)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_xs_level(multiindex_dataframe_random_data):
+ df = multiindex_dataframe_random_data
+ result = df.xs('two', level='second')
+ expected = df[df.index.get_level_values(1) == 'two']
+ expected.index = Index(['foo', 'bar', 'baz', 'qux'], name='first')
+ tm.assert_frame_equal(result, expected)
+
+
+def test_xs_level_eq_2():
+ arr = np.random.randn(3, 5)
+ index = MultiIndex(
+ levels=[['a', 'p', 'x'], ['b', 'q', 'y'], ['c', 'r', 'z']],
+ codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]])
+ df = DataFrame(arr, index=index)
+ expected = DataFrame(arr[1:2], index=[['a'], ['b']])
+ result = df.xs('c', level=2)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('indexer', [
+ lambda df: df.xs(('a', 4), level=['one', 'four']),
+ lambda df: df.xs('a').xs(4, level='four')
+])
+def test_xs_level_multiple(indexer, four_level_index_dataframe):
+ df = four_level_index_dataframe
+ expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]]
+ expected_index = MultiIndex(
+ levels=[['q'], [20.0]],
+ codes=[[0], [0]],
+ names=['two', 'three'])
+ expected = DataFrame(
+ expected_values, index=expected_index, columns=list('ABCDE'))
+ result = indexer(df)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_xs_setting_with_copy_error(multiindex_dataframe_random_data):
+ # this is a copy in 0.14
+ df = multiindex_dataframe_random_data
+ result = df.xs('two', level='second')
+
+ # setting this will give a SettingWithCopyError
+ # as we are trying to write a view
+ msg = 'A value is trying to be set on a copy of a slice from a DataFrame'
+ with pytest.raises(com.SettingWithCopyError, match=msg):
+ result[:] = 10
+
+
+def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe):
+ # this is a copy in 0.14
+ df = four_level_index_dataframe
+ result = df.xs(('a', 4), level=['one', 'four'])
+
+ # setting this will give a SettingWithCopyError
+ # as we are trying to write a view
+ msg = 'A value is trying to be set on a copy of a slice from a DataFrame'
+ with pytest.raises(com.SettingWithCopyError, match=msg):
+ result[:] = 10
+
+
+def test_xs_integer_key():
+ # see gh-2107
+ dates = lrange(20111201, 20111205)
+ ids = 'abcde'
+ index = MultiIndex.from_tuples(
+ [x for x in cart_product(dates, ids)],
+ names=['date', 'secid'])
+ df = DataFrame(
+ np.random.randn(len(index), 3), index, ['X', 'Y', 'Z'])
+
+ result = df.xs(20111201, level='date')
+ expected = df.loc[20111201, :]
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]('indexer', [
+ lambda df: df.xs('a', level=0),
+ lambda df: df.xs('a')
+])
+def test_xs_level0(indexer, four_level_index_dataframe):
+ df = four_level_index_dataframe
+ expected_values = [[-0.5109, -2.3358, -0.4645, 0.05076, 0.364],
+ [0.4473, 1.4152, 0.2834, 1.00661, 0.1744]]
+ expected_index = MultiIndex(
+ levels=[['b', 'q'], [10.0032, 20.0], [4, 5]],
+ codes=[[0, 1], [0, 1], [1, 0]],
+ names=['two', 'three', 'four'])
+ expected = DataFrame(
+ expected_values, index=expected_index, columns=list('ABCDE'))
+
+ result = indexer(df)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_xs_level_series(multiindex_dataframe_random_data):
+ # this test is not explicitly testing .xs functionality
+ # TODO: move to another module or refactor
+ df = multiindex_dataframe_random_data
+ s = df['A']
+ result = s[:, 'two']
+ expected = df.xs('two', level=1)['A']
+ tm.assert_series_equal(result, expected)
+
+
+def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data):
+ # this test is not explicitly testing .xs functionality
+ # TODO: move to another module or refactor
+ df = multiindex_year_month_day_dataframe_random_data
+ s = df['A']
+ result = s[2000, 5]
+ expected = df.loc[2000, 5]['A']
+ tm.assert_series_equal(result, expected)
+
+
+def test_xs_level_series_slice_not_implemented(
+ multiindex_year_month_day_dataframe_random_data):
+ # this test is not explicitly testing .xs functionality
+ # TODO: move to another module or refactor
+ # not implementing this for now
+ df = multiindex_year_month_day_dataframe_random_data
+ s = df['A']
+
+ msg = r'\(2000, slice\(3, 4, None\)\)'
+ with pytest.raises(TypeError, match=msg):
+ s[2000, 3:4]
+
+
+def test_series_getitem_multiindex_xs():
+ # GH6258
+ dt = list(date_range('20130903', periods=3))
+ idx = MultiIndex.from_product([list('AB'), dt])
+ s = Series([1, 3, 4, 1, 3, 4], index=idx)
+ expected = Series([1, 1], index=list('AB'))
+
+ result = s.xs('20130903', level=1)
+ tm.assert_series_equal(result, expected)
+
+
+def test_series_getitem_multiindex_xs_by_label():
+ # GH5684
+ idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'),
+ ('b', 'two')])
+ s = Series([1, 2, 3, 4], index=idx)
+ s.index.set_names(['L1', 'L2'], inplace=True)
+ expected = Series([1, 3], index=['a', 'b'])
+ expected.index.set_names(['L1'], inplace=True)
+
+ result = s.xs('one', level='L2')
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_callable.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_callable.py
new file mode 100644
index 00000000000..d8f65c211a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_callable.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestIndexingCallable(object):
+
+ def test_frame_loc_ix_callable(self):
+ # GH 11485
+ df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'),
+ 'C': [1, 2, 3, 4]})
+ # iloc cannot use boolean Series (see GH3635)
+
+ # return bool indexer
+ res = df.loc[lambda x: x.A > 2]
+ tm.assert_frame_equal(res, df.loc[df.A > 2])
+
+ res = df.loc[lambda x: x.A > 2]
+ tm.assert_frame_equal(res, df.loc[df.A > 2])
+
+ res = df.loc[lambda x: x.A > 2, ]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ])
+
+ res = df.loc[lambda x: x.A > 2, ]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ])
+
+ res = df.loc[lambda x: x.B == 'b', :]
+ tm.assert_frame_equal(res, df.loc[df.B == 'b', :])
+
+ res = df.loc[lambda x: x.B == 'b', :]
+ tm.assert_frame_equal(res, df.loc[df.B == 'b', :])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B']
+ tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B']
+ tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: 'B']
+ tm.assert_series_equal(res, df.loc[df.A > 2, 'B'])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: 'B']
+ tm.assert_series_equal(res, df.loc[df.A > 2, 'B'])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']])
+
+ res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']])
+
+ res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']])
+
+ res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']])
+
+ # scalar
+ res = df.loc[lambda x: 1, lambda x: 'A']
+ assert res == df.loc[1, 'A']
+
+ res = df.loc[lambda x: 1, lambda x: 'A']
+ assert res == df.loc[1, 'A']
+
+ def test_frame_loc_ix_callable_mixture(self):
+ # GH 11485
+ df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'),
+ 'C': [1, 2, 3, 4]})
+
+ res = df.loc[lambda x: x.A > 2, ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']])
+
+ res = df.loc[lambda x: x.A > 2, ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']])
+
+ res = df.loc[[2, 3], lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']])
+
+ res = df.loc[[2, 3], lambda x: ['A', 'B']]
+ tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']])
+
+ res = df.loc[3, lambda x: ['A', 'B']]
+ tm.assert_series_equal(res, df.loc[3, ['A', 'B']])
+
+ res = df.loc[3, lambda x: ['A', 'B']]
+ tm.assert_series_equal(res, df.loc[3, ['A', 'B']])
+
+ def test_frame_loc_callable(self):
+ # GH 11485
+ df = pd.DataFrame({'X': [1, 2, 3, 4],
+ 'Y': list('aabb')},
+ index=list('ABCD'))
+
+ # return label
+ res = df.loc[lambda x: ['A', 'C']]
+ tm.assert_frame_equal(res, df.loc[['A', 'C']])
+
+ res = df.loc[lambda x: ['A', 'C'], ]
+ tm.assert_frame_equal(res, df.loc[['A', 'C'], ])
+
+ res = df.loc[lambda x: ['A', 'C'], :]
+ tm.assert_frame_equal(res, df.loc[['A', 'C'], :])
+
+ res = df.loc[lambda x: ['A', 'C'], lambda x: 'X']
+ tm.assert_series_equal(res, df.loc[['A', 'C'], 'X'])
+
+ res = df.loc[lambda x: ['A', 'C'], lambda x: ['X']]
+ tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']])
+
+ # mixture
+ res = df.loc[['A', 'C'], lambda x: 'X']
+ tm.assert_series_equal(res, df.loc[['A', 'C'], 'X'])
+
+ res = df.loc[['A', 'C'], lambda x: ['X']]
+ tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']])
+
+ res = df.loc[lambda x: ['A', 'C'], 'X']
+ tm.assert_series_equal(res, df.loc[['A', 'C'], 'X'])
+
+ res = df.loc[lambda x: ['A', 'C'], ['X']]
+ tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']])
+
+ def test_frame_loc_callable_setitem(self):
+ # GH 11485
+ df = pd.DataFrame({'X': [1, 2, 3, 4],
+ 'Y': list('aabb')},
+ index=list('ABCD'))
+
+ # return label
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C']] = -20
+ exp = df.copy()
+ exp.loc[['A', 'C']] = -20
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C'], :] = 20
+ exp = df.copy()
+ exp.loc[['A', 'C'], :] = 20
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C'], lambda x: 'X'] = -1
+ exp = df.copy()
+ exp.loc[['A', 'C'], 'X'] = -1
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C'], lambda x: ['X']] = [5, 10]
+ exp = df.copy()
+ exp.loc[['A', 'C'], ['X']] = [5, 10]
+ tm.assert_frame_equal(res, exp)
+
+ # mixture
+ res = df.copy()
+ res.loc[['A', 'C'], lambda x: 'X'] = np.array([-1, -2])
+ exp = df.copy()
+ exp.loc[['A', 'C'], 'X'] = np.array([-1, -2])
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[['A', 'C'], lambda x: ['X']] = 10
+ exp = df.copy()
+ exp.loc[['A', 'C'], ['X']] = 10
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C'], 'X'] = -2
+ exp = df.copy()
+ exp.loc[['A', 'C'], 'X'] = -2
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.loc[lambda x: ['A', 'C'], ['X']] = -4
+ exp = df.copy()
+ exp.loc[['A', 'C'], ['X']] = -4
+ tm.assert_frame_equal(res, exp)
+
+ def test_frame_iloc_callable(self):
+ # GH 11485
+ df = pd.DataFrame({'X': [1, 2, 3, 4],
+ 'Y': list('aabb')},
+ index=list('ABCD'))
+
+ # return location
+ res = df.iloc[lambda x: [1, 3]]
+ tm.assert_frame_equal(res, df.iloc[[1, 3]])
+
+ res = df.iloc[lambda x: [1, 3], :]
+ tm.assert_frame_equal(res, df.iloc[[1, 3], :])
+
+ res = df.iloc[lambda x: [1, 3], lambda x: 0]
+ tm.assert_series_equal(res, df.iloc[[1, 3], 0])
+
+ res = df.iloc[lambda x: [1, 3], lambda x: [0]]
+ tm.assert_frame_equal(res, df.iloc[[1, 3], [0]])
+
+ # mixture
+ res = df.iloc[[1, 3], lambda x: 0]
+ tm.assert_series_equal(res, df.iloc[[1, 3], 0])
+
+ res = df.iloc[[1, 3], lambda x: [0]]
+ tm.assert_frame_equal(res, df.iloc[[1, 3], [0]])
+
+ res = df.iloc[lambda x: [1, 3], 0]
+ tm.assert_series_equal(res, df.iloc[[1, 3], 0])
+
+ res = df.iloc[lambda x: [1, 3], [0]]
+ tm.assert_frame_equal(res, df.iloc[[1, 3], [0]])
+
+ def test_frame_iloc_callable_setitem(self):
+ # GH 11485
+ df = pd.DataFrame({'X': [1, 2, 3, 4],
+ 'Y': list('aabb')},
+ index=list('ABCD'))
+
+ # return location
+ res = df.copy()
+ res.iloc[lambda x: [1, 3]] = 0
+ exp = df.copy()
+ exp.iloc[[1, 3]] = 0
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[lambda x: [1, 3], :] = -1
+ exp = df.copy()
+ exp.iloc[[1, 3], :] = -1
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[lambda x: [1, 3], lambda x: 0] = 5
+ exp = df.copy()
+ exp.iloc[[1, 3], 0] = 5
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[lambda x: [1, 3], lambda x: [0]] = 25
+ exp = df.copy()
+ exp.iloc[[1, 3], [0]] = 25
+ tm.assert_frame_equal(res, exp)
+
+ # mixture
+ res = df.copy()
+ res.iloc[[1, 3], lambda x: 0] = -3
+ exp = df.copy()
+ exp.iloc[[1, 3], 0] = -3
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[[1, 3], lambda x: [0]] = -5
+ exp = df.copy()
+ exp.iloc[[1, 3], [0]] = -5
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[lambda x: [1, 3], 0] = 10
+ exp = df.copy()
+ exp.iloc[[1, 3], 0] = 10
+ tm.assert_frame_equal(res, exp)
+
+ res = df.copy()
+ res.iloc[lambda x: [1, 3], [0]] = [-5, -5]
+ exp = df.copy()
+ exp.iloc[[1, 3], [0]] = [-5, -5]
+ tm.assert_frame_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_categorical.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_categorical.py
new file mode 100644
index 00000000000..b7443e24213
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_categorical.py
@@ -0,0 +1,717 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+
+from pandas.core.dtypes.common import is_categorical_dtype
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DataFrame, Index, Interval, Series,
+ Timestamp)
+from pandas.api.types import CategoricalDtype as CDT
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestCategoricalIndex(object):
+
+ def setup_method(self, method):
+
+ self.df = DataFrame({'A': np.arange(6, dtype='int64'),
+ 'B': Series(list('aabbca')).astype(
+ CDT(list('cab')))}).set_index('B')
+ self.df2 = DataFrame({'A': np.arange(6, dtype='int64'),
+ 'B': Series(list('aabbca')).astype(
+ CDT(list('cabe')))}).set_index('B')
+ self.df3 = DataFrame({'A': np.arange(6, dtype='int64'),
+ 'B': (Series([1, 1, 2, 1, 3, 2])
+ .astype(CDT([3, 2, 1], ordered=True)))
+ }).set_index('B')
+ self.df4 = DataFrame({'A': np.arange(6, dtype='int64'),
+ 'B': (Series([1, 1, 2, 1, 3, 2])
+ .astype(CDT([3, 2, 1], ordered=False)))
+ }).set_index('B')
+
+ def test_loc_scalar(self):
+ result = self.df.loc['a']
+ expected = (DataFrame({'A': [0, 1, 5],
+ 'B': (Series(list('aaa'))
+ .astype(CDT(list('cab'))))})
+ .set_index('B'))
+ assert_frame_equal(result, expected)
+
+ df = self.df.copy()
+ df.loc['a'] = 20
+ expected = (DataFrame({'A': [20, 20, 2, 3, 4, 20],
+ 'B': (Series(list('aabbca'))
+ .astype(CDT(list('cab'))))})
+ .set_index('B'))
+ assert_frame_equal(df, expected)
+
+ # value not in the categories
+ pytest.raises(KeyError, lambda: df.loc['d'])
+
+ def f():
+ df.loc['d'] = 10
+
+ pytest.raises(TypeError, f)
+
+ def f():
+ df.loc['d', 'A'] = 10
+
+ pytest.raises(TypeError, f)
+
+ def f():
+ df.loc['d', 'C'] = 10
+
+ pytest.raises(TypeError, f)
+
+ def test_getitem_scalar(self):
+
+ cats = Categorical([Timestamp('12-31-1999'),
+ Timestamp('12-31-2000')])
+
+ s = Series([1, 2], index=cats)
+
+ expected = s.iloc[0]
+ result = s[cats[0]]
+ assert result == expected
+
+ def test_slicing_directly(self):
+ cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
+ sliced = cat[3]
+ assert sliced == "d"
+ sliced = cat[3:5]
+ expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd'])
+ tm.assert_numpy_array_equal(sliced._codes, expected._codes)
+ tm.assert_index_equal(sliced.categories, expected.categories)
+
+ def test_slicing(self):
+ cat = Series(Categorical([1, 2, 3, 4]))
+ reversed = cat[::-1]
+ exp = np.array([4, 3, 2, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(reversed.__array__(), exp)
+
+ df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
+ df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
+
+ expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10)
+ result = df.iloc[10]
+ tm.assert_series_equal(result, expected)
+
+ expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
+ index=np.arange(10, 20).astype('int64'))
+ expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
+ result = df.iloc[10:20]
+ tm.assert_frame_equal(result, expected)
+
+ expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8)
+ result = df.loc[8]
+ tm.assert_series_equal(result, expected)
+
+ def test_slicing_and_getting_ops(self):
+
+ # systematically test the slicing operations:
+ # for all slicing ops:
+ # - returning a dataframe
+ # - returning a column
+ # - returning a row
+ # - returning a single value
+
+ cats = Categorical(
+ ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n"])
+ values = [1, 2, 3, 4, 5, 6, 7]
+ df = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ # the expected values
+ cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
+ idx2 = Index(["j", "k"])
+ values2 = [3, 4]
+
+ # 2:4,: | "j":"k",:
+ exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
+
+ # :,"cats" | :,0
+ exp_col = Series(cats, index=idx, name='cats')
+
+ # "j",: | 2,:
+ exp_row = Series(["b", 3], index=["cats", "values"], dtype="object",
+ name="j")
+
+ # "j","cats | 2,0
+ exp_val = "b"
+
+ # iloc
+ # frame
+ res_df = df.iloc[2:4, :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.iloc[2, :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.iloc[:, 0]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.iloc[2, 0]
+ assert res_val == exp_val
+
+ # loc
+ # frame
+ res_df = df.loc["j":"k", :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.loc["j", :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.loc[:, "cats"]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.loc["j", "cats"]
+ assert res_val == exp_val
+
+ # ix
+ # frame
+ # res_df = df.loc["j":"k",[0,1]] # doesn't work?
+ res_df = df.loc["j":"k", :]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ # row
+ res_row = df.loc["j", :]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ # col
+ res_col = df.loc[:, "cats"]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ # single value
+ res_val = df.loc["j", df.columns[0]]
+ assert res_val == exp_val
+
+ # iat
+ res_val = df.iat[2, 0]
+ assert res_val == exp_val
+
+ # at
+ res_val = df.at["j", "cats"]
+ assert res_val == exp_val
+
+ # fancy indexing
+ exp_fancy = df.iloc[[2]]
+
+ res_fancy = df[df["cats"] == "b"]
+ tm.assert_frame_equal(res_fancy, exp_fancy)
+ res_fancy = df[df["values"] == 3]
+ tm.assert_frame_equal(res_fancy, exp_fancy)
+
+ # get_value
+ res_val = df.at["j", "cats"]
+ assert res_val == exp_val
+
+ # i : int, slice, or sequence of integers
+ res_row = df.iloc[2]
+ tm.assert_series_equal(res_row, exp_row)
+ assert isinstance(res_row["cats"], compat.string_types)
+
+ res_df = df.iloc[slice(2, 4)]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_df = df.iloc[[2, 3]]
+ tm.assert_frame_equal(res_df, exp_df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_col = df.iloc[:, 0]
+ tm.assert_series_equal(res_col, exp_col)
+ assert is_categorical_dtype(res_col)
+
+ res_df = df.iloc[:, slice(0, 2)]
+ tm.assert_frame_equal(res_df, df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ res_df = df.iloc[:, [0, 1]]
+ tm.assert_frame_equal(res_df, df)
+ assert is_categorical_dtype(res_df["cats"])
+
+ def test_slicing_doc_examples(self):
+
+ # GH 7918
+ cats = Categorical(["a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c"])
+ idx = Index(["h", "i", "j", "k", "l", "m", "n", ])
+ values = [1, 2, 2, 2, 3, 4, 5]
+ df = DataFrame({"cats": cats, "values": values}, index=idx)
+
+ result = df.iloc[2:4, :]
+ expected = DataFrame(
+ {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']),
+ "values": [2, 2]}, index=['j', 'k'])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[2:4, :].dtypes
+ expected = Series(['category', 'int64'], ['cats', 'values'])
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc["h":"j", "cats"]
+ expected = Series(Categorical(['a', 'b', 'b'],
+ categories=['a', 'b', 'c']),
+ index=['h', 'i', 'j'], name='cats')
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc["h":"j", df.columns[0:1]]
+ expected = DataFrame({'cats': Categorical(['a', 'b', 'b'],
+ categories=['a', 'b', 'c'])},
+ index=['h', 'i', 'j'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_getitem_category_type(self):
+ # GH 14580
+ # test iloc() on Series with Categorical data
+
+ s = Series([1, 2, 3]).astype('category')
+
+ # get slice
+ result = s.iloc[0:2]
+ expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
+ # get list of indexes
+ result = s.iloc[[0, 1]]
+ expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
+ # get boolean array
+ result = s.iloc[[True, False, False]]
+ expected = Series([1]).astype(CategoricalDtype([1, 2, 3]))
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_listlike(self):
+
+ # list of labels
+ result = self.df.loc[['c', 'a']]
+ expected = self.df.iloc[[4, 0, 1, 5]]
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.loc[['a', 'b', 'e']]
+ exp_index = CategoricalIndex(
+ list('aaabbe'), categories=list('cabe'), name='B')
+ expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # element in the categories but not in the values
+ pytest.raises(KeyError, lambda: self.df2.loc['e'])
+
+ # assign is ok
+ df = self.df2.copy()
+ df.loc['e'] = 20
+ result = df.loc[['a', 'b', 'e']]
+ exp_index = CategoricalIndex(
+ list('aaabbe'), categories=list('cabe'), name='B')
+ expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index)
+ assert_frame_equal(result, expected)
+
+ df = self.df2.copy()
+ result = df.loc[['a', 'b', 'e']]
+ exp_index = CategoricalIndex(
+ list('aaabbe'), categories=list('cabe'), name='B')
+ expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # not all labels in the categories
+ with pytest.raises(KeyError):
+ self.df2.loc[['a', 'd']]
+
+ def test_loc_listlike_dtypes(self):
+ # GH 11586
+
+ # unique categories and codes
+ index = CategoricalIndex(['a', 'b', 'c'])
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
+
+ # unique slice
+ res = df.loc[['a', 'b']]
+ exp_index = CategoricalIndex(['a', 'b'],
+ categories=index.categories)
+ exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ # duplicated slice
+ res = df.loc[['a', 'a', 'b']]
+
+ exp_index = CategoricalIndex(['a', 'a', 'b'],
+ categories=index.categories)
+ exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ msg = ('a list-indexer must only include '
+ 'values that are in the categories')
+ with pytest.raises(KeyError, match=msg):
+ df.loc[['a', 'x']]
+
+ # duplicated categories and codes
+ index = CategoricalIndex(['a', 'b', 'a'])
+ df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
+
+ # unique slice
+ res = df.loc[['a', 'b']]
+ exp = DataFrame({'A': [1, 3, 2],
+ 'B': [4, 6, 5]},
+ index=CategoricalIndex(['a', 'a', 'b']))
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ # duplicated slice
+ res = df.loc[['a', 'a', 'b']]
+ exp = DataFrame(
+ {'A': [1, 3, 1, 3, 2],
+ 'B': [4, 6, 4, 6, 5
+ ]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ msg = ('a list-indexer must only include values '
+ 'that are in the categories')
+ with pytest.raises(KeyError, match=msg):
+ df.loc[['a', 'x']]
+
+ # contains unused category
+ index = CategoricalIndex(
+ ['a', 'b', 'a', 'c'], categories=list('abcde'))
+ df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)
+
+ res = df.loc[['a', 'b']]
+ exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
+ index=CategoricalIndex(['a', 'a', 'b'],
+ categories=list('abcde')))
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ res = df.loc[['a', 'e']]
+ exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
+ index=CategoricalIndex(['a', 'a', 'e'],
+ categories=list('abcde')))
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ # duplicated slice
+ res = df.loc[['a', 'a', 'b']]
+ exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
+ index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
+ categories=list('abcde')))
+ tm.assert_frame_equal(res, exp, check_index_type=True)
+
+ msg = ('a list-indexer must only include values '
+ 'that are in the categories')
+ with pytest.raises(KeyError, match=msg):
+ df.loc[['a', 'x']]
+
+ def test_get_indexer_array(self):
+ arr = np.array([Timestamp('1999-12-31 00:00:00'),
+ Timestamp('2000-12-31 00:00:00')], dtype=object)
+ cats = [Timestamp('1999-12-31 00:00:00'),
+ Timestamp('2000-12-31 00:00:00')]
+ ci = CategoricalIndex(cats,
+ categories=cats,
+ ordered=False, dtype='category')
+ result = ci.get_indexer(arr)
+ expected = np.array([0, 1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_indexer_same_categories_same_order(self):
+ ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
+
+ result = ci.get_indexer(CategoricalIndex(['b', 'b'],
+ categories=['a', 'b']))
+ expected = np.array([1, 1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_indexer_same_categories_different_order(self):
+ # https://github.com/pandas-dev/pandas/issues/19551
+ ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
+
+ result = ci.get_indexer(CategoricalIndex(['b', 'b'],
+ categories=['b', 'a']))
+ expected = np.array([1, 1], dtype='intp')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_getitem_with_listlike(self):
+ # GH 16115
+ cats = Categorical([Timestamp('12-31-1999'),
+ Timestamp('12-31-2000')])
+
+ expected = DataFrame([[1, 0], [0, 1]], dtype='uint8',
+ index=[0, 1], columns=cats)
+ dummies = pd.get_dummies(cats)
+ result = dummies[[c for c in dummies.columns]]
+ assert_frame_equal(result, expected)
+
+ def test_setitem_listlike(self):
+
+ # GH 9469
+ # properly coerce the input indexers
+ np.random.seed(1)
+ c = Categorical(np.random.randint(0, 5, size=150000).astype(
+ np.int8)).add_categories([-1000])
+ indexer = np.array([100000]).astype(np.int64)
+ c[indexer] = -1000
+
+ # we are asserting the code result here
+ # which maps to the -1000 category
+ result = c.codes[np.array([100000]).astype(np.int64)]
+ tm.assert_numpy_array_equal(result, np.array([5], dtype='int8'))
+
+ def test_ix_categorical_index(self):
+ # GH 12531
+ df = DataFrame(np.random.randn(3, 3),
+ index=list('ABC'), columns=list('XYZ'))
+ cdf = df.copy()
+ cdf.index = CategoricalIndex(df.index)
+ cdf.columns = CategoricalIndex(df.columns)
+
+ expect = Series(df.loc['A', :], index=cdf.columns, name='A')
+ assert_series_equal(cdf.loc['A', :], expect)
+
+ expect = Series(df.loc[:, 'X'], index=cdf.index, name='X')
+ assert_series_equal(cdf.loc[:, 'X'], expect)
+
+ exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
+ expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
+ index=exp_index)
+ assert_frame_equal(cdf.loc[['A', 'B'], :], expect)
+
+ exp_columns = CategoricalIndex(list('XY'),
+ categories=['X', 'Y', 'Z'])
+ expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
+ columns=exp_columns)
+ assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)
+
+ # non-unique
+ df = DataFrame(np.random.randn(3, 3),
+ index=list('ABA'), columns=list('XYX'))
+ cdf = df.copy()
+ cdf.index = CategoricalIndex(df.index)
+ cdf.columns = CategoricalIndex(df.columns)
+
+ exp_index = CategoricalIndex(list('AA'), categories=['A', 'B'])
+ expect = DataFrame(df.loc['A', :], columns=cdf.columns,
+ index=exp_index)
+ assert_frame_equal(cdf.loc['A', :], expect)
+
+ exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y'])
+ expect = DataFrame(df.loc[:, 'X'], index=cdf.index,
+ columns=exp_columns)
+ assert_frame_equal(cdf.loc[:, 'X'], expect)
+
+ expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
+ index=CategoricalIndex(list('AAB')))
+ assert_frame_equal(cdf.loc[['A', 'B'], :], expect)
+
+ expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
+ columns=CategoricalIndex(list('XXY')))
+ assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)
+
+ def test_read_only_source(self):
+ # GH 10043
+ rw_array = np.eye(10)
+ rw_df = DataFrame(rw_array)
+
+ ro_array = np.eye(10)
+ ro_array.setflags(write=False)
+ ro_df = DataFrame(ro_array)
+
+ assert_frame_equal(rw_df.iloc[[1, 2, 3]], ro_df.iloc[[1, 2, 3]])
+ assert_frame_equal(rw_df.iloc[[1]], ro_df.iloc[[1]])
+ assert_series_equal(rw_df.iloc[1], ro_df.iloc[1])
+ assert_frame_equal(rw_df.iloc[1:3], ro_df.iloc[1:3])
+
+ assert_frame_equal(rw_df.loc[[1, 2, 3]], ro_df.loc[[1, 2, 3]])
+ assert_frame_equal(rw_df.loc[[1]], ro_df.loc[[1]])
+ assert_series_equal(rw_df.loc[1], ro_df.loc[1])
+ assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3])
+
+ def test_reindexing(self):
+
+ # reindexing
+ # convert to a regular index
+ result = self.df2.reindex(['a', 'b', 'e'])
+ expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan],
+ 'B': Series(list('aaabbe'))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['a', 'b'])
+ expected = DataFrame({'A': [0, 1, 5, 2, 3],
+ 'B': Series(list('aaabb'))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['e'])
+ expected = DataFrame({'A': [np.nan],
+ 'B': Series(['e'])}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['d'])
+ expected = DataFrame({'A': [np.nan],
+ 'B': Series(['d'])}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # since we are actually reindexing with a Categorical
+ # then return a Categorical
+ cats = list('cabe')
+
+ result = self.df2.reindex(Categorical(['a', 'd'], categories=cats))
+ expected = DataFrame({'A': [0, 1, 5, np.nan],
+ 'B': Series(list('aaad')).astype(
+ CDT(cats))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(Categorical(['a'], categories=cats))
+ expected = DataFrame({'A': [0, 1, 5],
+ 'B': Series(list('aaa')).astype(
+ CDT(cats))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['a', 'b', 'e'])
+ expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan],
+ 'B': Series(list('aaabbe'))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['a', 'b'])
+ expected = DataFrame({'A': [0, 1, 5, 2, 3],
+ 'B': Series(list('aaabb'))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(['e'])
+ expected = DataFrame({'A': [np.nan],
+ 'B': Series(['e'])}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # give back the type of categorical that we received
+ result = self.df2.reindex(Categorical(
+ ['a', 'd'], categories=cats, ordered=True))
+ expected = DataFrame(
+ {'A': [0, 1, 5, np.nan],
+ 'B': Series(list('aaad')).astype(
+ CDT(cats, ordered=True))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ result = self.df2.reindex(Categorical(
+ ['a', 'd'], categories=['a', 'd']))
+ expected = DataFrame({'A': [0, 1, 5, np.nan],
+ 'B': Series(list('aaad')).astype(
+ CDT(['a', 'd']))}).set_index('B')
+ assert_frame_equal(result, expected, check_index_type=True)
+
+ # passed duplicate indexers are not allowed
+ pytest.raises(ValueError, lambda: self.df2.reindex(['a', 'a']))
+
+ # args NotImplemented ATM
+ pytest.raises(NotImplementedError,
+ lambda: self.df2.reindex(['a'], method='ffill'))
+ pytest.raises(NotImplementedError,
+ lambda: self.df2.reindex(['a'], level=1))
+ pytest.raises(NotImplementedError,
+ lambda: self.df2.reindex(['a'], limit=2))
+
+ def test_loc_slice(self):
+ # slicing
+ # not implemented ATM
+ # GH9748
+
+ pytest.raises(TypeError, lambda: self.df.loc[1:5])
+
+ # result = df.loc[1:5]
+ # expected = df.iloc[[1,2,3,4]]
+ # assert_frame_equal(result, expected)
+
+ def test_boolean_selection(self):
+
+ df3 = self.df3
+ df4 = self.df4
+
+ result = df3[df3.index == 'a']
+ expected = df3.iloc[[]]
+ assert_frame_equal(result, expected)
+
+ result = df4[df4.index == 'a']
+ expected = df4.iloc[[]]
+ assert_frame_equal(result, expected)
+
+ result = df3[df3.index == 1]
+ expected = df3.iloc[[0, 1, 3]]
+ assert_frame_equal(result, expected)
+
+ result = df4[df4.index == 1]
+ expected = df4.iloc[[0, 1, 3]]
+ assert_frame_equal(result, expected)
+
+ # since we have an ordered categorical
+
+ # CategoricalIndex([1, 1, 2, 1, 3, 2],
+ # categories=[3, 2, 1],
+ # ordered=True,
+ # name=u'B')
+ result = df3[df3.index < 2]
+ expected = df3.iloc[[4]]
+ assert_frame_equal(result, expected)
+
+ result = df3[df3.index > 1]
+ expected = df3.iloc[[]]
+ assert_frame_equal(result, expected)
+
+ # unordered
+ # cannot be compared
+
+ # CategoricalIndex([1, 1, 2, 1, 3, 2],
+ # categories=[3, 2, 1],
+ # ordered=False,
+ # name=u'B')
+ pytest.raises(TypeError, lambda: df4[df4.index < 2])
+ pytest.raises(TypeError, lambda: df4[df4.index > 1])
+
+ def test_indexing_with_category(self):
+
+ # https://github.com/pandas-dev/pandas/issues/12564
+ # consistent result if comparing as Dataframe
+
+ cat = DataFrame({'A': ['foo', 'bar', 'baz']})
+ exp = DataFrame({'A': [True, False, False]})
+
+ res = (cat[['A']] == 'foo')
+ tm.assert_frame_equal(res, exp)
+
+ cat['A'] = cat['A'].astype('category')
+
+ res = (cat[['A']] == 'foo')
+ tm.assert_frame_equal(res, exp)
+
+ def test_map_with_dict_or_series(self):
+ orig_values = ['a', 'B', 1, 'a']
+ new_values = ['one', 2, 3.0, 'one']
+ cur_index = pd.CategoricalIndex(orig_values, name='XXX')
+ expected = pd.CategoricalIndex(new_values,
+ name='XXX', categories=[3.0, 2, 'one'])
+
+ mapper = pd.Series(new_values[:-1], index=orig_values[:-1])
+ output = cur_index.map(mapper)
+ # Order of categories in output can be different
+ tm.assert_index_equal(expected, output)
+
+ mapper = {o: n for o, n in
+ zip(orig_values[:-1], new_values[:-1])}
+ output = cur_index.map(mapper)
+ # Order of categories in output can be different
+ tm.assert_index_equal(expected, output)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_chaining_and_caching.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_chaining_and_caching.py
new file mode 100644
index 00000000000..e38c1b16b3b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_chaining_and_caching.py
@@ -0,0 +1,402 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ DataFrame, Series, Timestamp, compat, date_range, option_context)
+from pandas.core import common as com
+from pandas.util import testing as tm
+
+
+class TestCaching(object):
+
+ def test_slice_consolidate_invalidate_item_cache(self):
+
+ # this is chained assignment, but will 'work'
+ with option_context('chained_assignment', None):
+
+ # #3970
+ df = DataFrame({"aa": compat.lrange(5), "bb": [2.2] * 5})
+
+ # Creates a second float block
+ df["cc"] = 0.0
+
+ # caches a reference to the 'bb' series
+ df["bb"]
+
+ # repr machinery triggers consolidation
+ repr(df)
+
+ # Assignment to wrong series
+ df['bb'].iloc[0] = 0.17
+ df._clear_item_cache()
+ tm.assert_almost_equal(df['bb'][0], 0.17)
+
+ def test_setitem_cache_updating(self):
+ # GH 5424
+ cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
+
+ for do_ref in [False, False]:
+ df = DataFrame({'a': cont,
+ "b": cont[3:] + cont[:3],
+ 'c': np.arange(7)})
+
+ # ref the cache
+ if do_ref:
+ df.loc[0, "c"]
+
+ # set it
+ df.loc[7, 'c'] = 1
+
+ assert df.loc[0, 'c'] == 0.0
+ assert df.loc[7, 'c'] == 1.0
+
+ # GH 7084
+ # not updating cache on series setting with slices
+ expected = DataFrame({'A': [600, 600, 600]},
+ index=date_range('5/7/2014', '5/9/2014'))
+ out = DataFrame({'A': [0, 0, 0]},
+ index=date_range('5/7/2014', '5/9/2014'))
+ df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]})
+
+ # loop through df to update out
+ six = Timestamp('5/7/2014')
+ eix = Timestamp('5/9/2014')
+ for ix, row in df.iterrows():
+ out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D']
+
+ tm.assert_frame_equal(out, expected)
+ tm.assert_series_equal(out['A'], expected['A'])
+
+ # try via a chain indexing
+ # this actually works
+ out = DataFrame({'A': [0, 0, 0]},
+ index=date_range('5/7/2014', '5/9/2014'))
+ for ix, row in df.iterrows():
+ v = out[row['C']][six:eix] + row['D']
+ out[row['C']][six:eix] = v
+
+ tm.assert_frame_equal(out, expected)
+ tm.assert_series_equal(out['A'], expected['A'])
+
+ out = DataFrame({'A': [0, 0, 0]},
+ index=date_range('5/7/2014', '5/9/2014'))
+ for ix, row in df.iterrows():
+ out.loc[six:eix, row['C']] += row['D']
+
+ tm.assert_frame_equal(out, expected)
+ tm.assert_series_equal(out['A'], expected['A'])
+
+
+class TestChaining(object):
+
+ def test_setitem_chained_setfault(self):
+
+ # GH6026
+ data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout']
+ mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none']
+
+ df = DataFrame({'response': np.array(data)})
+ mask = df.response == 'timeout'
+ df.response[mask] = 'none'
+ tm.assert_frame_equal(df, DataFrame({'response': mdata}))
+
+ recarray = np.rec.fromarrays([data], names=['response'])
+ df = DataFrame(recarray)
+ mask = df.response == 'timeout'
+ df.response[mask] = 'none'
+ tm.assert_frame_equal(df, DataFrame({'response': mdata}))
+
+ df = DataFrame({'response': data, 'response1': data})
+ mask = df.response == 'timeout'
+ df.response[mask] = 'none'
+ tm.assert_frame_equal(df, DataFrame({'response': mdata,
+ 'response1': data}))
+
+ # GH 6056
+ expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar']))
+ df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
+ df['A'].iloc[0] = np.nan
+ result = df.head()
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
+ df.A.iloc[0] = np.nan
+ result = df.head()
+ tm.assert_frame_equal(result, expected)
+
+ def test_detect_chained_assignment(self):
+
+ pd.set_option('chained_assignment', 'raise')
+
+ # work with the chain
+ expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB'))
+ df = DataFrame(np.arange(4).reshape(2, 2),
+ columns=list('AB'), dtype='int64')
+ assert df._is_copy is None
+
+ df['A'][0] = -5
+ df['A'][1] = -6
+ tm.assert_frame_equal(df, expected)
+
+ # test with the chaining
+ df = DataFrame({'A': Series(range(2), dtype='int64'),
+ 'B': np.array(np.arange(2, 4), dtype=np.float64)})
+ assert df._is_copy is None
+
+ with pytest.raises(com.SettingWithCopyError):
+ df['A'][0] = -5
+
+ with pytest.raises(com.SettingWithCopyError):
+ df['A'][1] = np.nan
+
+ assert df['A']._is_copy is None
+
+ # Using a copy (the chain), fails
+ df = DataFrame({'A': Series(range(2), dtype='int64'),
+ 'B': np.array(np.arange(2, 4), dtype=np.float64)})
+
+ with pytest.raises(com.SettingWithCopyError):
+ df.loc[0]['A'] = -5
+
+ # Doc example
+ df = DataFrame({'a': ['one', 'one', 'two', 'three',
+ 'two', 'one', 'six'],
+ 'c': Series(range(7), dtype='int64')})
+ assert df._is_copy is None
+
+ with pytest.raises(com.SettingWithCopyError):
+ indexer = df.a.str.startswith('o')
+ df[indexer]['c'] = 42
+
+ expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]})
+ df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
+
+ with pytest.raises(com.SettingWithCopyError):
+ df['A'][0] = 111
+
+ with pytest.raises(com.SettingWithCopyError):
+ df.loc[0]['A'] = 111
+
+ df.loc[0, 'A'] = 111
+ tm.assert_frame_equal(df, expected)
+
+ # gh-5475: Make sure that is_copy is picked up reconstruction
+ df = DataFrame({"A": [1, 2]})
+ assert df._is_copy is None
+
+ with tm.ensure_clean('__tmp__pickle') as path:
+ df.to_pickle(path)
+ df2 = pd.read_pickle(path)
+ df2["B"] = df2["A"]
+ df2["B"] = df2["A"]
+
+ # gh-5597: a spurious raise as we are setting the entire column here
+ from string import ascii_letters as letters
+
+ def random_text(nobs=100):
+ df = []
+ for i in range(nobs):
+ idx = np.random.randint(len(letters), size=2)
+ idx.sort()
+
+ df.append([letters[idx[0]:idx[1]]])
+
+ return DataFrame(df, columns=['letters'])
+
+ df = random_text(100000)
+
+ # Always a copy
+ x = df.iloc[[0, 1, 2]]
+ assert x._is_copy is not None
+
+ x = df.iloc[[0, 1, 2, 4]]
+ assert x._is_copy is not None
+
+ # Explicitly copy
+ indexer = df.letters.apply(lambda x: len(x) > 10)
+ df = df.loc[indexer].copy()
+
+ assert df._is_copy is None
+ df['letters'] = df['letters'].apply(str.lower)
+
+ # Implicitly take
+ df = random_text(100000)
+ indexer = df.letters.apply(lambda x: len(x) > 10)
+ df = df.loc[indexer]
+
+ assert df._is_copy is not None
+ df['letters'] = df['letters'].apply(str.lower)
+
+ # Implicitly take 2
+ df = random_text(100000)
+ indexer = df.letters.apply(lambda x: len(x) > 10)
+
+ df = df.loc[indexer]
+ assert df._is_copy is not None
+ df.loc[:, 'letters'] = df['letters'].apply(str.lower)
+
+ # Should be ok even though it's a copy!
+ assert df._is_copy is None
+
+ df['letters'] = df['letters'].apply(str.lower)
+ assert df._is_copy is None
+
+ df = random_text(100000)
+ indexer = df.letters.apply(lambda x: len(x) > 10)
+ df.loc[indexer, 'letters'] = (
+ df.loc[indexer, 'letters'].apply(str.lower))
+
+ # an identical take, so no copy
+ df = DataFrame({'a': [1]}).dropna()
+ assert df._is_copy is None
+ df['a'] += 1
+
+ df = DataFrame(np.random.randn(10, 4))
+ s = df.iloc[:, 0].sort_values()
+
+ tm.assert_series_equal(s, df.iloc[:, 0].sort_values())
+ tm.assert_series_equal(s, df[0].sort_values())
+
+ # see gh-6025: false positives
+ df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]})
+ str(df)
+
+ df['column1'] = df['column1'] + 'b'
+ str(df)
+
+ df = df[df['column2'] != 8]
+ str(df)
+
+ df['column1'] = df['column1'] + 'c'
+ str(df)
+
+ # from SO:
+ # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
+ df = DataFrame(np.arange(0, 9), columns=['count'])
+ df['group'] = 'b'
+
+ with pytest.raises(com.SettingWithCopyError):
+ df.iloc[0:5]['group'] = 'a'
+
+ # Mixed type setting but same dtype & changing dtype
+ df = DataFrame(dict(A=date_range('20130101', periods=5),
+ B=np.random.randn(5),
+ C=np.arange(5, dtype='int64'),
+ D=list('abcde')))
+
+ with pytest.raises(com.SettingWithCopyError):
+ df.loc[2]['D'] = 'foo'
+
+ with pytest.raises(com.SettingWithCopyError):
+ df.loc[2]['C'] = 'foo'
+
+ with pytest.raises(com.SettingWithCopyError):
+ df['C'][2] = 'foo'
+
+ def test_setting_with_copy_bug(self):
+
+ # operating on a copy
+ df = DataFrame({'a': list(range(4)),
+ 'b': list('ab..'),
+ 'c': ['a', 'b', np.nan, 'd']})
+ mask = pd.isna(df.c)
+
+ def f():
+ df[['c']][mask] = df[['b']][mask]
+
+ pytest.raises(com.SettingWithCopyError, f)
+
+ # invalid warning as we are returning a new object
+ # GH 8730
+ df1 = DataFrame({'x': Series(['a', 'b', 'c']),
+ 'y': Series(['d', 'e', 'f'])})
+ df2 = df1[['x']]
+
+ # this should not raise
+ df2['y'] = ['g', 'h', 'i']
+
+ def test_detect_chained_assignment_warnings(self):
+ with option_context("chained_assignment", "warn"):
+ df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
+
+ with tm.assert_produces_warning(com.SettingWithCopyWarning):
+ df.loc[0]["A"] = 111
+
+ def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
+ # xref gh-13017.
+ with option_context("chained_assignment", "warn"):
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
+ columns=["a", "a", "c"])
+
+ with tm.assert_produces_warning(com.SettingWithCopyWarning):
+ df.c.loc[df.c > 0] = None
+
+ expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
+ columns=["a", "a", "c"])
+ tm.assert_frame_equal(df, expected)
+
+ def test_chained_getitem_with_lists(self):
+
+ # GH6394
+ # Regression in chained getitem indexing with embedded list-like from
+ # 0.12
+ def check(result, expected):
+ tm.assert_numpy_array_equal(result, expected)
+ assert isinstance(result, np.ndarray)
+
+ df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]})
+ expected = df['A'].iloc[2]
+ result = df.loc[2, 'A']
+ check(result, expected)
+ result2 = df.iloc[2]['A']
+ check(result2, expected)
+ result3 = df['A'].loc[2]
+ check(result3, expected)
+ result4 = df['A'].iloc[2]
+ check(result4, expected)
+
+ @pytest.mark.filterwarnings("ignore::DeprecationWarning")
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_cache_updating(self):
+ # GH 4939, make sure to update the cache on setitem
+
+ df = tm.makeDataFrame()
+ df['A'] # cache series
+ df.ix["Hello Friend"] = df.ix[0]
+ assert "Hello Friend" in df['A'].index
+ assert "Hello Friend" in df['B'].index
+
+ panel = tm.makePanel()
+ panel.ix[0] # get first item into cache
+ panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1
+ assert "A+1" in panel.ix[0].columns
+ assert "A+1" in panel.ix[1].columns
+
+ # 10264
+ df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[
+ 'a', 'b', 'c', 'd', 'e'], index=range(5))
+ df['f'] = 0
+ df.f.values[3] = 1
+
+ # TODO(wesm): unused?
+ # y = df.iloc[np.arange(2, len(df))]
+
+ df.f.values[3] = 2
+ expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[
+ 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5))
+ expected.at[3, 'f'] = 2
+ tm.assert_frame_equal(df, expected)
+ expected = Series([0, 0, 0, 2, 0], name='f')
+ tm.assert_series_equal(df.f, expected)
+
+ def test_deprecate_is_copy(self):
+ # GH18801
+ df = DataFrame({"A": [1, 2, 3]})
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # getter
+ df.is_copy
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # setter
+ df.is_copy = "test deprecated is_copy"
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_coercion.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_coercion.py
new file mode 100644
index 00000000000..280db3b2b30
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_coercion.py
@@ -0,0 +1,939 @@
+# -*- coding: utf-8 -*-
+
+import itertools
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+
+import pandas as pd
+import pandas.util.testing as tm
+
+###############################################################
+# Index / Series common tests which may trigger dtype coercions
+###############################################################
+
+
[email protected](autouse=True, scope='class')
+def check_comprehensiveness(request):
+ # Iterate over combination of dtype, method and klass
+ # and ensure that each are contained within a collected test
+ cls = request.cls
+ combos = itertools.product(cls.klasses, cls.dtypes, [cls.method])
+
+ def has_test(combo):
+ klass, dtype, method = combo
+ cls_funcs = request.node.session.items
+ return any(klass in x.name and dtype in x.name and
+ method in x.name for x in cls_funcs)
+
+ for combo in combos:
+ if not has_test(combo):
+ msg = 'test method is not defined: {0}, {1}'
+ raise AssertionError(msg.format(cls.__name__, combo))
+
+ yield
+
+
+class CoercionBase(object):
+
+ klasses = ['index', 'series']
+ dtypes = ['object', 'int64', 'float64', 'complex128', 'bool',
+ 'datetime64', 'datetime64tz', 'timedelta64', 'period']
+
+ @property
+ def method(self):
+ raise NotImplementedError(self)
+
+ def _assert(self, left, right, dtype):
+ # explicitly check dtype to avoid any unexpected result
+ if isinstance(left, pd.Series):
+ tm.assert_series_equal(left, right)
+ elif isinstance(left, pd.Index):
+ tm.assert_index_equal(left, right)
+ else:
+ raise NotImplementedError
+ assert left.dtype == dtype
+ assert right.dtype == dtype
+
+
+class TestSetitemCoercion(CoercionBase):
+
+ method = 'setitem'
+
+ def _assert_setitem_series_conversion(self, original_series, loc_value,
+ expected_series, expected_dtype):
+ """ test series value's coercion triggered by assignment """
+ temp = original_series.copy()
+ temp[1] = loc_value
+ tm.assert_series_equal(temp, expected_series)
+ # check dtype explicitly for sure
+ assert temp.dtype == expected_dtype
+
+ # .loc works different rule, temporary disable
+ # temp = original_series.copy()
+ # temp.loc[1] = loc_value
+ # tm.assert_series_equal(temp, expected_series)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (1, np.object),
+ (1.1, np.object),
+ (1 + 1j, np.object),
+ (True, np.object)])
+ def test_setitem_series_object(self, val, exp_dtype):
+ obj = pd.Series(list('abcd'))
+ assert obj.dtype == np.object
+
+ exp = pd.Series(['a', val, 'c', 'd'])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (1, np.int64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_setitem_series_int64(self, val, exp_dtype):
+ obj = pd.Series([1, 2, 3, 4])
+ assert obj.dtype == np.int64
+
+ if exp_dtype is np.float64:
+ exp = pd.Series([1, 1, 3, 4])
+ self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64)
+ pytest.xfail("GH12747 The result must be float")
+
+ exp = pd.Series([1, val, 3, 4])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (np.int32(1), np.int8),
+ (np.int16(2**9), np.int16)])
+ def test_setitem_series_int8(self, val, exp_dtype):
+ obj = pd.Series([1, 2, 3, 4], dtype=np.int8)
+ assert obj.dtype == np.int8
+
+ if exp_dtype is np.int16:
+ exp = pd.Series([1, 0, 3, 4], dtype=np.int8)
+ self._assert_setitem_series_conversion(obj, val, exp, np.int8)
+ pytest.xfail("BUG: it must be Series([1, 1, 3, 4], dtype=np.int16")
+
+ exp = pd.Series([1, val, 3, 4], dtype=np.int8)
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (1, np.float64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_setitem_series_float64(self, val, exp_dtype):
+ obj = pd.Series([1.1, 2.2, 3.3, 4.4])
+ assert obj.dtype == np.float64
+
+ exp = pd.Series([1.1, val, 3.3, 4.4])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (1, np.complex128),
+ (1.1, np.complex128),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_setitem_series_complex128(self, val, exp_dtype):
+ obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j])
+ assert obj.dtype == np.complex128
+
+ exp = pd.Series([1 + 1j, val, 3 + 3j, 4 + 4j])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (1, np.int64),
+ (3, np.int64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.bool)])
+ def test_setitem_series_bool(self, val, exp_dtype):
+ obj = pd.Series([True, False, True, False])
+ assert obj.dtype == np.bool
+
+ if exp_dtype is np.int64:
+ exp = pd.Series([True, True, True, False])
+ self._assert_setitem_series_conversion(obj, val, exp, np.bool)
+ pytest.xfail("TODO_GH12747 The result must be int")
+ elif exp_dtype is np.float64:
+ exp = pd.Series([True, True, True, False])
+ self._assert_setitem_series_conversion(obj, val, exp, np.bool)
+ pytest.xfail("TODO_GH12747 The result must be float")
+ elif exp_dtype is np.complex128:
+ exp = pd.Series([True, True, True, False])
+ self._assert_setitem_series_conversion(obj, val, exp, np.bool)
+ pytest.xfail("TODO_GH12747 The result must be complex")
+
+ exp = pd.Series([True, val, True, False])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
+ (1, np.object),
+ ('x', np.object)])
+ def test_setitem_series_datetime64(self, val, exp_dtype):
+ obj = pd.Series([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ assert obj.dtype == 'datetime64[ns]'
+
+ exp = pd.Series([pd.Timestamp('2011-01-01'),
+ val,
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (pd.Timestamp('2012-01-01', tz='US/Eastern'),
+ 'datetime64[ns, US/Eastern]'),
+ (pd.Timestamp('2012-01-01', tz='US/Pacific'), np.object),
+ (pd.Timestamp('2012-01-01'), np.object),
+ (1, np.object)])
+ def test_setitem_series_datetime64tz(self, val, exp_dtype):
+ tz = 'US/Eastern'
+ obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2011-01-02', tz=tz),
+ pd.Timestamp('2011-01-03', tz=tz),
+ pd.Timestamp('2011-01-04', tz=tz)])
+ assert obj.dtype == 'datetime64[ns, US/Eastern]'
+
+ exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz),
+ val,
+ pd.Timestamp('2011-01-03', tz=tz),
+ pd.Timestamp('2011-01-04', tz=tz)])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (pd.Timedelta('12 day'), 'timedelta64[ns]'),
+ (1, np.object),
+ ('x', np.object)])
+ def test_setitem_series_timedelta64(self, val, exp_dtype):
+ obj = pd.Series([pd.Timedelta('1 day'),
+ pd.Timedelta('2 day'),
+ pd.Timedelta('3 day'),
+ pd.Timedelta('4 day')])
+ assert obj.dtype == 'timedelta64[ns]'
+
+ exp = pd.Series([pd.Timedelta('1 day'),
+ val,
+ pd.Timedelta('3 day'),
+ pd.Timedelta('4 day')])
+ self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
+
+ def _assert_setitem_index_conversion(self, original_series, loc_key,
+ expected_index, expected_dtype):
+ """ test index's coercion triggered by assign key """
+ temp = original_series.copy()
+ temp[loc_key] = 5
+ exp = pd.Series([1, 2, 3, 4, 5], index=expected_index)
+ tm.assert_series_equal(temp, exp)
+ # check dtype explicitly for sure
+ assert temp.index.dtype == expected_dtype
+
+ temp = original_series.copy()
+ temp.loc[loc_key] = 5
+ exp = pd.Series([1, 2, 3, 4, 5], index=expected_index)
+ tm.assert_series_equal(temp, exp)
+ # check dtype explicitly for sure
+ assert temp.index.dtype == expected_dtype
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ ('x', np.object),
+ (5, IndexError),
+ (1.1, np.object)])
+ def test_setitem_index_object(self, val, exp_dtype):
+ obj = pd.Series([1, 2, 3, 4], index=list('abcd'))
+ assert obj.index.dtype == np.object
+
+ if exp_dtype is IndexError:
+ temp = obj.copy()
+ with pytest.raises(exp_dtype):
+ temp[5] = 5
+ else:
+ exp_index = pd.Index(list('abcd') + [val])
+ self._assert_setitem_index_conversion(obj, val, exp_index,
+ exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (5, np.int64),
+ (1.1, np.float64),
+ ('x', np.object)])
+ def test_setitem_index_int64(self, val, exp_dtype):
+ obj = pd.Series([1, 2, 3, 4])
+ assert obj.index.dtype == np.int64
+
+ exp_index = pd.Index([0, 1, 2, 3, val])
+ self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype)
+
+ @pytest.mark.parametrize("val,exp_dtype", [
+ (5, IndexError),
+ (5.1, np.float64),
+ ('x', np.object)])
+ def test_setitem_index_float64(self, val, exp_dtype):
+ obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1])
+ assert obj.index.dtype == np.float64
+
+ if exp_dtype is IndexError:
+ # float + int -> int
+ temp = obj.copy()
+ with pytest.raises(exp_dtype):
+ temp[5] = 5
+ pytest.xfail("TODO_GH12747 The result must be float")
+
+ exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, val])
+ self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype)
+
+ def test_setitem_series_period(self):
+ pass
+
+ def test_setitem_index_complex128(self):
+ pass
+
+ def test_setitem_index_bool(self):
+ pass
+
+ def test_setitem_index_datetime64(self):
+ pass
+
+ def test_setitem_index_datetime64tz(self):
+ pass
+
+ def test_setitem_index_timedelta64(self):
+ pass
+
+ def test_setitem_index_period(self):
+ pass
+
+
+class TestInsertIndexCoercion(CoercionBase):
+
+ klasses = ['index']
+ method = 'insert'
+
+ def _assert_insert_conversion(self, original, value,
+ expected, expected_dtype):
+ """ test coercion triggered by insert """
+ target = original.copy()
+ res = target.insert(1, value)
+ tm.assert_index_equal(res, expected)
+ assert res.dtype == expected_dtype
+
+ @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [
+ (1, 1, np.object),
+ (1.1, 1.1, np.object),
+ (False, False, np.object),
+ ('x', 'x', np.object)])
+ def test_insert_index_object(self, insert, coerced_val, coerced_dtype):
+ obj = pd.Index(list('abcd'))
+ assert obj.dtype == np.object
+
+ exp = pd.Index(['a', coerced_val, 'b', 'c', 'd'])
+ self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
+
+ @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [
+ (1, 1, np.int64),
+ (1.1, 1.1, np.float64),
+ (False, 0, np.int64),
+ ('x', 'x', np.object)])
+ def test_insert_index_int64(self, insert, coerced_val, coerced_dtype):
+ obj = pd.Int64Index([1, 2, 3, 4])
+ assert obj.dtype == np.int64
+
+ exp = pd.Index([1, coerced_val, 2, 3, 4])
+ self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
+
+ @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [
+ (1, 1., np.float64),
+ (1.1, 1.1, np.float64),
+ (False, 0., np.float64),
+ ('x', 'x', np.object)])
+ def test_insert_index_float64(self, insert, coerced_val, coerced_dtype):
+ obj = pd.Float64Index([1., 2., 3., 4.])
+ assert obj.dtype == np.float64
+
+ exp = pd.Index([1., coerced_val, 2., 3., 4.])
+ self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
+
+ @pytest.mark.parametrize('fill_val,exp_dtype', [
+ (pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
+ (pd.Timestamp('2012-01-01', tz='US/Eastern'),
+ 'datetime64[ns, US/Eastern]')],
+ ids=['datetime64', 'datetime64tz'])
+ def test_insert_index_datetimes(self, fill_val, exp_dtype):
+ obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03',
+ '2011-01-04'], tz=fill_val.tz)
+ assert obj.dtype == exp_dtype
+
+ exp = pd.DatetimeIndex(['2011-01-01', fill_val.date(), '2011-01-02',
+ '2011-01-03', '2011-01-04'], tz=fill_val.tz)
+ self._assert_insert_conversion(obj, fill_val, exp, exp_dtype)
+
+ msg = "Passed item and index have different timezone"
+ if fill_val.tz:
+ with pytest.raises(ValueError, match=msg):
+ obj.insert(1, pd.Timestamp('2012-01-01'))
+
+ with pytest.raises(ValueError, match=msg):
+ obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo'))
+
+ msg = "cannot insert DatetimeIndex with incompatible label"
+ with pytest.raises(TypeError, match=msg):
+ obj.insert(1, 1)
+
+ pytest.xfail("ToDo: must coerce to object")
+
+ def test_insert_index_timedelta64(self):
+ obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day'])
+ assert obj.dtype == 'timedelta64[ns]'
+
+ # timedelta64 + timedelta64 => timedelta64
+ exp = pd.TimedeltaIndex(['1 day', '10 day', '2 day', '3 day', '4 day'])
+ self._assert_insert_conversion(obj, pd.Timedelta('10 day'),
+ exp, 'timedelta64[ns]')
+
+ # ToDo: must coerce to object
+ msg = "cannot insert TimedeltaIndex with incompatible label"
+ with pytest.raises(TypeError, match=msg):
+ obj.insert(1, pd.Timestamp('2012-01-01'))
+
+ # ToDo: must coerce to object
+ msg = "cannot insert TimedeltaIndex with incompatible label"
+ with pytest.raises(TypeError, match=msg):
+ obj.insert(1, 1)
+
+ @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [
+ (pd.Period('2012-01', freq='M'), '2012-01', 'period[M]'),
+ (pd.Timestamp('2012-01-01'), pd.Timestamp('2012-01-01'), np.object),
+ (1, 1, np.object),
+ ('x', 'x', np.object)])
+ def test_insert_index_period(self, insert, coerced_val, coerced_dtype):
+ obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'],
+ freq='M')
+ assert obj.dtype == 'period[M]'
+
+ if isinstance(insert, pd.Period):
+ index_type = pd.PeriodIndex
+ else:
+ index_type = pd.Index
+
+ exp = index_type([pd.Period('2011-01', freq='M'),
+ coerced_val,
+ pd.Period('2011-02', freq='M'),
+ pd.Period('2011-03', freq='M'),
+ pd.Period('2011-04', freq='M')], freq='M')
+ self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
+
+ def test_insert_index_complex128(self):
+ pass
+
+ def test_insert_index_bool(self):
+ pass
+
+
+class TestWhereCoercion(CoercionBase):
+
+ method = 'where'
+
+ def _assert_where_conversion(self, original, cond, values,
+ expected, expected_dtype):
+ """ test coercion triggered by where """
+ target = original.copy()
+ res = target.where(cond, values)
+ self._assert(res, expected, expected_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val,exp_dtype", [
+ (1, np.object),
+ (1.1, np.object),
+ (1 + 1j, np.object),
+ (True, np.object)])
+ def test_where_object(self, klass, fill_val, exp_dtype):
+ obj = klass(list('abcd'))
+ assert obj.dtype == np.object
+ cond = klass([True, False, True, False])
+
+ if fill_val is True and klass is pd.Series:
+ ret_val = 1
+ else:
+ ret_val = fill_val
+
+ exp = klass(['a', ret_val, 'c', ret_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ if fill_val is True:
+ values = klass([True, False, True, True])
+ else:
+ values = klass(fill_val * x for x in [5, 6, 7, 8])
+
+ exp = klass(['a', values[1], 'c', values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val,exp_dtype", [
+ (1, np.int64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_where_int64(self, klass, fill_val, exp_dtype):
+ if klass is pd.Index and exp_dtype is np.complex128:
+ pytest.skip("Complex Index not supported")
+ obj = klass([1, 2, 3, 4])
+ assert obj.dtype == np.int64
+ cond = klass([True, False, True, False])
+
+ exp = klass([1, fill_val, 3, fill_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ if fill_val is True:
+ values = klass([True, False, True, True])
+ else:
+ values = klass(x * fill_val for x in [5, 6, 7, 8])
+ exp = klass([1, values[1], 3, values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val, exp_dtype", [
+ (1, np.float64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_where_float64(self, klass, fill_val, exp_dtype):
+ if klass is pd.Index and exp_dtype is np.complex128:
+ pytest.skip("Complex Index not supported")
+ obj = klass([1.1, 2.2, 3.3, 4.4])
+ assert obj.dtype == np.float64
+ cond = klass([True, False, True, False])
+
+ exp = klass([1.1, fill_val, 3.3, fill_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ if fill_val is True:
+ values = klass([True, False, True, True])
+ else:
+ values = klass(x * fill_val for x in [5, 6, 7, 8])
+ exp = klass([1.1, values[1], 3.3, values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.parametrize("fill_val,exp_dtype", [
+ (1, np.complex128),
+ (1.1, np.complex128),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_where_series_complex128(self, fill_val, exp_dtype):
+ obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j])
+ assert obj.dtype == np.complex128
+ cond = pd.Series([True, False, True, False])
+
+ exp = pd.Series([1 + 1j, fill_val, 3 + 3j, fill_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ if fill_val is True:
+ values = pd.Series([True, False, True, True])
+ else:
+ values = pd.Series(x * fill_val for x in [5, 6, 7, 8])
+ exp = pd.Series([1 + 1j, values[1], 3 + 3j, values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.parametrize("fill_val,exp_dtype", [
+ (1, np.object),
+ (1.1, np.object),
+ (1 + 1j, np.object),
+ (True, np.bool)])
+ def test_where_series_bool(self, fill_val, exp_dtype):
+
+ obj = pd.Series([True, False, True, False])
+ assert obj.dtype == np.bool
+ cond = pd.Series([True, False, True, False])
+
+ exp = pd.Series([True, fill_val, True, fill_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ if fill_val is True:
+ values = pd.Series([True, False, True, True])
+ else:
+ values = pd.Series(x * fill_val for x in [5, 6, 7, 8])
+ exp = pd.Series([True, values[1], True, values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.parametrize("fill_val,exp_dtype", [
+ (pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
+ (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
+ ids=['datetime64', 'datetime64tz'])
+ def test_where_series_datetime64(self, fill_val, exp_dtype):
+ obj = pd.Series([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ assert obj.dtype == 'datetime64[ns]'
+ cond = pd.Series([True, False, True, False])
+
+ exp = pd.Series([pd.Timestamp('2011-01-01'), fill_val,
+ pd.Timestamp('2011-01-03'), fill_val])
+ self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype)
+
+ values = pd.Series(pd.date_range(fill_val, periods=4))
+ if fill_val.tz:
+ exp = pd.Series([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2012-01-02 00:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2012-01-04 00:00',
+ tz='US/Eastern')])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ exp = pd.Series([pd.Timestamp('2011-01-01'), values[1],
+ pd.Timestamp('2011-01-03'), values[3]])
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ def test_where_index_datetime(self):
+ fill_val = pd.Timestamp('2012-01-01')
+ exp_dtype = 'datetime64[ns]'
+ obj = pd.Index([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ assert obj.dtype == 'datetime64[ns]'
+ cond = pd.Index([True, False, True, False])
+
+ msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
+ "of some kind")
+ with pytest.raises(TypeError, match=msg):
+ obj.where(cond, fill_val)
+
+ values = pd.Index(pd.date_range(fill_val, periods=4))
+ exp = pd.Index([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2012-01-02'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2012-01-04')])
+
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ @pytest.mark.xfail(
+ reason="GH 22839: do not ignore timezone, must be object")
+ def test_where_index_datetimetz(self):
+ fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
+ exp_dtype = np.object
+ obj = pd.Index([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ assert obj.dtype == 'datetime64[ns]'
+ cond = pd.Index([True, False, True, False])
+
+ msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
+ "of some kind")
+ with pytest.raises(TypeError, match=msg):
+ obj.where(cond, fill_val)
+
+ values = pd.Index(pd.date_range(fill_val, periods=4))
+ exp = pd.Index([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2012-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2012-01-04', tz='US/Eastern')],
+ dtype=exp_dtype)
+
+ self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
+
+ def test_where_index_complex128(self):
+ pass
+
+ def test_where_index_bool(self):
+ pass
+
+ def test_where_series_datetime64tz(self):
+ pass
+
+ def test_where_series_timedelta64(self):
+ pass
+
+ def test_where_series_period(self):
+ pass
+
+ def test_where_index_datetime64tz(self):
+ pass
+
+ def test_where_index_timedelta64(self):
+ pass
+
+ def test_where_index_period(self):
+ pass
+
+
+class TestFillnaSeriesCoercion(CoercionBase):
+
+ # not indexing, but place here for consisntency
+
+ method = 'fillna'
+
+ def test_has_comprehensive_tests(self):
+ pass
+
+ def _assert_fillna_conversion(self, original, value,
+ expected, expected_dtype):
+ """ test coercion triggered by fillna """
+ target = original.copy()
+ res = target.fillna(value)
+ self._assert(res, expected, expected_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val, fill_dtype", [
+ (1, np.object),
+ (1.1, np.object),
+ (1 + 1j, np.object),
+ (True, np.object)])
+ def test_fillna_object(self, klass, fill_val, fill_dtype):
+ obj = klass(['a', np.nan, 'c', 'd'])
+ assert obj.dtype == np.object
+
+ exp = klass(['a', fill_val, 'c', 'd'])
+ self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val,fill_dtype", [
+ (1, np.float64),
+ (1.1, np.float64),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_fillna_float64(self, klass, fill_val, fill_dtype):
+ obj = klass([1.1, np.nan, 3.3, 4.4])
+ assert obj.dtype == np.float64
+
+ exp = klass([1.1, fill_val, 3.3, 4.4])
+ # float + complex -> we don't support a complex Index
+ # complex for Series,
+ # object for Index
+ if fill_dtype == np.complex128 and klass == pd.Index:
+ fill_dtype = np.object
+ self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
+
+ @pytest.mark.parametrize("fill_val,fill_dtype", [
+ (1, np.complex128),
+ (1.1, np.complex128),
+ (1 + 1j, np.complex128),
+ (True, np.object)])
+ def test_fillna_series_complex128(self, fill_val, fill_dtype):
+ obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j])
+ assert obj.dtype == np.complex128
+
+ exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j])
+ self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index],
+ ids=['series', 'index'])
+ @pytest.mark.parametrize("fill_val,fill_dtype", [
+ (pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
+ (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object),
+ (1, np.object), ('x', np.object)],
+ ids=['datetime64', 'datetime64tz', 'object', 'object'])
+ def test_fillna_datetime(self, klass, fill_val, fill_dtype):
+ obj = klass([pd.Timestamp('2011-01-01'),
+ pd.NaT,
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ assert obj.dtype == 'datetime64[ns]'
+
+ exp = klass([pd.Timestamp('2011-01-01'),
+ fill_val,
+ pd.Timestamp('2011-01-03'),
+ pd.Timestamp('2011-01-04')])
+ self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
+
+ @pytest.mark.parametrize("klass", [pd.Series, pd.Index])
+ @pytest.mark.parametrize("fill_val,fill_dtype", [
+ (pd.Timestamp('2012-01-01', tz='US/Eastern'),
+ 'datetime64[ns, US/Eastern]'),
+ (pd.Timestamp('2012-01-01'), np.object),
+ (pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), np.object),
+ (1, np.object),
+ ('x', np.object)])
+ def test_fillna_datetime64tz(self, klass, fill_val, fill_dtype):
+ tz = 'US/Eastern'
+
+ obj = klass([pd.Timestamp('2011-01-01', tz=tz),
+ pd.NaT,
+ pd.Timestamp('2011-01-03', tz=tz),
+ pd.Timestamp('2011-01-04', tz=tz)])
+ assert obj.dtype == 'datetime64[ns, US/Eastern]'
+
+ exp = klass([pd.Timestamp('2011-01-01', tz=tz),
+ fill_val,
+ pd.Timestamp('2011-01-03', tz=tz),
+ pd.Timestamp('2011-01-04', tz=tz)])
+ self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
+
+ def test_fillna_series_int64(self):
+ pass
+
+ def test_fillna_index_int64(self):
+ pass
+
+ def test_fillna_series_bool(self):
+ pass
+
+ def test_fillna_index_bool(self):
+ pass
+
+ def test_fillna_series_timedelta64(self):
+ pass
+
+ def test_fillna_series_period(self):
+ pass
+
+ def test_fillna_index_timedelta64(self):
+ pass
+
+ def test_fillna_index_period(self):
+ pass
+
+
+class TestReplaceSeriesCoercion(CoercionBase):
+
+ klasses = ['series']
+ method = 'replace'
+
+ rep = {}
+ rep['object'] = ['a', 'b']
+ rep['int64'] = [4, 5]
+ rep['float64'] = [1.1, 2.2]
+ rep['complex128'] = [1 + 1j, 2 + 2j]
+ rep['bool'] = [True, False]
+ rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-03')]
+
+ for tz in ['UTC', 'US/Eastern']:
+ # to test tz => different tz replacement
+ key = 'datetime64[ns, {0}]'.format(tz)
+ rep[key] = [pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2011-01-03', tz=tz)]
+
+ rep['timedelta64[ns]'] = [pd.Timedelta('1 day'),
+ pd.Timedelta('2 day')]
+
+ @pytest.mark.parametrize('how', ['dict', 'series'])
+ @pytest.mark.parametrize('to_key', [
+ 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]',
+ 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]'
+ ], ids=['object', 'int64', 'float64', 'complex128', 'bool',
+ 'datetime64', 'datetime64tz', 'datetime64tz', 'timedelta64'])
+ @pytest.mark.parametrize('from_key', [
+ 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]',
+ 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]']
+ )
+ def test_replace_series(self, how, to_key, from_key):
+ if from_key == 'bool' and how == 'series' and compat.PY3:
+ # doesn't work in PY3, though ...dict_from_bool works fine
+ pytest.skip("doesn't work as in PY3")
+
+ index = pd.Index([3, 4], name='xxx')
+ obj = pd.Series(self.rep[from_key], index=index, name='yyy')
+ assert obj.dtype == from_key
+
+ if (from_key.startswith('datetime') and to_key.startswith('datetime')):
+ # tested below
+ return
+ elif from_key in ['datetime64[ns, US/Eastern]', 'datetime64[ns, UTC]']:
+ # tested below
+ return
+
+ if how == 'dict':
+ replacer = dict(zip(self.rep[from_key], self.rep[to_key]))
+ elif how == 'series':
+ replacer = pd.Series(self.rep[to_key], index=self.rep[from_key])
+ else:
+ raise ValueError
+
+ result = obj.replace(replacer)
+
+ if ((from_key == 'float64' and to_key in ('int64')) or
+ (from_key == 'complex128' and
+ to_key in ('int64', 'float64'))):
+
+ if compat.is_platform_32bit() or compat.is_platform_windows():
+ pytest.skip("32-bit platform buggy: {0} -> {1}".format
+ (from_key, to_key))
+
+ # Expected: do not downcast by replacement
+ exp = pd.Series(self.rep[to_key], index=index,
+ name='yyy', dtype=from_key)
+
+ else:
+ exp = pd.Series(self.rep[to_key], index=index, name='yyy')
+ assert exp.dtype == to_key
+
+ tm.assert_series_equal(result, exp)
+
+ # TODO(jbrockmendel) commented out to only have a single xfail printed
+ @pytest.mark.xfail(reason='GH #18376, tzawareness-compat bug '
+ 'in BlockManager.replace_list')
+ # @pytest.mark.parametrize('how', ['dict', 'series'])
+ # @pytest.mark.parametrize('to_key', ['timedelta64[ns]', 'bool', 'object',
+ # 'complex128', 'float64', 'int64'])
+ # @pytest.mark.parametrize('from_key', ['datetime64[ns, UTC]',
+ # 'datetime64[ns, US/Eastern]'])
+ # def test_replace_series_datetime_tz(self, how, to_key, from_key):
+ def test_replace_series_datetime_tz(self):
+ how = 'series'
+ from_key = 'datetime64[ns, US/Eastern]'
+ to_key = 'timedelta64[ns]'
+
+ index = pd.Index([3, 4], name='xxx')
+ obj = pd.Series(self.rep[from_key], index=index, name='yyy')
+ assert obj.dtype == from_key
+
+ if how == 'dict':
+ replacer = dict(zip(self.rep[from_key], self.rep[to_key]))
+ elif how == 'series':
+ replacer = pd.Series(self.rep[to_key], index=self.rep[from_key])
+ else:
+ raise ValueError
+
+ result = obj.replace(replacer)
+ exp = pd.Series(self.rep[to_key], index=index, name='yyy')
+ assert exp.dtype == to_key
+
+ tm.assert_series_equal(result, exp)
+
+ # TODO(jreback) commented out to only have a single xfail printed
+ @pytest.mark.xfail(reason="different tz, "
+ "currently mask_missing raises SystemError",
+ strict=False)
+ # @pytest.mark.parametrize('how', ['dict', 'series'])
+ # @pytest.mark.parametrize('to_key', [
+ # 'datetime64[ns]', 'datetime64[ns, UTC]',
+ # 'datetime64[ns, US/Eastern]'])
+ # @pytest.mark.parametrize('from_key', [
+ # 'datetime64[ns]', 'datetime64[ns, UTC]',
+ # 'datetime64[ns, US/Eastern]'])
+ # def test_replace_series_datetime_datetime(self, how, to_key, from_key):
+ def test_replace_series_datetime_datetime(self):
+ how = 'dict'
+ to_key = 'datetime64[ns]'
+ from_key = 'datetime64[ns]'
+
+ index = pd.Index([3, 4], name='xxx')
+ obj = pd.Series(self.rep[from_key], index=index, name='yyy')
+ assert obj.dtype == from_key
+
+ if how == 'dict':
+ replacer = dict(zip(self.rep[from_key], self.rep[to_key]))
+ elif how == 'series':
+ replacer = pd.Series(self.rep[to_key], index=self.rep[from_key])
+ else:
+ raise ValueError
+
+ result = obj.replace(replacer)
+ exp = pd.Series(self.rep[to_key], index=index, name='yyy')
+ assert exp.dtype == to_key
+
+ tm.assert_series_equal(result, exp)
+
+ def test_replace_series_period(self):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_datetime.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_datetime.py
new file mode 100644
index 00000000000..11fb90ebd9b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_datetime.py
@@ -0,0 +1,315 @@
+from datetime import datetime, timedelta
+
+from dateutil import tz
+import numpy as np
+
+import pandas as pd
+from pandas import DataFrame, Index, Series, Timestamp, date_range
+from pandas.util import testing as tm
+
+
+class TestDatetimeIndex(object):
+
+ def test_setitem_with_datetime_tz(self):
+ # 16889
+ # support .loc with alignment and tz-aware DatetimeIndex
+ mask = np.array([True, False, True, False])
+
+ idx = date_range('20010101', periods=4, tz='UTC')
+ df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64')
+
+ result = df.copy()
+ result.loc[mask, :] = df.loc[mask, :]
+ tm.assert_frame_equal(result, df)
+
+ result = df.copy()
+ result.loc[mask] = df.loc[mask]
+ tm.assert_frame_equal(result, df)
+
+ idx = date_range('20010101', periods=4)
+ df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64')
+
+ result = df.copy()
+ result.loc[mask, :] = df.loc[mask, :]
+ tm.assert_frame_equal(result, df)
+
+ result = df.copy()
+ result.loc[mask] = df.loc[mask]
+ tm.assert_frame_equal(result, df)
+
+ def test_indexing_with_datetime_tz(self):
+
+ # 8260
+ # support datetime64 with tz
+
+ idx = Index(date_range('20130101', periods=3, tz='US/Eastern'),
+ name='foo')
+ dr = date_range('20130110', periods=3)
+ df = DataFrame({'A': idx, 'B': dr})
+ df['C'] = idx
+ df.iloc[1, 1] = pd.NaT
+ df.iloc[1, 2] = pd.NaT
+
+ # indexing
+ result = df.iloc[1]
+ expected = Series([Timestamp('2013-01-02 00:00:00-0500',
+ tz='US/Eastern'), np.nan, np.nan],
+ index=list('ABC'), dtype='object', name=1)
+ tm.assert_series_equal(result, expected)
+ result = df.loc[1]
+ expected = Series([Timestamp('2013-01-02 00:00:00-0500',
+ tz='US/Eastern'), np.nan, np.nan],
+ index=list('ABC'), dtype='object', name=1)
+ tm.assert_series_equal(result, expected)
+
+ # indexing - fast_xs
+ df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')})
+ result = df.iloc[5]
+ expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D')
+ assert result == expected
+
+ result = df.loc[5]
+ assert result == expected
+
+ # indexing - boolean
+ result = df[df.a > df.a[3]]
+ expected = df.iloc[4:]
+ tm.assert_frame_equal(result, expected)
+
+ # indexing - setting an element
+ df = DataFrame(data=pd.to_datetime(
+ ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time'])
+ df['new_col'] = ['new', 'old']
+ df.time = df.set_index('time').index.tz_localize('UTC')
+ v = df[df.new_col == 'new'].set_index('time').index.tz_convert(
+ 'US/Pacific')
+
+ # trying to set a single element on a part of a different timezone
+ # this converts to object
+ df2 = df.copy()
+ df2.loc[df2.new_col == 'new', 'time'] = v
+
+ expected = Series([v[0], df.loc[1, 'time']], name='time')
+ tm.assert_series_equal(df2.time, expected)
+
+ v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s')
+ df.loc[df.new_col == 'new', 'time'] = v
+ tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v)
+
+ def test_consistency_with_tz_aware_scalar(self):
+ # xef gh-12938
+ # various ways of indexing the same tz-aware scalar
+ df = Series([Timestamp('2016-03-30 14:35:25',
+ tz='Europe/Brussels')]).to_frame()
+
+ df = pd.concat([df, df]).reset_index(drop=True)
+ expected = Timestamp('2016-03-30 14:35:25+0200',
+ tz='Europe/Brussels')
+
+ result = df[0][0]
+ assert result == expected
+
+ result = df.iloc[0, 0]
+ assert result == expected
+
+ result = df.loc[0, 0]
+ assert result == expected
+
+ result = df.iat[0, 0]
+ assert result == expected
+
+ result = df.at[0, 0]
+ assert result == expected
+
+ result = df[0].loc[0]
+ assert result == expected
+
+ result = df[0].at[0]
+ assert result == expected
+
+ def test_indexing_with_datetimeindex_tz(self):
+
+ # GH 12050
+ # indexing on a series with a datetimeindex with tz
+ index = date_range('2015-01-01', periods=2, tz='utc')
+
+ ser = Series(range(2), index=index, dtype='int64')
+
+ # list-like indexing
+
+ for sel in (index, list(index)):
+ # getitem
+ tm.assert_series_equal(ser[sel], ser)
+
+ # setitem
+ result = ser.copy()
+ result[sel] = 1
+ expected = Series(1, index=index)
+ tm.assert_series_equal(result, expected)
+
+ # .loc getitem
+ tm.assert_series_equal(ser.loc[sel], ser)
+
+ # .loc setitem
+ result = ser.copy()
+ result.loc[sel] = 1
+ expected = Series(1, index=index)
+ tm.assert_series_equal(result, expected)
+
+ # single element indexing
+
+ # getitem
+ assert ser[index[1]] == 1
+
+ # setitem
+ result = ser.copy()
+ result[index[1]] = 5
+ expected = Series([0, 5], index=index)
+ tm.assert_series_equal(result, expected)
+
+ # .loc getitem
+ assert ser.loc[index[1]] == 1
+
+ # .loc setitem
+ result = ser.copy()
+ result.loc[index[1]] = 5
+ expected = Series([0, 5], index=index)
+ tm.assert_series_equal(result, expected)
+
+ def test_partial_setting_with_datetimelike_dtype(self):
+
+ # GH9478
+ # a datetimeindex alignment issue with partial setting
+ df = DataFrame(np.arange(6.).reshape(3, 2), columns=list('AB'),
+ index=date_range('1/1/2000', periods=3, freq='1H'))
+ expected = df.copy()
+ expected['C'] = [expected.index[0]] + [pd.NaT, pd.NaT]
+
+ mask = df.A < 1
+ df.loc[mask, 'C'] = df.loc[mask].index
+ tm.assert_frame_equal(df, expected)
+
+ def test_loc_setitem_datetime(self):
+
+ # GH 9516
+ dt1 = Timestamp('20130101 09:00:00')
+ dt2 = Timestamp('20130101 10:00:00')
+
+ for conv in [lambda x: x, lambda x: x.to_datetime64(),
+ lambda x: x.to_pydatetime(), lambda x: np.datetime64(x)]:
+
+ df = DataFrame()
+ df.loc[conv(dt1), 'one'] = 100
+ df.loc[conv(dt2), 'one'] = 200
+
+ expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2])
+ tm.assert_frame_equal(df, expected)
+
+ def test_series_partial_set_datetime(self):
+ # GH 11497
+
+ idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx')
+ ser = Series([0.1, 0.2], index=idx, name='s')
+
+ result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]]
+ exp = Series([0.1, 0.2], index=idx, name='s')
+ tm.assert_series_equal(result, exp, check_index_type=True)
+
+ keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'),
+ Timestamp('2011-01-01')]
+ exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'),
+ name='s')
+ tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True)
+
+ keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'),
+ Timestamp('2011-01-03')]
+ exp = Series([np.nan, 0.2, np.nan],
+ index=pd.DatetimeIndex(keys, name='idx'), name='s')
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True)
+
+ def test_series_partial_set_period(self):
+ # GH 11497
+
+ idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx')
+ ser = Series([0.1, 0.2], index=idx, name='s')
+
+ result = ser.loc[[pd.Period('2011-01-01', freq='D'),
+ pd.Period('2011-01-02', freq='D')]]
+ exp = Series([0.1, 0.2], index=idx, name='s')
+ tm.assert_series_equal(result, exp, check_index_type=True)
+
+ keys = [pd.Period('2011-01-02', freq='D'),
+ pd.Period('2011-01-02', freq='D'),
+ pd.Period('2011-01-01', freq='D')]
+ exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'),
+ name='s')
+ tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True)
+
+ keys = [pd.Period('2011-01-03', freq='D'),
+ pd.Period('2011-01-02', freq='D'),
+ pd.Period('2011-01-03', freq='D')]
+ exp = Series([np.nan, 0.2, np.nan],
+ index=pd.PeriodIndex(keys, name='idx'), name='s')
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = ser.loc[keys]
+ tm.assert_series_equal(result, exp)
+
+ def test_nanosecond_getitem_setitem_with_tz(self):
+ # GH 11679
+ data = ['2016-06-28 08:30:00.123456789']
+ index = pd.DatetimeIndex(data, dtype='datetime64[ns, America/Chicago]')
+ df = DataFrame({'a': [10]}, index=index)
+ result = df.loc[df.index[0]]
+ expected = Series(10, index=['a'], name=df.index[0])
+ tm.assert_series_equal(result, expected)
+
+ result = df.copy()
+ result.loc[df.index[0], 'a'] = -1
+ expected = DataFrame(-1, index=index, columns=['a'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_loc_getitem_across_dst(self):
+ # GH 21846
+ idx = pd.date_range('2017-10-29 01:30:00',
+ tz='Europe/Berlin', periods=5, freq='30 min')
+ series2 = pd.Series([0, 1, 2, 3, 4],
+ index=idx)
+
+ t_1 = pd.Timestamp('2017-10-29 02:30:00+02:00', tz='Europe/Berlin',
+ freq='30min')
+ t_2 = pd.Timestamp('2017-10-29 02:00:00+01:00', tz='Europe/Berlin',
+ freq='30min')
+ result = series2.loc[t_1:t_2]
+ expected = pd.Series([2, 3], index=idx[2:4])
+ tm.assert_series_equal(result, expected)
+
+ result = series2[t_1]
+ expected = 2
+ assert result == expected
+
+ def test_loc_incremental_setitem_with_dst(self):
+ # GH 20724
+ base = datetime(2015, 11, 1, tzinfo=tz.gettz("US/Pacific"))
+ idxs = [base + timedelta(seconds=i * 900) for i in range(16)]
+ result = pd.Series([0], index=[idxs[0]])
+ for ts in idxs:
+ result.loc[ts] = 1
+ expected = pd.Series(1, index=idxs)
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_setitem_with_existing_dst(self):
+ # GH 18308
+ start = pd.Timestamp('2017-10-29 00:00:00+0200', tz='Europe/Madrid')
+ end = pd.Timestamp('2017-10-29 03:00:00+0100', tz='Europe/Madrid')
+ ts = pd.Timestamp('2016-10-10 03:00:00', tz='Europe/Madrid')
+ idx = pd.date_range(start, end, closed='left', freq="H")
+ result = pd.DataFrame(index=idx, columns=['value'])
+ result.loc[ts, 'value'] = 12
+ expected = pd.DataFrame([np.nan] * len(idx) + [12],
+ index=idx.append(pd.DatetimeIndex([ts])),
+ columns=['value'],
+ dtype=object)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_floats.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_floats.py
new file mode 100644
index 00000000000..de91b8f4a79
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_floats.py
@@ -0,0 +1,898 @@
+# -*- coding: utf-8 -*-
+
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas import (
+ DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series)
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal, assert_series_equal
+
+ignore_ix = pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning")
+
+
+class TestFloatIndexers(object):
+
+ def check(self, result, original, indexer, getitem):
+ """
+ comparator for results
+ we need to take care if we are indexing on a
+ Series or a frame
+ """
+ if isinstance(original, Series):
+ expected = original.iloc[indexer]
+ else:
+ if getitem:
+ expected = original.iloc[:, indexer]
+ else:
+ expected = original.iloc[indexer]
+
+ assert_almost_equal(result, expected)
+
+ def test_scalar_error(self):
+
+ # GH 4892
+ # float_indexers should raise exceptions
+ # on appropriate Index types & accessors
+ # this duplicates the code below
+ # but is spefically testing for the error
+ # message
+
+ for index in [tm.makeStringIndex, tm.makeUnicodeIndex,
+ tm.makeCategoricalIndex,
+ tm.makeDateIndex, tm.makeTimedeltaIndex,
+ tm.makePeriodIndex, tm.makeIntIndex,
+ tm.makeRangeIndex]:
+
+ i = index(5)
+
+ s = Series(np.arange(len(i)), index=i)
+
+ msg = 'Cannot index by location index'
+ with pytest.raises(TypeError, match=msg):
+ s.iloc[3.0]
+
+ def f():
+ s.iloc[3.0] = 0
+ pytest.raises(TypeError, f)
+
+ @ignore_ix
+ def test_scalar_non_numeric(self):
+
+ # GH 4892
+ # float_indexers should raise exceptions
+ # on appropriate Index types & accessors
+
+ for index in [tm.makeStringIndex, tm.makeUnicodeIndex,
+ tm.makeCategoricalIndex,
+ tm.makeDateIndex, tm.makeTimedeltaIndex,
+ tm.makePeriodIndex]:
+
+ i = index(5)
+
+ for s in [Series(
+ np.arange(len(i)), index=i), DataFrame(
+ np.random.randn(
+ len(i), len(i)), index=i, columns=i)]:
+
+ # getting
+ for idxr, getitem in [(lambda x: x.ix, False),
+ (lambda x: x.iloc, False),
+ (lambda x: x, True)]:
+
+ def f():
+ with catch_warnings(record=True):
+ idxr(s)[3.0]
+
+ # gettitem on a DataFrame is a KeyError as it is indexing
+ # via labels on the columns
+ if getitem and isinstance(s, DataFrame):
+ error = KeyError
+ else:
+ error = TypeError
+ pytest.raises(error, f)
+
+ # label based can be a TypeError or KeyError
+ def f():
+ s.loc[3.0]
+
+ if s.index.inferred_type in ['string', 'unicode', 'mixed']:
+ error = KeyError
+ else:
+ error = TypeError
+ pytest.raises(error, f)
+
+ # contains
+ assert 3.0 not in s
+
+ # setting with a float fails with iloc
+ def f():
+ s.iloc[3.0] = 0
+ pytest.raises(TypeError, f)
+
+ # setting with an indexer
+ if s.index.inferred_type in ['categorical']:
+ # Value or Type Error
+ pass
+ elif s.index.inferred_type in ['datetime64', 'timedelta64',
+ 'period']:
+
+ # these should prob work
+ # and are inconsisten between series/dataframe ATM
+ # for idxr in [lambda x: x.ix,
+ # lambda x: x]:
+ # s2 = s.copy()
+ # def f():
+ # idxr(s2)[3.0] = 0
+ # pytest.raises(TypeError, f)
+ pass
+
+ else:
+
+ s2 = s.copy()
+ s2.loc[3.0] = 10
+ assert s2.index.is_object()
+
+ for idxr in [lambda x: x.ix,
+ lambda x: x]:
+ s2 = s.copy()
+ with catch_warnings(record=True):
+ idxr(s2)[3.0] = 0
+ assert s2.index.is_object()
+
+ # fallsback to position selection, series only
+ s = Series(np.arange(len(i)), index=i)
+ s[3]
+ pytest.raises(TypeError, lambda: s[3.0])
+
+ @ignore_ix
+ def test_scalar_with_mixed(self):
+
+ s2 = Series([1, 2, 3], index=['a', 'b', 'c'])
+ s3 = Series([1, 2, 3], index=['a', 'b', 1.5])
+
+ # lookup in a pure string index
+ # with an invalid indexer
+ for idxr in [lambda x: x.ix,
+ lambda x: x,
+ lambda x: x.iloc]:
+
+ def f():
+ with catch_warnings(record=True):
+ idxr(s2)[1.0]
+
+ pytest.raises(TypeError, f)
+
+ pytest.raises(KeyError, lambda: s2.loc[1.0])
+
+ result = s2.loc['b']
+ expected = 2
+ assert result == expected
+
+ # mixed index so we have label
+ # indexing
+ for idxr in [lambda x: x]:
+
+ def f():
+ idxr(s3)[1.0]
+
+ pytest.raises(TypeError, f)
+
+ result = idxr(s3)[1]
+ expected = 2
+ assert result == expected
+
+ # mixed index so we have label
+ # indexing
+ for idxr in [lambda x: x.ix]:
+ with catch_warnings(record=True):
+
+ def f():
+ idxr(s3)[1.0]
+
+ pytest.raises(TypeError, f)
+
+ result = idxr(s3)[1]
+ expected = 2
+ assert result == expected
+
+ pytest.raises(TypeError, lambda: s3.iloc[1.0])
+ pytest.raises(KeyError, lambda: s3.loc[1.0])
+
+ result = s3.loc[1.5]
+ expected = 3
+ assert result == expected
+
+ @ignore_ix
+ def test_scalar_integer(self):
+
+ # test how scalar float indexers work on int indexes
+
+ # integer index
+ for i in [Int64Index(range(5)), RangeIndex(5)]:
+
+ for s in [Series(np.arange(len(i))),
+ DataFrame(np.random.randn(len(i), len(i)),
+ index=i, columns=i)]:
+
+ # coerce to equal int
+ for idxr, getitem in [(lambda x: x.ix, False),
+ (lambda x: x.loc, False),
+ (lambda x: x, True)]:
+
+ with catch_warnings(record=True):
+ result = idxr(s)[3.0]
+ self.check(result, s, 3, getitem)
+
+ # coerce to equal int
+ for idxr, getitem in [(lambda x: x.ix, False),
+ (lambda x: x.loc, False),
+ (lambda x: x, True)]:
+
+ if isinstance(s, Series):
+ def compare(x, y):
+ assert x == y
+ expected = 100
+ else:
+ compare = tm.assert_series_equal
+ if getitem:
+ expected = Series(100,
+ index=range(len(s)), name=3)
+ else:
+ expected = Series(100.,
+ index=range(len(s)), name=3)
+
+ s2 = s.copy()
+ with catch_warnings(record=True):
+ idxr(s2)[3.0] = 100
+
+ result = idxr(s2)[3.0]
+ compare(result, expected)
+
+ result = idxr(s2)[3]
+ compare(result, expected)
+
+ # contains
+ # coerce to equal int
+ assert 3.0 in s
+
+ @ignore_ix
+ def test_scalar_float(self):
+
+ # scalar float indexers work on a float index
+ index = Index(np.arange(5.))
+ for s in [Series(np.arange(len(index)), index=index),
+ DataFrame(np.random.randn(len(index), len(index)),
+ index=index, columns=index)]:
+
+ # assert all operations except for iloc are ok
+ indexer = index[3]
+ for idxr, getitem in [(lambda x: x.ix, False),
+ (lambda x: x.loc, False),
+ (lambda x: x, True)]:
+
+ # getting
+ result = idxr(s)[indexer]
+ self.check(result, s, 3, getitem)
+
+ # setting
+ s2 = s.copy()
+
+ def f():
+ with catch_warnings(record=True):
+ idxr(s2)[indexer] = expected
+ with catch_warnings(record=True):
+ result = idxr(s2)[indexer]
+ self.check(result, s, 3, getitem)
+
+ # random integer is a KeyError
+ with catch_warnings(record=True):
+ pytest.raises(KeyError, lambda: idxr(s)[3.5])
+
+ # contains
+ assert 3.0 in s
+
+ # iloc succeeds with an integer
+ expected = s.iloc[3]
+ s2 = s.copy()
+
+ s2.iloc[3] = expected
+ result = s2.iloc[3]
+ self.check(result, s, 3, False)
+
+ # iloc raises with a float
+ pytest.raises(TypeError, lambda: s.iloc[3.0])
+
+ def g():
+ s2.iloc[3.0] = 0
+ pytest.raises(TypeError, g)
+
+ @ignore_ix
+ def test_slice_non_numeric(self):
+
+ # GH 4892
+ # float_indexers should raise exceptions
+ # on appropriate Index types & accessors
+
+ for index in [tm.makeStringIndex, tm.makeUnicodeIndex,
+ tm.makeDateIndex, tm.makeTimedeltaIndex,
+ tm.makePeriodIndex]:
+
+ index = index(5)
+ for s in [Series(range(5), index=index),
+ DataFrame(np.random.randn(5, 2), index=index)]:
+
+ # getitem
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ def f():
+ s.iloc[l]
+ pytest.raises(TypeError, f)
+
+ for idxr in [lambda x: x.ix,
+ lambda x: x.loc,
+ lambda x: x.iloc,
+ lambda x: x]:
+
+ def f():
+ with catch_warnings(record=True):
+ idxr(s)[l]
+ pytest.raises(TypeError, f)
+
+ # setitem
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ def f():
+ s.iloc[l] = 0
+ pytest.raises(TypeError, f)
+
+ for idxr in [lambda x: x.ix,
+ lambda x: x.loc,
+ lambda x: x.iloc,
+ lambda x: x]:
+ def f():
+ with catch_warnings(record=True):
+ idxr(s)[l] = 0
+ pytest.raises(TypeError, f)
+
+ @ignore_ix
+ def test_slice_integer(self):
+
+ # same as above, but for Integer based indexes
+ # these coerce to a like integer
+ # oob indicates if we are out of bounds
+ # of positional indexing
+ for index, oob in [(Int64Index(range(5)), False),
+ (RangeIndex(5), False),
+ (Int64Index(range(5)) + 10, True)]:
+
+ # s is an in-range index
+ s = Series(range(5), index=index)
+
+ # getitem
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ for idxr in [lambda x: x.loc,
+ lambda x: x.ix]:
+
+ with catch_warnings(record=True):
+ result = idxr(s)[l]
+
+ # these are all label indexing
+ # except getitem which is positional
+ # empty
+ if oob:
+ indexer = slice(0, 0)
+ else:
+ indexer = slice(3, 5)
+ self.check(result, s, indexer, False)
+
+ # positional indexing
+ def f():
+ s[l]
+
+ pytest.raises(TypeError, f)
+
+ # getitem out-of-bounds
+ for l in [slice(-6, 6),
+ slice(-6.0, 6.0)]:
+
+ for idxr in [lambda x: x.loc,
+ lambda x: x.ix]:
+ with catch_warnings(record=True):
+ result = idxr(s)[l]
+
+ # these are all label indexing
+ # except getitem which is positional
+ # empty
+ if oob:
+ indexer = slice(0, 0)
+ else:
+ indexer = slice(-6, 6)
+ self.check(result, s, indexer, False)
+
+ # positional indexing
+ def f():
+ s[slice(-6.0, 6.0)]
+
+ pytest.raises(TypeError, f)
+
+ # getitem odd floats
+ for l, res1 in [(slice(2.5, 4), slice(3, 5)),
+ (slice(2, 3.5), slice(2, 4)),
+ (slice(2.5, 3.5), slice(3, 4))]:
+
+ for idxr in [lambda x: x.loc,
+ lambda x: x.ix]:
+
+ with catch_warnings(record=True):
+ result = idxr(s)[l]
+ if oob:
+ res = slice(0, 0)
+ else:
+ res = res1
+
+ self.check(result, s, res, False)
+
+ # positional indexing
+ def f():
+ s[l]
+
+ pytest.raises(TypeError, f)
+
+ # setitem
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ for idxr in [lambda x: x.loc,
+ lambda x: x.ix]:
+ sc = s.copy()
+ with catch_warnings(record=True):
+ idxr(sc)[l] = 0
+ result = idxr(sc)[l].values.ravel()
+ assert (result == 0).all()
+
+ # positional indexing
+ def f():
+ s[l] = 0
+
+ pytest.raises(TypeError, f)
+
+ def test_integer_positional_indexing(self):
+ """ make sure that we are raising on positional indexing
+ w.r.t. an integer index """
+
+ s = Series(range(2, 6), index=range(2, 6))
+
+ result = s[2:4]
+ expected = s.iloc[2:4]
+ assert_series_equal(result, expected)
+
+ for idxr in [lambda x: x,
+ lambda x: x.iloc]:
+
+ for l in [slice(2, 4.0),
+ slice(2.0, 4),
+ slice(2.0, 4.0)]:
+
+ def f():
+ idxr(s)[l]
+
+ pytest.raises(TypeError, f)
+
+ @ignore_ix
+ def test_slice_integer_frame_getitem(self):
+
+ # similar to above, but on the getitem dim (of a DataFrame)
+ for index in [Int64Index(range(5)), RangeIndex(5)]:
+
+ s = DataFrame(np.random.randn(5, 2), index=index)
+
+ def f(idxr):
+
+ # getitem
+ for l in [slice(0.0, 1),
+ slice(0, 1.0),
+ slice(0.0, 1.0)]:
+
+ result = idxr(s)[l]
+ indexer = slice(0, 2)
+ self.check(result, s, indexer, False)
+
+ # positional indexing
+ def f():
+ s[l]
+
+ pytest.raises(TypeError, f)
+
+ # getitem out-of-bounds
+ for l in [slice(-10, 10),
+ slice(-10.0, 10.0)]:
+
+ result = idxr(s)[l]
+ self.check(result, s, slice(-10, 10), True)
+
+ # positional indexing
+ def f():
+ s[slice(-10.0, 10.0)]
+
+ pytest.raises(TypeError, f)
+
+ # getitem odd floats
+ for l, res in [(slice(0.5, 1), slice(1, 2)),
+ (slice(0, 0.5), slice(0, 1)),
+ (slice(0.5, 1.5), slice(1, 2))]:
+
+ result = idxr(s)[l]
+ self.check(result, s, res, False)
+
+ # positional indexing
+ def f():
+ s[l]
+
+ pytest.raises(TypeError, f)
+
+ # setitem
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ sc = s.copy()
+ idxr(sc)[l] = 0
+ result = idxr(sc)[l].values.ravel()
+ assert (result == 0).all()
+
+ # positional indexing
+ def f():
+ s[l] = 0
+
+ pytest.raises(TypeError, f)
+
+ f(lambda x: x.loc)
+ with catch_warnings(record=True):
+ f(lambda x: x.ix)
+
+ @ignore_ix
+ def test_slice_float(self):
+
+ # same as above, but for floats
+ index = Index(np.arange(5.)) + 0.1
+ for s in [Series(range(5), index=index),
+ DataFrame(np.random.randn(5, 2), index=index)]:
+
+ for l in [slice(3.0, 4),
+ slice(3, 4.0),
+ slice(3.0, 4.0)]:
+
+ expected = s.iloc[3:4]
+ for idxr in [lambda x: x.ix,
+ lambda x: x.loc,
+ lambda x: x]:
+
+ # getitem
+ with catch_warnings(record=True):
+ result = idxr(s)[l]
+ if isinstance(s, Series):
+ tm.assert_series_equal(result, expected)
+ else:
+ tm.assert_frame_equal(result, expected)
+ # setitem
+ s2 = s.copy()
+ with catch_warnings(record=True):
+ idxr(s2)[l] = 0
+ result = idxr(s2)[l].values.ravel()
+ assert (result == 0).all()
+
+ def test_floating_index_doc_example(self):
+
+ index = Index([1.5, 2, 3, 4.5, 5])
+ s = Series(range(5), index=index)
+ assert s[3] == 2
+ assert s.loc[3] == 2
+ assert s.loc[3] == 2
+ assert s.iloc[3] == 3
+
+ def test_floating_misc(self):
+
+ # related 236
+ # scalar/slicing of a float index
+ s = Series(np.arange(5), index=np.arange(5) * 2.5, dtype=np.int64)
+
+ # label based slicing
+ result1 = s[1.0:3.0]
+ result2 = s.loc[1.0:3.0]
+ result3 = s.loc[1.0:3.0]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+
+ # exact indexing when found
+ result1 = s[5.0]
+ result2 = s.loc[5.0]
+ result3 = s.loc[5.0]
+ assert result1 == result2
+ assert result1 == result3
+
+ result1 = s[5]
+ result2 = s.loc[5]
+ result3 = s.loc[5]
+ assert result1 == result2
+ assert result1 == result3
+
+ assert s[5.0] == s[5]
+
+ # value not found (and no fallbacking at all)
+
+ # scalar integers
+ pytest.raises(KeyError, lambda: s.loc[4])
+ pytest.raises(KeyError, lambda: s.loc[4])
+ pytest.raises(KeyError, lambda: s[4])
+
+ # fancy floats/integers create the correct entry (as nan)
+ # fancy tests
+ expected = Series([2, 0], index=Float64Index([5.0, 0.0]))
+ for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]: # float
+ assert_series_equal(s[fancy_idx], expected)
+ assert_series_equal(s.loc[fancy_idx], expected)
+ assert_series_equal(s.loc[fancy_idx], expected)
+
+ expected = Series([2, 0], index=Index([5, 0], dtype='int64'))
+ for fancy_idx in [[5, 0], np.array([5, 0])]: # int
+ assert_series_equal(s[fancy_idx], expected)
+ assert_series_equal(s.loc[fancy_idx], expected)
+ assert_series_equal(s.loc[fancy_idx], expected)
+
+ # all should return the same as we are slicing 'the same'
+ result1 = s.loc[2:5]
+ result2 = s.loc[2.0:5.0]
+ result3 = s.loc[2.0:5]
+ result4 = s.loc[2.1:5]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, result4)
+
+ # previously this did fallback indexing
+ result1 = s[2:5]
+ result2 = s[2.0:5.0]
+ result3 = s[2.0:5]
+ result4 = s[2.1:5]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, result4)
+
+ result1 = s.loc[2:5]
+ result2 = s.loc[2.0:5.0]
+ result3 = s.loc[2.0:5]
+ result4 = s.loc[2.1:5]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, result4)
+
+ # combined test
+ result1 = s.loc[2:5]
+ result2 = s.loc[2:5]
+ result3 = s[2:5]
+
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+
+ # list selection
+ result1 = s[[0.0, 5, 10]]
+ result2 = s.loc[[0.0, 5, 10]]
+ result3 = s.loc[[0.0, 5, 10]]
+ result4 = s.iloc[[0, 2, 4]]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, result4)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result1 = s[[1.6, 5, 10]]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result2 = s.loc[[1.6, 5, 10]]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result3 = s.loc[[1.6, 5, 10]]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, Series(
+ [np.nan, 2, 4], index=[1.6, 5, 10]))
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result1 = s[[0, 1, 2]]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result2 = s.loc[[0, 1, 2]]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result3 = s.loc[[0, 1, 2]]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, Series(
+ [0.0, np.nan, np.nan], index=[0, 1, 2]))
+
+ result1 = s.loc[[2.5, 5]]
+ result2 = s.loc[[2.5, 5]]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, Series([1, 2], index=[2.5, 5.0]))
+
+ result1 = s[[2.5]]
+ result2 = s.loc[[2.5]]
+ result3 = s.loc[[2.5]]
+ assert_series_equal(result1, result2)
+ assert_series_equal(result1, result3)
+ assert_series_equal(result1, Series([1], index=[2.5]))
+
+ def test_floating_tuples(self):
+ # see gh-13509
+ s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name='foo')
+
+ result = s[0.0]
+ assert result == (1, 1)
+
+ expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo')
+ s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name='foo')
+
+ result = s[0.0]
+ tm.assert_series_equal(result, expected)
+
+ def test_float64index_slicing_bug(self):
+ # GH 5557, related to slicing a float index
+ ser = {256: 2321.0,
+ 1: 78.0,
+ 2: 2716.0,
+ 3: 0.0,
+ 4: 369.0,
+ 5: 0.0,
+ 6: 269.0,
+ 7: 0.0,
+ 8: 0.0,
+ 9: 0.0,
+ 10: 3536.0,
+ 11: 0.0,
+ 12: 24.0,
+ 13: 0.0,
+ 14: 931.0,
+ 15: 0.0,
+ 16: 101.0,
+ 17: 78.0,
+ 18: 9643.0,
+ 19: 0.0,
+ 20: 0.0,
+ 21: 0.0,
+ 22: 63761.0,
+ 23: 0.0,
+ 24: 446.0,
+ 25: 0.0,
+ 26: 34773.0,
+ 27: 0.0,
+ 28: 729.0,
+ 29: 78.0,
+ 30: 0.0,
+ 31: 0.0,
+ 32: 3374.0,
+ 33: 0.0,
+ 34: 1391.0,
+ 35: 0.0,
+ 36: 361.0,
+ 37: 0.0,
+ 38: 61808.0,
+ 39: 0.0,
+ 40: 0.0,
+ 41: 0.0,
+ 42: 6677.0,
+ 43: 0.0,
+ 44: 802.0,
+ 45: 0.0,
+ 46: 2691.0,
+ 47: 0.0,
+ 48: 3582.0,
+ 49: 0.0,
+ 50: 734.0,
+ 51: 0.0,
+ 52: 627.0,
+ 53: 70.0,
+ 54: 2584.0,
+ 55: 0.0,
+ 56: 324.0,
+ 57: 0.0,
+ 58: 605.0,
+ 59: 0.0,
+ 60: 0.0,
+ 61: 0.0,
+ 62: 3989.0,
+ 63: 10.0,
+ 64: 42.0,
+ 65: 0.0,
+ 66: 904.0,
+ 67: 0.0,
+ 68: 88.0,
+ 69: 70.0,
+ 70: 8172.0,
+ 71: 0.0,
+ 72: 0.0,
+ 73: 0.0,
+ 74: 64902.0,
+ 75: 0.0,
+ 76: 347.0,
+ 77: 0.0,
+ 78: 36605.0,
+ 79: 0.0,
+ 80: 379.0,
+ 81: 70.0,
+ 82: 0.0,
+ 83: 0.0,
+ 84: 3001.0,
+ 85: 0.0,
+ 86: 1630.0,
+ 87: 7.0,
+ 88: 364.0,
+ 89: 0.0,
+ 90: 67404.0,
+ 91: 9.0,
+ 92: 0.0,
+ 93: 0.0,
+ 94: 7685.0,
+ 95: 0.0,
+ 96: 1017.0,
+ 97: 0.0,
+ 98: 2831.0,
+ 99: 0.0,
+ 100: 2963.0,
+ 101: 0.0,
+ 102: 854.0,
+ 103: 0.0,
+ 104: 0.0,
+ 105: 0.0,
+ 106: 0.0,
+ 107: 0.0,
+ 108: 0.0,
+ 109: 0.0,
+ 110: 0.0,
+ 111: 0.0,
+ 112: 0.0,
+ 113: 0.0,
+ 114: 0.0,
+ 115: 0.0,
+ 116: 0.0,
+ 117: 0.0,
+ 118: 0.0,
+ 119: 0.0,
+ 120: 0.0,
+ 121: 0.0,
+ 122: 0.0,
+ 123: 0.0,
+ 124: 0.0,
+ 125: 0.0,
+ 126: 67744.0,
+ 127: 22.0,
+ 128: 264.0,
+ 129: 0.0,
+ 260: 197.0,
+ 268: 0.0,
+ 265: 0.0,
+ 269: 0.0,
+ 261: 0.0,
+ 266: 1198.0,
+ 267: 0.0,
+ 262: 2629.0,
+ 258: 775.0,
+ 257: 0.0,
+ 263: 0.0,
+ 259: 0.0,
+ 264: 163.0,
+ 250: 10326.0,
+ 251: 0.0,
+ 252: 1228.0,
+ 253: 0.0,
+ 254: 2769.0,
+ 255: 0.0}
+
+ # smoke test for the repr
+ s = Series(ser)
+ result = s.value_counts()
+ str(result)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_iloc.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_iloc.py
new file mode 100644
index 00000000000..a867387db4b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_iloc.py
@@ -0,0 +1,677 @@
+""" test positional based indexing with iloc """
+
+from warnings import catch_warnings, filterwarnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas.compat import lmap, lrange
+
+import pandas as pd
+from pandas import DataFrame, Series, concat, date_range, isna
+from pandas.api.types import is_scalar
+from pandas.tests.indexing.common import Base
+from pandas.util import testing as tm
+
+
+class TestiLoc(Base):
+
+ def test_iloc_exceeds_bounds(self):
+
+ # GH6296
+ # iloc should allow indexers that exceed the bounds
+ df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE'))
+
+ # lists of positions should raise IndexErrror!
+ msg = 'positional indexers are out-of-bounds'
+ with pytest.raises(IndexError, match=msg):
+ df.iloc[:, [0, 1, 2, 3, 4, 5]]
+ pytest.raises(IndexError, lambda: df.iloc[[1, 30]])
+ pytest.raises(IndexError, lambda: df.iloc[[1, -30]])
+ pytest.raises(IndexError, lambda: df.iloc[[100]])
+
+ s = df['A']
+ pytest.raises(IndexError, lambda: s.iloc[[100]])
+ pytest.raises(IndexError, lambda: s.iloc[[-100]])
+
+ # still raise on a single indexer
+ msg = 'single positional indexer is out-of-bounds'
+ with pytest.raises(IndexError, match=msg):
+ df.iloc[30]
+ pytest.raises(IndexError, lambda: df.iloc[-30])
+
+ # GH10779
+ # single positive/negative indexer exceeding Series bounds should raise
+ # an IndexError
+ with pytest.raises(IndexError, match=msg):
+ s.iloc[30]
+ pytest.raises(IndexError, lambda: s.iloc[-30])
+
+ # slices are ok
+ result = df.iloc[:, 4:10] # 0 < start < len < stop
+ expected = df.iloc[:, 4:]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, -4:-10] # stop < 0 < start < len
+ expected = df.iloc[:, :0]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down)
+ expected = df.iloc[:, :4:-1]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down)
+ expected = df.iloc[:, 4::-1]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, -10:4] # start < 0 < stop < len
+ expected = df.iloc[:, :4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, 10:4] # 0 < stop < len < start
+ expected = df.iloc[:, :0]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down)
+ expected = df.iloc[:, :0]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, 10:11] # 0 < len < start < stop
+ expected = df.iloc[:, :0]
+ tm.assert_frame_equal(result, expected)
+
+ # slice bounds exceeding is ok
+ result = s.iloc[18:30]
+ expected = s.iloc[18:]
+ tm.assert_series_equal(result, expected)
+
+ result = s.iloc[30:]
+ expected = s.iloc[:0]
+ tm.assert_series_equal(result, expected)
+
+ result = s.iloc[30::-1]
+ expected = s.iloc[::-1]
+ tm.assert_series_equal(result, expected)
+
+ # doc example
+ def check(result, expected):
+ str(result)
+ result.dtypes
+ tm.assert_frame_equal(result, expected)
+
+ dfl = DataFrame(np.random.randn(5, 2), columns=list('AB'))
+ check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index))
+ check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]])
+ check(dfl.iloc[4:6], dfl.iloc[[4]])
+
+ pytest.raises(IndexError, lambda: dfl.iloc[[4, 5, 6]])
+ pytest.raises(IndexError, lambda: dfl.iloc[:, 4])
+
+ def test_iloc_getitem_int(self):
+
+ # integer
+ self.check_result('integer', 'iloc', 2, 'ix',
+ {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints'])
+ self.check_result('integer', 'iloc', 2, 'indexer', 2,
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ def test_iloc_getitem_neg_int(self):
+
+ # neg integer
+ self.check_result('neg int', 'iloc', -1, 'ix',
+ {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints'])
+ self.check_result('neg int', 'iloc', -1, 'indexer', -1,
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ @pytest.mark.parametrize('dims', [1, 2])
+ def test_iloc_getitem_invalid_scalar(self, dims):
+ # GH 21982
+
+ if dims == 1:
+ s = Series(np.arange(10))
+ else:
+ s = DataFrame(np.arange(100).reshape(10, 10))
+
+ with pytest.raises(TypeError, match='Cannot index by location index'):
+ s.iloc['a']
+
+ def test_iloc_array_not_mutating_negative_indices(self):
+
+ # GH 21867
+ array_with_neg_numbers = np.array([1, 2, -1])
+ array_copy = array_with_neg_numbers.copy()
+ df = pd.DataFrame({
+ 'A': [100, 101, 102],
+ 'B': [103, 104, 105],
+ 'C': [106, 107, 108]},
+ index=[1, 2, 3])
+ df.iloc[array_with_neg_numbers]
+ tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy)
+ df.iloc[:, array_with_neg_numbers]
+ tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy)
+
+ def test_iloc_getitem_list_int(self):
+
+ # list of ints
+ self.check_result('list int', 'iloc', [0, 1, 2], 'ix',
+ {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]},
+ typs=['ints', 'uints'])
+ self.check_result('list int', 'iloc', [2], 'ix',
+ {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints'])
+ self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2],
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ # array of ints (GH5006), make sure that a single indexer is returning
+ # the correct type
+ self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix',
+ {0: [0, 2, 4],
+ 1: [0, 3, 6],
+ 2: [0, 4, 8]}, typs=['ints', 'uints'])
+ self.check_result('array int', 'iloc', np.array([2]), 'ix',
+ {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints'])
+ self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer',
+ [0, 1, 2],
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ def test_iloc_getitem_neg_int_can_reach_first_index(self):
+ # GH10547 and GH10779
+ # negative integers should be able to reach index 0
+ df = DataFrame({'A': [2, 3, 5], 'B': [7, 11, 13]})
+ s = df['A']
+
+ expected = df.iloc[0]
+ result = df.iloc[-3]
+ tm.assert_series_equal(result, expected)
+
+ expected = df.iloc[[0]]
+ result = df.iloc[[-3]]
+ tm.assert_frame_equal(result, expected)
+
+ expected = s.iloc[0]
+ result = s.iloc[-3]
+ assert result == expected
+
+ expected = s.iloc[[0]]
+ result = s.iloc[[-3]]
+ tm.assert_series_equal(result, expected)
+
+ # check the length 1 Series case highlighted in GH10547
+ expected = Series(['a'], index=['A'])
+ result = expected.iloc[[-1]]
+ tm.assert_series_equal(result, expected)
+
+ def test_iloc_getitem_dups(self):
+
+ # no dups in panel (bug?)
+ self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix',
+ {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]},
+ objs=['series', 'frame'], typs=['ints', 'uints'])
+
+ # GH 6766
+ df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}])
+ df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}])
+ df = concat([df1, df2], axis=1)
+
+ # cross-sectional indexing
+ result = df.iloc[0, 0]
+ assert isna(result)
+
+ result = df.iloc[0, :]
+ expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'],
+ name=0)
+ tm.assert_series_equal(result, expected)
+
+ def test_iloc_getitem_array(self):
+
+ # array like
+ s = Series(index=lrange(1, 4))
+ self.check_result('array like', 'iloc', s.index, 'ix',
+ {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]},
+ typs=['ints', 'uints'])
+
+ def test_iloc_getitem_bool(self):
+
+ # boolean indexers
+ b = [True, False, True, False, ]
+ self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints'])
+ self.check_result('bool', 'iloc', b, 'ix', b,
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ def test_iloc_getitem_slice(self):
+
+ # slices
+ self.check_result('slice', 'iloc', slice(1, 3), 'ix',
+ {0: [2, 4], 1: [3, 6], 2: [4, 8]},
+ typs=['ints', 'uints'])
+ self.check_result('slice', 'iloc', slice(1, 3), 'indexer',
+ slice(1, 3),
+ typs=['labels', 'mixed', 'ts', 'floats', 'empty'],
+ fails=IndexError)
+
+ def test_iloc_getitem_slice_dups(self):
+
+ df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B'])
+ df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
+ columns=['A', 'C'])
+
+ # axis=1
+ df = concat([df1, df2], axis=1)
+ tm.assert_frame_equal(df.iloc[:, :4], df1)
+ tm.assert_frame_equal(df.iloc[:, 4:], df2)
+
+ df = concat([df2, df1], axis=1)
+ tm.assert_frame_equal(df.iloc[:, :2], df2)
+ tm.assert_frame_equal(df.iloc[:, 2:], df1)
+
+ exp = concat([df2, df1.iloc[:, [0]]], axis=1)
+ tm.assert_frame_equal(df.iloc[:, 0:3], exp)
+
+ # axis=0
+ df = concat([df, df], axis=0)
+ tm.assert_frame_equal(df.iloc[0:10, :2], df2)
+ tm.assert_frame_equal(df.iloc[0:10, 2:], df1)
+ tm.assert_frame_equal(df.iloc[10:, :2], df2)
+ tm.assert_frame_equal(df.iloc[10:, 2:], df1)
+
+ def test_iloc_setitem(self):
+ df = self.frame_ints
+
+ df.iloc[1, 1] = 1
+ result = df.iloc[1, 1]
+ assert result == 1
+
+ df.iloc[:, 2:3] = 0
+ expected = df.iloc[:, 2:3]
+ result = df.iloc[:, 2:3]
+ tm.assert_frame_equal(result, expected)
+
+ # GH5771
+ s = Series(0, index=[4, 5, 6])
+ s.iloc[1:2] += 1
+ expected = Series([0, 1, 0], index=[4, 5, 6])
+ tm.assert_series_equal(s, expected)
+
+ def test_iloc_setitem_list(self):
+
+ # setitem with an iloc list
+ df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"],
+ columns=["A", "B", "C"])
+ df.iloc[[0, 1], [1, 2]]
+ df.iloc[[0, 1], [1, 2]] += 100
+
+ expected = DataFrame(
+ np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)),
+ index=["A", "B", "C"], columns=["A", "B", "C"])
+ tm.assert_frame_equal(df, expected)
+
+ def test_iloc_setitem_pandas_object(self):
+ # GH 17193
+ s_orig = Series([0, 1, 2, 3])
+ expected = Series([0, -1, -2, 3])
+
+ s = s_orig.copy()
+ s.iloc[Series([1, 2])] = [-1, -2]
+ tm.assert_series_equal(s, expected)
+
+ s = s_orig.copy()
+ s.iloc[pd.Index([1, 2])] = [-1, -2]
+ tm.assert_series_equal(s, expected)
+
+ def test_iloc_setitem_dups(self):
+
+ # GH 6766
+ # iloc with a mask aligning from another iloc
+ df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}])
+ df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}])
+ df = concat([df1, df2], axis=1)
+
+ expected = df.fillna(3)
+ expected['A'] = expected['A'].astype('float64')
+ inds = np.isnan(df.iloc[:, 0])
+ mask = inds[inds].index
+ df.iloc[mask, 0] = df.iloc[mask, 2]
+ tm.assert_frame_equal(df, expected)
+
+ # del a dup column across blocks
+ expected = DataFrame({0: [1, 2], 1: [3, 4]})
+ expected.columns = ['B', 'B']
+ del df['A']
+ tm.assert_frame_equal(df, expected)
+
+ # assign back to self
+ df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]]
+ tm.assert_frame_equal(df, expected)
+
+ # reversed x 2
+ df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
+ drop=True)
+ df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
+ drop=True)
+ tm.assert_frame_equal(df, expected)
+
+ def test_iloc_getitem_frame(self):
+ df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2),
+ columns=lrange(0, 8, 2))
+
+ result = df.iloc[2]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ exp = df.ix[4]
+ tm.assert_series_equal(result, exp)
+
+ result = df.iloc[2, 2]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ exp = df.ix[4, 4]
+ assert result == exp
+
+ # slice
+ result = df.iloc[4:8]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[8:14]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[:, 2:3]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[:, 4:5]
+ tm.assert_frame_equal(result, expected)
+
+ # list of integers
+ result = df.iloc[[0, 1, 3]]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[[0, 2, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.iloc[[0, 1, 3], [0, 1]]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[[0, 2, 6], [0, 2]]
+ tm.assert_frame_equal(result, expected)
+
+ # neg indices
+ result = df.iloc[[-1, 1, 3], [-1, 1]]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[[18, 2, 6], [6, 2]]
+ tm.assert_frame_equal(result, expected)
+
+ # dups indices
+ result = df.iloc[[-1, -1, 1, 3], [-1, 1]]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[[18, 18, 2, 6], [6, 2]]
+ tm.assert_frame_equal(result, expected)
+
+ # with index-like
+ s = Series(index=lrange(1, 5))
+ result = df.iloc[s.index]
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ expected = df.ix[[2, 4, 6, 8]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_iloc_getitem_labelled_frame(self):
+ # try with labelled frame
+ df = DataFrame(np.random.randn(10, 4),
+ index=list('abcdefghij'), columns=list('ABCD'))
+
+ result = df.iloc[1, 1]
+ exp = df.loc['b', 'B']
+ assert result == exp
+
+ result = df.iloc[:, 2:3]
+ expected = df.loc[:, ['C']]
+ tm.assert_frame_equal(result, expected)
+
+ # negative indexing
+ result = df.iloc[-1, -1]
+ exp = df.loc['j', 'D']
+ assert result == exp
+
+ # out-of-bounds exception
+ pytest.raises(IndexError, df.iloc.__getitem__, tuple([10, 5]))
+
+ # trying to use a label
+ pytest.raises(ValueError, df.iloc.__getitem__, tuple(['j', 'D']))
+
+ def test_iloc_getitem_doc_issue(self):
+
+ # multi axis slicing issue with single block
+ # surfaced in GH 6059
+
+ arr = np.random.randn(6, 4)
+ index = date_range('20130101', periods=6)
+ columns = list('ABCD')
+ df = DataFrame(arr, index=index, columns=columns)
+
+ # defines ref_locs
+ df.describe()
+
+ result = df.iloc[3:5, 0:2]
+ str(result)
+ result.dtypes
+
+ expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
+ columns=columns[0:2])
+ tm.assert_frame_equal(result, expected)
+
+ # for dups
+ df.columns = list('aaaa')
+ result = df.iloc[3:5, 0:2]
+ str(result)
+ result.dtypes
+
+ expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
+ columns=list('aa'))
+ tm.assert_frame_equal(result, expected)
+
+ # related
+ arr = np.random.randn(6, 4)
+ index = list(range(0, 12, 2))
+ columns = list(range(0, 8, 2))
+ df = DataFrame(arr, index=index, columns=columns)
+
+ df._data.blocks[0].mgr_locs
+ result = df.iloc[1:5, 2:4]
+ str(result)
+ result.dtypes
+ expected = DataFrame(arr[1:5, 2:4], index=index[1:5],
+ columns=columns[2:4])
+ tm.assert_frame_equal(result, expected)
+
+ def test_iloc_setitem_series(self):
+ df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
+ columns=list('ABCD'))
+
+ df.iloc[1, 1] = 1
+ result = df.iloc[1, 1]
+ assert result == 1
+
+ df.iloc[:, 2:3] = 0
+ expected = df.iloc[:, 2:3]
+ result = df.iloc[:, 2:3]
+ tm.assert_frame_equal(result, expected)
+
+ s = Series(np.random.randn(10), index=lrange(0, 20, 2))
+
+ s.iloc[1] = 1
+ result = s.iloc[1]
+ assert result == 1
+
+ s.iloc[:4] = 0
+ expected = s.iloc[:4]
+ result = s.iloc[:4]
+ tm.assert_series_equal(result, expected)
+
+ s = Series([-1] * 6)
+ s.iloc[0::2] = [0, 2, 4]
+ s.iloc[1::2] = [1, 3, 5]
+ result = s
+ expected = Series([0, 1, 2, 3, 4, 5])
+ tm.assert_series_equal(result, expected)
+
+ def test_iloc_setitem_list_of_lists(self):
+
+ # GH 7551
+ # list-of-list is set incorrectly in mixed vs. single dtyped frames
+ df = DataFrame(dict(A=np.arange(5, dtype='int64'),
+ B=np.arange(5, 10, dtype='int64')))
+ df.iloc[2:4] = [[10, 11], [12, 13]]
+ expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9]))
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame(
+ dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64')))
+ df.iloc[2:4] = [['x', 11], ['y', 13]]
+ expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'],
+ B=[5, 6, 11, 13, 9]))
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ 'indexer', [[0], slice(None, 1, None), np.array([0])])
+ @pytest.mark.parametrize(
+ 'value', [['Z'], np.array(['Z'])])
+ def test_iloc_setitem_with_scalar_index(self, indexer, value):
+ # GH #19474
+ # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated
+ # elementwisely, not using "setter('A', ['Z'])".
+
+ df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+ df.iloc[0, indexer] = value
+ result = df.iloc[0, 0]
+
+ assert is_scalar(result) and result == 'Z'
+
+ def test_iloc_mask(self):
+
+ # GH 3631, iloc with a mask (of a series) should raise
+ df = DataFrame(lrange(5), list('ABCDE'), columns=['a'])
+ mask = (df.a % 2 == 0)
+ pytest.raises(ValueError, df.iloc.__getitem__, tuple([mask]))
+ mask.index = lrange(len(mask))
+ pytest.raises(NotImplementedError, df.iloc.__getitem__,
+ tuple([mask]))
+
+ # ndarray ok
+ result = df.iloc[np.array([True] * len(mask), dtype=bool)]
+ tm.assert_frame_equal(result, df)
+
+ # the possibilities
+ locs = np.arange(4)
+ nums = 2 ** locs
+ reps = lmap(bin, nums)
+ df = DataFrame({'locs': locs, 'nums': nums}, reps)
+
+ expected = {
+ (None, ''): '0b1100',
+ (None, '.loc'): '0b1100',
+ (None, '.iloc'): '0b1100',
+ ('index', ''): '0b11',
+ ('index', '.loc'): '0b11',
+ ('index', '.iloc'): ('iLocation based boolean indexing '
+ 'cannot use an indexable as a mask'),
+ ('locs', ''): 'Unalignable boolean Series provided as indexer '
+ '(index of the boolean Series and of the indexed '
+ 'object do not match',
+ ('locs', '.loc'): 'Unalignable boolean Series provided as indexer '
+ '(index of the boolean Series and of the '
+ 'indexed object do not match',
+ ('locs', '.iloc'): ('iLocation based boolean indexing on an '
+ 'integer type is not available'),
+ }
+
+ # UserWarnings from reindex of a boolean mask
+ with catch_warnings(record=True):
+ simplefilter("ignore", UserWarning)
+ result = dict()
+ for idx in [None, 'index', 'locs']:
+ mask = (df.nums > 2).values
+ if idx:
+ mask = Series(mask, list(reversed(getattr(df, idx))))
+ for method in ['', '.loc', '.iloc']:
+ try:
+ if method:
+ accessor = getattr(df, method[1:])
+ else:
+ accessor = df
+ ans = str(bin(accessor[mask]['nums'].sum()))
+ except Exception as e:
+ ans = str(e)
+
+ key = tuple([idx, method])
+ r = expected.get(key)
+ if r != ans:
+ raise AssertionError(
+ "[%s] does not match [%s], received [%s]"
+ % (key, ans, r))
+
+ def test_iloc_non_unique_indexing(self):
+
+ # GH 4017, non-unique indexing (on the axis)
+ df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000})
+ idx = np.array(lrange(30)) * 99
+ expected = df.iloc[idx]
+
+ df3 = concat([df, 2 * df, 3 * df])
+ result = df3.iloc[idx]
+
+ tm.assert_frame_equal(result, expected)
+
+ df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000})
+ df2 = concat([df2, 2 * df2, 3 * df2])
+
+ sidx = df2.index.to_series()
+ expected = df2.iloc[idx[idx <= sidx.max()]]
+
+ new_list = []
+ for r, s in expected.iterrows():
+ new_list.append(s)
+ new_list.append(s * 2)
+ new_list.append(s * 3)
+
+ expected = DataFrame(new_list)
+ expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])],
+ sort=True)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df2.loc[idx]
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ def test_iloc_empty_list_indexer_is_ok(self):
+ from pandas.util.testing import makeCustomDataframe as mkdf
+ df = mkdf(5, 2)
+ # vertical empty
+ tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0],
+ check_index_type=True, check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :],
+ check_index_type=True, check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :],
+ check_index_type=True,
+ check_column_type=True)
+
+ def test_identity_slice_returns_new_object(self):
+ # GH13873
+ original_df = DataFrame({'a': [1, 2, 3]})
+ sliced_df = original_df.iloc[:]
+ assert sliced_df is not original_df
+
+ # should be a shallow copy
+ original_df['a'] = [4, 4, 4]
+ assert (sliced_df['a'] == 4).all()
+
+ original_series = Series([1, 2, 3, 4, 5, 6])
+ sliced_series = original_series.iloc[:]
+ assert sliced_series is not original_series
+
+ # should also be a shallow copy
+ original_series[:3] = [7, 8, 9]
+ assert all(sliced_series[:3] == [7, 8, 9])
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing.py
new file mode 100644
index 00000000000..03f1975c50d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing.py
@@ -0,0 +1,1015 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101
+
+""" test fancy indexing & misc """
+
+from datetime import datetime
+from warnings import catch_warnings, simplefilter
+import weakref
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY2, lrange, range
+
+from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
+
+import pandas as pd
+from pandas import DataFrame, Index, NaT, Series
+from pandas.core.indexing import (
+ _maybe_numeric_slice, _non_reducing_slice, validate_indices)
+from pandas.tests.indexing.common import Base, _mklbl
+import pandas.util.testing as tm
+
+# ------------------------------------------------------------------------
+# Indexing test cases
+
+
+class TestFancy(Base):
+ """ pure get/set item & fancy indexing """
+
+ def test_setitem_ndarray_1d(self):
+ # GH5508
+
+ # len of indexer vs length of the 1d ndarray
+ df = DataFrame(index=Index(lrange(1, 11)))
+ df['foo'] = np.zeros(10, dtype=np.float64)
+ df['bar'] = np.zeros(10, dtype=np.complex)
+
+ # invalid
+ with pytest.raises(ValueError):
+ df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j,
+ 2.2, 1.0])
+
+ # valid
+ df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j,
+ 2.2, 1.0])
+
+ result = df.loc[df.index[2:6], 'bar']
+ expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6],
+ name='bar')
+ tm.assert_series_equal(result, expected)
+
+ # dtype getting changed?
+ df = DataFrame(index=Index(lrange(1, 11)))
+ df['foo'] = np.zeros(10, dtype=np.float64)
+ df['bar'] = np.zeros(10, dtype=np.complex)
+
+ with pytest.raises(ValueError):
+ df[2:5] = np.arange(1, 4) * 1j
+
+ def test_inf_upcast(self):
+ # GH 16957
+ # We should be able to use np.inf as a key
+ # np.inf should cause an index to convert to float
+
+ # Test with np.inf in rows
+ df = DataFrame(columns=[0])
+ df.loc[1] = 1
+ df.loc[2] = 2
+ df.loc[np.inf] = 3
+
+ # make sure we can look up the value
+ assert df.loc[np.inf, 0] == 3
+
+ result = df.index
+ expected = pd.Float64Index([1, 2, np.inf])
+ tm.assert_index_equal(result, expected)
+
+ # Test with np.inf in columns
+ df = DataFrame()
+ df.loc[0, 0] = 1
+ df.loc[1, 1] = 2
+ df.loc[0, np.inf] = 3
+
+ result = df.columns
+ expected = pd.Float64Index([0, 1, np.inf])
+ tm.assert_index_equal(result, expected)
+
+ def test_setitem_dtype_upcast(self):
+
+ # GH3216
+ df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
+ df['c'] = np.nan
+ assert df['c'].dtype == np.float64
+
+ df.loc[0, 'c'] = 'foo'
+ expected = DataFrame([{"a": 1, "c": 'foo'},
+ {"a": 3, "b": 2, "c": np.nan}])
+ tm.assert_frame_equal(df, expected)
+
+ # GH10280
+ df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
+ index=list('ab'),
+ columns=['foo', 'bar', 'baz'])
+
+ for val in [3.14, 'wxyz']:
+ left = df.copy()
+ left.loc['a', 'bar'] = val
+ right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'),
+ columns=['foo', 'bar', 'baz'])
+
+ tm.assert_frame_equal(left, right)
+ assert is_integer_dtype(left['foo'])
+ assert is_integer_dtype(left['baz'])
+
+ left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0,
+ index=list('ab'),
+ columns=['foo', 'bar', 'baz'])
+ left.loc['a', 'bar'] = 'wxyz'
+
+ right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'),
+ columns=['foo', 'bar', 'baz'])
+
+ tm.assert_frame_equal(left, right)
+ assert is_float_dtype(left['foo'])
+ assert is_float_dtype(left['baz'])
+
+ def test_dups_fancy_indexing(self):
+
+ # GH 3455
+ from pandas.util.testing import makeCustomDataframe as mkdf
+ df = mkdf(10, 3)
+ df.columns = ['a', 'a', 'b']
+ result = df[['b', 'a']].columns
+ expected = Index(['b', 'a', 'a'])
+ tm.assert_index_equal(result, expected)
+
+ # across dtypes
+ df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
+ columns=list('aaaaaaa'))
+ df.head()
+ str(df)
+ result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
+ result.columns = list('aaaaaaa')
+
+ # TODO(wesm): unused?
+ df_v = df.iloc[:, 4] # noqa
+ res_v = result.iloc[:, 4] # noqa
+
+ tm.assert_frame_equal(df, result)
+
+ # GH 3561, dups not in selected order
+ df = DataFrame(
+ {'test': [5, 7, 9, 11],
+ 'test1': [4., 5, 6, 7],
+ 'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
+ rows = ['C', 'B']
+ expected = DataFrame(
+ {'test': [11, 9],
+ 'test1': [7., 6],
+ 'other': ['d', 'c']}, index=rows)
+ result = df.loc[rows]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[Index(rows)]
+ tm.assert_frame_equal(result, expected)
+
+ rows = ['C', 'B', 'E']
+ expected = DataFrame(
+ {'test': [11, 9, np.nan],
+ 'test1': [7., 6, np.nan],
+ 'other': ['d', 'c', np.nan]}, index=rows)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[rows]
+ tm.assert_frame_equal(result, expected)
+
+ # see GH5553, make sure we use the right indexer
+ rows = ['F', 'G', 'H', 'C', 'B', 'E']
+ expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
+ 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
+ 'other': [np.nan, np.nan, np.nan,
+ 'd', 'c', np.nan]},
+ index=rows)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[rows]
+ tm.assert_frame_equal(result, expected)
+
+ # List containing only missing label
+ dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
+ with pytest.raises(KeyError):
+ dfnu.loc[['E']]
+
+ # ToDo: check_index_type can be True after GH 11497
+
+ # GH 4619; duplicate indexer with missing label
+ df = DataFrame({"A": [0, 1, 2]})
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[[0, 8, 0]]
+ expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ df = DataFrame({"A": list('abc')})
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[[0, 8, 0]]
+ expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ # non unique with non unique selector
+ df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
+ expected = DataFrame(
+ {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[['A', 'A', 'E']]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(PY2,
+ reason="GH-20770. Py2 unreliable warnings catching.")
+ def test_dups_fancy_indexing2(self):
+ # GH 5835
+ # dups on index and missing values
+ df = DataFrame(
+ np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A'])
+
+ expected = pd.concat(
+ [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'],
+ index=df.index)], axis=1)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = df.loc[:, ['A', 'B', 'C']]
+ tm.assert_frame_equal(result, expected)
+
+ # GH 6504, multi-axis indexing
+ df = DataFrame(np.random.randn(9, 2),
+ index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b'])
+
+ expected = df.iloc[0:6]
+ result = df.loc[[1, 2]]
+ tm.assert_frame_equal(result, expected)
+
+ expected = df
+ result = df.loc[:, ['a', 'b']]
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.iloc[0:6, :]
+ result = df.loc[[1, 2], ['a', 'b']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_indexing_mixed_frame_bug(self):
+
+ # GH3492
+ df = DataFrame({'a': {1: 'aaa', 2: 'bbb', 3: 'ccc'},
+ 'b': {1: 111, 2: 222, 3: 333}})
+
+ # this works, new column is created correctly
+ df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x)
+
+ # this does not work, ie column test is not changed
+ idx = df['test'] == '_'
+ temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x)
+ df.loc[idx, 'test'] = temp
+ assert df.iloc[0, 2] == '-----'
+
+ # if I look at df, then element [0,2] equals '_'. If instead I type
+ # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I
+ # get '_'.
+
+ def test_multitype_list_index_access(self):
+ # GH 10610
+ df = DataFrame(np.random.random((10, 5)),
+ columns=["a"] + [20, 21, 22, 23])
+
+ with pytest.raises(KeyError):
+ df[[22, 26, -8]]
+ assert df[21].shape[0] == df.shape[0]
+
+ def test_set_index_nan(self):
+
+ # GH 3586
+ df = DataFrame({'PRuid': {17: 'nonQC',
+ 18: 'nonQC',
+ 19: 'nonQC',
+ 20: '10',
+ 21: '11',
+ 22: '12',
+ 23: '13',
+ 24: '24',
+ 25: '35',
+ 26: '46',
+ 27: '47',
+ 28: '48',
+ 29: '59',
+ 30: '10'},
+ 'QC': {17: 0.0,
+ 18: 0.0,
+ 19: 0.0,
+ 20: np.nan,
+ 21: np.nan,
+ 22: np.nan,
+ 23: np.nan,
+ 24: 1.0,
+ 25: np.nan,
+ 26: np.nan,
+ 27: np.nan,
+ 28: np.nan,
+ 29: np.nan,
+ 30: np.nan},
+ 'data': {17: 7.9544899999999998,
+ 18: 8.0142609999999994,
+ 19: 7.8591520000000008,
+ 20: 0.86140349999999999,
+ 21: 0.87853110000000001,
+ 22: 0.8427041999999999,
+ 23: 0.78587700000000005,
+ 24: 0.73062459999999996,
+ 25: 0.81668560000000001,
+ 26: 0.81927080000000008,
+ 27: 0.80705009999999999,
+ 28: 0.81440240000000008,
+ 29: 0.80140849999999997,
+ 30: 0.81307740000000006},
+ 'year': {17: 2006,
+ 18: 2007,
+ 19: 2008,
+ 20: 1985,
+ 21: 1985,
+ 22: 1985,
+ 23: 1985,
+ 24: 1985,
+ 25: 1985,
+ 26: 1985,
+ 27: 1985,
+ 28: 1985,
+ 29: 1985,
+ 30: 1986}}).reset_index()
+
+ result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex(
+ columns=df.columns)
+ tm.assert_frame_equal(result, df)
+
+ def test_multi_assign(self):
+
+ # GH 3626, an assignment of a sub-df to a df
+ df = DataFrame({'FC': ['a', 'b', 'a', 'b', 'a', 'b'],
+ 'PF': [0, 0, 0, 0, 1, 1],
+ 'col1': lrange(6),
+ 'col2': lrange(6, 12)})
+ df.iloc[1, 0] = np.nan
+ df2 = df.copy()
+
+ mask = ~df2.FC.isna()
+ cols = ['col1', 'col2']
+
+ dft = df2 * 2
+ dft.iloc[3, 3] = np.nan
+
+ expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
+ 'PF': [0, 0, 0, 0, 1, 1],
+ 'col1': Series([0, 1, 4, 6, 8, 10]),
+ 'col2': [12, 7, 16, np.nan, 20, 22]})
+
+ # frame on rhs
+ df2.loc[mask, cols] = dft.loc[mask, cols]
+ tm.assert_frame_equal(df2, expected)
+
+ df2.loc[mask, cols] = dft.loc[mask, cols]
+ tm.assert_frame_equal(df2, expected)
+
+ # with an ndarray on rhs
+ # coerces to float64 because values has float64 dtype
+ # GH 14001
+ expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
+ 'PF': [0, 0, 0, 0, 1, 1],
+ 'col1': [0., 1., 4., 6., 8., 10.],
+ 'col2': [12, 7, 16, np.nan, 20, 22]})
+ df2 = df.copy()
+ df2.loc[mask, cols] = dft.loc[mask, cols].values
+ tm.assert_frame_equal(df2, expected)
+ df2.loc[mask, cols] = dft.loc[mask, cols].values
+ tm.assert_frame_equal(df2, expected)
+
+ # broadcasting on the rhs is required
+ df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[
+ 0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7]))
+
+ expected = df.copy()
+ mask = expected['A'] == 0
+ for col in ['A', 'B']:
+ expected.loc[mask, col] = df['D']
+
+ df.loc[df['A'] == 0, ['A', 'B']] = df['D']
+ tm.assert_frame_equal(df, expected)
+
+ def test_setitem_list(self):
+
+ # GH 6043
+ # ix with a list
+ df = DataFrame(index=[0, 1], columns=[0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ df.ix[1, 0] = [1, 2, 3]
+ df.ix[1, 0] = [1, 2]
+
+ result = DataFrame(index=[0, 1], columns=[0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ result.ix[1, 0] = [1, 2]
+
+ tm.assert_frame_equal(result, df)
+
+ # ix with an object
+ class TO(object):
+
+ def __init__(self, value):
+ self.value = value
+
+ def __str__(self):
+ return "[{0}]".format(self.value)
+
+ __repr__ = __str__
+
+ def __eq__(self, other):
+ return self.value == other.value
+
+ def view(self):
+ return self
+
+ df = DataFrame(index=[0, 1], columns=[0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ df.ix[1, 0] = TO(1)
+ df.ix[1, 0] = TO(2)
+
+ result = DataFrame(index=[0, 1], columns=[0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ result.ix[1, 0] = TO(2)
+
+ tm.assert_frame_equal(result, df)
+
+ # remains object dtype even after setting it back
+ df = DataFrame(index=[0, 1], columns=[0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ df.ix[1, 0] = TO(1)
+ df.ix[1, 0] = np.nan
+ result = DataFrame(index=[0, 1], columns=[0])
+
+ tm.assert_frame_equal(result, df)
+
+ def test_string_slice(self):
+ # GH 14424
+ # string indexing against datetimelike with object
+ # dtype should properly raises KeyError
+ df = DataFrame([1], Index([pd.Timestamp('2011-01-01')], dtype=object))
+ assert df.index.is_all_dates
+ with pytest.raises(KeyError):
+ df['2011']
+
+ with pytest.raises(KeyError):
+ df.loc['2011', 0]
+
+ df = DataFrame()
+ assert not df.index.is_all_dates
+ with pytest.raises(KeyError):
+ df['2011']
+
+ with pytest.raises(KeyError):
+ df.loc['2011', 0]
+
+ def test_astype_assignment(self):
+
+ # GH4312 (iloc)
+ df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']],
+ columns=list('ABCDEFG'))
+
+ df = df_orig.copy()
+ df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
+ expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']],
+ columns=list('ABCDEFG'))
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True)
+ expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']],
+ columns=list('ABCDEFG'))
+ tm.assert_frame_equal(df, expected)
+
+ # GH5702 (loc)
+ df = df_orig.copy()
+ df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64)
+ expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']],
+ columns=list('ABCDEFG'))
+ tm.assert_frame_equal(df, expected)
+
+ df = df_orig.copy()
+ df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64)
+ expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']],
+ columns=list('ABCDEFG'))
+ tm.assert_frame_equal(df, expected)
+
+ # full replacements / no nans
+ df = DataFrame({'A': [1., 2., 3., 4.]})
+ df.iloc[:, 0] = df['A'].astype(np.int64)
+ expected = DataFrame({'A': [1, 2, 3, 4]})
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'A': [1., 2., 3., 4.]})
+ df.loc[:, 'A'] = df['A'].astype(np.int64)
+ expected = DataFrame({'A': [1, 2, 3, 4]})
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize("index,val", [
+ (Index([0, 1, 2]), 2),
+ (Index([0, 1, '2']), '2'),
+ (Index([0, 1, 2, np.inf, 4]), 4),
+ (Index([0, 1, 2, np.nan, 4]), 4),
+ (Index([0, 1, 2, np.inf]), np.inf),
+ (Index([0, 1, 2, np.nan]), np.nan),
+ ])
+ def test_index_contains(self, index, val):
+ assert val in index
+
+ @pytest.mark.parametrize("index,val", [
+ (Index([0, 1, 2]), '2'),
+ (Index([0, 1, '2']), 2),
+ (Index([0, 1, 2, np.inf]), 4),
+ (Index([0, 1, 2, np.nan]), 4),
+ (Index([0, 1, 2, np.inf]), np.nan),
+ (Index([0, 1, 2, np.nan]), np.inf),
+ # Checking if np.inf in Int64Index should not cause an OverflowError
+ # Related to GH 16957
+ (pd.Int64Index([0, 1, 2]), np.inf),
+ (pd.Int64Index([0, 1, 2]), np.nan),
+ (pd.UInt64Index([0, 1, 2]), np.inf),
+ (pd.UInt64Index([0, 1, 2]), np.nan),
+ ])
+ def test_index_not_contains(self, index, val):
+ assert val not in index
+
+ @pytest.mark.parametrize("index,val", [
+ (Index([0, 1, '2']), 0),
+ (Index([0, 1, '2']), '2'),
+ ])
+ def test_mixed_index_contains(self, index, val):
+ # GH 19860
+ assert val in index
+
+ @pytest.mark.parametrize("index,val", [
+ (Index([0, 1, '2']), '1'),
+ (Index([0, 1, '2']), 2),
+ ])
+ def test_mixed_index_not_contains(self, index, val):
+ # GH 19860
+ assert val not in index
+
+ def test_contains_with_float_index(self):
+ # GH#22085
+ integer_index = pd.Int64Index([0, 1, 2, 3])
+ uinteger_index = pd.UInt64Index([0, 1, 2, 3])
+ float_index = pd.Float64Index([0.1, 1.1, 2.2, 3.3])
+
+ for index in (integer_index, uinteger_index):
+ assert 1.1 not in index
+ assert 1.0 in index
+ assert 1 in index
+
+ assert 1.1 in float_index
+ assert 1.0 not in float_index
+ assert 1 not in float_index
+
+ def test_index_type_coercion(self):
+
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+
+ # GH 11836
+ # if we have an index type and set it with something that looks
+ # to numpy like the same, but is actually, not
+ # (e.g. setting with a float or string '0')
+ # then we need to coerce to object
+
+ # integer indexes
+ for s in [Series(range(5)),
+ Series(range(5), index=range(1, 6))]:
+
+ assert s.index.is_integer()
+
+ for indexer in [lambda x: x.ix,
+ lambda x: x.loc,
+ lambda x: x]:
+ s2 = s.copy()
+ indexer(s2)[0.1] = 0
+ assert s2.index.is_floating()
+ assert indexer(s2)[0.1] == 0
+
+ s2 = s.copy()
+ indexer(s2)[0.0] = 0
+ exp = s.index
+ if 0 not in s:
+ exp = Index(s.index.tolist() + [0])
+ tm.assert_index_equal(s2.index, exp)
+
+ s2 = s.copy()
+ indexer(s2)['0'] = 0
+ assert s2.index.is_object()
+
+ for s in [Series(range(5), index=np.arange(5.))]:
+
+ assert s.index.is_floating()
+
+ for idxr in [lambda x: x.ix,
+ lambda x: x.loc,
+ lambda x: x]:
+
+ s2 = s.copy()
+ idxr(s2)[0.1] = 0
+ assert s2.index.is_floating()
+ assert idxr(s2)[0.1] == 0
+
+ s2 = s.copy()
+ idxr(s2)[0.0] = 0
+ tm.assert_index_equal(s2.index, s.index)
+
+ s2 = s.copy()
+ idxr(s2)['0'] = 0
+ assert s2.index.is_object()
+
+
+class TestMisc(Base):
+
+ def test_float_index_to_mixed(self):
+ df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)})
+ df['a'] = 10
+ tm.assert_frame_equal(DataFrame({0.0: df[0.0],
+ 1.0: df[1.0],
+ 'a': [10] * 10}),
+ df)
+
+ def test_float_index_non_scalar_assignment(self):
+ df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.])
+ df.loc[df.index[:2]] = 1
+ expected = DataFrame({'a': [1, 1, 3], 'b': [1, 1, 5]}, index=df.index)
+ tm.assert_frame_equal(expected, df)
+
+ df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.])
+ df2 = df.copy()
+ df.loc[df.index] = df.loc[df.index]
+ tm.assert_frame_equal(df, df2)
+
+ def test_float_index_at_iat(self):
+ s = Series([1, 2, 3], index=[0.1, 0.2, 0.3])
+ for el, item in s.iteritems():
+ assert s.at[el] == item
+ for i in range(len(s)):
+ assert s.iat[i] == i + 1
+
+ def test_mixed_index_assignment(self):
+ # GH 19860
+ s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2])
+ s.at['a'] = 11
+ assert s.iat[0] == 11
+ s.at[1] = 22
+ assert s.iat[3] == 22
+
+ def test_mixed_index_no_fallback(self):
+ # GH 19860
+ s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2])
+ with pytest.raises(KeyError):
+ s.at[0]
+ with pytest.raises(KeyError):
+ s.at[4]
+
+ def test_rhs_alignment(self):
+ # GH8258, tests that both rows & columns are aligned to what is
+ # assigned to. covers both uniform data-type & multi-type cases
+ def run_tests(df, rhs, right):
+ # label, index, slice
+ lbl_one, idx_one, slice_one = list('bcd'), [1, 2, 3], slice(1, 4)
+ lbl_two, idx_two, slice_two = ['joe', 'jolie'], [1, 2], slice(1, 3)
+
+ left = df.copy()
+ left.loc[lbl_one, lbl_two] = rhs
+ tm.assert_frame_equal(left, right)
+
+ left = df.copy()
+ left.iloc[idx_one, idx_two] = rhs
+ tm.assert_frame_equal(left, right)
+
+ left = df.copy()
+ with catch_warnings(record=True):
+ # XXX: finer-filter here.
+ simplefilter("ignore")
+ left.ix[slice_one, slice_two] = rhs
+ tm.assert_frame_equal(left, right)
+
+ left = df.copy()
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ left.ix[idx_one, idx_two] = rhs
+ tm.assert_frame_equal(left, right)
+
+ left = df.copy()
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ left.ix[lbl_one, lbl_two] = rhs
+ tm.assert_frame_equal(left, right)
+
+ xs = np.arange(20).reshape(5, 4)
+ cols = ['jim', 'joe', 'jolie', 'joline']
+ df = DataFrame(xs, columns=cols, index=list('abcde'))
+
+ # right hand side; permute the indices and multiplpy by -2
+ rhs = -2 * df.iloc[3:0:-1, 2:0:-1]
+
+ # expected `right` result; just multiply by -2
+ right = df.copy()
+ right.iloc[1:4, 1:3] *= -2
+
+ # run tests with uniform dtypes
+ run_tests(df, rhs, right)
+
+ # make frames multi-type & re-run tests
+ for frame in [df, rhs, right]:
+ frame['joe'] = frame['joe'].astype('float64')
+ frame['jolie'] = frame['jolie'].map('@{0}'.format)
+
+ run_tests(df, rhs, right)
+
+ def test_str_label_slicing_with_negative_step(self):
+ SLC = pd.IndexSlice
+
+ def assert_slices_equivalent(l_slc, i_slc):
+ tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc])
+
+ if not idx.is_integer:
+ # For integer indices, ix and plain getitem are position-based.
+ tm.assert_series_equal(s[l_slc], s.iloc[i_slc])
+ tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc])
+
+ for idx in [_mklbl('A', 20), np.arange(20) + 100,
+ np.linspace(100, 150, 20)]:
+ idx = Index(idx)
+ s = Series(np.arange(20), index=idx)
+ assert_slices_equivalent(SLC[idx[9]::-1], SLC[9::-1])
+ assert_slices_equivalent(SLC[:idx[9]:-1], SLC[:8:-1])
+ assert_slices_equivalent(SLC[idx[13]:idx[9]:-1], SLC[13:8:-1])
+ assert_slices_equivalent(SLC[idx[9]:idx[13]:-1], SLC[:0])
+
+ def test_slice_with_zero_step_raises(self):
+ s = Series(np.arange(20), index=_mklbl('A', 20))
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ s[::0]
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ s.loc[::0]
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ with pytest.raises(ValueError, match='slice step cannot be zero'):
+ s.ix[::0]
+
+ def test_indexing_assignment_dict_already_exists(self):
+ df = DataFrame({'x': [1, 2, 6],
+ 'y': [2, 2, 8],
+ 'z': [-5, 0, 5]}).set_index('z')
+ expected = df.copy()
+ rhs = dict(x=9, y=99)
+ df.loc[5] = rhs
+ expected.loc[5] = [9, 99]
+ tm.assert_frame_equal(df, expected)
+
+ def test_indexing_dtypes_on_empty(self):
+ # Check that .iloc and .ix return correct dtypes GH9983
+ df = DataFrame({'a': [1, 2, 3], 'b': ['b', 'b2', 'b3']})
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ df2 = df.ix[[], :]
+
+ assert df2.loc[:, 'a'].dtype == np.int64
+ tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0])
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0])
+
+ def test_range_in_series_indexing(self):
+ # range can cause an indexing error
+ # GH 11652
+ for x in [5, 999999, 1000000]:
+ s = Series(index=range(x))
+ s.loc[range(1)] = 42
+ tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
+
+ s.loc[range(2)] = 43
+ tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
+
+ def test_non_reducing_slice(self):
+ df = DataFrame([[0, 1], [2, 3]])
+
+ slices = [
+ # pd.IndexSlice[:, :],
+ pd.IndexSlice[:, 1],
+ pd.IndexSlice[1, :],
+ pd.IndexSlice[[1], [1]],
+ pd.IndexSlice[1, [1]],
+ pd.IndexSlice[[1], 1],
+ pd.IndexSlice[1],
+ pd.IndexSlice[1, 1],
+ slice(None, None, None),
+ [0, 1],
+ np.array([0, 1]),
+ Series([0, 1])
+ ]
+ for slice_ in slices:
+ tslice_ = _non_reducing_slice(slice_)
+ assert isinstance(df.loc[tslice_], DataFrame)
+
+ def test_list_slice(self):
+ # like dataframe getitem
+ slices = [['A'], Series(['A']), np.array(['A'])]
+ df = DataFrame({'A': [1, 2], 'B': [3, 4]}, index=['A', 'B'])
+ expected = pd.IndexSlice[:, ['A']]
+ for subset in slices:
+ result = _non_reducing_slice(subset)
+ tm.assert_frame_equal(df.loc[result], df.loc[expected])
+
+ def test_maybe_numeric_slice(self):
+ df = DataFrame({'A': [1, 2], 'B': ['c', 'd'], 'C': [True, False]})
+ result = _maybe_numeric_slice(df, slice_=None)
+ expected = pd.IndexSlice[:, ['A']]
+ assert result == expected
+
+ result = _maybe_numeric_slice(df, None, include_bool=True)
+ expected = pd.IndexSlice[:, ['A', 'C']]
+ result = _maybe_numeric_slice(df, [1])
+ expected = [1]
+ assert result == expected
+
+ def test_partial_boolean_frame_indexing(self):
+ # GH 17170
+ df = DataFrame(np.arange(9.).reshape(3, 3),
+ index=list('abc'), columns=list('ABC'))
+ index_df = DataFrame(1, index=list('ab'), columns=list('AB'))
+ result = df[index_df.notnull()]
+ expected = DataFrame(np.array([[0., 1., np.nan],
+ [3., 4., np.nan],
+ [np.nan] * 3]),
+ index=list('abc'),
+ columns=list('ABC'))
+ tm.assert_frame_equal(result, expected)
+
+ def test_no_reference_cycle(self):
+ df = DataFrame({'a': [0, 1], 'b': [2, 3]})
+ for name in ('loc', 'iloc', 'at', 'iat'):
+ getattr(df, name)
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ getattr(df, 'ix')
+ wr = weakref.ref(df)
+ del df
+ assert wr() is None
+
+
+class TestSeriesNoneCoercion(object):
+ EXPECTED_RESULTS = [
+ # For numeric series, we should coerce to NaN.
+ ([1, 2, 3], [np.nan, 2, 3]),
+ ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
+
+ # For datetime series, we should coerce to NaT.
+ ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+ [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
+
+ # For objects, we should preserve the None value.
+ (["foo", "bar", "baz"], [None, "bar", "baz"]),
+ ]
+
+ def test_coercion_with_setitem(self):
+ for start_data, expected_result in self.EXPECTED_RESULTS:
+ start_series = Series(start_data)
+ start_series[0] = None
+
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
+
+ def test_coercion_with_loc_setitem(self):
+ for start_data, expected_result in self.EXPECTED_RESULTS:
+ start_series = Series(start_data)
+ start_series.loc[0] = None
+
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
+
+ def test_coercion_with_setitem_and_series(self):
+ for start_data, expected_result in self.EXPECTED_RESULTS:
+ start_series = Series(start_data)
+ start_series[start_series == start_series[0]] = None
+
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
+
+ def test_coercion_with_loc_and_series(self):
+ for start_data, expected_result in self.EXPECTED_RESULTS:
+ start_series = Series(start_data)
+ start_series.loc[start_series == start_series[0]] = None
+
+ expected_series = Series(expected_result)
+ tm.assert_series_equal(start_series, expected_series)
+
+
+class TestDataframeNoneCoercion(object):
+ EXPECTED_SINGLE_ROW_RESULTS = [
+ # For numeric series, we should coerce to NaN.
+ ([1, 2, 3], [np.nan, 2, 3]),
+ ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
+
+ # For datetime series, we should coerce to NaT.
+ ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+ [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
+
+ # For objects, we should preserve the None value.
+ (["foo", "bar", "baz"], [None, "bar", "baz"]),
+ ]
+
+ def test_coercion_with_loc(self):
+ for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+ start_dataframe = DataFrame({'foo': start_data})
+ start_dataframe.loc[0, ['foo']] = None
+
+ expected_dataframe = DataFrame({'foo': expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
+
+ def test_coercion_with_setitem_and_dataframe(self):
+ for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+ start_dataframe = DataFrame({'foo': start_data})
+ start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][
+ 0]] = None
+
+ expected_dataframe = DataFrame({'foo': expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
+
+ def test_none_coercion_loc_and_dataframe(self):
+ for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+ start_dataframe = DataFrame({'foo': start_data})
+ start_dataframe.loc[start_dataframe['foo'] == start_dataframe[
+ 'foo'][0]] = None
+
+ expected_dataframe = DataFrame({'foo': expected_result})
+ tm.assert_frame_equal(start_dataframe, expected_dataframe)
+
+ def test_none_coercion_mixed_dtypes(self):
+ start_dataframe = DataFrame({
+ 'a': [1, 2, 3],
+ 'b': [1.0, 2.0, 3.0],
+ 'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1,
+ 3)],
+ 'd': ['a', 'b', 'c']
+ })
+ start_dataframe.iloc[0] = None
+
+ exp = DataFrame({'a': [np.nan, 2, 3],
+ 'b': [np.nan, 2.0, 3.0],
+ 'c': [NaT, datetime(2000, 1, 2),
+ datetime(2000, 1, 3)],
+ 'd': [None, 'b', 'c']})
+ tm.assert_frame_equal(start_dataframe, exp)
+
+
+def test_validate_indices_ok():
+ indices = np.asarray([0, 1])
+ validate_indices(indices, 2)
+ validate_indices(indices[:0], 0)
+ validate_indices(np.array([-1, -1]), 0)
+
+
+def test_validate_indices_low():
+ indices = np.asarray([0, -2])
+ with pytest.raises(ValueError, match="'indices' contains"):
+ validate_indices(indices, 2)
+
+
+def test_validate_indices_high():
+ indices = np.asarray([0, 1, 2])
+ with pytest.raises(IndexError, match="indices are out"):
+ validate_indices(indices, 2)
+
+
+def test_validate_indices_empty():
+ with pytest.raises(IndexError, match="indices are out"):
+ validate_indices(np.array([0, 1]), 0)
+
+
+def test_extension_array_cross_section():
+ # A cross-section of a homogeneous EA should be an EA
+ df = pd.DataFrame({
+ "A": pd.core.arrays.integer_array([1, 2]),
+ "B": pd.core.arrays.integer_array([3, 4])
+ }, index=['a', 'b'])
+ expected = pd.Series(pd.core.arrays.integer_array([1, 3]),
+ index=['A', 'B'], name='a')
+ result = df.loc['a']
+ tm.assert_series_equal(result, expected)
+
+ result = df.iloc[0]
+ tm.assert_series_equal(result, expected)
+
+
+def test_extension_array_cross_section_converts():
+ df = pd.DataFrame({
+ "A": pd.core.arrays.integer_array([1, 2]),
+ "B": np.array([1, 2]),
+ }, index=['a', 'b'])
+ result = df.loc['a']
+ expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a')
+ tm.assert_series_equal(result, expected)
+
+ result = df.iloc[0]
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_engines.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_engines.py
new file mode 100644
index 00000000000..57b85fd46a4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_engines.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from pandas._libs import algos as libalgos, index as libindex
+
+from pandas import compat
+import pandas.util.testing as tm
+
+
+class TestNumericEngine(object):
+ def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype):
+ engine_type, dtype = numeric_indexing_engine_type_and_dtype
+ num = 1000
+ arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype)
+
+ # monotonic increasing
+ engine = engine_type(lambda: arr, len(arr))
+ assert engine.is_monotonic_increasing is True
+ assert engine.is_monotonic_decreasing is False
+
+ # monotonic decreasing
+ engine = engine_type(lambda: arr[::-1], len(arr))
+ assert engine.is_monotonic_increasing is False
+ assert engine.is_monotonic_decreasing is True
+
+ # neither monotonic increasing or decreasing
+ arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype)
+ engine = engine_type(lambda: arr[::-1], len(arr))
+ assert engine.is_monotonic_increasing is False
+ assert engine.is_monotonic_decreasing is False
+
+ def test_is_unique(self, numeric_indexing_engine_type_and_dtype):
+ engine_type, dtype = numeric_indexing_engine_type_and_dtype
+
+ # unique
+ arr = np.array([1, 3, 2], dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+ assert engine.is_unique is True
+
+ # not unique
+ arr = np.array([1, 2, 1], dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+ assert engine.is_unique is False
+
+ def test_get_loc(self, numeric_indexing_engine_type_and_dtype):
+ engine_type, dtype = numeric_indexing_engine_type_and_dtype
+
+ # unique
+ arr = np.array([1, 2, 3], dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+ assert engine.get_loc(2) == 1
+
+ # monotonic
+ num = 1000
+ arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+ assert engine.get_loc(2) == slice(1000, 2000)
+
+ # not monotonic
+ arr = np.array([1, 2, 3] * num, dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+ expected = np.array([False, True, False] * num, dtype=bool)
+ result = engine.get_loc(2)
+ assert (result == expected).all()
+
+ def test_get_backfill_indexer(
+ self, numeric_indexing_engine_type_and_dtype):
+ engine_type, dtype = numeric_indexing_engine_type_and_dtype
+
+ arr = np.array([1, 5, 10], dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+
+ new = np.array(compat.range(12), dtype=dtype)
+ result = engine.get_backfill_indexer(new)
+
+ expected = libalgos.backfill(arr, new)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_pad_indexer(
+ self, numeric_indexing_engine_type_and_dtype):
+ engine_type, dtype = numeric_indexing_engine_type_and_dtype
+
+ arr = np.array([1, 5, 10], dtype=dtype)
+ engine = engine_type(lambda: arr, len(arr))
+
+ new = np.array(compat.range(12), dtype=dtype)
+ result = engine.get_pad_indexer(new)
+
+ expected = libalgos.pad(arr, new)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestObjectEngine(object):
+ engine_type = libindex.ObjectEngine
+ dtype = np.object_
+ values = list('abc')
+
+ def test_is_monotonic(self):
+
+ num = 1000
+ arr = np.array(['a'] * num + ['a'] * num + ['c'] * num,
+ dtype=self.dtype)
+
+ # monotonic increasing
+ engine = self.engine_type(lambda: arr, len(arr))
+ assert engine.is_monotonic_increasing is True
+ assert engine.is_monotonic_decreasing is False
+
+ # monotonic decreasing
+ engine = self.engine_type(lambda: arr[::-1], len(arr))
+ assert engine.is_monotonic_increasing is False
+ assert engine.is_monotonic_decreasing is True
+
+ # neither monotonic increasing or decreasing
+ arr = np.array(['a'] * num + ['b'] * num + ['a'] * num,
+ dtype=self.dtype)
+ engine = self.engine_type(lambda: arr[::-1], len(arr))
+ assert engine.is_monotonic_increasing is False
+ assert engine.is_monotonic_decreasing is False
+
+ def test_is_unique(self):
+ # unique
+ arr = np.array(self.values, dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+ assert engine.is_unique is True
+
+ # not unique
+ arr = np.array(['a', 'b', 'a'], dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+ assert engine.is_unique is False
+
+ def test_get_loc(self):
+ # unique
+ arr = np.array(self.values, dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+ assert engine.get_loc('b') == 1
+
+ # monotonic
+ num = 1000
+ arr = np.array(['a'] * num + ['b'] * num + ['c'] * num,
+ dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+ assert engine.get_loc('b') == slice(1000, 2000)
+
+ # not monotonic
+ arr = np.array(self.values * num, dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+ expected = np.array([False, True, False] * num, dtype=bool)
+ result = engine.get_loc('b')
+ assert (result == expected).all()
+
+ def test_get_backfill_indexer(self):
+ arr = np.array(['a', 'e', 'j'], dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+
+ new = np.array(list('abcdefghij'), dtype=self.dtype)
+ result = engine.get_backfill_indexer(new)
+
+ expected = libalgos.backfill["object"](arr, new)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_get_pad_indexer(self):
+ arr = np.array(['a', 'e', 'j'], dtype=self.dtype)
+ engine = self.engine_type(lambda: arr, len(arr))
+
+ new = np.array(list('abcdefghij'), dtype=self.dtype)
+ result = engine.get_pad_indexer(new)
+
+ expected = libalgos.pad["object"](arr, new)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_slow.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_slow.py
new file mode 100644
index 00000000000..42263c813dd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_indexing_slow.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
+class TestIndexingSlow(object):
+
+ @pytest.mark.slow
+ def test_large_dataframe_indexing(self):
+ # GH10692
+ result = DataFrame({'x': range(10 ** 6)}, dtype='int64')
+ result.loc[len(result)] = len(result) + 1
+ expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64')
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_ix.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_ix.py
new file mode 100644
index 00000000000..35805bce077
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_ix.py
@@ -0,0 +1,314 @@
+""" test indexing with ix """
+
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+from pandas.core.dtypes.common import is_scalar
+
+import pandas as pd
+from pandas import DataFrame, Series, option_context
+from pandas.util import testing as tm
+
+
+def test_ix_deprecation():
+ # GH 15114
+
+ df = DataFrame({'A': [1, 2, 3]})
+ with tm.assert_produces_warning(DeprecationWarning,
+ check_stacklevel=False):
+ df.ix[1, 'A']
+
+
[email protected]("ignore:\\n.ix:DeprecationWarning")
+class TestIX(object):
+
+ def test_ix_loc_setitem_consistency(self):
+
+ # GH 5771
+ # loc with slice and series
+ s = Series(0, index=[4, 5, 6])
+ s.loc[4:5] += 1
+ expected = Series([1, 1, 0], index=[4, 5, 6])
+ tm.assert_series_equal(s, expected)
+
+ # GH 5928
+ # chained indexing assignment
+ df = DataFrame({'a': [0, 1, 2]})
+ expected = df.copy()
+ with catch_warnings(record=True):
+ expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a']
+
+ with catch_warnings(record=True):
+ df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]]
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]})
+ with catch_warnings(record=True):
+ df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype(
+ 'float64') + 0.5
+ expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]})
+ tm.assert_frame_equal(df, expected)
+
+ # GH 8607
+ # ix setitem consistency
+ df = DataFrame({'delta': [1174, 904, 161],
+ 'elapsed': [7673, 9277, 1470],
+ 'timestamp': [1413840976, 1413842580, 1413760580]})
+ expected = DataFrame({'delta': [1174, 904, 161],
+ 'elapsed': [7673, 9277, 1470],
+ 'timestamp': pd.to_datetime(
+ [1413840976, 1413842580, 1413760580],
+ unit='s')
+ })
+
+ df2 = df.copy()
+ df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
+ tm.assert_frame_equal(df2, expected)
+
+ df2 = df.copy()
+ df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
+ tm.assert_frame_equal(df2, expected)
+
+ df2 = df.copy()
+ with catch_warnings(record=True):
+ df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s')
+ tm.assert_frame_equal(df2, expected)
+
+ def test_ix_loc_consistency(self):
+
+ # GH 8613
+ # some edge cases where ix/loc should return the same
+ # this is not an exhaustive case
+
+ def compare(result, expected):
+ if is_scalar(expected):
+ assert result == expected
+ else:
+ assert expected.equals(result)
+
+ # failure cases for .loc, but these work for .ix
+ df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'))
+ for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]),
+ tuple([slice(0, 2), df.columns[0:2]])]:
+
+ for index in [tm.makeStringIndex, tm.makeUnicodeIndex,
+ tm.makeDateIndex, tm.makePeriodIndex,
+ tm.makeTimedeltaIndex]:
+ df.index = index(len(df.index))
+ with catch_warnings(record=True):
+ df.ix[key]
+
+ pytest.raises(TypeError, lambda: df.loc[key])
+
+ df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'),
+ index=pd.date_range('2012-01-01', periods=5))
+
+ for key in ['2012-01-03',
+ '2012-01-31',
+ slice('2012-01-03', '2012-01-03'),
+ slice('2012-01-03', '2012-01-04'),
+ slice('2012-01-03', '2012-01-06', 2),
+ slice('2012-01-03', '2012-01-31'),
+ tuple([[True, True, True, False, True]]), ]:
+
+ # getitem
+
+ # if the expected raises, then compare the exceptions
+ try:
+ with catch_warnings(record=True):
+ expected = df.ix[key]
+ except KeyError:
+ pytest.raises(KeyError, lambda: df.loc[key])
+ continue
+
+ result = df.loc[key]
+ compare(result, expected)
+
+ # setitem
+ df1 = df.copy()
+ df2 = df.copy()
+
+ with catch_warnings(record=True):
+ df1.ix[key] = 10
+ df2.loc[key] = 10
+ compare(df2, df1)
+
+ # edge cases
+ s = Series([1, 2, 3, 4], index=list('abde'))
+
+ result1 = s['a':'c']
+ with catch_warnings(record=True):
+ result2 = s.ix['a':'c']
+ result3 = s.loc['a':'c']
+ tm.assert_series_equal(result1, result2)
+ tm.assert_series_equal(result1, result3)
+
+ # now work rather than raising KeyError
+ s = Series(range(5), [-2, -1, 1, 2, 3])
+
+ with catch_warnings(record=True):
+ result1 = s.ix[-10:3]
+ result2 = s.loc[-10:3]
+ tm.assert_series_equal(result1, result2)
+
+ with catch_warnings(record=True):
+ result1 = s.ix[0:3]
+ result2 = s.loc[0:3]
+ tm.assert_series_equal(result1, result2)
+
+ def test_ix_weird_slicing(self):
+ # http://stackoverflow.com/q/17056560/1240268
+ df = DataFrame({'one': [1, 2, 3, np.nan, np.nan],
+ 'two': [1, 2, 3, 4, 5]})
+ df.loc[df['one'] > 1, 'two'] = -df['two']
+
+ expected = DataFrame({'one': {0: 1.0,
+ 1: 2.0,
+ 2: 3.0,
+ 3: np.nan,
+ 4: np.nan},
+ 'two': {0: 1,
+ 1: -2,
+ 2: -3,
+ 3: 4,
+ 4: 5}})
+ tm.assert_frame_equal(df, expected)
+
+ def test_ix_assign_column_mixed(self):
+ # GH #1142
+ df = DataFrame(tm.getSeriesData())
+ df['foo'] = 'bar'
+
+ orig = df.loc[:, 'B'].copy()
+ df.loc[:, 'B'] = df.loc[:, 'B'] + 1
+ tm.assert_series_equal(df.B, orig + 1)
+
+ # GH 3668, mixed frame with series value
+ df = DataFrame({'x': lrange(10), 'y': lrange(10, 20), 'z': 'bar'})
+ expected = df.copy()
+
+ for i in range(5):
+ indexer = i * 2
+ v = 1000 + i * 200
+ expected.loc[indexer, 'y'] = v
+ assert expected.loc[indexer, 'y'] == v
+
+ df.loc[df.x % 2 == 0, 'y'] = df.loc[df.x % 2 == 0, 'y'] * 100
+ tm.assert_frame_equal(df, expected)
+
+ # GH 4508, making sure consistency of assignments
+ df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]})
+ df.loc[[0, 2, ], 'b'] = [100, -100]
+ expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]})
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'a': lrange(4)})
+ df['b'] = np.nan
+ df.loc[[1, 3], 'b'] = [100, -100]
+ expected = DataFrame({'a': [0, 1, 2, 3],
+ 'b': [np.nan, 100, np.nan, -100]})
+ tm.assert_frame_equal(df, expected)
+
+ # ok, but chained assignments are dangerous
+ # if we turn off chained assignment it will work
+ with option_context('chained_assignment', None):
+ df = DataFrame({'a': lrange(4)})
+ df['b'] = np.nan
+ df['b'].loc[[1, 3]] = [100, -100]
+ tm.assert_frame_equal(df, expected)
+
+ def test_ix_get_set_consistency(self):
+
+ # GH 4544
+ # ix/loc get/set not consistent when
+ # a mixed int/string index
+ df = DataFrame(np.arange(16).reshape((4, 4)),
+ columns=['a', 'b', 8, 'c'],
+ index=['e', 7, 'f', 'g'])
+
+ with catch_warnings(record=True):
+ assert df.ix['e', 8] == 2
+ assert df.loc['e', 8] == 2
+
+ with catch_warnings(record=True):
+ df.ix['e', 8] = 42
+ assert df.ix['e', 8] == 42
+ assert df.loc['e', 8] == 42
+
+ df.loc['e', 8] = 45
+ with catch_warnings(record=True):
+ assert df.ix['e', 8] == 45
+ assert df.loc['e', 8] == 45
+
+ def test_ix_slicing_strings(self):
+ # see gh-3836
+ data = {'Classification':
+ ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'],
+ 'Random': [1, 2, 3, 4, 5],
+ 'X': ['correct', 'wrong', 'correct', 'correct', 'wrong']}
+ df = DataFrame(data)
+ x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF'
+ ])]
+ with catch_warnings(record=True):
+ df.ix[x.index, 'X'] = df['Classification']
+
+ expected = DataFrame({'Classification': {0: 'SA EQUITY CFD',
+ 1: 'bbb',
+ 2: 'SA EQUITY',
+ 3: 'SA SSF',
+ 4: 'aaa'},
+ 'Random': {0: 1,
+ 1: 2,
+ 2: 3,
+ 3: 4,
+ 4: 5},
+ 'X': {0: 'correct',
+ 1: 'bbb',
+ 2: 'correct',
+ 3: 'correct',
+ 4: 'aaa'}}) # bug was 4: 'bbb'
+
+ tm.assert_frame_equal(df, expected)
+
+ def test_ix_setitem_out_of_bounds_axis_0(self):
+ df = DataFrame(
+ np.random.randn(2, 5), index=["row%s" % i for i in range(2)],
+ columns=["col%s" % i for i in range(5)])
+ with catch_warnings(record=True):
+ pytest.raises(ValueError, df.ix.__setitem__, (2, 0), 100)
+
+ def test_ix_setitem_out_of_bounds_axis_1(self):
+ df = DataFrame(
+ np.random.randn(5, 2), index=["row%s" % i for i in range(5)],
+ columns=["col%s" % i for i in range(2)])
+ with catch_warnings(record=True):
+ pytest.raises(ValueError, df.ix.__setitem__, (0, 2), 100)
+
+ def test_ix_empty_list_indexer_is_ok(self):
+ with catch_warnings(record=True):
+ from pandas.util.testing import makeCustomDataframe as mkdf
+ df = mkdf(5, 2)
+ # vertical empty
+ tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0],
+ check_index_type=True,
+ check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :],
+ check_index_type=True,
+ check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :],
+ check_index_type=True,
+ check_column_type=True)
+
+ def test_ix_duplicate_returns_series(self):
+ df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2],
+ columns=list('abc'))
+ with catch_warnings(record=True):
+ r = df.ix[0.2, 'a']
+ e = df.loc[0.2, 'a']
+ tm.assert_series_equal(r, e)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_loc.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_loc.py
new file mode 100644
index 00000000000..17e107c7a11
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_loc.py
@@ -0,0 +1,767 @@
+""" test label based indexing with loc """
+
+from warnings import catch_warnings, filterwarnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY2, StringIO, lrange
+
+import pandas as pd
+from pandas import DataFrame, Series, Timestamp, date_range
+from pandas.api.types import is_scalar
+from pandas.tests.indexing.common import Base
+from pandas.util import testing as tm
+
+
+class TestLoc(Base):
+
+ def test_loc_getitem_dups(self):
+ # GH 5678
+ # repeated gettitems on a dup index returning a ndarray
+ df = DataFrame(
+ np.random.random_sample((20, 5)),
+ index=['ABCDE' [x % 5] for x in range(20)])
+ expected = df.loc['A', 0]
+ result = df.loc[:, 0].loc['A']
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_getitem_dups2(self):
+
+ # GH4726
+ # dup indexing with iloc/loc
+ df = DataFrame([[1, 2, 'foo', 'bar', Timestamp('20130101')]],
+ columns=['a', 'a', 'a', 'a', 'a'], index=[1])
+ expected = Series([1, 2, 'foo', 'bar', Timestamp('20130101')],
+ index=['a', 'a', 'a', 'a', 'a'], name=1)
+
+ result = df.iloc[0]
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc[1]
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_setitem_dups(self):
+
+ # GH 6541
+ df_orig = DataFrame(
+ {'me': list('rttti'),
+ 'foo': list('aaade'),
+ 'bar': np.arange(5, dtype='float64') * 1.34 + 2,
+ 'bar2': np.arange(5, dtype='float64') * -.34 + 2}).set_index('me')
+
+ indexer = tuple(['r', ['bar', 'bar2']])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
+
+ indexer = tuple(['r', 'bar'])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ assert df.loc[indexer] == 2.0 * df_orig.loc[indexer]
+
+ indexer = tuple(['t', ['bar', 'bar2']])
+ df = df_orig.copy()
+ df.loc[indexer] *= 2.0
+ tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])
+
+ def test_loc_setitem_slice(self):
+ # GH10503
+
+ # assigning the same type should not change the type
+ df1 = DataFrame({'a': [0, 1, 1],
+ 'b': Series([100, 200, 300], dtype='uint32')})
+ ix = df1['a'] == 1
+ newb1 = df1.loc[ix, 'b'] + 1
+ df1.loc[ix, 'b'] = newb1
+ expected = DataFrame({'a': [0, 1, 1],
+ 'b': Series([100, 201, 301], dtype='uint32')})
+ tm.assert_frame_equal(df1, expected)
+
+ # assigning a new type should get the inferred type
+ df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]},
+ dtype='uint64')
+ ix = df1['a'] == 1
+ newb2 = df2.loc[ix, 'b']
+ df1.loc[ix, 'b'] = newb2
+ expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]},
+ dtype='uint64')
+ tm.assert_frame_equal(df2, expected)
+
+ def test_loc_getitem_int(self):
+
+ # int label
+ self.check_result('int label', 'loc', 2, 'ix', 2,
+ typs=['ints', 'uints'], axes=0)
+ self.check_result('int label', 'loc', 3, 'ix', 3,
+ typs=['ints', 'uints'], axes=1)
+ self.check_result('int label', 'loc', 4, 'ix', 4,
+ typs=['ints', 'uints'], axes=2)
+ self.check_result('int label', 'loc', 2, 'ix', 2,
+ typs=['label'], fails=KeyError)
+
+ def test_loc_getitem_label(self):
+
+ # label
+ self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['labels'],
+ axes=0)
+ self.check_result('label', 'loc', 'null', 'ix', 'null', typs=['mixed'],
+ axes=0)
+ self.check_result('label', 'loc', 8, 'ix', 8, typs=['mixed'], axes=0)
+ self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1,
+ typs=['ts'], axes=0)
+ self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['empty'],
+ fails=KeyError)
+
+ def test_loc_getitem_label_out_of_range(self):
+
+ # out of range label
+ self.check_result('label range', 'loc', 'f', 'ix', 'f',
+ typs=['ints', 'uints', 'labels', 'mixed', 'ts'],
+ fails=KeyError)
+ self.check_result('label range', 'loc', 'f', 'ix', 'f',
+ typs=['floats'], fails=KeyError)
+ self.check_result('label range', 'loc', 20, 'ix', 20,
+ typs=['ints', 'uints', 'mixed'], fails=KeyError)
+ self.check_result('label range', 'loc', 20, 'ix', 20,
+ typs=['labels'], fails=TypeError)
+ self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'],
+ axes=0, fails=TypeError)
+ self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'],
+ axes=0, fails=KeyError)
+
+ def test_loc_getitem_label_list(self):
+
+ # list of labels
+ self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4],
+ typs=['ints', 'uints'], axes=0)
+ self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9],
+ typs=['ints', 'uints'], axes=1)
+ self.check_result('list lbl', 'loc', [4, 8, 12], 'ix', [4, 8, 12],
+ typs=['ints', 'uints'], axes=2)
+ self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix',
+ ['a', 'b', 'd'], typs=['labels'], axes=0)
+ self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix',
+ ['A', 'B', 'C'], typs=['labels'], axes=1)
+ self.check_result('list lbl', 'loc', ['Z', 'Y', 'W'], 'ix',
+ ['Z', 'Y', 'W'], typs=['labels'], axes=2)
+ self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix',
+ [2, 8, 'null'], typs=['mixed'], axes=0)
+ self.check_result('list lbl', 'loc',
+ [Timestamp('20130102'), Timestamp('20130103')], 'ix',
+ [Timestamp('20130102'), Timestamp('20130103')],
+ typs=['ts'], axes=0)
+
+ @pytest.mark.skipif(PY2, reason=("Catching warnings unreliable with "
+ "Python 2 (GH #20770)"))
+ def test_loc_getitem_label_list_with_missing(self):
+ self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2],
+ typs=['empty'], fails=KeyError)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ self.check_result('list lbl', 'loc', [0, 2, 10], 'ix', [0, 2, 10],
+ typs=['ints', 'uints', 'floats'],
+ axes=0, fails=KeyError)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7],
+ typs=['ints', 'uints', 'floats'],
+ axes=1, fails=KeyError)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10],
+ typs=['ints', 'uints', 'floats'],
+ axes=2, fails=KeyError)
+
+ # GH 17758 - MultiIndex and missing keys
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ self.check_result('list lbl', 'loc', [(1, 3), (1, 4), (2, 5)],
+ 'ix', [(1, 3), (1, 4), (2, 5)],
+ typs=['multi'],
+ axes=0)
+
+ def test_getitem_label_list_with_missing(self):
+ s = Series(range(3), index=['a', 'b', 'c'])
+
+ # consistency
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ s[['a', 'd']]
+
+ s = Series(range(3))
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ s[[0, 3]]
+
+ def test_loc_getitem_label_list_fails(self):
+ # fails
+ self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40],
+ typs=['ints', 'uints'], axes=1, fails=KeyError)
+ self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40],
+ typs=['ints', 'uints'], axes=2, fails=KeyError)
+
+ def test_loc_getitem_label_array_like(self):
+ # array like
+ self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index,
+ 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0)
+ self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index,
+ 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1)
+ self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index,
+ 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2)
+
+ def test_loc_getitem_bool(self):
+ # boolean indexers
+ b = [True, False, True, False]
+ self.check_result('bool', 'loc', b, 'ix', b,
+ typs=['ints', 'uints', 'labels',
+ 'mixed', 'ts', 'floats'])
+ self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'],
+ fails=KeyError)
+
+ def test_loc_getitem_int_slice(self):
+
+ # ok
+ self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4],
+ typs=['ints', 'uints'], axes=0)
+ self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6],
+ typs=['ints', 'uints'], axes=1)
+ self.check_result('int slice2', 'loc', slice(4, 8), 'ix', [4, 8],
+ typs=['ints', 'uints'], axes=2)
+
+ def test_loc_to_fail(self):
+
+ # GH3449
+ df = DataFrame(np.random.random((3, 3)),
+ index=['a', 'b', 'c'],
+ columns=['e', 'f', 'g'])
+
+ # raise a KeyError?
+ pytest.raises(KeyError, df.loc.__getitem__,
+ tuple([[1, 2], [1, 2]]))
+
+ # GH 7496
+ # loc should not fallback
+
+ s = Series()
+ s.loc[1] = 1
+ s.loc['a'] = 2
+
+ pytest.raises(KeyError, lambda: s.loc[-1])
+ pytest.raises(KeyError, lambda: s.loc[[-1, -2]])
+
+ pytest.raises(KeyError, lambda: s.loc[['4']])
+
+ s.loc[-1] = 3
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = s.loc[[-1, -2]]
+ expected = Series([3, np.nan], index=[-1, -2])
+ tm.assert_series_equal(result, expected)
+
+ s['a'] = 2
+ pytest.raises(KeyError, lambda: s.loc[[-2]])
+
+ del s['a']
+
+ def f():
+ s.loc[[-2]] = 0
+
+ pytest.raises(KeyError, f)
+
+ # inconsistency between .loc[values] and .loc[values,:]
+ # GH 7999
+ df = DataFrame([['a'], ['b']], index=[1, 2], columns=['value'])
+
+ def f():
+ df.loc[[3], :]
+
+ pytest.raises(KeyError, f)
+
+ def f():
+ df.loc[[3]]
+
+ pytest.raises(KeyError, f)
+
+ def test_loc_getitem_list_with_fail(self):
+ # 15747
+ # should KeyError if *any* missing labels
+
+ s = Series([1, 2, 3])
+
+ s.loc[[2]]
+
+ with pytest.raises(KeyError):
+ s.loc[[3]]
+
+ # a non-match and a match
+ with tm.assert_produces_warning(FutureWarning):
+ expected = s.loc[[2, 3]]
+ result = s.reindex([2, 3])
+ tm.assert_series_equal(result, expected)
+
+ def test_loc_getitem_label_slice(self):
+
+ # label slices (with ints)
+ self.check_result('lab slice', 'loc', slice(1, 3),
+ 'ix', slice(1, 3),
+ typs=['labels', 'mixed', 'empty', 'ts', 'floats'],
+ fails=TypeError)
+
+ # real label slices
+ self.check_result('lab slice', 'loc', slice('a', 'c'),
+ 'ix', slice('a', 'c'), typs=['labels'], axes=0)
+ self.check_result('lab slice', 'loc', slice('A', 'C'),
+ 'ix', slice('A', 'C'), typs=['labels'], axes=1)
+ self.check_result('lab slice', 'loc', slice('W', 'Z'),
+ 'ix', slice('W', 'Z'), typs=['labels'], axes=2)
+
+ self.check_result('ts slice', 'loc', slice('20130102', '20130104'),
+ 'ix', slice('20130102', '20130104'),
+ typs=['ts'], axes=0)
+ self.check_result('ts slice', 'loc', slice('20130102', '20130104'),
+ 'ix', slice('20130102', '20130104'),
+ typs=['ts'], axes=1, fails=TypeError)
+ self.check_result('ts slice', 'loc', slice('20130102', '20130104'),
+ 'ix', slice('20130102', '20130104'),
+ typs=['ts'], axes=2, fails=TypeError)
+
+ # GH 14316
+ self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'),
+ 'indexer', [0, 1, 2], typs=['ts_rev'], axes=0)
+
+ self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8),
+ typs=['mixed'], axes=0, fails=TypeError)
+ self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8),
+ typs=['mixed'], axes=1, fails=KeyError)
+ self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8),
+ typs=['mixed'], axes=2, fails=KeyError)
+
+ self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice(
+ 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError)
+
+ def test_loc_index(self):
+ # gh-17131
+ # a boolean index should index like a boolean numpy array
+
+ df = DataFrame(
+ np.random.random(size=(5, 10)),
+ index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"])
+
+ mask = df.index.map(lambda x: "alpha" in x)
+ expected = df.loc[np.array(mask)]
+
+ result = df.loc[mask]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[mask.values]
+ tm.assert_frame_equal(result, expected)
+
+ def test_loc_general(self):
+
+ df = DataFrame(
+ np.random.rand(4, 4), columns=['A', 'B', 'C', 'D'],
+ index=['A', 'B', 'C', 'D'])
+
+ # want this to work
+ result = df.loc[:, "A":"B"].iloc[0:2, :]
+ assert (result.columns == ['A', 'B']).all()
+ assert (result.index == ['A', 'B']).all()
+
+ # mixed type
+ result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0]
+ expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == object
+
+ def test_loc_setitem_consistency(self):
+ # GH 6149
+ # coerce similarly for setitem and loc when rows have a null-slice
+ expected = DataFrame({'date': Series(0, index=range(5),
+ dtype=np.int64),
+ 'val': Series(range(5), dtype=np.int64)})
+
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(
+ range(5), dtype=np.int64)})
+ df.loc[:, 'date'] = 0
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(range(5), dtype=np.int64)})
+ df.loc[:, 'date'] = np.array(0, dtype=np.int64)
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(range(5), dtype=np.int64)})
+ df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64)
+ tm.assert_frame_equal(df, expected)
+
+ expected = DataFrame({'date': Series('foo', index=range(5)),
+ 'val': Series(range(5), dtype=np.int64)})
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(range(5), dtype=np.int64)})
+ df.loc[:, 'date'] = 'foo'
+ tm.assert_frame_equal(df, expected)
+
+ expected = DataFrame({'date': Series(1.0, index=range(5)),
+ 'val': Series(range(5), dtype=np.int64)})
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(range(5), dtype=np.int64)})
+ df.loc[:, 'date'] = 1.0
+ tm.assert_frame_equal(df, expected)
+
+ # GH 15494
+ # setting on frame with single row
+ df = DataFrame({'date': Series([Timestamp('20180101')])})
+ df.loc[:, 'date'] = 'string'
+ expected = DataFrame({'date': Series(['string'])})
+ tm.assert_frame_equal(df, expected)
+
+ def test_loc_setitem_consistency_empty(self):
+ # empty (essentially noops)
+ expected = DataFrame(columns=['x', 'y'])
+ expected['x'] = expected['x'].astype(np.int64)
+ df = DataFrame(columns=['x', 'y'])
+ df.loc[:, 'x'] = 1
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame(columns=['x', 'y'])
+ df['x'] = 1
+ tm.assert_frame_equal(df, expected)
+
+ def test_loc_setitem_consistency_slice_column_len(self):
+ # .loc[:,column] setting with slice == len of the column
+ # GH10408
+ data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat
+Level_1,,,Something,StartDate,EndDate,Yes/No,SomethingElse
+Region,Site,RespondentID,,,,,
+Region_1,Site_1,3987227376,A,5/25/2015 10:59,5/25/2015 11:22,Yes,
+Region_1,Site_1,3980680971,A,5/21/2015 9:40,5/21/2015 9:52,Yes,Yes
+Region_1,Site_2,3977723249,A,5/20/2015 8:27,5/20/2015 8:41,Yes,
+Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No"""
+
+ df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2])
+ df.loc[:, ('Respondent', 'StartDate')] = pd.to_datetime(df.loc[:, (
+ 'Respondent', 'StartDate')])
+ df.loc[:, ('Respondent', 'EndDate')] = pd.to_datetime(df.loc[:, (
+ 'Respondent', 'EndDate')])
+ df.loc[:, ('Respondent', 'Duration')] = df.loc[:, (
+ 'Respondent', 'EndDate')] - df.loc[:, ('Respondent', 'StartDate')]
+
+ df.loc[:, ('Respondent', 'Duration')] = df.loc[:, (
+ 'Respondent', 'Duration')].astype('timedelta64[s]')
+ expected = Series([1380, 720, 840, 2160.], index=df.index,
+ name=('Respondent', 'Duration'))
+ tm.assert_series_equal(df[('Respondent', 'Duration')], expected)
+
+ def test_loc_setitem_frame(self):
+ df = self.frame_labels
+
+ result = df.iloc[0, 0]
+
+ df.loc['a', 'A'] = 1
+ result = df.loc['a', 'A']
+ assert result == 1
+
+ result = df.iloc[0, 0]
+ assert result == 1
+
+ df.loc[:, 'B':'D'] = 0
+ expected = df.loc[:, 'B':'D']
+ result = df.iloc[:, 1:]
+ tm.assert_frame_equal(result, expected)
+
+ # GH 6254
+ # setting issue
+ df = DataFrame(index=[3, 5, 4], columns=['A'])
+ df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64')
+ expected = DataFrame(dict(A=Series(
+ [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4])
+ tm.assert_frame_equal(df, expected)
+
+ # GH 6252
+ # setting with an empty frame
+ keys1 = ['@' + str(i) for i in range(5)]
+ val1 = np.arange(5, dtype='int64')
+
+ keys2 = ['@' + str(i) for i in range(4)]
+ val2 = np.arange(4, dtype='int64')
+
+ index = list(set(keys1).union(keys2))
+ df = DataFrame(index=index)
+ df['A'] = np.nan
+ df.loc[keys1, 'A'] = val1
+
+ df['B'] = np.nan
+ df.loc[keys2, 'B'] = val2
+
+ expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series(
+ val2, index=keys2))).reindex(index=index)
+ tm.assert_frame_equal(df, expected)
+
+ # GH 8669
+ # invalid coercion of nan -> int
+ df = DataFrame({'A': [1, 2, 3], 'B': np.nan})
+ df.loc[df.B > df.A, 'B'] = df.A
+ expected = DataFrame({'A': [1, 2, 3], 'B': np.nan})
+ tm.assert_frame_equal(df, expected)
+
+ # GH 6546
+ # setting with mixed labels
+ df = DataFrame({1: [1, 2], 2: [3, 4], 'a': ['a', 'b']})
+
+ result = df.loc[0, [1, 2]]
+ expected = Series([1, 3], index=[1, 2], dtype=object, name=0)
+ tm.assert_series_equal(result, expected)
+
+ expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']})
+ df.loc[0, [1, 2]] = [5, 6]
+ tm.assert_frame_equal(df, expected)
+
+ def test_loc_setitem_frame_multiples(self):
+ # multiple setting
+ df = DataFrame({'A': ['foo', 'bar', 'baz'],
+ 'B': Series(
+ range(3), dtype=np.int64)})
+ rhs = df.loc[1:2]
+ rhs.index = df.index[0:2]
+ df.loc[0:1] = rhs
+ expected = DataFrame({'A': ['bar', 'baz', 'baz'],
+ 'B': Series(
+ [1, 2, 2], dtype=np.int64)})
+ tm.assert_frame_equal(df, expected)
+
+ # multiple setting with frame on rhs (with M8)
+ df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
+ 'val': Series(
+ range(5), dtype=np.int64)})
+ expected = DataFrame({'date': [Timestamp('20000101'), Timestamp(
+ '20000102'), Timestamp('20000101'), Timestamp('20000102'),
+ Timestamp('20000103')],
+ 'val': Series(
+ [0, 1, 0, 1, 2], dtype=np.int64)})
+ rhs = df.loc[0:2]
+ rhs.index = df.index[2:5]
+ df.loc[2:4] = rhs
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ 'indexer', [['A'], slice(None, 'A', None), np.array(['A'])])
+ @pytest.mark.parametrize(
+ 'value', [['Z'], np.array(['Z'])])
+ def test_loc_setitem_with_scalar_index(self, indexer, value):
+ # GH #19474
+ # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated
+ # elementwisely, not using "setter('A', ['Z'])".
+
+ df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+ df.loc[0, indexer] = value
+ result = df.loc[0, 'A']
+
+ assert is_scalar(result) and result == 'Z'
+
+ def test_loc_coerceion(self):
+
+ # 12411
+ df = DataFrame({'date': [Timestamp('20130101').tz_localize('UTC'),
+ pd.NaT]})
+ expected = df.dtypes
+
+ result = df.iloc[[0]]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ result = df.iloc[[1]]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ # 12045
+ import datetime
+ df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
+ datetime.datetime(1012, 1, 2)]})
+ expected = df.dtypes
+
+ result = df.iloc[[0]]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ result = df.iloc[[1]]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ # 11594
+ df = DataFrame({'text': ['some words'] + [None] * 9})
+ expected = df.dtypes
+
+ result = df.iloc[0:2]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ result = df.iloc[3:]
+ tm.assert_series_equal(result.dtypes, expected)
+
+ def test_loc_non_unique(self):
+ # GH3659
+ # non-unique indexer with loc slice
+ # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs
+
+ # these are going to raise because the we are non monotonic
+ df = DataFrame({'A': [1, 2, 3, 4, 5, 6],
+ 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3])
+ pytest.raises(KeyError, df.loc.__getitem__,
+ tuple([slice(1, None)]))
+ pytest.raises(KeyError, df.loc.__getitem__,
+ tuple([slice(0, None)]))
+ pytest.raises(KeyError, df.loc.__getitem__, tuple([slice(1, 2)]))
+
+ # monotonic are ok
+ df = DataFrame({'A': [1, 2, 3, 4, 5, 6],
+ 'B': [3, 4, 5, 6, 7, 8]},
+ index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0)
+ result = df.loc[1:]
+ expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]},
+ index=[1, 1, 2, 3])
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[0:]
+ tm.assert_frame_equal(result, df)
+
+ result = df.loc[1:2]
+ expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]},
+ index=[1, 1, 2])
+ tm.assert_frame_equal(result, expected)
+
+ def test_loc_non_unique_memory_error(self):
+
+ # GH 4280
+ # non_unique index with a large selection triggers a memory error
+
+ columns = list('ABCDEFG')
+
+ def gen_test(l, l2):
+ return pd.concat([
+ DataFrame(np.random.randn(l, len(columns)),
+ index=lrange(l), columns=columns),
+ DataFrame(np.ones((l2, len(columns))),
+ index=[0] * l2, columns=columns)])
+
+ def gen_expected(df, mask):
+ len_mask = len(mask)
+ return pd.concat([df.take([0]),
+ DataFrame(np.ones((len_mask, len(columns))),
+ index=[0] * len_mask,
+ columns=columns),
+ df.take(mask[1:])])
+
+ df = gen_test(900, 100)
+ assert df.index.is_unique is False
+
+ mask = np.arange(100)
+ result = df.loc[mask]
+ expected = gen_expected(df, mask)
+ tm.assert_frame_equal(result, expected)
+
+ df = gen_test(900000, 100000)
+ assert df.index.is_unique is False
+
+ mask = np.arange(100000)
+ result = df.loc[mask]
+ expected = gen_expected(df, mask)
+ tm.assert_frame_equal(result, expected)
+
+ def test_loc_name(self):
+ # GH 3880
+ df = DataFrame([[1, 1], [1, 1]])
+ df.index.name = 'index_name'
+ result = df.iloc[[0, 1]].index.name
+ assert result == 'index_name'
+
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\n.ix", DeprecationWarning)
+ result = df.ix[[0, 1]].index.name
+ assert result == 'index_name'
+
+ result = df.loc[[0, 1]].index.name
+ assert result == 'index_name'
+
+ def test_loc_empty_list_indexer_is_ok(self):
+ from pandas.util.testing import makeCustomDataframe as mkdf
+ df = mkdf(5, 2)
+ # vertical empty
+ tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0],
+ check_index_type=True, check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :],
+ check_index_type=True, check_column_type=True)
+ # horizontal empty
+ tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :],
+ check_index_type=True,
+ check_column_type=True)
+
+ def test_identity_slice_returns_new_object(self):
+ # GH13873
+ original_df = DataFrame({'a': [1, 2, 3]})
+ sliced_df = original_df.loc[:]
+ assert sliced_df is not original_df
+ assert original_df[:] is not original_df
+
+ # should be a shallow copy
+ original_df['a'] = [4, 4, 4]
+ assert (sliced_df['a'] == 4).all()
+
+ # These should not return copies
+ assert original_df is original_df.loc[:, :]
+ df = DataFrame(np.random.randn(10, 4))
+ assert df[0] is df.loc[:, 0]
+
+ # Same tests for Series
+ original_series = Series([1, 2, 3, 4, 5, 6])
+ sliced_series = original_series.loc[:]
+ assert sliced_series is not original_series
+ assert original_series[:] is not original_series
+
+ original_series[:3] = [7, 8, 9]
+ assert all(sliced_series[:3] == [7, 8, 9])
+
+ def test_loc_uint64(self):
+ # GH20722
+ # Test whether loc accept uint64 max value as index.
+ s = pd.Series([1, 2],
+ index=[np.iinfo('uint64').max - 1,
+ np.iinfo('uint64').max])
+
+ result = s.loc[np.iinfo('uint64').max - 1]
+ expected = s.iloc[0]
+ assert result == expected
+
+ result = s.loc[[np.iinfo('uint64').max - 1]]
+ expected = s.iloc[[0]]
+ tm.assert_series_equal(result, expected)
+
+ result = s.loc[[np.iinfo('uint64').max - 1,
+ np.iinfo('uint64').max]]
+ tm.assert_series_equal(result, s)
+
+ def test_loc_setitem_empty_append(self):
+ # GH6173, various appends to an empty dataframe
+
+ data = [1, 2, 3]
+ expected = DataFrame({'x': data, 'y': [None] * len(data)})
+
+ # appends to fit length of data
+ df = DataFrame(columns=['x', 'y'])
+ df.loc[:, 'x'] = data
+ tm.assert_frame_equal(df, expected)
+
+ # only appends one value
+ expected = DataFrame({'x': [1.0], 'y': [np.nan]})
+ df = DataFrame(columns=['x', 'y'],
+ dtype=np.float)
+ df.loc[0, 'x'] = expected.loc[0, 'x']
+ tm.assert_frame_equal(df, expected)
+
+ def test_loc_setitem_empty_append_raises(self):
+ # GH6173, various appends to an empty dataframe
+
+ data = [1, 2]
+ df = DataFrame(columns=['x', 'y'])
+ msg = (r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] "
+ r"are in the \[index\]")
+ with pytest.raises(KeyError, match=msg):
+ df.loc[[0, 1], 'x'] = data
+
+ msg = "cannot copy sequence with size 2 to array axis with dimension 0"
+ with pytest.raises(ValueError, match=msg):
+ df.loc[0:2, 'x'] = data
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_panel.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_panel.py
new file mode 100644
index 00000000000..34708e1148c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_panel.py
@@ -0,0 +1,214 @@
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Panel, date_range
+from pandas.util import testing as tm
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestPanel(object):
+
+ def test_iloc_getitem_panel(self):
+
+ with catch_warnings(record=True):
+ # GH 7189
+ p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2),
+ items=['A', 'B', 'C', 'D'],
+ major_axis=['a', 'b', 'c'],
+ minor_axis=['one', 'two'])
+
+ result = p.iloc[1]
+ expected = p.loc['B']
+ tm.assert_frame_equal(result, expected)
+
+ result = p.iloc[1, 1]
+ expected = p.loc['B', 'b']
+ tm.assert_series_equal(result, expected)
+
+ result = p.iloc[1, 1, 1]
+ expected = p.loc['B', 'b', 'two']
+ assert result == expected
+
+ # slice
+ result = p.iloc[1:3]
+ expected = p.loc[['B', 'C']]
+ tm.assert_panel_equal(result, expected)
+
+ result = p.iloc[:, 0:2]
+ expected = p.loc[:, ['a', 'b']]
+ tm.assert_panel_equal(result, expected)
+
+ # list of integers
+ result = p.iloc[[0, 2]]
+ expected = p.loc[['A', 'C']]
+ tm.assert_panel_equal(result, expected)
+
+ # neg indices
+ result = p.iloc[[-1, 1], [-1, 1]]
+ expected = p.loc[['D', 'B'], ['c', 'b']]
+ tm.assert_panel_equal(result, expected)
+
+ # dups indices
+ result = p.iloc[[-1, -1, 1], [-1, 1]]
+ expected = p.loc[['D', 'D', 'B'], ['c', 'b']]
+ tm.assert_panel_equal(result, expected)
+
+ # combined
+ result = p.iloc[0, [True, True], [0, 1]]
+ expected = p.loc['A', ['a', 'b'], ['one', 'two']]
+ tm.assert_frame_equal(result, expected)
+
+ # out-of-bounds exception
+ with pytest.raises(IndexError):
+ p.iloc[tuple([10, 5])]
+
+ with pytest.raises(IndexError):
+ p.iloc[0, [True, True], [0, 1, 2]]
+
+ # trying to use a label
+ with pytest.raises(ValueError):
+ p.iloc[tuple(['j', 'D'])]
+
+ # GH
+ p = Panel(
+ np.random.rand(4, 3, 2), items=['A', 'B', 'C', 'D'],
+ major_axis=['U', 'V', 'W'], minor_axis=['X', 'Y'])
+ expected = p['A']
+
+ result = p.iloc[0, :, :]
+ tm.assert_frame_equal(result, expected)
+
+ result = p.iloc[0, [True, True, True], :]
+ tm.assert_frame_equal(result, expected)
+
+ result = p.iloc[0, [True, True, True], [0, 1]]
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(IndexError):
+ p.iloc[0, [True, True, True], [0, 1, 2]]
+
+ with pytest.raises(IndexError):
+ p.iloc[0, [True, True, True], [2]]
+
+ def test_iloc_panel_issue(self):
+
+ with catch_warnings(record=True):
+ # see gh-3617
+ p = Panel(np.random.randn(4, 4, 4))
+
+ assert p.iloc[:3, :3, :3].shape == (3, 3, 3)
+ assert p.iloc[1, :3, :3].shape == (3, 3)
+ assert p.iloc[:3, 1, :3].shape == (3, 3)
+ assert p.iloc[:3, :3, 1].shape == (3, 3)
+ assert p.iloc[1, 1, :3].shape == (3, )
+ assert p.iloc[1, :3, 1].shape == (3, )
+ assert p.iloc[:3, 1, 1].shape == (3, )
+
+ @pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning")
+ def test_panel_getitem(self):
+
+ with catch_warnings(record=True):
+ # GH4016, date selection returns a frame when a partial string
+ # selection
+ ind = date_range(start="2000", freq="D", periods=1000)
+ df = DataFrame(
+ np.random.randn(
+ len(ind), 5), index=ind, columns=list('ABCDE'))
+ panel = Panel({'frame_' + c: df for c in list('ABC')})
+
+ test2 = panel.loc[:, "2002":"2002-12-31"]
+ test1 = panel.loc[:, "2002"]
+ tm.assert_panel_equal(test1, test2)
+
+ # GH8710
+ # multi-element getting with a list
+ panel = tm.makePanel()
+
+ expected = panel.iloc[[0, 1]]
+
+ result = panel.loc[['ItemA', 'ItemB']]
+ tm.assert_panel_equal(result, expected)
+
+ result = panel.loc[['ItemA', 'ItemB'], :, :]
+ tm.assert_panel_equal(result, expected)
+
+ result = panel[['ItemA', 'ItemB']]
+ tm.assert_panel_equal(result, expected)
+
+ result = panel.loc['ItemA':'ItemB']
+ tm.assert_panel_equal(result, expected)
+
+ with catch_warnings(record=True):
+ result = panel.ix[['ItemA', 'ItemB']]
+ tm.assert_panel_equal(result, expected)
+
+ # with an object-like
+ # GH 9140
+ class TestObject(object):
+
+ def __str__(self):
+ return "TestObject"
+
+ obj = TestObject()
+
+ p = Panel(np.random.randn(1, 5, 4), items=[obj],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+
+ expected = p.iloc[0]
+ result = p[obj]
+ tm.assert_frame_equal(result, expected)
+
+ def test_panel_setitem(self):
+
+ with catch_warnings(record=True):
+ # GH 7763
+ # loc and setitem have setting differences
+ np.random.seed(0)
+ index = range(3)
+ columns = list('abc')
+
+ panel = Panel({'A': DataFrame(np.random.randn(3, 3),
+ index=index, columns=columns),
+ 'B': DataFrame(np.random.randn(3, 3),
+ index=index, columns=columns),
+ 'C': DataFrame(np.random.randn(3, 3),
+ index=index, columns=columns)})
+
+ replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns)
+ expected = Panel({'A': replace, 'B': replace, 'C': replace})
+
+ p = panel.copy()
+ for idx in list('ABC'):
+ p[idx] = replace
+ tm.assert_panel_equal(p, expected)
+
+ p = panel.copy()
+ for idx in list('ABC'):
+ p.loc[idx, :, :] = replace
+ tm.assert_panel_equal(p, expected)
+
+ def test_panel_assignment(self):
+
+ with catch_warnings(record=True):
+ # GH3777
+ wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+ wp2 = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+
+ # TODO: unused?
+ # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']]
+
+ with pytest.raises(NotImplementedError):
+ wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[
+ ['Item1', 'Item2'], :, ['A', 'B']]
+
+ # to_assign = wp2.loc[['Item1', 'Item2'], :, ['A', 'B']]
+ # wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = to_assign
+ # result = wp.loc[['Item1', 'Item2'], :, ['A', 'B']]
+ # tm.assert_panel_equal(result,expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_partial.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_partial.py
new file mode 100644
index 00000000000..b863afe02c2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_partial.py
@@ -0,0 +1,620 @@
+"""
+test setting *parts* of objects both positionally and label based
+
+TOD: these should be split among the indexer tests
+"""
+
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, Panel, Series, date_range
+from pandas.util import testing as tm
+
+
+class TestPartialSetting(object):
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ @pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning")
+ def test_partial_setting(self):
+
+ # GH2578, allow ix and friends to partially set
+
+ # series
+ s_orig = Series([1, 2, 3])
+
+ s = s_orig.copy()
+ s[5] = 5
+ expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
+ tm.assert_series_equal(s, expected)
+
+ s = s_orig.copy()
+ s.loc[5] = 5
+ expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
+ tm.assert_series_equal(s, expected)
+
+ s = s_orig.copy()
+ s[5] = 5.
+ expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
+ tm.assert_series_equal(s, expected)
+
+ s = s_orig.copy()
+ s.loc[5] = 5.
+ expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
+ tm.assert_series_equal(s, expected)
+
+ # iloc/iat raise
+ s = s_orig.copy()
+
+ with pytest.raises(IndexError):
+ s.iloc[3] = 5.
+
+ with pytest.raises(IndexError):
+ s.iat[3] = 5.
+
+ # ## frame ##
+
+ df_orig = DataFrame(
+ np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64')
+
+ # iloc/iat raise
+ df = df_orig.copy()
+
+ with pytest.raises(IndexError):
+ df.iloc[4, 2] = 5.
+
+ with pytest.raises(IndexError):
+ df.iat[4, 2] = 5.
+
+ # row setting where it exists
+ expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
+ df = df_orig.copy()
+ df.iloc[1] = df.iloc[2]
+ tm.assert_frame_equal(df, expected)
+
+ expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
+ df = df_orig.copy()
+ df.loc[1] = df.loc[2]
+ tm.assert_frame_equal(df, expected)
+
+ # like 2578, partial setting with dtype preservation
+ expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
+ df = df_orig.copy()
+ df.loc[3] = df.loc[2]
+ tm.assert_frame_equal(df, expected)
+
+ # single dtype frame, overwrite
+ expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
+ df = df_orig.copy()
+ with catch_warnings(record=True):
+ df.ix[:, 'B'] = df.ix[:, 'A']
+ tm.assert_frame_equal(df, expected)
+
+ # mixed dtype frame, overwrite
+ expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
+ df = df_orig.copy()
+ df['B'] = df['B'].astype(np.float64)
+ with catch_warnings(record=True):
+ df.ix[:, 'B'] = df.ix[:, 'A']
+ tm.assert_frame_equal(df, expected)
+
+ # single dtype frame, partial setting
+ expected = df_orig.copy()
+ expected['C'] = df['A']
+ df = df_orig.copy()
+ with catch_warnings(record=True):
+ df.ix[:, 'C'] = df.ix[:, 'A']
+ tm.assert_frame_equal(df, expected)
+
+ # mixed frame, partial setting
+ expected = df_orig.copy()
+ expected['C'] = df['A']
+ df = df_orig.copy()
+ with catch_warnings(record=True):
+ df.ix[:, 'C'] = df.ix[:, 'A']
+ tm.assert_frame_equal(df, expected)
+
+ with catch_warnings(record=True):
+ # ## panel ##
+ p_orig = Panel(np.arange(16).reshape(2, 4, 2),
+ items=['Item1', 'Item2'],
+ major_axis=pd.date_range('2001/1/12', periods=4),
+ minor_axis=['A', 'B'], dtype='float64')
+
+ # panel setting via item
+ p_orig = Panel(np.arange(16).reshape(2, 4, 2),
+ items=['Item1', 'Item2'],
+ major_axis=pd.date_range('2001/1/12', periods=4),
+ minor_axis=['A', 'B'], dtype='float64')
+ expected = p_orig.copy()
+ expected['Item3'] = expected['Item1']
+ p = p_orig.copy()
+ p.loc['Item3'] = p['Item1']
+ tm.assert_panel_equal(p, expected)
+
+ # panel with aligned series
+ expected = p_orig.copy()
+ expected = expected.transpose(2, 1, 0)
+ expected['C'] = DataFrame({'Item1': [30, 30, 30, 30],
+ 'Item2': [32, 32, 32, 32]},
+ index=p_orig.major_axis)
+ expected = expected.transpose(2, 1, 0)
+ p = p_orig.copy()
+ p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items)
+ tm.assert_panel_equal(p, expected)
+
+ # GH 8473
+ dates = date_range('1/1/2000', periods=8)
+ df_orig = DataFrame(np.random.randn(8, 4), index=dates,
+ columns=['A', 'B', 'C', 'D'])
+
+ expected = pd.concat([df_orig,
+ DataFrame({'A': 7},
+ index=[dates[-1] + dates.freq])],
+ sort=True)
+ df = df_orig.copy()
+ df.loc[dates[-1] + dates.freq, 'A'] = 7
+ tm.assert_frame_equal(df, expected)
+ df = df_orig.copy()
+ df.at[dates[-1] + dates.freq, 'A'] = 7
+ tm.assert_frame_equal(df, expected)
+
+ exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq])
+ expected = pd.concat([df_orig, exp_other], axis=1)
+
+ df = df_orig.copy()
+ df.loc[dates[-1] + dates.freq, 0] = 7
+ tm.assert_frame_equal(df, expected)
+ df = df_orig.copy()
+ df.at[dates[-1] + dates.freq, 0] = 7
+ tm.assert_frame_equal(df, expected)
+
+ def test_partial_setting_mixed_dtype(self):
+
+ # in a mixed dtype environment, try to preserve dtypes
+ # by appending
+ df = DataFrame([[True, 1], [False, 2]], columns=["female", "fitness"])
+
+ s = df.loc[1].copy()
+ s.name = 2
+ expected = df.append(s)
+
+ df.loc[2] = df.loc[1]
+ tm.assert_frame_equal(df, expected)
+
+ # columns will align
+ df = DataFrame(columns=['A', 'B'])
+ df.loc[0] = Series(1, index=range(4))
+ tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0]))
+
+ # columns will align
+ df = DataFrame(columns=['A', 'B'])
+ df.loc[0] = Series(1, index=['B'])
+
+ exp = DataFrame([[np.nan, 1]], columns=['A', 'B'],
+ index=[0], dtype='float64')
+ tm.assert_frame_equal(df, exp)
+
+ # list-like must conform
+ df = DataFrame(columns=['A', 'B'])
+
+ with pytest.raises(ValueError):
+ df.loc[0] = [1, 2, 3]
+
+ # TODO: #15657, these are left as object and not coerced
+ df = DataFrame(columns=['A', 'B'])
+ df.loc[3] = [6, 7]
+
+ exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'],
+ dtype='object')
+ tm.assert_frame_equal(df, exp)
+
+ def test_series_partial_set(self):
+ # partial set with new index
+ # Regression from GH4825
+ ser = Series([0.1, 0.2], index=[1, 2])
+
+ # loc equiv to .reindex
+ expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[3, 2, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = ser.reindex([3, 2, 3])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x'])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[3, 2, 3, 'x']]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = ser.reindex([3, 2, 3, 'x'])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1])
+ result = ser.loc[[2, 2, 1]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[2, 2, 'x', 1]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = ser.reindex([2, 2, 'x', 1])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # raises as nothing in in the index
+ pytest.raises(KeyError, lambda: ser.loc[[3, 3, 3]])
+
+ expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[2, 2, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = ser.reindex([2, 2, 3])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ s = Series([0.1, 0.2, 0.3], index=[1, 2, 3])
+ expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.loc[[3, 4, 4]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.reindex([3, 4, 4])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ s = Series([0.1, 0.2, 0.3, 0.4],
+ index=[1, 2, 3, 4])
+ expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.loc[[5, 3, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.reindex([5, 3, 3])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ s = Series([0.1, 0.2, 0.3, 0.4],
+ index=[1, 2, 3, 4])
+ expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.loc[[5, 4, 4]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.reindex([5, 4, 4])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ s = Series([0.1, 0.2, 0.3, 0.4],
+ index=[4, 5, 6, 7])
+ expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.loc[[7, 2, 2]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.reindex([7, 2, 2])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ s = Series([0.1, 0.2, 0.3, 0.4],
+ index=[1, 2, 3, 4])
+ expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.loc[[4, 5, 5]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.reindex([4, 5, 5])
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # iloc
+ expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1])
+ result = ser.iloc[[1, 1, 0, 0]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ def test_series_partial_set_with_name(self):
+ # GH 11497
+
+ idx = Index([1, 2], dtype='int64', name='idx')
+ ser = Series([0.1, 0.2], index=idx, name='s')
+
+ # loc
+ exp_idx = Index([3, 2, 3], dtype='int64', name='idx')
+ expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[3, 2, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx')
+ expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx,
+ name='s')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[3, 2, 3, 'x']]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([2, 2, 1], dtype='int64', name='idx')
+ expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s')
+ result = ser.loc[[2, 2, 1]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx')
+ expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[2, 2, 'x', 1]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # raises as nothing in in the index
+ pytest.raises(KeyError, lambda: ser.loc[[3, 3, 3]])
+
+ exp_idx = Index([2, 2, 3], dtype='int64', name='idx')
+ expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = ser.loc[[2, 2, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([3, 4, 4], dtype='int64', name='idx')
+ expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s')
+ idx = Index([1, 2, 3], dtype='int64', name='idx')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = Series([0.1, 0.2, 0.3],
+ index=idx,
+ name='s').loc[[3, 4, 4]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([5, 3, 3], dtype='int64', name='idx')
+ expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s')
+ idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = Series([0.1, 0.2, 0.3, 0.4], index=idx,
+ name='s').loc[[5, 3, 3]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([5, 4, 4], dtype='int64', name='idx')
+ expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s')
+ idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = Series([0.1, 0.2, 0.3, 0.4], index=idx,
+ name='s').loc[[5, 4, 4]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([7, 2, 2], dtype='int64', name='idx')
+ expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
+ idx = Index([4, 5, 6, 7], dtype='int64', name='idx')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = Series([0.1, 0.2, 0.3, 0.4], index=idx,
+ name='s').loc[[7, 2, 2]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ exp_idx = Index([4, 5, 5], dtype='int64', name='idx')
+ expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s')
+ idx = Index([1, 2, 3, 4], dtype='int64', name='idx')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = Series([0.1, 0.2, 0.3, 0.4], index=idx,
+ name='s').loc[[4, 5, 5]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # iloc
+ exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx')
+ expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s')
+ result = ser.iloc[[1, 1, 0, 0]]
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ @pytest.mark.filterwarnings("ignore:\\n.ix")
+ def test_partial_set_invalid(self):
+
+ # GH 4940
+ # allow only setting of 'valid' values
+
+ orig = tm.makeTimeDataFrame()
+ df = orig.copy()
+
+ # don't allow not string inserts
+ with pytest.raises(TypeError):
+ with catch_warnings(record=True):
+ df.loc[100.0, :] = df.ix[0]
+
+ with pytest.raises(TypeError):
+ with catch_warnings(record=True):
+ df.loc[100, :] = df.ix[0]
+
+ with pytest.raises(TypeError):
+ with catch_warnings(record=True):
+ df.ix[100.0, :] = df.ix[0]
+
+ with pytest.raises(ValueError):
+ with catch_warnings(record=True):
+ df.ix[100, :] = df.ix[0]
+
+ # allow object conversion here
+ df = orig.copy()
+ with catch_warnings(record=True):
+ df.loc['a', :] = df.ix[0]
+ exp = orig.append(Series(df.ix[0], name='a'))
+ tm.assert_frame_equal(df, exp)
+ tm.assert_index_equal(df.index, Index(orig.index.tolist() + ['a']))
+ assert df.index.dtype == 'object'
+
+ def test_partial_set_empty_series(self):
+
+ # GH5226
+
+ # partially set with an empty object series
+ s = Series()
+ s.loc[1] = 1
+ tm.assert_series_equal(s, Series([1], index=[1]))
+ s.loc[3] = 3
+ tm.assert_series_equal(s, Series([1, 3], index=[1, 3]))
+
+ s = Series()
+ s.loc[1] = 1.
+ tm.assert_series_equal(s, Series([1.], index=[1]))
+ s.loc[3] = 3.
+ tm.assert_series_equal(s, Series([1., 3.], index=[1, 3]))
+
+ s = Series()
+ s.loc['foo'] = 1
+ tm.assert_series_equal(s, Series([1], index=['foo']))
+ s.loc['bar'] = 3
+ tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar']))
+ s.loc[3] = 4
+ tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3]))
+
+ def test_partial_set_empty_frame(self):
+
+ # partially set with an empty object
+ # frame
+ df = DataFrame()
+
+ with pytest.raises(ValueError):
+ df.loc[1] = 1
+
+ with pytest.raises(ValueError):
+ df.loc[1] = Series([1], index=['foo'])
+
+ with pytest.raises(ValueError):
+ df.loc[:, 1] = 1
+
+ # these work as they don't really change
+ # anything but the index
+ # GH5632
+ expected = DataFrame(columns=['foo'], index=Index([], dtype='int64'))
+
+ def f():
+ df = DataFrame()
+ df['foo'] = Series([], dtype='object')
+ return df
+
+ tm.assert_frame_equal(f(), expected)
+
+ def f():
+ df = DataFrame()
+ df['foo'] = Series(df.index)
+ return df
+
+ tm.assert_frame_equal(f(), expected)
+
+ def f():
+ df = DataFrame()
+ df['foo'] = df.index
+ return df
+
+ tm.assert_frame_equal(f(), expected)
+
+ expected = DataFrame(columns=['foo'], index=Index([], dtype='int64'))
+ expected['foo'] = expected['foo'].astype('float64')
+
+ def f():
+ df = DataFrame()
+ df['foo'] = []
+ return df
+
+ tm.assert_frame_equal(f(), expected)
+
+ def f():
+ df = DataFrame()
+ df['foo'] = Series(np.arange(len(df)), dtype='float64')
+ return df
+
+ tm.assert_frame_equal(f(), expected)
+
+ def f():
+ df = DataFrame()
+ tm.assert_index_equal(df.index, Index([], dtype='object'))
+ df['foo'] = range(len(df))
+ return df
+
+ expected = DataFrame(columns=['foo'], index=Index([], dtype='int64'))
+ expected['foo'] = expected['foo'].astype('float64')
+ tm.assert_frame_equal(f(), expected)
+
+ df = DataFrame()
+ tm.assert_index_equal(df.columns, Index([], dtype=object))
+ df2 = DataFrame()
+ df2[1] = Series([1], index=['foo'])
+ df.loc[:, 1] = Series([1], index=['foo'])
+ tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1]))
+ tm.assert_frame_equal(df, df2)
+
+ # no index to start
+ expected = DataFrame({0: Series(1, index=range(4))},
+ columns=['A', 'B', 0])
+
+ df = DataFrame(columns=['A', 'B'])
+ df[0] = Series(1, index=range(4))
+ df.dtypes
+ str(df)
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame(columns=['A', 'B'])
+ df.loc[:, 0] = Series(1, index=range(4))
+ df.dtypes
+ str(df)
+ tm.assert_frame_equal(df, expected)
+
+ def test_partial_set_empty_frame_row(self):
+ # GH5720, GH5744
+ # don't create rows when empty
+ expected = DataFrame(columns=['A', 'B', 'New'],
+ index=Index([], dtype='int64'))
+ expected['A'] = expected['A'].astype('int64')
+ expected['B'] = expected['B'].astype('float64')
+ expected['New'] = expected['New'].astype('float64')
+
+ df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]})
+ y = df[df.A > 5]
+ y['New'] = np.nan
+ tm.assert_frame_equal(y, expected)
+ # tm.assert_frame_equal(y,expected)
+
+ expected = DataFrame(columns=['a', 'b', 'c c', 'd'])
+ expected['d'] = expected['d'].astype('int64')
+ df = DataFrame(columns=['a', 'b', 'c c'])
+ df['d'] = 3
+ tm.assert_frame_equal(df, expected)
+ tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object))
+
+ # reindex columns is ok
+ df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]})
+ y = df[df.A > 5]
+ result = y.reindex(columns=['A', 'B', 'C'])
+ expected = DataFrame(columns=['A', 'B', 'C'],
+ index=Index([], dtype='int64'))
+ expected['A'] = expected['A'].astype('int64')
+ expected['B'] = expected['B'].astype('float64')
+ expected['C'] = expected['C'].astype('float64')
+ tm.assert_frame_equal(result, expected)
+
+ def test_partial_set_empty_frame_set_series(self):
+ # GH 5756
+ # setting with empty Series
+ df = DataFrame(Series())
+ tm.assert_frame_equal(df, DataFrame({0: Series()}))
+
+ df = DataFrame(Series(name='foo'))
+ tm.assert_frame_equal(df, DataFrame({'foo': Series()}))
+
+ def test_partial_set_empty_frame_empty_copy_assignment(self):
+ # GH 5932
+ # copy on empty with assignment fails
+ df = DataFrame(index=[0])
+ df = df.copy()
+ df['a'] = 0
+ expected = DataFrame(0, index=[0], columns=['a'])
+ tm.assert_frame_equal(df, expected)
+
+ def test_partial_set_empty_frame_empty_consistencies(self):
+ # GH 6171
+ # consistency on empty frames
+ df = DataFrame(columns=['x', 'y'])
+ df['x'] = [1, 2]
+ expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan]))
+ tm.assert_frame_equal(df, expected, check_dtype=False)
+
+ df = DataFrame(columns=['x', 'y'])
+ df['x'] = ['1', '2']
+ expected = DataFrame(
+ dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object)
+ tm.assert_frame_equal(df, expected)
+
+ df = DataFrame(columns=['x', 'y'])
+ df.loc[0, 'x'] = 1
+ expected = DataFrame(dict(x=[1], y=[np.nan]))
+ tm.assert_frame_equal(df, expected, check_dtype=False)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_scalar.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_scalar.py
new file mode 100644
index 00000000000..e4b8181a675
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_scalar.py
@@ -0,0 +1,207 @@
+""" test scalar indexing, including at and iat """
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series, Timedelta, Timestamp, date_range
+from pandas.tests.indexing.common import Base
+from pandas.util import testing as tm
+
+
+class TestScalar(Base):
+
+ def test_at_and_iat_get(self):
+ def _check(f, func, values=False):
+
+ if f is not None:
+ indicies = self.generate_indices(f, values)
+ for i in indicies:
+ result = getattr(f, func)[i]
+ expected = self.get_value(f, i, values)
+ tm.assert_almost_equal(result, expected)
+
+ for o in self._objs:
+
+ d = getattr(self, o)
+
+ # iat
+ for f in [d['ints'], d['uints']]:
+ _check(f, 'iat', values=True)
+
+ for f in [d['labels'], d['ts'], d['floats']]:
+ if f is not None:
+ pytest.raises(ValueError, self.check_values, f, 'iat')
+
+ # at
+ for f in [d['ints'], d['uints'], d['labels'],
+ d['ts'], d['floats']]:
+ _check(f, 'at')
+
+ def test_at_and_iat_set(self):
+ def _check(f, func, values=False):
+
+ if f is not None:
+ indicies = self.generate_indices(f, values)
+ for i in indicies:
+ getattr(f, func)[i] = 1
+ expected = self.get_value(f, i, values)
+ tm.assert_almost_equal(expected, 1)
+
+ for t in self._objs:
+
+ d = getattr(self, t)
+
+ # iat
+ for f in [d['ints'], d['uints']]:
+ _check(f, 'iat', values=True)
+
+ for f in [d['labels'], d['ts'], d['floats']]:
+ if f is not None:
+ pytest.raises(ValueError, _check, f, 'iat')
+
+ # at
+ for f in [d['ints'], d['uints'], d['labels'],
+ d['ts'], d['floats']]:
+ _check(f, 'at')
+
+ def test_at_iat_coercion(self):
+
+ # as timestamp is not a tuple!
+ dates = date_range('1/1/2000', periods=8)
+ df = DataFrame(np.random.randn(8, 4),
+ index=dates,
+ columns=['A', 'B', 'C', 'D'])
+ s = df['A']
+
+ result = s.at[dates[5]]
+ xp = s.values[5]
+ assert result == xp
+
+ # GH 7729
+ # make sure we are boxing the returns
+ s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]')
+ expected = Timestamp('2014-02-02')
+
+ for r in [lambda: s.iat[1], lambda: s.iloc[1]]:
+ result = r()
+ assert result == expected
+
+ s = Series(['1 days', '2 days'], dtype='timedelta64[ns]')
+ expected = Timedelta('2 days')
+
+ for r in [lambda: s.iat[1], lambda: s.iloc[1]]:
+ result = r()
+ assert result == expected
+
+ def test_iat_invalid_args(self):
+ pass
+
+ def test_imethods_with_dups(self):
+
+ # GH6493
+ # iat/iloc with dups
+
+ s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64')
+ result = s.iloc[2]
+ assert result == 2
+ result = s.iat[2]
+ assert result == 2
+
+ pytest.raises(IndexError, lambda: s.iat[10])
+ pytest.raises(IndexError, lambda: s.iat[-10])
+
+ result = s.iloc[[2, 3]]
+ expected = Series([2, 3], [2, 2], dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+ df = s.to_frame()
+ result = df.iloc[2]
+ expected = Series(2, index=[0], name=2)
+ tm.assert_series_equal(result, expected)
+
+ result = df.iat[2, 0]
+ assert result == 2
+
+ def test_at_to_fail(self):
+ # at should not fallback
+ # GH 7814
+ s = Series([1, 2, 3], index=list('abc'))
+ result = s.at['a']
+ assert result == 1
+ pytest.raises(ValueError, lambda: s.at[0])
+
+ df = DataFrame({'A': [1, 2, 3]}, index=list('abc'))
+ result = df.at['a', 'A']
+ assert result == 1
+ pytest.raises(ValueError, lambda: df.at['a', 0])
+
+ s = Series([1, 2, 3], index=[3, 2, 1])
+ result = s.at[1]
+ assert result == 3
+ pytest.raises(ValueError, lambda: s.at['a'])
+
+ df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1])
+ result = df.at[1, 0]
+ assert result == 3
+ pytest.raises(ValueError, lambda: df.at['a', 0])
+
+ # GH 13822, incorrect error string with non-unique columns when missing
+ # column is accessed
+ df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]})
+ df.columns = ['x', 'x', 'z']
+
+ # Check that we get the correct value in the KeyError
+ with pytest.raises(KeyError, match=r"\['y'\] not in index"):
+ df[['x', 'y', 'z']]
+
+ def test_at_with_tz(self):
+ # gh-15822
+ df = DataFrame({'name': ['John', 'Anderson'],
+ 'date': [Timestamp(2017, 3, 13, 13, 32, 56),
+ Timestamp(2017, 2, 16, 12, 10, 3)]})
+ df['date'] = df['date'].dt.tz_localize('Asia/Shanghai')
+
+ expected = Timestamp('2017-03-13 13:32:56+0800', tz='Asia/Shanghai')
+
+ result = df.loc[0, 'date']
+ assert result == expected
+
+ result = df.at[0, 'date']
+ assert result == expected
+
+ def test_mixed_index_at_iat_loc_iloc_series(self):
+ # GH 19860
+ s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2])
+ for el, item in s.iteritems():
+ assert s.at[el] == s.loc[el] == item
+ for i in range(len(s)):
+ assert s.iat[i] == s.iloc[i] == i + 1
+
+ with pytest.raises(KeyError):
+ s.at[4]
+ with pytest.raises(KeyError):
+ s.loc[4]
+
+ def test_mixed_index_at_iat_loc_iloc_dataframe(self):
+ # GH 19860
+ df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
+ columns=['a', 'b', 'c', 1, 2])
+ for rowIdx, row in df.iterrows():
+ for el, item in row.iteritems():
+ assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item
+
+ for row in range(2):
+ for i in range(5):
+ assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i
+
+ with pytest.raises(KeyError):
+ df.at[0, 3]
+ with pytest.raises(KeyError):
+ df.loc[0, 3]
+
+ def test_iat_setter_incompatible_assignment(self):
+ # GH 23236
+ result = DataFrame({'a': [0, 1], 'b': [4, 5]})
+ result.iat[0, 0] = None
+ expected = DataFrame({"a": [None, 1], "b": [4, 5]})
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/indexing/test_timedelta.py b/contrib/python/pandas/py2/pandas/tests/indexing/test_timedelta.py
new file mode 100644
index 00000000000..acd8bee3e56
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/indexing/test_timedelta.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.util import testing as tm
+
+
+class TestTimedeltaIndexing(object):
+ def test_boolean_indexing(self):
+ # GH 14946
+ df = pd.DataFrame({'x': range(10)})
+ df.index = pd.to_timedelta(range(10), unit='s')
+ conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3]
+ expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10],
+ [0, 1, 2, 10, 4, 5, 6, 7, 8, 9],
+ [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]]
+ for cond, data in zip(conditions, expected_data):
+ result = df.assign(x=df.mask(cond, 10).astype('int64'))
+ expected = pd.DataFrame(data,
+ index=pd.to_timedelta(range(10), unit='s'),
+ columns=['x'],
+ dtype='int64')
+ tm.assert_frame_equal(expected, result)
+
+ @pytest.mark.parametrize(
+ "indexer, expected",
+ [(0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+ (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]),
+ ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9])])
+ def test_list_like_indexing(self, indexer, expected):
+ # GH 16637
+ df = pd.DataFrame({'x': range(10)}, dtype="int64")
+ df.index = pd.to_timedelta(range(10), unit='s')
+
+ df.loc[df.index[indexer], 'x'] = 20
+
+ expected = pd.DataFrame(expected,
+ index=pd.to_timedelta(range(10), unit='s'),
+ columns=['x'],
+ dtype="int64")
+
+ tm.assert_frame_equal(expected, df)
+
+ def test_string_indexing(self):
+ # GH 16896
+ df = pd.DataFrame({'x': range(3)},
+ index=pd.to_timedelta(range(3), unit='days'))
+ expected = df.iloc[0]
+ sliced = df.loc['0 days']
+ tm.assert_series_equal(sliced, expected)
+
+ @pytest.mark.parametrize(
+ "value",
+ [None, pd.NaT, np.nan])
+ def test_masked_setitem(self, value):
+ # issue (#18586)
+ series = pd.Series([0, 1, 2], dtype='timedelta64[ns]')
+ series[series == series[0]] = value
+ expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]')
+ tm.assert_series_equal(series, expected)
+
+ @pytest.mark.parametrize(
+ "value",
+ [None, pd.NaT, np.nan])
+ def test_listlike_setitem(self, value):
+ # issue (#18586)
+ series = pd.Series([0, 1, 2], dtype='timedelta64[ns]')
+ series.iloc[0] = value
+ expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]')
+ tm.assert_series_equal(series, expected)
+
+ @pytest.mark.parametrize('start,stop, expected_slice', [
+ [np.timedelta64(0, 'ns'), None, slice(0, 11)],
+ [np.timedelta64(1, 'D'), np.timedelta64(6, 'D'), slice(1, 7)],
+ [None, np.timedelta64(4, 'D'), slice(0, 5)]])
+ def test_numpy_timedelta_scalar_indexing(self, start, stop,
+ expected_slice):
+ # GH 20393
+ s = pd.Series(range(11), pd.timedelta_range('0 days', '10 days'))
+ result = s.loc[slice(start, stop)]
+ expected = s.iloc[expected_slice]
+ tm.assert_series_equal(result, expected)
+
+ def test_roundtrip_thru_setitem(self):
+ # PR 23462
+ dt1 = pd.Timedelta(0)
+ dt2 = pd.Timedelta(28767471428571405)
+ df = pd.DataFrame({'dt': pd.Series([dt1, dt2])})
+ df_copy = df.copy()
+ s = pd.Series([dt1])
+
+ expected = df['dt'].iloc[1].value
+ df.loc[[True, False]] = s
+ result = df['dt'].iloc[1].value
+
+ assert expected == result
+ tm.assert_frame_equal(df, df_copy)
diff --git a/contrib/python/pandas/py2/pandas/tests/internals/__init__.py b/contrib/python/pandas/py2/pandas/tests/internals/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/internals/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/internals/test_internals.py b/contrib/python/pandas/py2/pandas/tests/internals/test_internals.py
new file mode 100644
index 00000000000..fe0706efdc4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/internals/test_internals.py
@@ -0,0 +1,1296 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=W0102
+
+from datetime import date, datetime
+from distutils.version import LooseVersion
+import itertools
+import operator
+import re
+import sys
+
+import numpy as np
+import pytest
+
+from pandas._libs.internals import BlockPlacement
+from pandas.compat import OrderedDict, lrange, u, zip
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series,
+ SparseArray)
+import pandas.core.algorithms as algos
+from pandas.core.arrays import DatetimeArray, TimedeltaArray
+from pandas.core.internals import BlockManager, SingleBlockManager, make_block
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal, randn)
+
+# in 3.6.1 a c-api slicing function changed, see src/compat_helper.h
+PY361 = LooseVersion(sys.version) >= LooseVersion('3.6.1')
+
+
+def mgr():
+ return create_mgr(
+ 'a: f8; b: object; c: f8; d: object; e: f8;'
+ 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;'
+ 'k: M8[ns, US/Eastern]; l: M8[ns, CET];')
+
+
+def assert_block_equal(left, right):
+ tm.assert_numpy_array_equal(left.values, right.values)
+ assert left.dtype == right.dtype
+ assert isinstance(left.mgr_locs, BlockPlacement)
+ assert isinstance(right.mgr_locs, BlockPlacement)
+ tm.assert_numpy_array_equal(left.mgr_locs.as_array,
+ right.mgr_locs.as_array)
+
+
+def get_numeric_mat(shape):
+ arr = np.arange(shape[0])
+ return np.lib.stride_tricks.as_strided(x=arr, shape=shape, strides=(
+ arr.itemsize, ) + (0, ) * (len(shape) - 1)).copy()
+
+
+N = 10
+
+
+def create_block(typestr, placement, item_shape=None, num_offset=0):
+ """
+ Supported typestr:
+
+ * float, f8, f4, f2
+ * int, i8, i4, i2, i1
+ * uint, u8, u4, u2, u1
+ * complex, c16, c8
+ * bool
+ * object, string, O
+ * datetime, dt, M8[ns], M8[ns, tz]
+ * timedelta, td, m8[ns]
+ * sparse (SparseArray with fill_value=0.0)
+ * sparse_na (SparseArray with fill_value=np.nan)
+ * category, category2
+
+ """
+ placement = BlockPlacement(placement)
+ num_items = len(placement)
+
+ if item_shape is None:
+ item_shape = (N, )
+
+ shape = (num_items, ) + item_shape
+
+ mat = get_numeric_mat(shape)
+
+ if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1',
+ 'uint', 'u8', 'u4', 'u2', 'u1'):
+ values = mat.astype(typestr) + num_offset
+ elif typestr in ('complex', 'c16', 'c8'):
+ values = 1.j * (mat.astype(typestr) + num_offset)
+ elif typestr in ('object', 'string', 'O'):
+ values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
+ shape)
+ elif typestr in ('b', 'bool', ):
+ values = np.ones(shape, dtype=np.bool_)
+ elif typestr in ('datetime', 'dt', 'M8[ns]'):
+ values = (mat * 1e9).astype('M8[ns]')
+ elif typestr.startswith('M8[ns'):
+ # datetime with tz
+ m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr)
+ assert m is not None, "incompatible typestr -> {0}".format(typestr)
+ tz = m.groups()[0]
+ assert num_items == 1, "must have only 1 num items for a tz-aware"
+ values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
+ elif typestr in ('timedelta', 'td', 'm8[ns]'):
+ values = (mat * 1).astype('m8[ns]')
+ elif typestr in ('category', ):
+ values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
+ elif typestr in ('category2', ):
+ values = Categorical(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd'
+ ])
+ elif typestr in ('sparse', 'sparse_na'):
+ # FIXME: doesn't support num_rows != 10
+ assert shape[-1] == 10
+ assert all(s == 1 for s in shape[:-1])
+ if typestr.endswith('_na'):
+ fill_value = np.nan
+ else:
+ fill_value = 0.0
+ values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value,
+ 4, 5, fill_value, 6], fill_value=fill_value)
+ arr = values.sp_values.view()
+ arr += (num_offset - 1)
+ else:
+ raise ValueError('Unsupported typestr: "%s"' % typestr)
+
+ return make_block(values, placement=placement, ndim=len(shape))
+
+
+def create_single_mgr(typestr, num_rows=None):
+ if num_rows is None:
+ num_rows = N
+
+ return SingleBlockManager(
+ create_block(typestr, placement=slice(0, num_rows), item_shape=()),
+ np.arange(num_rows))
+
+
+def create_mgr(descr, item_shape=None):
+ """
+ Construct BlockManager from string description.
+
+ String description syntax looks similar to np.matrix initializer. It looks
+ like this::
+
+ a,b,c: f8; d,e,f: i8
+
+ Rules are rather simple:
+
+ * see list of supported datatypes in `create_block` method
+ * components are semicolon-separated
+ * each component is `NAME,NAME,NAME: DTYPE_ID`
+ * whitespace around colons & semicolons are removed
+ * components with same DTYPE_ID are combined into single block
+ * to force multiple blocks with same dtype, use '-SUFFIX'::
+
+ 'a:f8-1; b:f8-2; c:f8-foobar'
+
+ """
+ if item_shape is None:
+ item_shape = (N, )
+
+ offset = 0
+ mgr_items = []
+ block_placements = OrderedDict()
+ for d in descr.split(';'):
+ d = d.strip()
+ if not len(d):
+ continue
+ names, blockstr = d.partition(':')[::2]
+ blockstr = blockstr.strip()
+ names = names.strip().split(',')
+
+ mgr_items.extend(names)
+ placement = list(np.arange(len(names)) + offset)
+ try:
+ block_placements[blockstr].extend(placement)
+ except KeyError:
+ block_placements[blockstr] = placement
+ offset += len(names)
+
+ mgr_items = Index(mgr_items)
+
+ blocks = []
+ num_offset = 0
+ for blockstr, placement in block_placements.items():
+ typestr = blockstr.split('-')[0]
+ blocks.append(create_block(typestr,
+ placement,
+ item_shape=item_shape,
+ num_offset=num_offset, ))
+ num_offset += len(placement)
+
+ return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]),
+ [mgr_items] + [np.arange(n) for n in item_shape])
+
+
+class TestBlock(object):
+
+ def setup_method(self, method):
+ # self.fblock = get_float_ex() # a,c,e
+ # self.cblock = get_complex_ex() #
+ # self.oblock = get_obj_ex()
+ # self.bool_block = get_bool_ex()
+ # self.int_block = get_int_ex()
+
+ self.fblock = create_block('float', [0, 2, 4])
+ self.cblock = create_block('complex', [7])
+ self.oblock = create_block('object', [1, 3])
+ self.bool_block = create_block('bool', [5])
+ self.int_block = create_block('int', [6])
+
+ def test_constructor(self):
+ int32block = create_block('i4', [0])
+ assert int32block.dtype == np.int32
+
+ def test_pickle(self):
+ def _check(blk):
+ assert_block_equal(tm.round_trip_pickle(blk), blk)
+
+ _check(self.fblock)
+ _check(self.cblock)
+ _check(self.oblock)
+ _check(self.bool_block)
+
+ def test_mgr_locs(self):
+ assert isinstance(self.fblock.mgr_locs, BlockPlacement)
+ tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array,
+ np.array([0, 2, 4], dtype=np.int64))
+
+ def test_attrs(self):
+ assert self.fblock.shape == self.fblock.values.shape
+ assert self.fblock.dtype == self.fblock.values.dtype
+ assert len(self.fblock) == len(self.fblock.values)
+
+ def test_merge(self):
+ avals = randn(2, 10)
+ bvals = randn(2, 10)
+
+ ref_cols = Index(['e', 'a', 'b', 'd', 'f'])
+
+ ablock = make_block(avals, ref_cols.get_indexer(['e', 'b']))
+ bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd']))
+ merged = ablock.merge(bblock)
+ tm.assert_numpy_array_equal(merged.mgr_locs.as_array,
+ np.array([0, 1, 2, 3], dtype=np.int64))
+ tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals))
+ tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
+
+ # TODO: merge with mixed type?
+
+ def test_copy(self):
+ cop = self.fblock.copy()
+ assert cop is not self.fblock
+ assert_block_equal(self.fblock, cop)
+
+ def test_reindex_index(self):
+ pass
+
+ def test_reindex_cast(self):
+ pass
+
+ def test_insert(self):
+ pass
+
+ def test_delete(self):
+ newb = self.fblock.copy()
+ newb.delete(0)
+ assert isinstance(newb.mgr_locs, BlockPlacement)
+ tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
+ np.array([2, 4], dtype=np.int64))
+ assert (newb.values[0] == 1).all()
+
+ newb = self.fblock.copy()
+ newb.delete(1)
+ assert isinstance(newb.mgr_locs, BlockPlacement)
+ tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
+ np.array([0, 4], dtype=np.int64))
+ assert (newb.values[1] == 2).all()
+
+ newb = self.fblock.copy()
+ newb.delete(2)
+ tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
+ np.array([0, 2], dtype=np.int64))
+ assert (newb.values[1] == 1).all()
+
+ newb = self.fblock.copy()
+ with pytest.raises(Exception):
+ newb.delete(3)
+
+ def test_make_block_same_class(self):
+ # issue 19431
+ block = create_block('M8[ns, US/Eastern]', [3])
+ with tm.assert_produces_warning(DeprecationWarning,
+ check_stacklevel=False):
+ block.make_block_same_class(block.values,
+ dtype=block.values.dtype)
+
+
+class TestDatetimeBlock(object):
+
+ def test_try_coerce_arg(self):
+ block = create_block('datetime', [0])
+
+ # coerce None
+ none_coerced = block._try_coerce_args(block.values, None)[1]
+ assert pd.Timestamp(none_coerced) is pd.NaT
+
+ # coerce different types of date bojects
+ vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10),
+ date(2010, 10, 10))
+ for val in vals:
+ coerced = block._try_coerce_args(block.values, val)[1]
+ assert np.int64 == type(coerced)
+ assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced)
+
+
+class TestBlockManager(object):
+
+ def test_constructor_corner(self):
+ pass
+
+ def test_attrs(self):
+ mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2')
+ assert mgr.nblocks == 2
+ assert len(mgr) == 6
+
+ def test_is_mixed_dtype(self):
+ assert not create_mgr('a,b:f8').is_mixed_type
+ assert not create_mgr('a:f8-1; b:f8-2').is_mixed_type
+
+ assert create_mgr('a,b:f8; c,d: f4').is_mixed_type
+ assert create_mgr('a,b:f8; c,d: object').is_mixed_type
+
+ def test_duplicate_ref_loc_failure(self):
+ tmp_mgr = create_mgr('a:bool; a: f8')
+
+ axes, blocks = tmp_mgr.axes, tmp_mgr.blocks
+
+ blocks[0].mgr_locs = np.array([0])
+ blocks[1].mgr_locs = np.array([0])
+
+ # test trying to create block manager with overlapping ref locs
+ with pytest.raises(AssertionError):
+ BlockManager(blocks, axes)
+
+ blocks[0].mgr_locs = np.array([0])
+ blocks[1].mgr_locs = np.array([1])
+ mgr = BlockManager(blocks, axes)
+ mgr.iget(1)
+
+ def test_contains(self, mgr):
+ assert 'a' in mgr
+ assert 'baz' not in mgr
+
+ def test_pickle(self, mgr):
+
+ mgr2 = tm.round_trip_pickle(mgr)
+ assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+ # share ref_items
+ # assert mgr2.blocks[0].ref_items is mgr2.blocks[1].ref_items
+
+ # GH2431
+ assert hasattr(mgr2, "_is_consolidated")
+ assert hasattr(mgr2, "_known_consolidated")
+
+ # reset to False on load
+ assert not mgr2._is_consolidated
+ assert not mgr2._known_consolidated
+
+ def test_non_unique_pickle(self):
+
+ mgr = create_mgr('a,a,a:f8')
+ mgr2 = tm.round_trip_pickle(mgr)
+ assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+ mgr = create_mgr('a: f8; a: i8')
+ mgr2 = tm.round_trip_pickle(mgr)
+ assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+ def test_categorical_block_pickle(self):
+ mgr = create_mgr('a: category')
+ mgr2 = tm.round_trip_pickle(mgr)
+ assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+ smgr = create_single_mgr('category')
+ smgr2 = tm.round_trip_pickle(smgr)
+ assert_series_equal(Series(smgr), Series(smgr2))
+
+ def test_get(self):
+ cols = Index(list('abc'))
+ values = np.random.rand(3, 3)
+ block = make_block(values=values.copy(), placement=np.arange(3))
+ mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])
+
+ assert_almost_equal(mgr.get('a', fastpath=False), values[0])
+ assert_almost_equal(mgr.get('b', fastpath=False), values[1])
+ assert_almost_equal(mgr.get('c', fastpath=False), values[2])
+ assert_almost_equal(mgr.get('a').internal_values(), values[0])
+ assert_almost_equal(mgr.get('b').internal_values(), values[1])
+ assert_almost_equal(mgr.get('c').internal_values(), values[2])
+
+ def test_set(self):
+ mgr = create_mgr('a,b,c: int', item_shape=(3, ))
+
+ mgr.set('d', np.array(['foo'] * 3))
+ mgr.set('b', np.array(['bar'] * 3))
+ tm.assert_numpy_array_equal(mgr.get('a').internal_values(),
+ np.array([0] * 3))
+ tm.assert_numpy_array_equal(mgr.get('b').internal_values(),
+ np.array(['bar'] * 3, dtype=np.object_))
+ tm.assert_numpy_array_equal(mgr.get('c').internal_values(),
+ np.array([2] * 3))
+ tm.assert_numpy_array_equal(mgr.get('d').internal_values(),
+ np.array(['foo'] * 3, dtype=np.object_))
+
+ def test_set_change_dtype(self, mgr):
+ mgr.set('baz', np.zeros(N, dtype=bool))
+
+ mgr.set('baz', np.repeat('foo', N))
+ assert mgr.get('baz').dtype == np.object_
+
+ mgr2 = mgr.consolidate()
+ mgr2.set('baz', np.repeat('foo', N))
+ assert mgr2.get('baz').dtype == np.object_
+
+ mgr2.set('quux', randn(N).astype(int))
+ assert mgr2.get('quux').dtype == np.int_
+
+ mgr2.set('quux', randn(N))
+ assert mgr2.get('quux').dtype == np.float_
+
+ def test_set_change_dtype_slice(self): # GH8850
+ cols = MultiIndex.from_tuples([('1st', 'a'), ('2nd', 'b'), ('3rd', 'c')
+ ])
+ df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
+ df['2nd'] = df['2nd'] * 2.0
+
+ blocks = df._to_dict_of_blocks()
+ assert sorted(blocks.keys()) == ['float64', 'int64']
+ assert_frame_equal(blocks['float64'], DataFrame(
+ [[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]))
+ assert_frame_equal(blocks['int64'], DataFrame(
+ [[3], [6]], columns=cols[2:]))
+
+ def test_copy(self, mgr):
+ cp = mgr.copy(deep=False)
+ for blk, cp_blk in zip(mgr.blocks, cp.blocks):
+
+ # view assertion
+ assert cp_blk.equals(blk)
+ if isinstance(blk.values, np.ndarray):
+ assert cp_blk.values.base is blk.values.base
+ else:
+ # DatetimeTZBlock has DatetimeIndex values
+ assert cp_blk.values._data.base is blk.values._data.base
+
+ cp = mgr.copy(deep=True)
+ for blk, cp_blk in zip(mgr.blocks, cp.blocks):
+
+ # copy assertion we either have a None for a base or in case of
+ # some blocks it is an array (e.g. datetimetz), but was copied
+ assert cp_blk.equals(blk)
+ if not isinstance(cp_blk.values, np.ndarray):
+ assert cp_blk.values._data.base is not blk.values._data.base
+ else:
+ assert cp_blk.values.base is None and blk.values.base is None
+
+ def test_sparse(self):
+ mgr = create_mgr('a: sparse-1; b: sparse-2')
+ # what to test here?
+ assert mgr.as_array().dtype == np.float64
+
+ def test_sparse_mixed(self):
+ mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8')
+ assert len(mgr.blocks) == 3
+ assert isinstance(mgr, BlockManager)
+
+ # what to test here?
+
+ def test_as_array_float(self):
+ mgr = create_mgr('c: f4; d: f2; e: f8')
+ assert mgr.as_array().dtype == np.float64
+
+ mgr = create_mgr('c: f4; d: f2')
+ assert mgr.as_array().dtype == np.float32
+
+ def test_as_array_int_bool(self):
+ mgr = create_mgr('a: bool-1; b: bool-2')
+ assert mgr.as_array().dtype == np.bool_
+
+ mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1')
+ assert mgr.as_array().dtype == np.int64
+
+ mgr = create_mgr('c: i4; d: i2; e: u1')
+ assert mgr.as_array().dtype == np.int32
+
+ def test_as_array_datetime(self):
+ mgr = create_mgr('h: datetime-1; g: datetime-2')
+ assert mgr.as_array().dtype == 'M8[ns]'
+
+ def test_as_array_datetime_tz(self):
+ mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]')
+ assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]'
+ assert mgr.get('g').dtype == 'datetime64[ns, CET]'
+ assert mgr.as_array().dtype == 'object'
+
+ def test_astype(self):
+ # coerce all
+ mgr = create_mgr('c: f4; d: f2; e: f8')
+ for t in ['float16', 'float32', 'float64', 'int32', 'int64']:
+ t = np.dtype(t)
+ tmgr = mgr.astype(t)
+ assert tmgr.get('c').dtype.type == t
+ assert tmgr.get('d').dtype.type == t
+ assert tmgr.get('e').dtype.type == t
+
+ # mixed
+ mgr = create_mgr('a,b: object; c: bool; d: datetime;'
+ 'e: f4; f: f2; g: f8')
+ for t in ['float16', 'float32', 'float64', 'int32', 'int64']:
+ t = np.dtype(t)
+ tmgr = mgr.astype(t, errors='ignore')
+ assert tmgr.get('c').dtype.type == t
+ assert tmgr.get('e').dtype.type == t
+ assert tmgr.get('f').dtype.type == t
+ assert tmgr.get('g').dtype.type == t
+
+ assert tmgr.get('a').dtype.type == np.object_
+ assert tmgr.get('b').dtype.type == np.object_
+ if t != np.int64:
+ assert tmgr.get('d').dtype.type == np.datetime64
+ else:
+ assert tmgr.get('d').dtype.type == t
+
+ def test_convert(self):
+ def _compare(old_mgr, new_mgr):
+ """ compare the blocks, numeric compare ==, object don't """
+ old_blocks = set(old_mgr.blocks)
+ new_blocks = set(new_mgr.blocks)
+ assert len(old_blocks) == len(new_blocks)
+
+ # compare non-numeric
+ for b in old_blocks:
+ found = False
+ for nb in new_blocks:
+ if (b.values == nb.values).all():
+ found = True
+ break
+ assert found
+
+ for b in new_blocks:
+ found = False
+ for ob in old_blocks:
+ if (b.values == ob.values).all():
+ found = True
+ break
+ assert found
+
+ # noops
+ mgr = create_mgr('f: i8; g: f8')
+ new_mgr = mgr.convert()
+ _compare(mgr, new_mgr)
+
+ mgr = create_mgr('a, b: object; f: i8; g: f8')
+ new_mgr = mgr.convert()
+ _compare(mgr, new_mgr)
+
+ # convert
+ mgr = create_mgr('a,b,foo: object; f: i8; g: f8')
+ mgr.set('a', np.array(['1'] * N, dtype=np.object_))
+ mgr.set('b', np.array(['2.'] * N, dtype=np.object_))
+ mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_))
+ new_mgr = mgr.convert(numeric=True)
+ assert new_mgr.get('a').dtype == np.int64
+ assert new_mgr.get('b').dtype == np.float64
+ assert new_mgr.get('foo').dtype == np.object_
+ assert new_mgr.get('f').dtype == np.int64
+ assert new_mgr.get('g').dtype == np.float64
+
+ mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;'
+ 'i: i8; g: f8; h: f2')
+ mgr.set('a', np.array(['1'] * N, dtype=np.object_))
+ mgr.set('b', np.array(['2.'] * N, dtype=np.object_))
+ mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_))
+ new_mgr = mgr.convert(numeric=True)
+ assert new_mgr.get('a').dtype == np.int64
+ assert new_mgr.get('b').dtype == np.float64
+ assert new_mgr.get('foo').dtype == np.object_
+ assert new_mgr.get('f').dtype == np.int32
+ assert new_mgr.get('bool').dtype == np.bool_
+ assert new_mgr.get('dt').dtype.type, np.datetime64
+ assert new_mgr.get('i').dtype == np.int64
+ assert new_mgr.get('g').dtype == np.float64
+ assert new_mgr.get('h').dtype == np.float16
+
+ def test_interleave(self):
+
+ # self
+ for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]',
+ 'm8[ns]']:
+ mgr = create_mgr('a: {0}'.format(dtype))
+ assert mgr.as_array().dtype == dtype
+ mgr = create_mgr('a: {0}; b: {0}'.format(dtype))
+ assert mgr.as_array().dtype == dtype
+
+ # will be converted according the actual dtype of the underlying
+ mgr = create_mgr('a: category')
+ assert mgr.as_array().dtype == 'i8'
+ mgr = create_mgr('a: category; b: category')
+ assert mgr.as_array().dtype == 'i8'
+ mgr = create_mgr('a: category; b: category2')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: category2')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: category2; b: category2')
+ assert mgr.as_array().dtype == 'object'
+
+ # combinations
+ mgr = create_mgr('a: f8')
+ assert mgr.as_array().dtype == 'f8'
+ mgr = create_mgr('a: f8; b: i8')
+ assert mgr.as_array().dtype == 'f8'
+ mgr = create_mgr('a: f4; b: i8')
+ assert mgr.as_array().dtype == 'f8'
+ mgr = create_mgr('a: f4; b: i8; d: object')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: bool; b: i8')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: complex')
+ assert mgr.as_array().dtype == 'complex'
+ mgr = create_mgr('a: f8; b: category')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: M8[ns]; b: category')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: M8[ns]; b: bool')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: M8[ns]; b: i8')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: m8[ns]; b: bool')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: m8[ns]; b: i8')
+ assert mgr.as_array().dtype == 'object'
+ mgr = create_mgr('a: M8[ns]; b: m8[ns]')
+ assert mgr.as_array().dtype == 'object'
+
+ def test_interleave_non_unique_cols(self):
+ df = DataFrame([
+ [pd.Timestamp('20130101'), 3.5],
+ [pd.Timestamp('20130102'), 4.5]],
+ columns=['x', 'x'],
+ index=[1, 2])
+
+ df_unique = df.copy()
+ df_unique.columns = ['x', 'y']
+ assert df_unique.values.shape == df.values.shape
+ tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
+ tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
+
+ def test_consolidate(self):
+ pass
+
+ def test_consolidate_ordering_issues(self, mgr):
+ mgr.set('f', randn(N))
+ mgr.set('d', randn(N))
+ mgr.set('b', randn(N))
+ mgr.set('g', randn(N))
+ mgr.set('h', randn(N))
+
+ # we have datetime/tz blocks in mgr
+ cons = mgr.consolidate()
+ assert cons.nblocks == 4
+ cons = mgr.consolidate().get_numeric_data()
+ assert cons.nblocks == 1
+ assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
+ tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array,
+ np.arange(len(cons.items), dtype=np.int64))
+
+ def test_reindex_index(self):
+ pass
+
+ def test_reindex_items(self):
+ # mgr is not consolidated, f8 & f8-2 blocks
+ mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;'
+ 'f: bool; g: f8-2')
+
+ reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0)
+ assert reindexed.nblocks == 2
+ tm.assert_index_equal(reindexed.items, pd.Index(['g', 'c', 'a', 'd']))
+ assert_almost_equal(
+ mgr.get('g', fastpath=False), reindexed.get('g', fastpath=False))
+ assert_almost_equal(
+ mgr.get('c', fastpath=False), reindexed.get('c', fastpath=False))
+ assert_almost_equal(
+ mgr.get('a', fastpath=False), reindexed.get('a', fastpath=False))
+ assert_almost_equal(
+ mgr.get('d', fastpath=False), reindexed.get('d', fastpath=False))
+ assert_almost_equal(
+ mgr.get('g').internal_values(),
+ reindexed.get('g').internal_values())
+ assert_almost_equal(
+ mgr.get('c').internal_values(),
+ reindexed.get('c').internal_values())
+ assert_almost_equal(
+ mgr.get('a').internal_values(),
+ reindexed.get('a').internal_values())
+ assert_almost_equal(
+ mgr.get('d').internal_values(),
+ reindexed.get('d').internal_values())
+
+ def test_multiindex_xs(self):
+ mgr = create_mgr('a,b,c: f8; d,e,f: i8')
+
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+
+ mgr.set_axis(1, index)
+ result = mgr.xs('bar', axis=1)
+ assert result.shape == (6, 2)
+ assert result.axes[1][0] == ('bar', 'one')
+ assert result.axes[1][1] == ('bar', 'two')
+
+ def test_get_numeric_data(self):
+ mgr = create_mgr('int: int; float: float; complex: complex;'
+ 'str: object; bool: bool; obj: object; dt: datetime',
+ item_shape=(3, ))
+ mgr.set('obj', np.array([1, 2, 3], dtype=np.object_))
+
+ numeric = mgr.get_numeric_data()
+ tm.assert_index_equal(numeric.items,
+ pd.Index(['int', 'float', 'complex', 'bool']))
+ assert_almost_equal(
+ mgr.get('float', fastpath=False), numeric.get('float',
+ fastpath=False))
+ assert_almost_equal(
+ mgr.get('float').internal_values(),
+ numeric.get('float').internal_values())
+
+ # Check sharing
+ numeric.set('float', np.array([100., 200., 300.]))
+ assert_almost_equal(
+ mgr.get('float', fastpath=False), np.array([100., 200., 300.]))
+ assert_almost_equal(
+ mgr.get('float').internal_values(), np.array([100., 200., 300.]))
+
+ numeric2 = mgr.get_numeric_data(copy=True)
+ tm.assert_index_equal(numeric.items,
+ pd.Index(['int', 'float', 'complex', 'bool']))
+ numeric2.set('float', np.array([1000., 2000., 3000.]))
+ assert_almost_equal(
+ mgr.get('float', fastpath=False), np.array([100., 200., 300.]))
+ assert_almost_equal(
+ mgr.get('float').internal_values(), np.array([100., 200., 300.]))
+
+ def test_get_bool_data(self):
+ mgr = create_mgr('int: int; float: float; complex: complex;'
+ 'str: object; bool: bool; obj: object; dt: datetime',
+ item_shape=(3, ))
+ mgr.set('obj', np.array([True, False, True], dtype=np.object_))
+
+ bools = mgr.get_bool_data()
+ tm.assert_index_equal(bools.items, pd.Index(['bool']))
+ assert_almost_equal(mgr.get('bool', fastpath=False),
+ bools.get('bool', fastpath=False))
+ assert_almost_equal(
+ mgr.get('bool').internal_values(),
+ bools.get('bool').internal_values())
+
+ bools.set('bool', np.array([True, False, True]))
+ tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False),
+ np.array([True, False, True]))
+ tm.assert_numpy_array_equal(mgr.get('bool').internal_values(),
+ np.array([True, False, True]))
+
+ # Check sharing
+ bools2 = mgr.get_bool_data(copy=True)
+ bools2.set('bool', np.array([False, True, False]))
+ tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False),
+ np.array([True, False, True]))
+ tm.assert_numpy_array_equal(mgr.get('bool').internal_values(),
+ np.array([True, False, True]))
+
+ def test_unicode_repr_doesnt_raise(self):
+ repr(create_mgr(u('b,\u05d0: object')))
+
+ def test_missing_unicode_key(self):
+ df = DataFrame({"a": [1]})
+ try:
+ df.loc[:, u("\u05d0")] # should not raise UnicodeEncodeError
+ except KeyError:
+ pass # this is the expected exception
+
+ def test_equals(self):
+ # unique items
+ bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2')
+ bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
+ assert bm1.equals(bm2)
+
+ bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2')
+ bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
+ assert bm1.equals(bm2)
+
+ def test_equals_block_order_different_dtypes(self):
+ # GH 9330
+
+ mgr_strings = [
+ "a:i8;b:f8", # basic case
+ "a:i8;b:f8;c:c8;d:b", # many types
+ "a:i8;e:dt;f:td;g:string", # more types
+ "a:i8;b:category;c:category2;d:category2", # categories
+ "c:sparse;d:sparse_na;b:f8", # sparse
+ ]
+
+ for mgr_string in mgr_strings:
+ bm = create_mgr(mgr_string)
+ block_perms = itertools.permutations(bm.blocks)
+ for bm_perm in block_perms:
+ bm_this = BlockManager(bm_perm, bm.axes)
+ assert bm.equals(bm_this)
+ assert bm_this.equals(bm)
+
+ def test_single_mgr_ctor(self):
+ mgr = create_single_mgr('f8', num_rows=5)
+ assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.]
+
+ def test_validate_bool_args(self):
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+ bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2')
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ bm1.replace_list([1], [2], inplace=value)
+
+
+class TestIndexing(object):
+ # Nosetests-style data-driven tests.
+ #
+ # This test applies different indexing routines to block managers and
+ # compares the outcome to the result of same operations on np.ndarray.
+ #
+ # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
+ # and are disabled.
+
+ MANAGERS = [
+ create_single_mgr('f8', N),
+ create_single_mgr('i8', N),
+
+ # 2-dim
+ create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)),
+ create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)),
+ create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)),
+ create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)),
+
+ # 3-dim
+ create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)),
+ create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)),
+ create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)),
+ create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)),
+ ]
+
+ # MANAGERS = [MANAGERS[6]]
+
+ def test_get_slice(self):
+ def assert_slice_ok(mgr, axis, slobj):
+ # import pudb; pudb.set_trace()
+ mat = mgr.as_array()
+
+ # we maybe using an ndarray to test slicing and
+ # might not be the full length of the axis
+ if isinstance(slobj, np.ndarray):
+ ax = mgr.axes[axis]
+ if len(ax) and len(slobj) and len(slobj) != len(ax):
+ slobj = np.concatenate([slobj, np.zeros(
+ len(ax) - len(slobj), dtype=bool)])
+ sliced = mgr.get_slice(slobj, axis=axis)
+ mat_slobj = (slice(None), ) * axis + (slobj, )
+ tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(),
+ check_dtype=False)
+ tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])
+
+ for mgr in self.MANAGERS:
+ for ax in range(mgr.ndim):
+ # slice
+ assert_slice_ok(mgr, ax, slice(None))
+ assert_slice_ok(mgr, ax, slice(3))
+ assert_slice_ok(mgr, ax, slice(100))
+ assert_slice_ok(mgr, ax, slice(1, 4))
+ assert_slice_ok(mgr, ax, slice(3, 0, -2))
+
+ # boolean mask
+ assert_slice_ok(
+ mgr, ax, np.array([], dtype=np.bool_))
+ assert_slice_ok(
+ mgr, ax,
+ np.ones(mgr.shape[ax], dtype=np.bool_))
+ assert_slice_ok(
+ mgr, ax,
+ np.zeros(mgr.shape[ax], dtype=np.bool_))
+
+ if mgr.shape[ax] >= 3:
+ assert_slice_ok(
+ mgr, ax,
+ np.arange(mgr.shape[ax]) % 3 == 0)
+ assert_slice_ok(
+ mgr, ax, np.array(
+ [True, True, False], dtype=np.bool_))
+
+ # fancy indexer
+ assert_slice_ok(mgr, ax, [])
+ assert_slice_ok(mgr, ax, lrange(mgr.shape[ax]))
+
+ if mgr.shape[ax] >= 3:
+ assert_slice_ok(mgr, ax, [0, 1, 2])
+ assert_slice_ok(mgr, ax, [-1, -2, -3])
+
+ def test_take(self):
+ def assert_take_ok(mgr, axis, indexer):
+ mat = mgr.as_array()
+ taken = mgr.take(indexer, axis)
+ tm.assert_numpy_array_equal(np.take(mat, indexer, axis),
+ taken.as_array(), check_dtype=False)
+ tm.assert_index_equal(mgr.axes[axis].take(indexer),
+ taken.axes[axis])
+
+ for mgr in self.MANAGERS:
+ for ax in range(mgr.ndim):
+ # take/fancy indexer
+ assert_take_ok(mgr, ax, [])
+ assert_take_ok(mgr, ax, [0, 0, 0])
+ assert_take_ok(mgr, ax, lrange(mgr.shape[ax]))
+
+ if mgr.shape[ax] >= 3:
+ assert_take_ok(mgr, ax, [0, 1, 2])
+ assert_take_ok(mgr, ax, [-1, -2, -3])
+
+ def test_reindex_axis(self):
+ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
+ mat = mgr.as_array()
+ indexer = mgr.axes[axis].get_indexer_for(new_labels)
+
+ reindexed = mgr.reindex_axis(new_labels, axis,
+ fill_value=fill_value)
+ tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis,
+ fill_value=fill_value),
+ reindexed.as_array(),
+ check_dtype=False)
+ tm.assert_index_equal(reindexed.axes[axis], new_labels)
+
+ for mgr in self.MANAGERS:
+ for ax in range(mgr.ndim):
+ for fill_value in (None, np.nan, 100.):
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ pd.Index([]), fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax, mgr.axes[ax],
+ fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ mgr.axes[ax][[0, 0, 0]], fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ pd.Index(['foo', 'bar', 'baz']), fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ pd.Index(['foo', mgr.axes[ax][0], 'baz']),
+ fill_value)
+
+ if mgr.shape[ax] >= 3:
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ mgr.axes[ax][:-3], fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ mgr.axes[ax][-3::-1], fill_value)
+ assert_reindex_axis_is_ok(
+ mgr, ax,
+ mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value)
+
+ def test_reindex_indexer(self):
+
+ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer,
+ fill_value):
+ mat = mgr.as_array()
+ reindexed_mat = algos.take_nd(mat, indexer, axis,
+ fill_value=fill_value)
+ reindexed = mgr.reindex_indexer(new_labels, indexer, axis,
+ fill_value=fill_value)
+ tm.assert_numpy_array_equal(reindexed_mat,
+ reindexed.as_array(),
+ check_dtype=False)
+ tm.assert_index_equal(reindexed.axes[axis], new_labels)
+
+ for mgr in self.MANAGERS:
+ for ax in range(mgr.ndim):
+ for fill_value in (None, np.nan, 100.):
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index([]), [], fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index(['foo'] * mgr.shape[ax]),
+ np.arange(mgr.shape[ax]), fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ mgr.axes[ax][::-1], np.arange(mgr.shape[ax]),
+ fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax, mgr.axes[ax],
+ np.arange(mgr.shape[ax])[::-1], fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index(['foo', 'bar', 'baz']),
+ [0, 0, 0], fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index(['foo', 'bar', 'baz']),
+ [-1, 0, -1], fill_value)
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index(['foo', mgr.axes[ax][0], 'baz']),
+ [-1, -1, -1], fill_value)
+
+ if mgr.shape[ax] >= 3:
+ assert_reindex_indexer_is_ok(
+ mgr, ax,
+ pd.Index(['foo', 'bar', 'baz']),
+ [0, 1, 2], fill_value)
+
+ # test_get_slice(slice_like, axis)
+ # take(indexer, axis)
+ # reindex_axis(new_labels, axis)
+ # reindex_indexer(new_labels, indexer, axis)
+
+
+class TestBlockPlacement(object):
+
+ def test_slice_len(self):
+ assert len(BlockPlacement(slice(0, 4))) == 4
+ assert len(BlockPlacement(slice(0, 4, 2))) == 2
+ assert len(BlockPlacement(slice(0, 3, 2))) == 2
+
+ assert len(BlockPlacement(slice(0, 1, 2))) == 1
+ assert len(BlockPlacement(slice(1, 0, -1))) == 1
+
+ def test_zero_step_raises(self):
+ with pytest.raises(ValueError):
+ BlockPlacement(slice(1, 1, 0))
+ with pytest.raises(ValueError):
+ BlockPlacement(slice(1, 2, 0))
+
+ def test_unbounded_slice_raises(self):
+ def assert_unbounded_slice_error(slc):
+ with pytest.raises(ValueError, match="unbounded slice"):
+ BlockPlacement(slc)
+
+ assert_unbounded_slice_error(slice(None, None))
+ assert_unbounded_slice_error(slice(10, None))
+ assert_unbounded_slice_error(slice(None, None, -1))
+ assert_unbounded_slice_error(slice(None, 10, -1))
+
+ # These are "unbounded" because negative index will change depending on
+ # container shape.
+ assert_unbounded_slice_error(slice(-1, None))
+ assert_unbounded_slice_error(slice(None, -1))
+ assert_unbounded_slice_error(slice(-1, -1))
+ assert_unbounded_slice_error(slice(-1, None, -1))
+ assert_unbounded_slice_error(slice(None, -1, -1))
+ assert_unbounded_slice_error(slice(-1, -1, -1))
+
+ def test_not_slice_like_slices(self):
+ def assert_not_slice_like(slc):
+ assert not BlockPlacement(slc).is_slice_like
+
+ assert_not_slice_like(slice(0, 0))
+ assert_not_slice_like(slice(100, 0))
+
+ assert_not_slice_like(slice(100, 100, -1))
+ assert_not_slice_like(slice(0, 100, -1))
+
+ assert not BlockPlacement(slice(0, 0)).is_slice_like
+ assert not BlockPlacement(slice(100, 100)).is_slice_like
+
+ def test_array_to_slice_conversion(self):
+ def assert_as_slice_equals(arr, slc):
+ assert BlockPlacement(arr).as_slice == slc
+
+ assert_as_slice_equals([0], slice(0, 1, 1))
+ assert_as_slice_equals([100], slice(100, 101, 1))
+
+ assert_as_slice_equals([0, 1, 2], slice(0, 3, 1))
+ assert_as_slice_equals([0, 5, 10], slice(0, 15, 5))
+ assert_as_slice_equals([0, 100], slice(0, 200, 100))
+
+ assert_as_slice_equals([2, 1], slice(2, 0, -1))
+
+ if not PY361:
+ assert_as_slice_equals([2, 1, 0], slice(2, None, -1))
+ assert_as_slice_equals([100, 0], slice(100, None, -100))
+
+ def test_not_slice_like_arrays(self):
+ def assert_not_slice_like(arr):
+ assert not BlockPlacement(arr).is_slice_like
+
+ assert_not_slice_like([])
+ assert_not_slice_like([-1])
+ assert_not_slice_like([-1, -2, -3])
+ assert_not_slice_like([-10])
+ assert_not_slice_like([-1])
+ assert_not_slice_like([-1, 0, 1, 2])
+ assert_not_slice_like([-2, 0, 2, 4])
+ assert_not_slice_like([1, 0, -1])
+ assert_not_slice_like([1, 1, 1])
+
+ def test_slice_iter(self):
+ assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2]
+ assert list(BlockPlacement(slice(0, 0))) == []
+ assert list(BlockPlacement(slice(3, 0))) == []
+
+ if not PY361:
+ assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1]
+ assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0]
+
+ def test_slice_to_array_conversion(self):
+ def assert_as_array_equals(slc, asarray):
+ tm.assert_numpy_array_equal(
+ BlockPlacement(slc).as_array,
+ np.asarray(asarray, dtype=np.int64))
+
+ assert_as_array_equals(slice(0, 3), [0, 1, 2])
+ assert_as_array_equals(slice(0, 0), [])
+ assert_as_array_equals(slice(3, 0), [])
+
+ assert_as_array_equals(slice(3, 0, -1), [3, 2, 1])
+
+ if not PY361:
+ assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0])
+ assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1])
+
+ def test_blockplacement_add(self):
+ bpl = BlockPlacement(slice(0, 5))
+ assert bpl.add(1).as_slice == slice(1, 6, 1)
+ assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
+ assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]
+
+ def test_blockplacement_add_int(self):
+ def assert_add_equals(val, inc, result):
+ assert list(BlockPlacement(val).add(inc)) == result
+
+ assert_add_equals(slice(0, 0), 0, [])
+ assert_add_equals(slice(1, 4), 0, [1, 2, 3])
+ assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1])
+ assert_add_equals([1, 2, 4], 0, [1, 2, 4])
+
+ assert_add_equals(slice(0, 0), 10, [])
+ assert_add_equals(slice(1, 4), 10, [11, 12, 13])
+ assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11])
+ assert_add_equals([1, 2, 4], 10, [11, 12, 14])
+
+ assert_add_equals(slice(0, 0), -1, [])
+ assert_add_equals(slice(1, 4), -1, [0, 1, 2])
+ assert_add_equals([1, 2, 4], -1, [0, 1, 3])
+
+ with pytest.raises(ValueError):
+ BlockPlacement(slice(1, 4)).add(-10)
+ with pytest.raises(ValueError):
+ BlockPlacement([1, 2, 4]).add(-10)
+
+ if not PY361:
+ assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0])
+ assert_add_equals(slice(2, None, -1), 0, [2, 1, 0])
+ assert_add_equals(slice(2, None, -1), 10, [12, 11, 10])
+
+ with pytest.raises(ValueError):
+ BlockPlacement(slice(2, None, -1)).add(-1)
+
+
+class DummyElement(object):
+ def __init__(self, value, dtype):
+ self.value = value
+ self.dtype = np.dtype(dtype)
+
+ def __array__(self):
+ return np.array(self.value, dtype=self.dtype)
+
+ def __str__(self):
+ return "DummyElement({}, {})".format(self.value, self.dtype)
+
+ def __repr__(self):
+ return str(self)
+
+ def astype(self, dtype, copy=False):
+ self.dtype = dtype
+ return self
+
+ def view(self, dtype):
+ return type(self)(self.value.view(dtype), dtype)
+
+ def any(self, axis=None):
+ return bool(self.value)
+
+
+class TestCanHoldElement(object):
+ @pytest.mark.parametrize('value, dtype', [
+ (1, 'i8'),
+ (1.0, 'f8'),
+ (2**63, 'f8'),
+ (1j, 'complex128'),
+ (2**63, 'complex128'),
+ (True, 'bool'),
+ (np.timedelta64(20, 'ns'), '<m8[ns]'),
+ (np.datetime64(20, 'ns'), '<M8[ns]'),
+ ])
+ @pytest.mark.parametrize('op', [
+ operator.add,
+ operator.sub,
+ operator.mul,
+ operator.truediv,
+ operator.mod,
+ operator.pow,
+ ], ids=lambda x: x.__name__)
+ def test_binop_other(self, op, value, dtype):
+ skip = {(operator.add, 'bool'),
+ (operator.sub, 'bool'),
+ (operator.mul, 'bool'),
+ (operator.truediv, 'bool'),
+ (operator.mod, 'i8'),
+ (operator.mod, 'complex128'),
+ (operator.pow, 'bool')}
+ if (op, dtype) in skip:
+ pytest.skip("Invalid combination {},{}".format(op, dtype))
+
+ e = DummyElement(value, dtype)
+ s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype)
+
+ invalid = {(operator.pow, '<M8[ns]'),
+ (operator.mod, '<M8[ns]'),
+ (operator.truediv, '<M8[ns]'),
+ (operator.mul, '<M8[ns]'),
+ (operator.add, '<M8[ns]'),
+ (operator.pow, '<m8[ns]'),
+ (operator.mul, '<m8[ns]')}
+
+ if (op, dtype) in invalid:
+ with pytest.raises(TypeError):
+ op(s, e.value)
+ else:
+ # FIXME: Since dispatching to Series, this test no longer
+ # asserts anything meaningful
+ result = op(s, e.value).dtypes
+ expected = op(s, value).dtypes
+ assert_series_equal(result, expected)
+
+
[email protected]('typestr, holder', [
+ ('category', Categorical),
+ ('M8[ns]', DatetimeArray),
+ ('M8[ns, US/Central]', DatetimeArray),
+ ('m8[ns]', TimedeltaArray),
+ ('sparse', SparseArray),
+])
+def test_holder(typestr, holder):
+ blk = create_block(typestr, [1])
+ assert blk._holder is holder
+
+
+def test_deprecated_fastpath():
+ # GH#19265
+ values = np.random.rand(3, 3)
+ with tm.assert_produces_warning(DeprecationWarning,
+ check_stacklevel=False):
+ make_block(values, placement=np.arange(3), fastpath=True)
+
+
+def test_validate_ndim():
+ values = np.array([1.0, 2.0])
+ placement = slice(2)
+ msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
+
+ with pytest.raises(ValueError, match=msg):
+ make_block(values, placement, ndim=2)
+
+
+def test_block_shape():
+ idx = pd.Index([0, 1, 2, 3, 4])
+ a = pd.Series([1, 2, 3]).reindex(idx)
+ b = pd.Series(pd.Categorical([1, 2, 3])).reindex(idx)
+
+ assert (a._data.blocks[0].mgr_locs.indexer ==
+ b._data.blocks[0].mgr_locs.indexer)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/conftest.py b/contrib/python/pandas/py2/pandas/tests/io/conftest.py
new file mode 100644
index 00000000000..af6f7ac4ef5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/conftest.py
@@ -0,0 +1,90 @@
+from distutils.version import LooseVersion
+import os
+
+import pytest
+
+import pandas.util.testing as tm
+
+from pandas.io.parsers import read_csv
+
+
+def tips_file(datapath):
+ """Path to the tips dataset"""
+ return datapath('io', 'parser', 'data', 'tips.csv')
+
+
+def jsonl_file(datapath):
+ """Path a JSONL dataset"""
+ return datapath('io', 'parser', 'data', 'items.jsonl')
+
+
+def salaries_table(datapath):
+ """DataFrame with the salaries dataset"""
+ return read_csv(datapath('io', 'parser', 'data', 'salaries.csv'), sep='\t')
+
+
+def s3_resource(tips_file, jsonl_file):
+ """Fixture for mocking S3 interaction.
+
+ The primary bucket name is "pandas-test". The following datasets
+ are loaded.
+
+ - tips.csv
+ - tips.csv.gz
+ - tips.csv.bz2
+ - items.jsonl
+
+ A private bucket "cant_get_it" is also created. The boto3 s3 resource
+ is yielded by the fixture.
+ """
+ pytest.importorskip('s3fs')
+ boto3 = pytest.importorskip('boto3')
+ botocore = pytest.importorskip('botocore')
+
+ if LooseVersion(botocore.__version__) < LooseVersion("1.11.0"):
+ # botocore leaks an uncatchable ResourceWarning before 1.11.0;
+ # see GH 23731 and https://github.com/boto/botocore/issues/1464
+ pytest.skip("botocore is leaking resources before 1.11.0")
+
+ with tm.ensure_safe_environment_variables():
+ # temporary workaround as moto fails for botocore >= 1.11 otherwise,
+ # see https://github.com/spulec/moto/issues/1924 & 1952
+ os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
+ os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+
+ moto = pytest.importorskip('moto')
+
+ test_s3_files = [
+ ('tips.csv', tips_file),
+ ('tips.csv.gz', tips_file + '.gz'),
+ ('tips.csv.bz2', tips_file + '.bz2'),
+ ('items.jsonl', jsonl_file),
+ ]
+
+ def add_tips_files(bucket_name):
+ for s3_key, file_name in test_s3_files:
+ with open(file_name, 'rb') as f:
+ conn.Bucket(bucket_name).put_object(
+ Key=s3_key,
+ Body=f)
+
+ try:
+ s3 = moto.mock_s3()
+ s3.start()
+
+ # see gh-16135
+ bucket = 'pandas-test'
+ conn = boto3.resource("s3", region_name="us-east-1")
+
+ conn.create_bucket(Bucket=bucket)
+ add_tips_files(bucket)
+
+ conn.create_bucket(Bucket='cant_get_it', ACL='private')
+ add_tips_files('cant_get_it')
+ yield conn
+ finally:
+ s3.stop()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/data/feather-0_3_1.feather b/contrib/python/pandas/py2/pandas/tests/io/data/feather-0_3_1.feather
new file mode 100644
index 00000000000..5a2c7b3dcc6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/data/feather-0_3_1.feather
Binary files differ
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/formats/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_console.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_console.py
new file mode 100644
index 00000000000..a3e0e195f48
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_console.py
@@ -0,0 +1,92 @@
+import subprocess # noqa: F401
+
+import pytest
+
+from pandas.io.formats.console import detect_console_encoding
+from pandas.io.formats.terminal import _get_terminal_size_tput
+
+
+class MockEncoding(object): # TODO(py27): replace with mock
+ """
+ Used to add a side effect when accessing the 'encoding' property. If the
+ side effect is a str in nature, the value will be returned. Otherwise, the
+ side effect should be an exception that will be raised.
+ """
+ def __init__(self, encoding):
+ super(MockEncoding, self).__init__()
+ self.val = encoding
+
+ @property
+ def encoding(self):
+ return self.raise_or_return(self.val)
+
+ @staticmethod
+ def raise_or_return(val):
+ if isinstance(val, str):
+ return val
+ else:
+ raise val
+
+
[email protected]('empty,filled', [
+ ['stdin', 'stdout'],
+ ['stdout', 'stdin']
+])
+def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
+ # Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when
+ # they have values filled.
+ # GH 21552
+ with monkeypatch.context() as context:
+ context.setattr('sys.{}'.format(empty), MockEncoding(''))
+ context.setattr('sys.{}'.format(filled), MockEncoding(filled))
+ assert detect_console_encoding() == filled
+
+
[email protected]('encoding', [
+ AttributeError,
+ IOError,
+ 'ascii'
+])
+def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
+ # GH 21552
+ with monkeypatch.context() as context:
+ context.setattr('locale.getpreferredencoding', lambda: 'foo')
+ context.setattr('sys.stdout', MockEncoding(encoding))
+ assert detect_console_encoding() == 'foo'
+
+
[email protected]('std,locale', [
+ ['ascii', 'ascii'],
+ ['ascii', Exception],
+ [AttributeError, 'ascii'],
+ [AttributeError, Exception],
+ [IOError, 'ascii'],
+ [IOError, Exception]
+])
+def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
+ # When both the stdout/stdin encoding and locale preferred encoding checks
+ # fail (or return 'ascii', we should default to the sys default encoding.
+ # GH 21552
+ with monkeypatch.context() as context:
+ context.setattr(
+ 'locale.getpreferredencoding',
+ lambda: MockEncoding.raise_or_return(locale)
+ )
+ context.setattr('sys.stdout', MockEncoding(std))
+ context.setattr('sys.getdefaultencoding', lambda: 'sysDefaultEncoding')
+ assert detect_console_encoding() == 'sysDefaultEncoding'
+
+
[email protected]("size", ['', ['']])
+def test_terminal_unknown_dimensions(monkeypatch, size, mocker):
+
+ def communicate(*args, **kwargs):
+ return size
+
+ monkeypatch.setattr('subprocess.Popen', mocker.Mock())
+ monkeypatch.setattr('subprocess.Popen.return_value.returncode', None)
+ monkeypatch.setattr(
+ 'subprocess.Popen.return_value.communicate', communicate)
+ result = _get_terminal_size_tput()
+
+ assert result is None
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_css.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_css.py
new file mode 100644
index 00000000000..f251bd98350
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_css.py
@@ -0,0 +1,187 @@
+import pytest
+
+from pandas.util import testing as tm
+
+from pandas.io.formats.css import CSSResolver, CSSWarning
+
+
+def assert_resolves(css, props, inherited=None):
+ resolve = CSSResolver()
+ actual = resolve(css, inherited=inherited)
+ assert props == actual
+
+
+def assert_same_resolution(css1, css2, inherited=None):
+ resolve = CSSResolver()
+ resolved1 = resolve(css1, inherited=inherited)
+ resolved2 = resolve(css2, inherited=inherited)
+ assert resolved1 == resolved2
+
+
[email protected]('name,norm,abnorm', [
+ ('whitespace', 'hello: world; foo: bar',
+ ' \t hello \t :\n world \n ; \n foo: \tbar\n\n'),
+ ('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'),
+ ('empty-decl', 'hello: world; foo: bar',
+ '; hello: world;; foo: bar;\n; ;'),
+ ('empty-list', '', ';'),
+])
+def test_css_parse_normalisation(name, norm, abnorm):
+ assert_same_resolution(norm, abnorm)
+
+
+ 'invalid_css,remainder', [
+ # No colon
+ ('hello-world', ''),
+ ('border-style: solid; hello-world', 'border-style: solid'),
+ ('border-style: solid; hello-world; font-weight: bold',
+ 'border-style: solid; font-weight: bold'),
+ # Unclosed string fail
+ # Invalid size
+ ('font-size: blah', 'font-size: 1em'),
+ ('font-size: 1a2b', 'font-size: 1em'),
+ ('font-size: 1e5pt', 'font-size: 1em'),
+ ('font-size: 1+6pt', 'font-size: 1em'),
+ ('font-size: 1unknownunit', 'font-size: 1em'),
+ ('font-size: 10', 'font-size: 1em'),
+ ('font-size: 10 pt', 'font-size: 1em'),
+ ])
+def test_css_parse_invalid(invalid_css, remainder):
+ with tm.assert_produces_warning(CSSWarning):
+ assert_same_resolution(invalid_css, remainder)
+
+ # TODO: we should be checking that in other cases no warnings are raised
+
+
+ 'shorthand,expansions',
+ [('margin', ['margin-top', 'margin-right',
+ 'margin-bottom', 'margin-left']),
+ ('padding', ['padding-top', 'padding-right',
+ 'padding-bottom', 'padding-left']),
+ ('border-width', ['border-top-width', 'border-right-width',
+ 'border-bottom-width', 'border-left-width']),
+ ('border-color', ['border-top-color', 'border-right-color',
+ 'border-bottom-color', 'border-left-color']),
+ ('border-style', ['border-top-style', 'border-right-style',
+ 'border-bottom-style', 'border-left-style']),
+ ])
+def test_css_side_shorthands(shorthand, expansions):
+ top, right, bottom, left = expansions
+
+ assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand),
+ {top: '1pt', right: '1pt',
+ bottom: '1pt', left: '1pt'})
+
+ assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand),
+ {top: '1pt', right: '4pt',
+ bottom: '1pt', left: '4pt'})
+
+ assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand),
+ {top: '1pt', right: '4pt',
+ bottom: '2pt', left: '4pt'})
+
+ assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand),
+ {top: '1pt', right: '4pt',
+ bottom: '2pt', left: '0pt'})
+
+ with tm.assert_produces_warning(CSSWarning):
+ assert_resolves(
+ '{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {})
+
+
[email protected]('style,inherited,equiv', [
+ ('margin: 1px; margin: 2px', '',
+ 'margin: 2px'),
+ ('margin: 1px', 'margin: 2px',
+ 'margin: 1px'),
+ ('margin: 1px; margin: inherit', 'margin: 2px',
+ 'margin: 2px'),
+ ('margin: 1px; margin-top: 2px', '',
+ 'margin-left: 1px; margin-right: 1px; ' +
+ 'margin-bottom: 1px; margin-top: 2px'),
+ ('margin-top: 2px', 'margin: 1px',
+ 'margin: 1px; margin-top: 2px'),
+ ('margin: 1px', 'margin-top: 2px',
+ 'margin: 1px'),
+ ('margin: 1px; margin-top: inherit', 'margin: 2px',
+ 'margin: 1px; margin-top: 2px'),
+])
+def test_css_precedence(style, inherited, equiv):
+ resolve = CSSResolver()
+ inherited_props = resolve(inherited)
+ style_props = resolve(style, inherited=inherited_props)
+ equiv_props = resolve(equiv)
+ assert style_props == equiv_props
+
+
[email protected]('style,equiv', [
+ ('margin: 1px; margin-top: inherit',
+ 'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'),
+ ('margin-top: inherit', ''),
+ ('margin-top: initial', ''),
+])
+def test_css_none_absent(style, equiv):
+ assert_same_resolution(style, equiv)
+
+
[email protected]('size,resolved', [
+ ('xx-small', '6pt'),
+ ('x-small', '{pt:f}pt'.format(pt=7.5)),
+ ('small', '{pt:f}pt'.format(pt=9.6)),
+ ('medium', '12pt'),
+ ('large', '{pt:f}pt'.format(pt=13.5)),
+ ('x-large', '18pt'),
+ ('xx-large', '24pt'),
+
+ ('8px', '6pt'),
+ ('1.25pc', '15pt'),
+ ('.25in', '18pt'),
+ ('02.54cm', '72pt'),
+ ('25.4mm', '72pt'),
+ ('101.6q', '72pt'),
+ ('101.6q', '72pt'),
+])
[email protected]('relative_to', # invariant to inherited size
+ [None, '16pt'])
+def test_css_absolute_font_size(size, relative_to, resolved):
+ if relative_to is None:
+ inherited = None
+ else:
+ inherited = {'font-size': relative_to}
+ assert_resolves('font-size: {size}'.format(size=size),
+ {'font-size': resolved}, inherited=inherited)
+
+
[email protected]('size,relative_to,resolved', [
+ ('1em', None, '12pt'),
+ ('1.0em', None, '12pt'),
+ ('1.25em', None, '15pt'),
+ ('1em', '16pt', '16pt'),
+ ('1.0em', '16pt', '16pt'),
+ ('1.25em', '16pt', '20pt'),
+ ('1rem', '16pt', '12pt'),
+ ('1.0rem', '16pt', '12pt'),
+ ('1.25rem', '16pt', '15pt'),
+ ('100%', None, '12pt'),
+ ('125%', None, '15pt'),
+ ('100%', '16pt', '16pt'),
+ ('125%', '16pt', '20pt'),
+ ('2ex', None, '12pt'),
+ ('2.0ex', None, '12pt'),
+ ('2.50ex', None, '15pt'),
+ ('inherit', '16pt', '16pt'),
+
+ ('smaller', None, '10pt'),
+ ('smaller', '18pt', '15pt'),
+ ('larger', None, '{pt:f}pt'.format(pt=14.4)),
+ ('larger', '15pt', '18pt'),
+])
+def test_css_relative_font_size(size, relative_to, resolved):
+ if relative_to is None:
+ inherited = None
+ else:
+ inherited = {'font-size': relative_to}
+ assert_resolves('font-size: {size}'.format(size=size),
+ {'font-size': resolved}, inherited=inherited)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_eng_formatting.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_eng_formatting.py
new file mode 100644
index 00000000000..455b6454d73
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_eng_formatting.py
@@ -0,0 +1,196 @@
+import numpy as np
+
+from pandas.compat import u
+
+import pandas as pd
+from pandas import DataFrame
+from pandas.util import testing as tm
+
+import pandas.io.formats.format as fmt
+
+
+class TestEngFormatter(object):
+
+ def test_eng_float_formatter(self):
+ df = DataFrame({'A': [1.41, 141., 14100, 1410000.]})
+
+ fmt.set_eng_float_format()
+ result = df.to_string()
+ expected = (' A\n'
+ '0 1.410E+00\n'
+ '1 141.000E+00\n'
+ '2 14.100E+03\n'
+ '3 1.410E+06')
+ assert result == expected
+
+ fmt.set_eng_float_format(use_eng_prefix=True)
+ result = df.to_string()
+ expected = (' A\n'
+ '0 1.410\n'
+ '1 141.000\n'
+ '2 14.100k\n'
+ '3 1.410M')
+ assert result == expected
+
+ fmt.set_eng_float_format(accuracy=0)
+ result = df.to_string()
+ expected = (' A\n'
+ '0 1E+00\n'
+ '1 141E+00\n'
+ '2 14E+03\n'
+ '3 1E+06')
+ assert result == expected
+
+ tm.reset_display_options()
+
+ def compare(self, formatter, input, output):
+ formatted_input = formatter(input)
+ assert formatted_input == output
+
+ def compare_all(self, formatter, in_out):
+ """
+ Parameters:
+ -----------
+ formatter: EngFormatter under test
+ in_out: list of tuples. Each tuple = (number, expected_formatting)
+
+ It is tested if 'formatter(number) == expected_formatting'.
+ *number* should be >= 0 because formatter(-number) == fmt is also
+ tested. *fmt* is derived from *expected_formatting*
+ """
+ for input, output in in_out:
+ self.compare(formatter, input, output)
+ self.compare(formatter, -input, "-" + output[1:])
+
+ def test_exponents_with_eng_prefix(self):
+ formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+ f = np.sqrt(2)
+ in_out = [
+ (f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"),
+ (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"),
+ (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"),
+ (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"),
+ (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"),
+ (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"),
+ (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"),
+ (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"),
+ (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"),
+ (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"),
+ (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"),
+ (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"),
+ (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"),
+ (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"),
+ (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"),
+ (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"),
+ (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"),
+ (f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"),
+ (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"),
+ (f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"),
+ (f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"),
+ (f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"),
+ (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"),
+ (f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"),
+ (f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"),
+ (f * 10 ** 26, " 141.421Y")]
+ self.compare_all(formatter, in_out)
+
+ def test_exponents_without_eng_prefix(self):
+ formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False)
+ f = np.pi
+ in_out = [
+ (f * 10 ** -24, " 3.1416E-24"),
+ (f * 10 ** -23, " 31.4159E-24"),
+ (f * 10 ** -22, " 314.1593E-24"),
+ (f * 10 ** -21, " 3.1416E-21"),
+ (f * 10 ** -20, " 31.4159E-21"),
+ (f * 10 ** -19, " 314.1593E-21"),
+ (f * 10 ** -18, " 3.1416E-18"),
+ (f * 10 ** -17, " 31.4159E-18"),
+ (f * 10 ** -16, " 314.1593E-18"),
+ (f * 10 ** -15, " 3.1416E-15"),
+ (f * 10 ** -14, " 31.4159E-15"),
+ (f * 10 ** -13, " 314.1593E-15"),
+ (f * 10 ** -12, " 3.1416E-12"),
+ (f * 10 ** -11, " 31.4159E-12"),
+ (f * 10 ** -10, " 314.1593E-12"),
+ (f * 10 ** -9, " 3.1416E-09"),
+ (f * 10 ** -8, " 31.4159E-09"),
+ (f * 10 ** -7, " 314.1593E-09"),
+ (f * 10 ** -6, " 3.1416E-06"),
+ (f * 10 ** -5, " 31.4159E-06"),
+ (f * 10 ** -4, " 314.1593E-06"),
+ (f * 10 ** -3, " 3.1416E-03"),
+ (f * 10 ** -2, " 31.4159E-03"),
+ (f * 10 ** -1, " 314.1593E-03"),
+ (f * 10 ** 0, " 3.1416E+00"),
+ (f * 10 ** 1, " 31.4159E+00"),
+ (f * 10 ** 2, " 314.1593E+00"),
+ (f * 10 ** 3, " 3.1416E+03"),
+ (f * 10 ** 4, " 31.4159E+03"),
+ (f * 10 ** 5, " 314.1593E+03"),
+ (f * 10 ** 6, " 3.1416E+06"),
+ (f * 10 ** 7, " 31.4159E+06"),
+ (f * 10 ** 8, " 314.1593E+06"),
+ (f * 10 ** 9, " 3.1416E+09"),
+ (f * 10 ** 10, " 31.4159E+09"),
+ (f * 10 ** 11, " 314.1593E+09"),
+ (f * 10 ** 12, " 3.1416E+12"),
+ (f * 10 ** 13, " 31.4159E+12"),
+ (f * 10 ** 14, " 314.1593E+12"),
+ (f * 10 ** 15, " 3.1416E+15"),
+ (f * 10 ** 16, " 31.4159E+15"),
+ (f * 10 ** 17, " 314.1593E+15"),
+ (f * 10 ** 18, " 3.1416E+18"),
+ (f * 10 ** 19, " 31.4159E+18"),
+ (f * 10 ** 20, " 314.1593E+18"),
+ (f * 10 ** 21, " 3.1416E+21"),
+ (f * 10 ** 22, " 31.4159E+21"),
+ (f * 10 ** 23, " 314.1593E+21"),
+ (f * 10 ** 24, " 3.1416E+24"),
+ (f * 10 ** 25, " 31.4159E+24"),
+ (f * 10 ** 26, " 314.1593E+24")]
+ self.compare_all(formatter, in_out)
+
+ def test_rounding(self):
+ formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+ in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'),
+ (555.555, ' 555.555'), (5555.55, ' 5.556k'),
+ (55555.5, ' 55.556k'), (555555, ' 555.555k')]
+ self.compare_all(formatter, in_out)
+
+ formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+ in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'),
+ (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')]
+ self.compare_all(formatter, in_out)
+
+ formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True)
+ in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'),
+ (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')]
+ self.compare_all(formatter, in_out)
+
+ formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+ result = formatter(0)
+ assert result == u(' 0.000')
+
+ def test_nan(self):
+ # Issue #11981
+
+ formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+ result = formatter(np.nan)
+ assert result == u('NaN')
+
+ df = pd.DataFrame({'a': [1.5, 10.3, 20.5],
+ 'b': [50.3, 60.67, 70.12],
+ 'c': [100.2, 101.33, 120.33]})
+ pt = df.pivot_table(values='a', index='b', columns='c')
+ fmt.set_eng_float_format(accuracy=1)
+ result = pt.to_string()
+ assert 'NaN' in result
+ tm.reset_display_options()
+
+ def test_inf(self):
+ # Issue #11981
+
+ formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+ result = formatter(np.inf)
+ assert result == u('inf')
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_format.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_format.py
new file mode 100644
index 00000000000..b0cf5a2f176
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_format.py
@@ -0,0 +1,2794 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test output formatting for Series/DataFrame, including to_string & reprs
+"""
+
+from __future__ import print_function
+
+from datetime import datetime
+import itertools
+from operator import methodcaller
+import os
+import re
+import sys
+import textwrap
+import warnings
+
+import dateutil
+import numpy as np
+import pytest
+import pytz
+
+import pandas.compat as compat
+from pandas.compat import (
+ PY3, StringIO, is_platform_32bit, is_platform_windows, lrange, lzip, range,
+ u, zip)
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, NaT, Series, Timestamp, date_range, read_csv)
+from pandas.core.config import (
+ get_option, option_context, reset_option, set_option)
+import pandas.util.testing as tm
+
+import pandas.io.formats.format as fmt
+import pandas.io.formats.printing as printing
+from pandas.io.formats.terminal import get_terminal_size
+
+use_32bit_repr = is_platform_windows() or is_platform_32bit()
+
+_frame = DataFrame(tm.getSeriesData())
+
+
+def curpath():
+ pth, _ = os.path.split(os.path.abspath(__file__))
+ return pth
+
+
+def has_info_repr(df):
+ r = repr(df)
+ c1 = r.split('\n')[0].startswith("<class")
+ c2 = r.split('\n')[0].startswith(r"&lt;class") # _repr_html_
+ return c1 or c2
+
+
+def has_non_verbose_info_repr(df):
+ has_info = has_info_repr(df)
+ r = repr(df)
+
+ # 1. <class>
+ # 2. Index
+ # 3. Columns
+ # 4. dtype
+ # 5. memory usage
+ # 6. trailing newline
+ nv = len(r.split('\n')) == 6
+ return has_info and nv
+
+
+def has_horizontally_truncated_repr(df):
+ try: # Check header row
+ fst_line = np.array(repr(df).splitlines()[0].split())
+ cand_col = np.where(fst_line == '...')[0][0]
+ except IndexError:
+ return False
+ # Make sure each row has this ... in the same place
+ r = repr(df)
+ for ix, l in enumerate(r.splitlines()):
+ if not r.split()[cand_col] == '...':
+ return False
+ return True
+
+
+def has_vertically_truncated_repr(df):
+ r = repr(df)
+ only_dot_row = False
+ for row in r.splitlines():
+ if re.match(r'^[\.\ ]+$', row):
+ only_dot_row = True
+ return only_dot_row
+
+
+def has_truncated_repr(df):
+ return has_horizontally_truncated_repr(
+ df) or has_vertically_truncated_repr(df)
+
+
+def has_doubly_truncated_repr(df):
+ return has_horizontally_truncated_repr(
+ df) and has_vertically_truncated_repr(df)
+
+
+def has_expanded_repr(df):
+ r = repr(df)
+ for line in r.split('\n'):
+ if line.endswith('\\'):
+ return True
+ return False
+
+
+class TestDataFrameFormatting(object):
+
+ def setup_method(self, method):
+ self.warn_filters = warnings.filters
+ warnings.filterwarnings('ignore', category=FutureWarning,
+ module=".*format")
+
+ self.frame = _frame.copy()
+
+ def teardown_method(self, method):
+ warnings.filters = self.warn_filters
+
+ def test_repr_embedded_ndarray(self):
+ arr = np.empty(10, dtype=[('err', object)])
+ for i in range(len(arr)):
+ arr['err'][i] = np.random.randn(i)
+
+ df = DataFrame(arr)
+ repr(df['err'])
+ repr(df)
+ df.to_string()
+
+ def test_eng_float_formatter(self):
+ self.frame.loc[5] = 0
+
+ fmt.set_eng_float_format()
+ repr(self.frame)
+
+ fmt.set_eng_float_format(use_eng_prefix=True)
+ repr(self.frame)
+
+ fmt.set_eng_float_format(accuracy=0)
+ repr(self.frame)
+ tm.reset_display_options()
+
+ def test_show_null_counts(self):
+
+ df = DataFrame(1, columns=range(10), index=range(10))
+ df.iloc[1, 1] = np.nan
+
+ def check(null_counts, result):
+ buf = StringIO()
+ df.info(buf=buf, null_counts=null_counts)
+ assert ('non-null' in buf.getvalue()) is result
+
+ with option_context('display.max_info_rows', 20,
+ 'display.max_info_columns', 20):
+ check(None, True)
+ check(True, True)
+ check(False, False)
+
+ with option_context('display.max_info_rows', 5,
+ 'display.max_info_columns', 5):
+ check(None, False)
+ check(True, False)
+ check(False, False)
+
+ def test_repr_tuples(self):
+ buf = StringIO()
+
+ df = DataFrame({'tups': lzip(range(10), range(10))})
+ repr(df)
+ df.to_string(col_space=10, buf=buf)
+
+ def test_repr_truncation(self):
+ max_len = 20
+ with option_context("display.max_colwidth", max_len):
+ df = DataFrame({'A': np.random.randn(10),
+ 'B': [tm.rands(np.random.randint(
+ max_len - 1, max_len + 1)) for i in range(10)
+ ]})
+ r = repr(df)
+ r = r[r.find('\n') + 1:]
+
+ adj = fmt._get_adjustment()
+
+ for line, value in lzip(r.split('\n'), df['B']):
+ if adj.len(value) + 1 > max_len:
+ assert '...' in line
+ else:
+ assert '...' not in line
+
+ with option_context("display.max_colwidth", 999999):
+ assert '...' not in repr(df)
+
+ with option_context("display.max_colwidth", max_len + 2):
+ assert '...' not in repr(df)
+
+ def test_repr_chop_threshold(self):
+ df = DataFrame([[0.1, 0.5], [0.5, -0.1]])
+ pd.reset_option("display.chop_threshold") # default None
+ assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1'
+
+ with option_context("display.chop_threshold", 0.2):
+ assert repr(df) == ' 0 1\n0 0.0 0.5\n1 0.5 0.0'
+
+ with option_context("display.chop_threshold", 0.6):
+ assert repr(df) == ' 0 1\n0 0.0 0.0\n1 0.0 0.0'
+
+ with option_context("display.chop_threshold", None):
+ assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1'
+
+ def test_repr_chop_threshold_column_below(self):
+ # GH 6839: validation case
+
+ df = pd.DataFrame([[10, 20, 30, 40],
+ [8e-10, -1e-11, 2e-9, -2e-11]]).T
+
+ with option_context("display.chop_threshold", 0):
+ assert repr(df) == (' 0 1\n'
+ '0 10.0 8.000000e-10\n'
+ '1 20.0 -1.000000e-11\n'
+ '2 30.0 2.000000e-09\n'
+ '3 40.0 -2.000000e-11')
+
+ with option_context("display.chop_threshold", 1e-8):
+ assert repr(df) == (' 0 1\n'
+ '0 10.0 0.000000e+00\n'
+ '1 20.0 0.000000e+00\n'
+ '2 30.0 0.000000e+00\n'
+ '3 40.0 0.000000e+00')
+
+ with option_context("display.chop_threshold", 5e-11):
+ assert repr(df) == (' 0 1\n'
+ '0 10.0 8.000000e-10\n'
+ '1 20.0 0.000000e+00\n'
+ '2 30.0 2.000000e-09\n'
+ '3 40.0 0.000000e+00')
+
+ def test_repr_obeys_max_seq_limit(self):
+ with option_context("display.max_seq_items", 2000):
+ assert len(printing.pprint_thing(lrange(1000))) > 1000
+
+ with option_context("display.max_seq_items", 5):
+ assert len(printing.pprint_thing(lrange(1000))) < 100
+
+ def test_repr_set(self):
+ assert printing.pprint_thing({1}) == '{1}'
+
+ def test_repr_is_valid_construction_code(self):
+ # for the case of Index, where the repr is traditional rather then
+ # stylized
+ idx = Index(['a', 'b'])
+ res = eval("pd." + repr(idx))
+ tm.assert_series_equal(Series(res), Series(idx))
+
+ def test_repr_should_return_str(self):
+ # https://docs.python.org/3/reference/datamodel.html#object.__repr__
+ # "...The return value must be a string object."
+
+ # (str on py2.x, str (unicode) on py3)
+
+ data = [8, 5, 3, 5]
+ index1 = [u("\u03c3"), u("\u03c4"), u("\u03c5"), u("\u03c6")]
+ cols = [u("\u03c8")]
+ df = DataFrame(data, columns=cols, index=index1)
+ assert type(df.__repr__()) == str # both py2 / 3
+
+ def test_repr_no_backslash(self):
+ with option_context('mode.sim_interactive', True):
+ df = DataFrame(np.random.randn(10, 4))
+ assert '\\' not in repr(df)
+
+ def test_expand_frame_repr(self):
+ df_small = DataFrame('hello', [0], [0])
+ df_wide = DataFrame('hello', [0], lrange(10))
+ df_tall = DataFrame('hello', lrange(30), lrange(5))
+
+ with option_context('mode.sim_interactive', True):
+ with option_context('display.max_columns', 10, 'display.width', 20,
+ 'display.max_rows', 20,
+ 'display.show_dimensions', True):
+ with option_context('display.expand_frame_repr', True):
+ assert not has_truncated_repr(df_small)
+ assert not has_expanded_repr(df_small)
+ assert not has_truncated_repr(df_wide)
+ assert has_expanded_repr(df_wide)
+ assert has_vertically_truncated_repr(df_tall)
+ assert has_expanded_repr(df_tall)
+
+ with option_context('display.expand_frame_repr', False):
+ assert not has_truncated_repr(df_small)
+ assert not has_expanded_repr(df_small)
+ assert not has_horizontally_truncated_repr(df_wide)
+ assert not has_expanded_repr(df_wide)
+ assert has_vertically_truncated_repr(df_tall)
+ assert not has_expanded_repr(df_tall)
+
+ def test_repr_non_interactive(self):
+ # in non interactive mode, there can be no dependency on the
+ # result of terminal auto size detection
+ df = DataFrame('hello', lrange(1000), lrange(5))
+
+ with option_context('mode.sim_interactive', False, 'display.width', 0,
+ 'display.max_rows', 5000):
+ assert not has_truncated_repr(df)
+ assert not has_expanded_repr(df)
+
+ def test_repr_truncates_terminal_size(self, monkeypatch):
+ # see gh-21180
+
+ terminal_size = (118, 96)
+ monkeypatch.setattr('pandas.io.formats.console.get_terminal_size',
+ lambda: terminal_size)
+ monkeypatch.setattr('pandas.io.formats.format.get_terminal_size',
+ lambda: terminal_size)
+
+ index = range(5)
+ columns = pd.MultiIndex.from_tuples([
+ ('This is a long title with > 37 chars.', 'cat'),
+ ('This is a loooooonger title with > 43 chars.', 'dog'),
+ ])
+ df = pd.DataFrame(1, index=index, columns=columns)
+
+ result = repr(df)
+
+ h1, h2 = result.split('\n')[:2]
+ assert 'long' in h1
+ assert 'loooooonger' in h1
+ assert 'cat' in h2
+ assert 'dog' in h2
+
+ # regular columns
+ df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]})
+ result = repr(df2)
+
+ assert df2.columns[0] in result.split('\n')[0]
+
+ def test_repr_truncates_terminal_size_full(self, monkeypatch):
+ # GH 22984 ensure entire window is filled
+ terminal_size = (80, 24)
+ df = pd.DataFrame(np.random.rand(1, 7))
+ monkeypatch.setattr('pandas.io.formats.console.get_terminal_size',
+ lambda: terminal_size)
+ monkeypatch.setattr('pandas.io.formats.format.get_terminal_size',
+ lambda: terminal_size)
+ assert "..." not in str(df)
+
+ def test_repr_truncation_column_size(self):
+ # dataframe with last column very wide -> check it is not used to
+ # determine size of truncation (...) column
+ df = pd.DataFrame({'a': [108480, 30830], 'b': [12345, 12345],
+ 'c': [12345, 12345], 'd': [12345, 12345],
+ 'e': ['a' * 50] * 2})
+ assert "..." in str(df)
+ assert " ... " not in str(df)
+
+ def test_repr_max_columns_max_rows(self):
+ term_width, term_height = get_terminal_size()
+ if term_width < 10 or term_height < 10:
+ pytest.skip("terminal size too small, "
+ "{0} x {1}".format(term_width, term_height))
+
+ def mkframe(n):
+ index = ['{i:05d}'.format(i=i) for i in range(n)]
+ return DataFrame(0, index, index)
+
+ df6 = mkframe(6)
+ df10 = mkframe(10)
+ with option_context('mode.sim_interactive', True):
+ with option_context('display.width', term_width * 2):
+ with option_context('display.max_rows', 5,
+ 'display.max_columns', 5):
+ assert not has_expanded_repr(mkframe(4))
+ assert not has_expanded_repr(mkframe(5))
+ assert not has_expanded_repr(df6)
+ assert has_doubly_truncated_repr(df6)
+
+ with option_context('display.max_rows', 20,
+ 'display.max_columns', 10):
+ # Out off max_columns boundary, but no extending
+ # since not exceeding width
+ assert not has_expanded_repr(df6)
+ assert not has_truncated_repr(df6)
+
+ with option_context('display.max_rows', 9,
+ 'display.max_columns', 10):
+ # out vertical bounds can not result in exanded repr
+ assert not has_expanded_repr(df10)
+ assert has_vertically_truncated_repr(df10)
+
+ # width=None in terminal, auto detection
+ with option_context('display.max_columns', 100, 'display.max_rows',
+ term_width * 20, 'display.width', None):
+ df = mkframe((term_width // 7) - 2)
+ assert not has_expanded_repr(df)
+ df = mkframe((term_width // 7) + 2)
+ printing.pprint_thing(df._repr_fits_horizontal_())
+ assert has_expanded_repr(df)
+
+ def test_str_max_colwidth(self):
+ # GH 7856
+ df = pd.DataFrame([{'a': 'foo',
+ 'b': 'bar',
+ 'c': 'uncomfortably long line with lots of stuff',
+ 'd': 1}, {'a': 'foo',
+ 'b': 'bar',
+ 'c': 'stuff',
+ 'd': 1}])
+ df.set_index(['a', 'b', 'c'])
+ assert str(df) == (
+ ' a b c d\n'
+ '0 foo bar uncomfortably long line with lots of stuff 1\n'
+ '1 foo bar stuff 1')
+ with option_context('max_colwidth', 20):
+ assert str(df) == (' a b c d\n'
+ '0 foo bar uncomfortably lo... 1\n'
+ '1 foo bar stuff 1')
+
+ def test_auto_detect(self):
+ term_width, term_height = get_terminal_size()
+ fac = 1.05 # Arbitrary large factor to exceed term width
+ cols = range(int(term_width * fac))
+ index = range(10)
+ df = DataFrame(index=index, columns=cols)
+ with option_context('mode.sim_interactive', True):
+ with option_context('max_rows', None):
+ with option_context('max_columns', None):
+ # Wrap around with None
+ assert has_expanded_repr(df)
+ with option_context('max_rows', 0):
+ with option_context('max_columns', 0):
+ # Truncate with auto detection.
+ assert has_horizontally_truncated_repr(df)
+
+ index = range(int(term_height * fac))
+ df = DataFrame(index=index, columns=cols)
+ with option_context('max_rows', 0):
+ with option_context('max_columns', None):
+ # Wrap around with None
+ assert has_expanded_repr(df)
+ # Truncate vertically
+ assert has_vertically_truncated_repr(df)
+
+ with option_context('max_rows', None):
+ with option_context('max_columns', 0):
+ assert has_horizontally_truncated_repr(df)
+
+ def test_to_string_repr_unicode(self):
+ buf = StringIO()
+
+ unicode_values = [u('\u03c3')] * 10
+ unicode_values = np.array(unicode_values, dtype=object)
+ df = DataFrame({'unicode': unicode_values})
+ df.to_string(col_space=10, buf=buf)
+
+ # it works!
+ repr(df)
+
+ idx = Index(['abc', u('\u03c3a'), 'aegdvg'])
+ ser = Series(np.random.randn(len(idx)), idx)
+ rs = repr(ser).split('\n')
+ line_len = len(rs[0])
+ for line in rs[1:]:
+ try:
+ line = line.decode(get_option("display.encoding"))
+ except AttributeError:
+ pass
+ if not line.startswith('dtype:'):
+ assert len(line) == line_len
+
+ # it works even if sys.stdin in None
+ _stdin = sys.stdin
+ try:
+ sys.stdin = None
+ repr(df)
+ finally:
+ sys.stdin = _stdin
+
+ def test_to_string_unicode_columns(self):
+ df = DataFrame({u('\u03c3'): np.arange(10.)})
+
+ buf = StringIO()
+ df.to_string(buf=buf)
+ buf.getvalue()
+
+ buf = StringIO()
+ df.info(buf=buf)
+ buf.getvalue()
+
+ result = self.frame.to_string()
+ assert isinstance(result, compat.text_type)
+
+ def test_to_string_utf8_columns(self):
+ n = u("\u05d0").encode('utf-8')
+
+ with option_context('display.max_rows', 1):
+ df = DataFrame([1, 2], columns=[n])
+ repr(df)
+
+ def test_to_string_unicode_two(self):
+ dm = DataFrame({u('c/\u03c3'): []})
+ buf = StringIO()
+ dm.to_string(buf)
+
+ def test_to_string_unicode_three(self):
+ dm = DataFrame(['\xc2'])
+ buf = StringIO()
+ dm.to_string(buf)
+
+ def test_to_string_with_formatters(self):
+ df = DataFrame({'int': [1, 2, 3],
+ 'float': [1.0, 2.0, 3.0],
+ 'object': [(1, 2), True, False]},
+ columns=['int', 'float', 'object'])
+
+ formatters = [('int', lambda x: '0x{x:x}'.format(x=x)),
+ ('float', lambda x: '[{x: 4.1f}]'.format(x=x)),
+ ('object', lambda x: '-{x!s}-'.format(x=x))]
+ result = df.to_string(formatters=dict(formatters))
+ result2 = df.to_string(formatters=lzip(*formatters)[1])
+ assert result == (' int float object\n'
+ '0 0x1 [ 1.0] -(1, 2)-\n'
+ '1 0x2 [ 2.0] -True-\n'
+ '2 0x3 [ 3.0] -False-')
+ assert result == result2
+
+ def test_to_string_with_datetime64_monthformatter(self):
+ months = [datetime(2016, 1, 1), datetime(2016, 2, 2)]
+ x = DataFrame({'months': months})
+
+ def format_func(x):
+ return x.strftime('%Y-%m')
+ result = x.to_string(formatters={'months': format_func})
+ expected = 'months\n0 2016-01\n1 2016-02'
+ assert result.strip() == expected
+
+ def test_to_string_with_datetime64_hourformatter(self):
+
+ x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'],
+ format='%H:%M:%S.%f')})
+
+ def format_func(x):
+ return x.strftime('%H:%M')
+
+ result = x.to_string(formatters={'hod': format_func})
+ expected = 'hod\n0 10:10\n1 12:12'
+ assert result.strip() == expected
+
+ def test_to_string_with_formatters_unicode(self):
+ df = DataFrame({u('c/\u03c3'): [1, 2, 3]})
+ result = df.to_string(
+ formatters={u('c/\u03c3'): lambda x: '{x}'.format(x=x)})
+ assert result == u(' c/\u03c3\n') + '0 1\n1 2\n2 3'
+
+ def test_east_asian_unicode_false(self):
+ if PY3:
+ _rep = repr
+ else:
+ _rep = unicode # noqa
+
+ # not alighned properly because of east asian width
+
+ # mid col
+ df = DataFrame({'a': [u'あ', u'いいい', u'う', u'ええええええ'],
+ 'b': [1, 222, 33333, 4]},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\na あ 1\n"
+ u"bb いいい 222\nc う 33333\n"
+ u"ddd ええええええ 4")
+ assert _rep(df) == expected
+
+ # last col
+ df = DataFrame({'a': [1, 222, 33333, 4],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\na 1 あ\n"
+ u"bb 222 いいい\nc 33333 う\n"
+ u"ddd 4 ええええええ")
+ assert _rep(df) == expected
+
+ # all col
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\na あああああ あ\n"
+ u"bb い いいい\nc う う\n"
+ u"ddd えええ ええええええ")
+ assert _rep(df) == expected
+
+ # column name
+ df = DataFrame({'b': [u'あ', u'いいい', u'う', u'ええええええ'],
+ u'あああああ': [1, 222, 33333, 4]},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" b あああああ\na あ 1\n"
+ u"bb いいい 222\nc う 33333\n"
+ u"ddd ええええええ 4")
+ assert _rep(df) == expected
+
+ # index
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=[u'あああ', u'いいいいいい', u'うう', u'え'])
+ expected = (u" a b\nあああ あああああ あ\n"
+ u"いいいいいい い いいい\nうう う う\n"
+ u"え えええ ええええええ")
+ assert _rep(df) == expected
+
+ # index name
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=pd.Index([u'あ', u'い', u'うう', u'え'],
+ name=u'おおおお'))
+ expected = (u" a b\n"
+ u"おおおお \n"
+ u"あ あああああ あ\n"
+ u"い い いいい\n"
+ u"うう う う\n"
+ u"え えええ ええええええ")
+ assert _rep(df) == expected
+
+ # all
+ df = DataFrame({u'あああ': [u'あああ', u'い', u'う', u'えええええ'],
+ u'いいいいい': [u'あ', u'いいい', u'う', u'ええ']},
+ index=pd.Index([u'あ', u'いいい', u'うう', u'え'],
+ name=u'お'))
+ expected = (u" あああ いいいいい\n"
+ u"お \n"
+ u"あ あああ あ\n"
+ u"いいい い いいい\n"
+ u"うう う う\n"
+ u"え えええええ ええ")
+ assert _rep(df) == expected
+
+ # MultiIndex
+ idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), (
+ u'おおお', u'かかかか'), (u'き', u'くく')])
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=idx)
+ expected = (u" a b\n"
+ u"あ いい あああああ あ\n"
+ u"う え い いいい\n"
+ u"おおお かかかか う う\n"
+ u"き くく えええ ええええええ")
+ assert _rep(df) == expected
+
+ # truncate
+ with option_context('display.max_rows', 3, 'display.max_columns', 3):
+ df = pd.DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ'],
+ 'c': [u'お', u'か', u'ききき', u'くくくくくく'],
+ u'ああああ': [u'さ', u'し', u'す', u'せ']},
+ columns=['a', 'b', 'c', u'ああああ'])
+
+ expected = (u" a ... ああああ\n0 あああああ ... さ\n"
+ u".. ... ... ...\n3 えええ ... せ\n"
+ u"\n[4 rows x 4 columns]")
+ assert _rep(df) == expected
+
+ df.index = [u'あああ', u'いいいい', u'う', 'aaa']
+ expected = (u" a ... ああああ\nあああ あああああ ... さ\n"
+ u".. ... ... ...\naaa えええ ... せ\n"
+ u"\n[4 rows x 4 columns]")
+ assert _rep(df) == expected
+
+ def test_east_asian_unicode_true(self):
+ if PY3:
+ _rep = repr
+ else:
+ _rep = unicode # noqa
+
+ # Emable Unicode option -----------------------------------------
+ with option_context('display.unicode.east_asian_width', True):
+
+ # mid col
+ df = DataFrame({'a': [u'あ', u'いいい', u'う', u'ええええええ'],
+ 'b': [1, 222, 33333, 4]},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\na あ 1\n"
+ u"bb いいい 222\nc う 33333\n"
+ u"ddd ええええええ 4")
+ assert _rep(df) == expected
+
+ # last col
+ df = DataFrame({'a': [1, 222, 33333, 4],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\na 1 あ\n"
+ u"bb 222 いいい\nc 33333 う\n"
+ u"ddd 4 ええええええ")
+ assert _rep(df) == expected
+
+ # all col
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" a b\n"
+ u"a あああああ あ\n"
+ u"bb い いいい\n"
+ u"c う う\n"
+ u"ddd えええ ええええええ")
+ assert _rep(df) == expected
+
+ # column name
+ df = DataFrame({'b': [u'あ', u'いいい', u'う', u'ええええええ'],
+ u'あああああ': [1, 222, 33333, 4]},
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u" b あああああ\n"
+ u"a あ 1\n"
+ u"bb いいい 222\n"
+ u"c う 33333\n"
+ u"ddd ええええええ 4")
+ assert _rep(df) == expected
+
+ # index
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=[u'あああ', u'いいいいいい', u'うう', u'え'])
+ expected = (u" a b\n"
+ u"あああ あああああ あ\n"
+ u"いいいいいい い いいい\n"
+ u"うう う う\n"
+ u"え えええ ええええええ")
+ assert _rep(df) == expected
+
+ # index name
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=pd.Index([u'あ', u'い', u'うう', u'え'],
+ name=u'おおおお'))
+ expected = (u" a b\n"
+ u"おおおお \n"
+ u"あ あああああ あ\n"
+ u"い い いいい\n"
+ u"うう う う\n"
+ u"え えええ ええええええ")
+ assert _rep(df) == expected
+
+ # all
+ df = DataFrame({u'あああ': [u'あああ', u'い', u'う', u'えええええ'],
+ u'いいいいい': [u'あ', u'いいい', u'う', u'ええ']},
+ index=pd.Index([u'あ', u'いいい', u'うう', u'え'],
+ name=u'お'))
+ expected = (u" あああ いいいいい\n"
+ u"お \n"
+ u"あ あああ あ\n"
+ u"いいい い いいい\n"
+ u"うう う う\n"
+ u"え えええええ ええ")
+ assert _rep(df) == expected
+
+ # MultiIndex
+ idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), (
+ u'おおお', u'かかかか'), (u'き', u'くく')])
+ df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ']},
+ index=idx)
+ expected = (u" a b\n"
+ u"あ いい あああああ あ\n"
+ u"う え い いいい\n"
+ u"おおお かかかか う う\n"
+ u"き くく えええ ええええええ")
+ assert _rep(df) == expected
+
+ # truncate
+ with option_context('display.max_rows', 3, 'display.max_columns',
+ 3):
+
+ df = pd.DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'],
+ 'b': [u'あ', u'いいい', u'う', u'ええええええ'],
+ 'c': [u'お', u'か', u'ききき', u'くくくくくく'],
+ u'ああああ': [u'さ', u'し', u'す', u'せ']},
+ columns=['a', 'b', 'c', u'ああああ'])
+
+ expected = (u" a ... ああああ\n"
+ u"0 あああああ ... さ\n"
+ u".. ... ... ...\n"
+ u"3 えええ ... せ\n"
+ u"\n[4 rows x 4 columns]")
+ assert _rep(df) == expected
+
+ df.index = [u'あああ', u'いいいい', u'う', 'aaa']
+ expected = (u" a ... ああああ\n"
+ u"あああ あああああ ... さ\n"
+ u"... ... ... ...\n"
+ u"aaa えええ ... せ\n"
+ u"\n[4 rows x 4 columns]")
+ assert _rep(df) == expected
+
+ # ambiguous unicode
+ df = DataFrame({'b': [u'あ', u'いいい', u'¡¡', u'ええええええ'],
+ u'あああああ': [1, 222, 33333, 4]},
+ index=['a', 'bb', 'c', '¡¡¡'])
+ expected = (u" b あああああ\n"
+ u"a あ 1\n"
+ u"bb いいい 222\n"
+ u"c ¡¡ 33333\n"
+ u"¡¡¡ ええええええ 4")
+ assert _rep(df) == expected
+
+ def test_to_string_buffer_all_unicode(self):
+ buf = StringIO()
+
+ empty = DataFrame({u('c/\u03c3'): Series()})
+ nonempty = DataFrame({u('c/\u03c3'): Series([1, 2, 3])})
+
+ print(empty, file=buf)
+ print(nonempty, file=buf)
+
+ # this should work
+ buf.getvalue()
+
+ def test_to_string_with_col_space(self):
+ df = DataFrame(np.random.random(size=(1, 3)))
+ c10 = len(df.to_string(col_space=10).split("\n")[1])
+ c20 = len(df.to_string(col_space=20).split("\n")[1])
+ c30 = len(df.to_string(col_space=30).split("\n")[1])
+ assert c10 < c20 < c30
+
+ # GH 8230
+ # col_space wasn't being applied with header=False
+ with_header = df.to_string(col_space=20)
+ with_header_row1 = with_header.splitlines()[1]
+ no_header = df.to_string(col_space=20, header=False)
+ assert len(with_header_row1) == len(no_header)
+
+ def test_to_string_truncate_indices(self):
+ for index in [tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
+ tm.makeDateIndex, tm.makePeriodIndex]:
+ for column in [tm.makeStringIndex]:
+ for h in [10, 20]:
+ for w in [10, 20]:
+ with option_context("display.expand_frame_repr",
+ False):
+ df = DataFrame(index=index(h), columns=column(w))
+ with option_context("display.max_rows", 15):
+ if h == 20:
+ assert has_vertically_truncated_repr(df)
+ else:
+ assert not has_vertically_truncated_repr(
+ df)
+ with option_context("display.max_columns", 15):
+ if w == 20:
+ assert has_horizontally_truncated_repr(df)
+ else:
+ assert not (
+ has_horizontally_truncated_repr(df))
+ with option_context("display.max_rows", 15,
+ "display.max_columns", 15):
+ if h == 20 and w == 20:
+ assert has_doubly_truncated_repr(df)
+ else:
+ assert not has_doubly_truncated_repr(
+ df)
+
+ def test_to_string_truncate_multilevel(self):
+ arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ df = DataFrame(index=arrays, columns=arrays)
+ with option_context("display.max_rows", 7, "display.max_columns", 7):
+ assert has_doubly_truncated_repr(df)
+
+ def test_truncate_with_different_dtypes(self):
+
+ # 11594, 12045
+ # when truncated the dtypes of the splits can differ
+
+ # 11594
+ import datetime
+ s = Series([datetime.datetime(2012, 1, 1)] * 10 +
+ [datetime.datetime(1012, 1, 2)] + [
+ datetime.datetime(2012, 1, 3)] * 10)
+
+ with pd.option_context('display.max_rows', 8):
+ result = str(s)
+ assert 'object' in result
+
+ # 12045
+ df = DataFrame({'text': ['some words'] + [None] * 9})
+
+ with pd.option_context('display.max_rows', 8,
+ 'display.max_columns', 3):
+ result = str(df)
+ assert 'None' in result
+ assert 'NaN' not in result
+
+ def test_datetimelike_frame(self):
+
+ # GH 12211
+ df = DataFrame(
+ {'date': [pd.Timestamp('20130101').tz_localize('UTC')] +
+ [pd.NaT] * 5})
+
+ with option_context("display.max_rows", 5):
+ result = str(df)
+ assert '2013-01-01 00:00:00+00:00' in result
+ assert 'NaT' in result
+ assert '...' in result
+ assert '[6 rows x 1 columns]' in result
+
+ dts = [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5 + [pd.NaT] * 5
+ df = pd.DataFrame({"dt": dts,
+ "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+ with option_context('display.max_rows', 5):
+ expected = (' dt x\n'
+ '0 2011-01-01 00:00:00-05:00 1\n'
+ '1 2011-01-01 00:00:00-05:00 2\n'
+ '.. ... ..\n'
+ '8 NaT 9\n'
+ '9 NaT 10\n\n'
+ '[10 rows x 2 columns]')
+ assert repr(df) == expected
+
+ dts = [pd.NaT] * 5 + [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5
+ df = pd.DataFrame({"dt": dts,
+ "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+ with option_context('display.max_rows', 5):
+ expected = (' dt x\n'
+ '0 NaT 1\n'
+ '1 NaT 2\n'
+ '.. ... ..\n'
+ '8 2011-01-01 00:00:00-05:00 9\n'
+ '9 2011-01-01 00:00:00-05:00 10\n\n'
+ '[10 rows x 2 columns]')
+ assert repr(df) == expected
+
+ dts = ([pd.Timestamp('2011-01-01', tz='Asia/Tokyo')] * 5 +
+ [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5)
+ df = pd.DataFrame({"dt": dts,
+ "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+ with option_context('display.max_rows', 5):
+ expected = (' dt x\n'
+ '0 2011-01-01 00:00:00+09:00 1\n'
+ '1 2011-01-01 00:00:00+09:00 2\n'
+ '.. ... ..\n'
+ '8 2011-01-01 00:00:00-05:00 9\n'
+ '9 2011-01-01 00:00:00-05:00 10\n\n'
+ '[10 rows x 2 columns]')
+ assert repr(df) == expected
+
+ @pytest.mark.parametrize('start_date', [
+ '2017-01-01 23:59:59.999999999',
+ '2017-01-01 23:59:59.99999999',
+ '2017-01-01 23:59:59.9999999',
+ '2017-01-01 23:59:59.999999',
+ '2017-01-01 23:59:59.99999',
+ '2017-01-01 23:59:59.9999',
+ ])
+ def test_datetimeindex_highprecision(self, start_date):
+ # GH19030
+ # Check that high-precision time values for the end of day are
+ # included in repr for DatetimeIndex
+ df = DataFrame({'A': date_range(start=start_date,
+ freq='D', periods=5)})
+ result = str(df)
+ assert start_date in result
+
+ dti = date_range(start=start_date,
+ freq='D', periods=5)
+ df = DataFrame({'A': range(5)}, index=dti)
+ result = str(df.index)
+ assert start_date in result
+
+ def test_nonunicode_nonascii_alignment(self):
+ df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]])
+ rep_str = df.to_string()
+ lines = rep_str.split('\n')
+ assert len(lines[1]) == len(lines[2])
+
+ def test_unicode_problem_decoding_as_ascii(self):
+ dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})})
+ compat.text_type(dm.to_string())
+
+ def test_string_repr_encoding(self, datapath):
+ filepath = datapath('io', 'parser', 'data', 'unicode_series.csv')
+ df = pd.read_csv(filepath, header=None, encoding='latin1')
+ repr(df)
+ repr(df[1])
+
+ def test_repr_corner(self):
+ # representing infs poses no problems
+ df = DataFrame({'foo': [-np.inf, np.inf]})
+ repr(df)
+
+ def test_frame_info_encoding(self):
+ index = ['\'Til There Was You (1997)',
+ 'ldum klaka (Cold Fever) (1994)']
+ fmt.set_option('display.max_rows', 1)
+ df = DataFrame(columns=['a', 'b', 'c'], index=index)
+ repr(df)
+ repr(df.T)
+ fmt.set_option('display.max_rows', 200)
+
+ def test_pprint_thing(self):
+ from pandas.io.formats.printing import pprint_thing as pp_t
+
+ if PY3:
+ pytest.skip("doesn't work on Python 3")
+
+ assert pp_t('a') == u('a')
+ assert pp_t(u('a')) == u('a')
+ assert pp_t(None) == 'None'
+ assert pp_t(u('\u05d0'), quote_strings=True) == u("u'\u05d0'")
+ assert pp_t(u('\u05d0'), quote_strings=False) == u('\u05d0')
+ assert (pp_t((u('\u05d0'), u('\u05d1')), quote_strings=True) ==
+ u("(u'\u05d0', u'\u05d1')"))
+ assert (pp_t((u('\u05d0'), (u('\u05d1'), u('\u05d2'))),
+ quote_strings=True) == u("(u'\u05d0', "
+ "(u'\u05d1', u'\u05d2'))"))
+ assert (pp_t(('foo', u('\u05d0'), (u('\u05d0'), u('\u05d0'))),
+ quote_strings=True) == u("(u'foo', u'\u05d0', "
+ "(u'\u05d0', u'\u05d0'))"))
+
+ # gh-2038: escape embedded tabs in string
+ assert "\t" not in pp_t("a\tb", escape_chars=("\t", ))
+
+ def test_wide_repr(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.show_dimensions', True,
+ 'display.max_columns', 20):
+ max_cols = get_option('display.max_columns')
+ df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+ set_option('display.expand_frame_repr', False)
+ rep_str = repr(df)
+
+ assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str
+ set_option('display.expand_frame_repr', True)
+ wide_repr = repr(df)
+ assert rep_str != wide_repr
+
+ with option_context('display.width', 120):
+ wider_repr = repr(df)
+ assert len(wider_repr) < len(wide_repr)
+
+ reset_option('display.expand_frame_repr')
+
+ def test_wide_repr_wide_columns(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.max_columns', 20):
+ df = DataFrame(np.random.randn(5, 3),
+ columns=['a' * 90, 'b' * 90, 'c' * 90])
+ rep_str = repr(df)
+
+ assert len(rep_str.splitlines()) == 20
+
+ def test_wide_repr_named(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.max_columns', 20):
+ max_cols = get_option('display.max_columns')
+ df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+ df.index.name = 'DataFrame Index'
+ set_option('display.expand_frame_repr', False)
+
+ rep_str = repr(df)
+ set_option('display.expand_frame_repr', True)
+ wide_repr = repr(df)
+ assert rep_str != wide_repr
+
+ with option_context('display.width', 150):
+ wider_repr = repr(df)
+ assert len(wider_repr) < len(wide_repr)
+
+ for line in wide_repr.splitlines()[1::13]:
+ assert 'DataFrame Index' in line
+
+ reset_option('display.expand_frame_repr')
+
+ def test_wide_repr_multiindex(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.max_columns', 20):
+ midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10)))
+ max_cols = get_option('display.max_columns')
+ df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)),
+ index=midx)
+ df.index.names = ['Level 0', 'Level 1']
+ set_option('display.expand_frame_repr', False)
+ rep_str = repr(df)
+ set_option('display.expand_frame_repr', True)
+ wide_repr = repr(df)
+ assert rep_str != wide_repr
+
+ with option_context('display.width', 150):
+ wider_repr = repr(df)
+ assert len(wider_repr) < len(wide_repr)
+
+ for line in wide_repr.splitlines()[1::13]:
+ assert 'Level 0 Level 1' in line
+
+ reset_option('display.expand_frame_repr')
+
+ def test_wide_repr_multiindex_cols(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.max_columns', 20):
+ max_cols = get_option('display.max_columns')
+ midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10)))
+ mcols = MultiIndex.from_arrays(
+ tm.rands_array(3, size=(2, max_cols - 1)))
+ df = DataFrame(tm.rands_array(25, (10, max_cols - 1)),
+ index=midx, columns=mcols)
+ df.index.names = ['Level 0', 'Level 1']
+ set_option('display.expand_frame_repr', False)
+ rep_str = repr(df)
+ set_option('display.expand_frame_repr', True)
+ wide_repr = repr(df)
+ assert rep_str != wide_repr
+
+ with option_context('display.width', 150, 'display.max_columns', 20):
+ wider_repr = repr(df)
+ assert len(wider_repr) < len(wide_repr)
+
+ reset_option('display.expand_frame_repr')
+
+ def test_wide_repr_unicode(self):
+ with option_context('mode.sim_interactive', True,
+ 'display.max_columns', 20):
+ max_cols = 20
+ df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+ set_option('display.expand_frame_repr', False)
+ rep_str = repr(df)
+ set_option('display.expand_frame_repr', True)
+ wide_repr = repr(df)
+ assert rep_str != wide_repr
+
+ with option_context('display.width', 150):
+ wider_repr = repr(df)
+ assert len(wider_repr) < len(wide_repr)
+
+ reset_option('display.expand_frame_repr')
+
+ def test_wide_repr_wide_long_columns(self):
+ with option_context('mode.sim_interactive', True):
+ df = DataFrame({'a': ['a' * 30, 'b' * 30],
+ 'b': ['c' * 70, 'd' * 80]})
+
+ result = repr(df)
+ assert 'ccccc' in result
+ assert 'ddddd' in result
+
+ def test_long_series(self):
+ n = 1000
+ s = Series(
+ np.random.randint(-50, 50, n),
+ index=['s{x:04d}'.format(x=x) for x in range(n)], dtype='int64')
+
+ import re
+ str_rep = str(s)
+ nmatches = len(re.findall('dtype', str_rep))
+ assert nmatches == 1
+
+ def test_index_with_nan(self):
+ # GH 2850
+ df = DataFrame({'id1': {0: '1a3',
+ 1: '9h4'},
+ 'id2': {0: np.nan,
+ 1: 'd67'},
+ 'id3': {0: '78d',
+ 1: '79d'},
+ 'value': {0: 123,
+ 1: 64}})
+
+ # multi-index
+ y = df.set_index(['id1', 'id2', 'id3'])
+ result = y.to_string()
+ expected = u(
+ ' value\nid1 id2 id3 \n'
+ '1a3 NaN 78d 123\n9h4 d67 79d 64')
+ assert result == expected
+
+ # index
+ y = df.set_index('id2')
+ result = y.to_string()
+ expected = u(
+ ' id1 id3 value\nid2 \n'
+ 'NaN 1a3 78d 123\nd67 9h4 79d 64')
+ assert result == expected
+
+ # with append (this failed in 0.12)
+ y = df.set_index(['id1', 'id2']).set_index('id3', append=True)
+ result = y.to_string()
+ expected = u(
+ ' value\nid1 id2 id3 \n'
+ '1a3 NaN 78d 123\n9h4 d67 79d 64')
+ assert result == expected
+
+ # all-nan in mi
+ df2 = df.copy()
+ df2.loc[:, 'id2'] = np.nan
+ y = df2.set_index('id2')
+ result = y.to_string()
+ expected = u(
+ ' id1 id3 value\nid2 \n'
+ 'NaN 1a3 78d 123\nNaN 9h4 79d 64')
+ assert result == expected
+
+ # partial nan in mi
+ df2 = df.copy()
+ df2.loc[:, 'id2'] = np.nan
+ y = df2.set_index(['id2', 'id3'])
+ result = y.to_string()
+ expected = u(
+ ' id1 value\nid2 id3 \n'
+ 'NaN 78d 1a3 123\n 79d 9h4 64')
+ assert result == expected
+
+ df = DataFrame({'id1': {0: np.nan,
+ 1: '9h4'},
+ 'id2': {0: np.nan,
+ 1: 'd67'},
+ 'id3': {0: np.nan,
+ 1: '79d'},
+ 'value': {0: 123,
+ 1: 64}})
+
+ y = df.set_index(['id1', 'id2', 'id3'])
+ result = y.to_string()
+ expected = u(
+ ' value\nid1 id2 id3 \n'
+ 'NaN NaN NaN 123\n9h4 d67 79d 64')
+ assert result == expected
+
+ def test_to_string(self):
+
+ # big mixed
+ biggie = DataFrame({'A': np.random.randn(200),
+ 'B': tm.makeStringIndex(200)},
+ index=lrange(200))
+
+ biggie.loc[:20, 'A'] = np.nan
+ biggie.loc[:20, 'B'] = np.nan
+ s = biggie.to_string()
+
+ buf = StringIO()
+ retval = biggie.to_string(buf=buf)
+ assert retval is None
+ assert buf.getvalue() == s
+
+ assert isinstance(s, compat.string_types)
+
+ # print in right order
+ result = biggie.to_string(columns=['B', 'A'], col_space=17,
+ float_format='%.5f'.__mod__)
+ lines = result.split('\n')
+ header = lines[0].strip().split()
+ joined = '\n'.join(re.sub(r'\s+', ' ', x).strip() for x in lines[1:])
+ recons = read_csv(StringIO(joined), names=header,
+ header=None, sep=' ')
+ tm.assert_series_equal(recons['B'], biggie['B'])
+ assert recons['A'].count() == biggie['A'].count()
+ assert (np.abs(recons['A'].dropna() -
+ biggie['A'].dropna()) < 0.1).all()
+
+ # expected = ['B', 'A']
+ # assert header == expected
+
+ result = biggie.to_string(columns=['A'], col_space=17)
+ header = result.split('\n')[0].strip().split()
+ expected = ['A']
+ assert header == expected
+
+ biggie.to_string(columns=['B', 'A'],
+ formatters={'A': lambda x: '{x:.1f}'.format(x=x)})
+
+ biggie.to_string(columns=['B', 'A'], float_format=str)
+ biggie.to_string(columns=['B', 'A'], col_space=12, float_format=str)
+
+ frame = DataFrame(index=np.arange(200))
+ frame.to_string()
+
+ def test_to_string_no_header(self):
+ df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
+
+ df_s = df.to_string(header=False)
+ expected = "0 1 4\n1 2 5\n2 3 6"
+
+ assert df_s == expected
+
+ def test_to_string_specified_header(self):
+ df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
+
+ df_s = df.to_string(header=['X', 'Y'])
+ expected = ' X Y\n0 1 4\n1 2 5\n2 3 6'
+
+ assert df_s == expected
+
+ with pytest.raises(ValueError):
+ df.to_string(header=['X'])
+
+ def test_to_string_no_index(self):
+ # GH 16839, GH 13032
+ df = DataFrame({'x': [11, 22], 'y': [33, -44], 'z': ['AAA', ' ']})
+
+ df_s = df.to_string(index=False)
+ # Leading space is expected for positive numbers.
+ expected = (" x y z\n"
+ " 11 33 AAA\n"
+ " 22 -44 ")
+ assert df_s == expected
+
+ df_s = df[['y', 'x', 'z']].to_string(index=False)
+ expected = (" y x z\n"
+ " 33 11 AAA\n"
+ "-44 22 ")
+ assert df_s == expected
+
+ def test_to_string_line_width_no_index(self):
+ # GH 13998, GH 22505
+ df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, index=False)
+ expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 "
+
+ assert df_s == expected
+
+ df = DataFrame({'x': [11, 22, 33], 'y': [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, index=False)
+ expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 "
+
+ assert df_s == expected
+
+ df = DataFrame({'x': [11, 22, -33], 'y': [4, 5, -6]})
+
+ df_s = df.to_string(line_width=1, index=False)
+ expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 "
+
+ assert df_s == expected
+
+ def test_to_string_float_formatting(self):
+ tm.reset_display_options()
+ fmt.set_option('display.precision', 5, 'display.column_space', 12,
+ 'display.notebook_repr_html', False)
+
+ df = DataFrame({'x': [0, 0.25, 3456.000, 12e+45, 1.64e+6, 1.7e+8,
+ 1.253456, np.pi, -1e6]})
+
+ df_s = df.to_string()
+
+ if _three_digit_exp():
+ expected = (' x\n0 0.00000e+000\n1 2.50000e-001\n'
+ '2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n'
+ '5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n'
+ '8 -1.00000e+006')
+ else:
+ expected = (' x\n0 0.00000e+00\n1 2.50000e-01\n'
+ '2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n'
+ '5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n'
+ '8 -1.00000e+06')
+ assert df_s == expected
+
+ df = DataFrame({'x': [3234, 0.253]})
+ df_s = df.to_string()
+
+ expected = (' x\n' '0 3234.000\n' '1 0.253')
+ assert df_s == expected
+
+ tm.reset_display_options()
+ assert get_option("display.precision") == 6
+
+ df = DataFrame({'x': [1e9, 0.2512]})
+ df_s = df.to_string()
+
+ if _three_digit_exp():
+ expected = (' x\n'
+ '0 1.000000e+009\n'
+ '1 2.512000e-001')
+ else:
+ expected = (' x\n'
+ '0 1.000000e+09\n'
+ '1 2.512000e-01')
+ assert df_s == expected
+
+ def test_to_string_float_format_no_fixed_width(self):
+
+ # GH 21625
+ df = DataFrame({'x': [0.19999]})
+ expected = ' x\n0 0.200'
+ assert df.to_string(float_format='%.3f') == expected
+
+ # GH 22270
+ df = DataFrame({'x': [100.0]})
+ expected = ' x\n0 100'
+ assert df.to_string(float_format='%.0f') == expected
+
+ def test_to_string_small_float_values(self):
+ df = DataFrame({'a': [1.5, 1e-17, -5.5e-7]})
+
+ result = df.to_string()
+ # sadness per above
+ if '{x:.4g}'.format(x=1.7e8) == '1.7e+008':
+ expected = (' a\n'
+ '0 1.500000e+000\n'
+ '1 1.000000e-017\n'
+ '2 -5.500000e-007')
+ else:
+ expected = (' a\n'
+ '0 1.500000e+00\n'
+ '1 1.000000e-17\n'
+ '2 -5.500000e-07')
+ assert result == expected
+
+ # but not all exactly zero
+ df = df * 0
+ result = df.to_string()
+ expected = (' 0\n' '0 0\n' '1 0\n' '2 -0')
+
+ def test_to_string_float_index(self):
+ index = Index([1.5, 2, 3, 4, 5])
+ df = DataFrame(lrange(5), index=index)
+
+ result = df.to_string()
+ expected = (' 0\n'
+ '1.5 0\n'
+ '2.0 1\n'
+ '3.0 2\n'
+ '4.0 3\n'
+ '5.0 4')
+ assert result == expected
+
+ def test_to_string_ascii_error(self):
+ data = [('0 ', u(' .gitignore '), u(' 5 '),
+ ' \xe2\x80\xa2\xe2\x80\xa2\xe2\x80'
+ '\xa2\xe2\x80\xa2\xe2\x80\xa2')]
+ df = DataFrame(data)
+
+ # it works!
+ repr(df)
+
+ def test_to_string_int_formatting(self):
+ df = DataFrame({'x': [-15, 20, 25, -35]})
+ assert issubclass(df['x'].dtype.type, np.integer)
+
+ output = df.to_string()
+ expected = (' x\n' '0 -15\n' '1 20\n' '2 25\n' '3 -35')
+ assert output == expected
+
+ def test_to_string_index_formatter(self):
+ df = DataFrame([lrange(5), lrange(5, 10), lrange(10, 15)])
+
+ rs = df.to_string(formatters={'__index__': lambda x: 'abc' [x]})
+
+ xp = """\
+ 0 1 2 3 4
+a 0 1 2 3 4
+b 5 6 7 8 9
+c 10 11 12 13 14\
+"""
+
+ assert rs == xp
+
+ def test_to_string_left_justify_cols(self):
+ tm.reset_display_options()
+ df = DataFrame({'x': [3234, 0.253]})
+ df_s = df.to_string(justify='left')
+ expected = (' x \n' '0 3234.000\n' '1 0.253')
+ assert df_s == expected
+
+ def test_to_string_format_na(self):
+ tm.reset_display_options()
+ df = DataFrame({'A': [np.nan, -1, -2.1234, 3, 4],
+ 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']})
+ result = df.to_string()
+
+ expected = (' A B\n'
+ '0 NaN NaN\n'
+ '1 -1.0000 foo\n'
+ '2 -2.1234 foooo\n'
+ '3 3.0000 fooooo\n'
+ '4 4.0000 bar')
+ assert result == expected
+
+ df = DataFrame({'A': [np.nan, -1., -2., 3., 4.],
+ 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']})
+ result = df.to_string()
+
+ expected = (' A B\n'
+ '0 NaN NaN\n'
+ '1 -1.0 foo\n'
+ '2 -2.0 foooo\n'
+ '3 3.0 fooooo\n'
+ '4 4.0 bar')
+ assert result == expected
+
+ def test_to_string_format_inf(self):
+ # Issue #24861
+ tm.reset_display_options()
+ df = DataFrame({
+ 'A': [-np.inf, np.inf, -1, -2.1234, 3, 4],
+ 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar']
+ })
+ result = df.to_string()
+
+ expected = (' A B\n'
+ '0 -inf -inf\n'
+ '1 inf inf\n'
+ '2 -1.0000 foo\n'
+ '3 -2.1234 foooo\n'
+ '4 3.0000 fooooo\n'
+ '5 4.0000 bar')
+ assert result == expected
+
+ df = DataFrame({
+ 'A': [-np.inf, np.inf, -1., -2., 3., 4.],
+ 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar']
+ })
+ result = df.to_string()
+
+ expected = (' A B\n'
+ '0 -inf -inf\n'
+ '1 inf inf\n'
+ '2 -1.0 foo\n'
+ '3 -2.0 foooo\n'
+ '4 3.0 fooooo\n'
+ '5 4.0 bar')
+ assert result == expected
+
+ def test_to_string_decimal(self):
+ # Issue #23614
+ df = DataFrame({'A': [6.0, 3.1, 2.2]})
+ expected = ' A\n0 6,0\n1 3,1\n2 2,2'
+ assert df.to_string(decimal=',') == expected
+
+ def test_to_string_line_width(self):
+ df = DataFrame(123, lrange(10, 15), lrange(30))
+ s = df.to_string(line_width=80)
+ assert max(len(l) for l in s.split('\n')) == 80
+
+ def test_show_dimensions(self):
+ df = DataFrame(123, lrange(10, 15), lrange(30))
+
+ with option_context('display.max_rows', 10, 'display.max_columns', 40,
+ 'display.width', 500, 'display.expand_frame_repr',
+ 'info', 'display.show_dimensions', True):
+ assert '5 rows' in str(df)
+ assert '5 rows' in df._repr_html_()
+ with option_context('display.max_rows', 10, 'display.max_columns', 40,
+ 'display.width', 500, 'display.expand_frame_repr',
+ 'info', 'display.show_dimensions', False):
+ assert '5 rows' not in str(df)
+ assert '5 rows' not in df._repr_html_()
+ with option_context('display.max_rows', 2, 'display.max_columns', 2,
+ 'display.width', 500, 'display.expand_frame_repr',
+ 'info', 'display.show_dimensions', 'truncate'):
+ assert '5 rows' in str(df)
+ assert '5 rows' in df._repr_html_()
+ with option_context('display.max_rows', 10, 'display.max_columns', 40,
+ 'display.width', 500, 'display.expand_frame_repr',
+ 'info', 'display.show_dimensions', 'truncate'):
+ assert '5 rows' not in str(df)
+ assert '5 rows' not in df._repr_html_()
+
+ def test_repr_html(self):
+ self.frame._repr_html_()
+
+ fmt.set_option('display.max_rows', 1, 'display.max_columns', 1)
+ self.frame._repr_html_()
+
+ fmt.set_option('display.notebook_repr_html', False)
+ self.frame._repr_html_()
+
+ tm.reset_display_options()
+
+ df = DataFrame([[1, 2], [3, 4]])
+ fmt.set_option('display.show_dimensions', True)
+ assert '2 rows' in df._repr_html_()
+ fmt.set_option('display.show_dimensions', False)
+ assert '2 rows' not in df._repr_html_()
+
+ tm.reset_display_options()
+
+ def test_repr_html_mathjax(self):
+ df = DataFrame([[1, 2], [3, 4]])
+ assert 'tex2jax_ignore' not in df._repr_html_()
+
+ with pd.option_context('display.html.use_mathjax', False):
+ assert 'tex2jax_ignore' in df._repr_html_()
+
+ def test_repr_html_wide(self):
+ max_cols = 20
+ df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ assert "..." not in df._repr_html_()
+
+ wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ assert "..." in wide_df._repr_html_()
+
+ def test_repr_html_wide_multiindex_cols(self):
+ max_cols = 20
+
+ mcols = MultiIndex.from_product([np.arange(max_cols // 2),
+ ['foo', 'bar']],
+ names=['first', 'second'])
+ df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
+ columns=mcols)
+ reg_repr = df._repr_html_()
+ assert '...' not in reg_repr
+
+ mcols = MultiIndex.from_product((np.arange(1 + (max_cols // 2)),
+ ['foo', 'bar']),
+ names=['first', 'second'])
+ df = DataFrame(tm.rands_array(25, size=(10, len(mcols))),
+ columns=mcols)
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ assert '...' in df._repr_html_()
+
+ def test_repr_html_long(self):
+ with option_context('display.max_rows', 60):
+ max_rows = get_option('display.max_rows')
+ h = max_rows - 1
+ df = DataFrame({'A': np.arange(1, 1 + h),
+ 'B': np.arange(41, 41 + h)})
+ reg_repr = df._repr_html_()
+ assert '..' not in reg_repr
+ assert str(41 + max_rows // 2) in reg_repr
+
+ h = max_rows + 1
+ df = DataFrame({'A': np.arange(1, 1 + h),
+ 'B': np.arange(41, 41 + h)})
+ long_repr = df._repr_html_()
+ assert '..' in long_repr
+ assert str(41 + max_rows // 2) not in long_repr
+ assert u('{h} rows ').format(h=h) in long_repr
+ assert u('2 columns') in long_repr
+
+ def test_repr_html_float(self):
+ with option_context('display.max_rows', 60):
+
+ max_rows = get_option('display.max_rows')
+ h = max_rows - 1
+ df = DataFrame({'idx': np.linspace(-10, 10, h),
+ 'A': np.arange(1, 1 + h),
+ 'B': np.arange(41, 41 + h)}).set_index('idx')
+ reg_repr = df._repr_html_()
+ assert '..' not in reg_repr
+ assert '<td>{val}</td>'.format(val=str(40 + h)) in reg_repr
+
+ h = max_rows + 1
+ df = DataFrame({'idx': np.linspace(-10, 10, h),
+ 'A': np.arange(1, 1 + h),
+ 'B': np.arange(41, 41 + h)}).set_index('idx')
+ long_repr = df._repr_html_()
+ assert '..' in long_repr
+ assert '<td>{val}</td>'.format(val='31') not in long_repr
+ assert u('{h} rows ').format(h=h) in long_repr
+ assert u('2 columns') in long_repr
+
+ def test_repr_html_long_multiindex(self):
+ max_rows = 60
+ max_L1 = max_rows // 2
+
+ tuples = list(itertools.product(np.arange(max_L1), ['foo', 'bar']))
+ idx = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+ df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx,
+ columns=['A', 'B'])
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ reg_repr = df._repr_html_()
+ assert '...' not in reg_repr
+
+ tuples = list(itertools.product(np.arange(max_L1 + 1), ['foo', 'bar']))
+ idx = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+ df = DataFrame(np.random.randn((max_L1 + 1) * 2, 2), index=idx,
+ columns=['A', 'B'])
+ long_repr = df._repr_html_()
+ assert '...' in long_repr
+
+ def test_repr_html_long_and_wide(self):
+ max_cols = 20
+ max_rows = 60
+
+ h, w = max_rows - 1, max_cols - 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ assert '...' not in df._repr_html_()
+
+ h, w = max_rows + 1, max_cols + 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ with option_context('display.max_rows', 60, 'display.max_columns', 20):
+ assert '...' in df._repr_html_()
+
+ def test_info_repr(self):
+ # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect
+ # the terminal size to ensure that we try to print something "too big"
+ term_width, term_height = get_terminal_size()
+
+ max_rows = 60
+ max_cols = 20 + (max(term_width, 80) - 80) // 4
+ # Long
+ h, w = max_rows + 1, max_cols - 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ assert has_vertically_truncated_repr(df)
+ with option_context('display.large_repr', 'info'):
+ assert has_info_repr(df)
+
+ # Wide
+ h, w = max_rows - 1, max_cols + 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ assert has_horizontally_truncated_repr(df)
+ with option_context('display.large_repr', 'info',
+ 'display.max_columns', max_cols):
+ assert has_info_repr(df)
+
+ def test_info_repr_max_cols(self):
+ # GH #6939
+ df = DataFrame(np.random.randn(10, 5))
+ with option_context('display.large_repr', 'info',
+ 'display.max_columns', 1,
+ 'display.max_info_columns', 4):
+ assert has_non_verbose_info_repr(df)
+
+ with option_context('display.large_repr', 'info',
+ 'display.max_columns', 1,
+ 'display.max_info_columns', 5):
+ assert not has_non_verbose_info_repr(df)
+
+ # test verbose overrides
+ # fmt.set_option('display.max_info_columns', 4) # exceeded
+
+ def test_info_repr_html(self):
+ max_rows = 60
+ max_cols = 20
+ # Long
+ h, w = max_rows + 1, max_cols - 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ assert r'&lt;class' not in df._repr_html_()
+ with option_context('display.large_repr', 'info'):
+ assert r'&lt;class' in df._repr_html_()
+
+ # Wide
+ h, w = max_rows - 1, max_cols + 1
+ df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)})
+ assert '<class' not in df._repr_html_()
+ with option_context('display.large_repr', 'info',
+ 'display.max_columns', max_cols):
+ assert '&lt;class' in df._repr_html_()
+
+ def test_fake_qtconsole_repr_html(self):
+ def get_ipython():
+ return {'config': {'KernelApp':
+ {'parent_appname': 'ipython-qtconsole'}}}
+
+ repstr = self.frame._repr_html_()
+ assert repstr is not None
+
+ fmt.set_option('display.max_rows', 5, 'display.max_columns', 2)
+ repstr = self.frame._repr_html_()
+
+ assert 'class' in repstr # info fallback
+ tm.reset_display_options()
+
+ def test_pprint_pathological_object(self):
+ """
+ If the test fails, it at least won't hang.
+ """
+
+ class A(object):
+ def __getitem__(self, key):
+ return 3 # obviously simplified
+
+ df = DataFrame([A()])
+ repr(df) # just don't die
+
+ def test_float_trim_zeros(self):
+ vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10,
+ 2.03954217305e+10, 5.59897817305e+10]
+ skip = True
+ for line in repr(DataFrame({'A': vals})).split('\n')[:-2]:
+ if line.startswith('dtype:'):
+ continue
+ if _three_digit_exp():
+ assert ('+010' in line) or skip
+ else:
+ assert ('+10' in line) or skip
+ skip = False
+
+ def test_dict_entries(self):
+ df = DataFrame({'A': [{'a': 1, 'b': 2}]})
+
+ val = df.to_string()
+ assert "'a': 1" in val
+ assert "'b': 2" in val
+
+ def test_period(self):
+ # GH 12615
+ df = pd.DataFrame({'A': pd.period_range('2013-01',
+ periods=4, freq='M'),
+ 'B': [pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02-01', freq='D'),
+ pd.Period('2011-03-01 09:00', freq='H'),
+ pd.Period('2011-04', freq='M')],
+ 'C': list('abcd')})
+ exp = (" A B C\n"
+ "0 2013-01 2011-01 a\n"
+ "1 2013-02 2011-02-01 b\n"
+ "2 2013-03 2011-03-01 09:00 c\n"
+ "3 2013-04 2011-04 d")
+ assert str(df) == exp
+
+
+def gen_series_formatting():
+ s1 = pd.Series(['a'] * 100)
+ s2 = pd.Series(['ab'] * 100)
+ s3 = pd.Series(['a', 'ab', 'abc', 'abcd', 'abcde', 'abcdef'])
+ s4 = s3[::-1]
+ test_sers = {'onel': s1, 'twol': s2, 'asc': s3, 'desc': s4}
+ return test_sers
+
+
+class TestSeriesFormatting(object):
+
+ def setup_method(self, method):
+ self.ts = tm.makeTimeSeries()
+
+ def test_repr_unicode(self):
+ s = Series([u('\u03c3')] * 10)
+ repr(s)
+
+ a = Series([u("\u05d0")] * 1000)
+ a.name = 'title1'
+ repr(a)
+
+ def test_to_string(self):
+ buf = StringIO()
+
+ s = self.ts.to_string()
+
+ retval = self.ts.to_string(buf=buf)
+ assert retval is None
+ assert buf.getvalue().strip() == s
+
+ # pass float_format
+ format = '%.4f'.__mod__
+ result = self.ts.to_string(float_format=format)
+ result = [x.split()[1] for x in result.split('\n')[:-1]]
+ expected = [format(x) for x in self.ts]
+ assert result == expected
+
+ # empty string
+ result = self.ts[:0].to_string()
+ assert result == 'Series([], Freq: B)'
+
+ result = self.ts[:0].to_string(length=0)
+ assert result == 'Series([], Freq: B)'
+
+ # name and length
+ cp = self.ts.copy()
+ cp.name = 'foo'
+ result = cp.to_string(length=True, name=True, dtype=True)
+ last_line = result.split('\n')[-1].strip()
+ assert last_line == ("Freq: B, Name: foo, "
+ "Length: {cp}, dtype: float64".format(cp=len(cp)))
+
+ def test_freq_name_separation(self):
+ s = Series(np.random.randn(10),
+ index=date_range('1/1/2000', periods=10), name=0)
+
+ result = repr(s)
+ assert 'Freq: D, Name: 0' in result
+
+ def test_to_string_mixed(self):
+ s = Series(['foo', np.nan, -1.23, 4.56])
+ result = s.to_string()
+ expected = (u('0 foo\n') + u('1 NaN\n') + u('2 -1.23\n') +
+ u('3 4.56'))
+ assert result == expected
+
+ # but don't count NAs as floats
+ s = Series(['foo', np.nan, 'bar', 'baz'])
+ result = s.to_string()
+ expected = (u('0 foo\n') + '1 NaN\n' + '2 bar\n' + '3 baz')
+ assert result == expected
+
+ s = Series(['foo', 5, 'bar', 'baz'])
+ result = s.to_string()
+ expected = (u('0 foo\n') + '1 5\n' + '2 bar\n' + '3 baz')
+ assert result == expected
+
+ def test_to_string_float_na_spacing(self):
+ s = Series([0., 1.5678, 2., -3., 4.])
+ s[::2] = np.nan
+
+ result = s.to_string()
+ expected = (u('0 NaN\n') + '1 1.5678\n' + '2 NaN\n' +
+ '3 -3.0000\n' + '4 NaN')
+ assert result == expected
+
+ def test_to_string_without_index(self):
+ # GH 11729 Test index=False option
+ s = Series([1, 2, 3, 4])
+ result = s.to_string(index=False)
+ expected = (u(' 1\n') + ' 2\n' + ' 3\n' + ' 4')
+ assert result == expected
+
+ def test_unicode_name_in_footer(self):
+ s = Series([1, 2], name=u('\u05e2\u05d1\u05e8\u05d9\u05ea'))
+ sf = fmt.SeriesFormatter(s, name=u('\u05e2\u05d1\u05e8\u05d9\u05ea'))
+ sf._get_footer() # should not raise exception
+
+ def test_east_asian_unicode_series(self):
+ if PY3:
+ _rep = repr
+ else:
+ _rep = unicode # noqa
+ # not aligned properly because of east asian width
+
+ # unicode index
+ s = Series(['a', 'bb', 'CCC', 'D'],
+ index=[u'あ', u'いい', u'ううう', u'ええええ'])
+ expected = (u"あ a\nいい bb\nううう CCC\n"
+ u"ええええ D\ndtype: object")
+ assert _rep(s) == expected
+
+ # unicode values
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u"a あ\nbb いい\nc ううう\n"
+ u"ddd ええええ\ndtype: object")
+ assert _rep(s) == expected
+
+ # both
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=[u'ああ', u'いいいい', u'う', u'えええ'])
+ expected = (u"ああ あ\nいいいい いい\nう ううう\n"
+ u"えええ ええええ\ndtype: object")
+ assert _rep(s) == expected
+
+ # unicode footer
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=[u'ああ', u'いいいい', u'う', u'えええ'],
+ name=u'おおおおおおお')
+ expected = (u"ああ あ\nいいいい いい\nう ううう\n"
+ u"えええ ええええ\nName: おおおおおおお, dtype: object")
+ assert _rep(s) == expected
+
+ # MultiIndex
+ idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), (
+ u'おおお', u'かかかか'), (u'き', u'くく')])
+ s = Series([1, 22, 3333, 44444], index=idx)
+ expected = (u"あ いい 1\n"
+ u"う え 22\n"
+ u"おおお かかかか 3333\n"
+ u"き くく 44444\ndtype: int64")
+ assert _rep(s) == expected
+
+ # object dtype, shorter than unicode repr
+ s = Series([1, 22, 3333, 44444], index=[1, 'AB', np.nan, u'あああ'])
+ expected = (u"1 1\nAB 22\nNaN 3333\n"
+ u"あああ 44444\ndtype: int64")
+ assert _rep(s) == expected
+
+ # object dtype, longer than unicode repr
+ s = Series([1, 22, 3333, 44444],
+ index=[1, 'AB', pd.Timestamp('2011-01-01'), u'あああ'])
+ expected = (u"1 1\n"
+ u"AB 22\n"
+ u"2011-01-01 00:00:00 3333\n"
+ u"あああ 44444\ndtype: int64")
+ assert _rep(s) == expected
+
+ # truncate
+ with option_context('display.max_rows', 3):
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ name=u'おおおおおおお')
+
+ expected = (u"0 あ\n ... \n"
+ u"3 ええええ\n"
+ u"Name: おおおおおおお, Length: 4, dtype: object")
+ assert _rep(s) == expected
+
+ s.index = [u'ああ', u'いいいい', u'う', u'えええ']
+ expected = (u"ああ あ\n ... \n"
+ u"えええ ええええ\n"
+ u"Name: おおおおおおお, Length: 4, dtype: object")
+ assert _rep(s) == expected
+
+ # Emable Unicode option -----------------------------------------
+ with option_context('display.unicode.east_asian_width', True):
+
+ # unicode index
+ s = Series(['a', 'bb', 'CCC', 'D'],
+ index=[u'あ', u'いい', u'ううう', u'ええええ'])
+ expected = (u"あ a\nいい bb\nううう CCC\n"
+ u"ええええ D\ndtype: object")
+ assert _rep(s) == expected
+
+ # unicode values
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=['a', 'bb', 'c', 'ddd'])
+ expected = (u"a あ\nbb いい\nc ううう\n"
+ u"ddd ええええ\ndtype: object")
+ assert _rep(s) == expected
+
+ # both
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=[u'ああ', u'いいいい', u'う', u'えええ'])
+ expected = (u"ああ あ\n"
+ u"いいいい いい\n"
+ u"う ううう\n"
+ u"えええ ええええ\ndtype: object")
+ assert _rep(s) == expected
+
+ # unicode footer
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ index=[u'ああ', u'いいいい', u'う', u'えええ'],
+ name=u'おおおおおおお')
+ expected = (u"ああ あ\n"
+ u"いいいい いい\n"
+ u"う ううう\n"
+ u"えええ ええええ\n"
+ u"Name: おおおおおおお, dtype: object")
+ assert _rep(s) == expected
+
+ # MultiIndex
+ idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), (
+ u'おおお', u'かかかか'), (u'き', u'くく')])
+ s = Series([1, 22, 3333, 44444], index=idx)
+ expected = (u"あ いい 1\n"
+ u"う え 22\n"
+ u"おおお かかかか 3333\n"
+ u"き くく 44444\n"
+ u"dtype: int64")
+ assert _rep(s) == expected
+
+ # object dtype, shorter than unicode repr
+ s = Series([1, 22, 3333, 44444], index=[1, 'AB', np.nan, u'あああ'])
+ expected = (u"1 1\nAB 22\nNaN 3333\n"
+ u"あああ 44444\ndtype: int64")
+ assert _rep(s) == expected
+
+ # object dtype, longer than unicode repr
+ s = Series([1, 22, 3333, 44444],
+ index=[1, 'AB', pd.Timestamp('2011-01-01'), u'あああ'])
+ expected = (u"1 1\n"
+ u"AB 22\n"
+ u"2011-01-01 00:00:00 3333\n"
+ u"あああ 44444\ndtype: int64")
+ assert _rep(s) == expected
+
+ # truncate
+ with option_context('display.max_rows', 3):
+ s = Series([u'あ', u'いい', u'ううう', u'ええええ'],
+ name=u'おおおおおおお')
+ expected = (u"0 あ\n ... \n"
+ u"3 ええええ\n"
+ u"Name: おおおおおおお, Length: 4, dtype: object")
+ assert _rep(s) == expected
+
+ s.index = [u'ああ', u'いいいい', u'う', u'えええ']
+ expected = (u"ああ あ\n"
+ u" ... \n"
+ u"えええ ええええ\n"
+ u"Name: おおおおおおお, Length: 4, dtype: object")
+ assert _rep(s) == expected
+
+ # ambiguous unicode
+ s = Series([u'¡¡', u'い¡¡', u'ううう', u'ええええ'],
+ index=[u'ああ', u'¡¡¡¡いい', u'¡¡', u'えええ'])
+ expected = (u"ああ ¡¡\n"
+ u"¡¡¡¡いい い¡¡\n"
+ u"¡¡ ううう\n"
+ u"えええ ええええ\ndtype: object")
+ assert _rep(s) == expected
+
+ def test_float_trim_zeros(self):
+ vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10,
+ 2.03954217305e+10, 5.59897817305e+10]
+ for line in repr(Series(vals)).split('\n'):
+ if line.startswith('dtype:'):
+ continue
+ if _three_digit_exp():
+ assert '+010' in line
+ else:
+ assert '+10' in line
+
+ def test_datetimeindex(self):
+
+ index = date_range('20130102', periods=6)
+ s = Series(1, index=index)
+ result = s.to_string()
+ assert '2013-01-02' in result
+
+ # nat in index
+ s2 = Series(2, index=[Timestamp('20130111'), NaT])
+ s = s2.append(s)
+ result = s.to_string()
+ assert 'NaT' in result
+
+ # nat in summary
+ result = str(s2.index)
+ assert 'NaT' in result
+
+ @pytest.mark.parametrize('start_date', [
+ '2017-01-01 23:59:59.999999999',
+ '2017-01-01 23:59:59.99999999',
+ '2017-01-01 23:59:59.9999999',
+ '2017-01-01 23:59:59.999999',
+ '2017-01-01 23:59:59.99999',
+ '2017-01-01 23:59:59.9999'
+ ])
+ def test_datetimeindex_highprecision(self, start_date):
+ # GH19030
+ # Check that high-precision time values for the end of day are
+ # included in repr for DatetimeIndex
+ s1 = Series(date_range(start=start_date, freq='D', periods=5))
+ result = str(s1)
+ assert start_date in result
+
+ dti = date_range(start=start_date, freq='D', periods=5)
+ s2 = Series(3, index=dti)
+ result = str(s2.index)
+ assert start_date in result
+
+ def test_timedelta64(self):
+
+ from datetime import datetime, timedelta
+
+ Series(np.array([1100, 20], dtype='timedelta64[ns]')).to_string()
+
+ s = Series(date_range('2012-1-1', periods=3, freq='D'))
+
+ # GH2146
+
+ # adding NaTs
+ y = s - s.shift(1)
+ result = y.to_string()
+ assert '1 days' in result
+ assert '00:00:00' not in result
+ assert 'NaT' in result
+
+ # with frac seconds
+ o = Series([datetime(2012, 1, 1, microsecond=150)] * 3)
+ y = s - o
+ result = y.to_string()
+ assert '-1 days +23:59:59.999850' in result
+
+ # rounding?
+ o = Series([datetime(2012, 1, 1, 1)] * 3)
+ y = s - o
+ result = y.to_string()
+ assert '-1 days +23:00:00' in result
+ assert '1 days 23:00:00' in result
+
+ o = Series([datetime(2012, 1, 1, 1, 1)] * 3)
+ y = s - o
+ result = y.to_string()
+ assert '-1 days +22:59:00' in result
+ assert '1 days 22:59:00' in result
+
+ o = Series([datetime(2012, 1, 1, 1, 1, microsecond=150)] * 3)
+ y = s - o
+ result = y.to_string()
+ assert '-1 days +22:58:59.999850' in result
+ assert '0 days 22:58:59.999850' in result
+
+ # neg time
+ td = timedelta(minutes=5, seconds=3)
+ s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td
+ y = s - s2
+ result = y.to_string()
+ assert '-1 days +23:54:57' in result
+
+ td = timedelta(microseconds=550)
+ s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td
+ y = s - td
+ result = y.to_string()
+ assert '2012-01-01 23:59:59.999450' in result
+
+ # no boxing of the actual elements
+ td = Series(pd.timedelta_range('1 days', periods=3))
+ result = td.to_string()
+ assert result == u("0 1 days\n1 2 days\n2 3 days")
+
+ def test_mixed_datetime64(self):
+ df = DataFrame({'A': [1, 2], 'B': ['2012-01-01', '2012-01-02']})
+ df['B'] = pd.to_datetime(df.B)
+
+ result = repr(df.loc[0])
+ assert '2012-01-01' in result
+
+ def test_period(self):
+ # GH 12615
+ index = pd.period_range('2013-01', periods=6, freq='M')
+ s = Series(np.arange(6, dtype='int64'), index=index)
+ exp = ("2013-01 0\n"
+ "2013-02 1\n"
+ "2013-03 2\n"
+ "2013-04 3\n"
+ "2013-05 4\n"
+ "2013-06 5\n"
+ "Freq: M, dtype: int64")
+ assert str(s) == exp
+
+ s = Series(index)
+ exp = ("0 2013-01\n"
+ "1 2013-02\n"
+ "2 2013-03\n"
+ "3 2013-04\n"
+ "4 2013-05\n"
+ "5 2013-06\n"
+ "dtype: period[M]")
+ assert str(s) == exp
+
+ # periods with mixed freq
+ s = Series([pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02-01', freq='D'),
+ pd.Period('2011-03-01 09:00', freq='H')])
+ exp = ("0 2011-01\n1 2011-02-01\n"
+ "2 2011-03-01 09:00\ndtype: object")
+ assert str(s) == exp
+
+ def test_max_multi_index_display(self):
+ # GH 7101
+
+ # doc example (indexing.rst)
+
+ # multi-index
+ arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ tuples = list(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
+ s = Series(np.random.randn(8), index=index)
+
+ with option_context("display.max_rows", 10):
+ assert len(str(s).split('\n')) == 10
+ with option_context("display.max_rows", 3):
+ assert len(str(s).split('\n')) == 5
+ with option_context("display.max_rows", 2):
+ assert len(str(s).split('\n')) == 5
+ with option_context("display.max_rows", 1):
+ assert len(str(s).split('\n')) == 4
+ with option_context("display.max_rows", 0):
+ assert len(str(s).split('\n')) == 10
+
+ # index
+ s = Series(np.random.randn(8), None)
+
+ with option_context("display.max_rows", 10):
+ assert len(str(s).split('\n')) == 9
+ with option_context("display.max_rows", 3):
+ assert len(str(s).split('\n')) == 4
+ with option_context("display.max_rows", 2):
+ assert len(str(s).split('\n')) == 4
+ with option_context("display.max_rows", 1):
+ assert len(str(s).split('\n')) == 3
+ with option_context("display.max_rows", 0):
+ assert len(str(s).split('\n')) == 9
+
+ # Make sure #8532 is fixed
+ def test_consistent_format(self):
+ s = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9999, 1, 1] * 10)
+ with option_context("display.max_rows", 10,
+ "display.show_dimensions", False):
+ res = repr(s)
+ exp = ('0 1.0000\n1 1.0000\n2 1.0000\n3 '
+ '1.0000\n4 1.0000\n ... \n125 '
+ '1.0000\n126 1.0000\n127 0.9999\n128 '
+ '1.0000\n129 1.0000\ndtype: float64')
+ assert res == exp
+
+ def chck_ncols(self, s):
+ with option_context("display.max_rows", 10):
+ res = repr(s)
+ lines = res.split('\n')
+ lines = [line for line in repr(s).split('\n')
+ if not re.match(r'[^\.]*\.+', line)][:-1]
+ ncolsizes = len({len(line.strip()) for line in lines})
+ assert ncolsizes == 1
+
+ def test_format_explicit(self):
+ test_sers = gen_series_formatting()
+ with option_context("display.max_rows", 4,
+ "display.show_dimensions", False):
+ res = repr(test_sers['onel'])
+ exp = '0 a\n1 a\n ..\n98 a\n99 a\ndtype: object'
+ assert exp == res
+ res = repr(test_sers['twol'])
+ exp = ('0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype:'
+ ' object')
+ assert exp == res
+ res = repr(test_sers['asc'])
+ exp = ('0 a\n1 ab\n ... \n4 abcde\n5'
+ ' abcdef\ndtype: object')
+ assert exp == res
+ res = repr(test_sers['desc'])
+ exp = ('5 abcdef\n4 abcde\n ... \n1 ab\n0'
+ ' a\ndtype: object')
+ assert exp == res
+
+ def test_ncols(self):
+ test_sers = gen_series_formatting()
+ for s in test_sers.values():
+ self.chck_ncols(s)
+
+ def test_max_rows_eq_one(self):
+ s = Series(range(10), dtype='int64')
+ with option_context("display.max_rows", 1):
+ strrepr = repr(s).split('\n')
+ exp1 = ['0', '0']
+ res1 = strrepr[0].split()
+ assert exp1 == res1
+ exp2 = ['..']
+ res2 = strrepr[1].split()
+ assert exp2 == res2
+
+ def test_truncate_ndots(self):
+ def getndots(s):
+ return len(re.match(r'[^\.]*(\.*)', s).groups()[0])
+
+ s = Series([0, 2, 3, 6])
+ with option_context("display.max_rows", 2):
+ strrepr = repr(s).replace('\n', '')
+ assert getndots(strrepr) == 2
+
+ s = Series([0, 100, 200, 400])
+ with option_context("display.max_rows", 2):
+ strrepr = repr(s).replace('\n', '')
+ assert getndots(strrepr) == 3
+
+ def test_show_dimensions(self):
+ # gh-7117
+ s = Series(range(5))
+
+ assert 'Length' not in repr(s)
+
+ with option_context("display.max_rows", 4):
+ assert 'Length' in repr(s)
+
+ with option_context("display.show_dimensions", True):
+ assert 'Length' in repr(s)
+
+ with option_context("display.max_rows", 4,
+ "display.show_dimensions", False):
+ assert 'Length' not in repr(s)
+
+ def test_to_string_name(self):
+ s = Series(range(100), dtype='int64')
+ s.name = 'myser'
+ res = s.to_string(max_rows=2, name=True)
+ exp = '0 0\n ..\n99 99\nName: myser'
+ assert res == exp
+ res = s.to_string(max_rows=2, name=False)
+ exp = '0 0\n ..\n99 99'
+ assert res == exp
+
+ def test_to_string_dtype(self):
+ s = Series(range(100), dtype='int64')
+ res = s.to_string(max_rows=2, dtype=True)
+ exp = '0 0\n ..\n99 99\ndtype: int64'
+ assert res == exp
+ res = s.to_string(max_rows=2, dtype=False)
+ exp = '0 0\n ..\n99 99'
+ assert res == exp
+
+ def test_to_string_length(self):
+ s = Series(range(100), dtype='int64')
+ res = s.to_string(max_rows=2, length=True)
+ exp = '0 0\n ..\n99 99\nLength: 100'
+ assert res == exp
+
+ def test_to_string_na_rep(self):
+ s = pd.Series(index=range(100))
+ res = s.to_string(na_rep='foo', max_rows=2)
+ exp = '0 foo\n ..\n99 foo'
+ assert res == exp
+
+ def test_to_string_float_format(self):
+ s = pd.Series(range(10), dtype='float64')
+ res = s.to_string(float_format=lambda x: '{0:2.1f}'.format(x),
+ max_rows=2)
+ exp = '0 0.0\n ..\n9 9.0'
+ assert res == exp
+
+ def test_to_string_header(self):
+ s = pd.Series(range(10), dtype='int64')
+ s.index.name = 'foo'
+ res = s.to_string(header=True, max_rows=2)
+ exp = 'foo\n0 0\n ..\n9 9'
+ assert res == exp
+ res = s.to_string(header=False, max_rows=2)
+ exp = '0 0\n ..\n9 9'
+ assert res == exp
+
+
+def _three_digit_exp():
+ return '{x:.4g}'.format(x=1.7e8) == '1.7e+008'
+
+
+class TestFloatArrayFormatter(object):
+
+ def test_misc(self):
+ obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64))
+ result = obj.get_result()
+ assert len(result) == 0
+
+ def test_format(self):
+ obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64))
+ result = obj.get_result()
+ assert result[0] == " 12.0"
+ assert result[1] == " 0.0"
+
+ def test_output_significant_digits(self):
+ # Issue #9764
+
+ # In case default display precision changes:
+ with pd.option_context('display.precision', 6):
+ # DataFrame example from issue #9764
+ d = pd.DataFrame(
+ {'col1': [9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7,
+ 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6,
+ 4.999e-6, 5e-6, 5.0001e-6, 6e-6]})
+
+ expected_output = {
+ (0, 6):
+ ' col1\n'
+ '0 9.999000e-08\n'
+ '1 1.000000e-07\n'
+ '2 1.000100e-07\n'
+ '3 2.000000e-07\n'
+ '4 4.999000e-07\n'
+ '5 5.000000e-07',
+ (1, 6):
+ ' col1\n'
+ '1 1.000000e-07\n'
+ '2 1.000100e-07\n'
+ '3 2.000000e-07\n'
+ '4 4.999000e-07\n'
+ '5 5.000000e-07',
+ (1, 8):
+ ' col1\n'
+ '1 1.000000e-07\n'
+ '2 1.000100e-07\n'
+ '3 2.000000e-07\n'
+ '4 4.999000e-07\n'
+ '5 5.000000e-07\n'
+ '6 5.000100e-07\n'
+ '7 6.000000e-07',
+ (8, 16):
+ ' col1\n'
+ '8 9.999000e-07\n'
+ '9 1.000000e-06\n'
+ '10 1.000100e-06\n'
+ '11 2.000000e-06\n'
+ '12 4.999000e-06\n'
+ '13 5.000000e-06\n'
+ '14 5.000100e-06\n'
+ '15 6.000000e-06',
+ (9, 16):
+ ' col1\n'
+ '9 0.000001\n'
+ '10 0.000001\n'
+ '11 0.000002\n'
+ '12 0.000005\n'
+ '13 0.000005\n'
+ '14 0.000005\n'
+ '15 0.000006'
+ }
+
+ for (start, stop), v in expected_output.items():
+ assert str(d[start:stop]) == v
+
+ def test_too_long(self):
+ # GH 10451
+ with pd.option_context('display.precision', 4):
+ # need both a number > 1e6 and something that normally formats to
+ # having length > display.precision + 6
+ df = pd.DataFrame(dict(x=[12345.6789]))
+ assert str(df) == ' x\n0 12345.6789'
+ df = pd.DataFrame(dict(x=[2e6]))
+ assert str(df) == ' x\n0 2000000.0'
+ df = pd.DataFrame(dict(x=[12345.6789, 2e6]))
+ assert str(df) == ' x\n0 1.2346e+04\n1 2.0000e+06'
+
+
+class TestRepr_timedelta64(object):
+
+ def test_none(self):
+ delta_1d = pd.to_timedelta(1, unit='D')
+ delta_0d = pd.to_timedelta(0, unit='D')
+ delta_1s = pd.to_timedelta(1, unit='s')
+ delta_500ms = pd.to_timedelta(500, unit='ms')
+
+ drepr = lambda x: x._repr_base()
+ assert drepr(delta_1d) == "1 days"
+ assert drepr(-delta_1d) == "-1 days"
+ assert drepr(delta_0d) == "0 days"
+ assert drepr(delta_1s) == "0 days 00:00:01"
+ assert drepr(delta_500ms) == "0 days 00:00:00.500000"
+ assert drepr(delta_1d + delta_1s) == "1 days 00:00:01"
+ assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01"
+ assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000"
+ assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000"
+
+ def test_sub_day(self):
+ delta_1d = pd.to_timedelta(1, unit='D')
+ delta_0d = pd.to_timedelta(0, unit='D')
+ delta_1s = pd.to_timedelta(1, unit='s')
+ delta_500ms = pd.to_timedelta(500, unit='ms')
+
+ drepr = lambda x: x._repr_base(format='sub_day')
+ assert drepr(delta_1d) == "1 days"
+ assert drepr(-delta_1d) == "-1 days"
+ assert drepr(delta_0d) == "00:00:00"
+ assert drepr(delta_1s) == "00:00:01"
+ assert drepr(delta_500ms) == "00:00:00.500000"
+ assert drepr(delta_1d + delta_1s) == "1 days 00:00:01"
+ assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01"
+ assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000"
+ assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000"
+
+ def test_long(self):
+ delta_1d = pd.to_timedelta(1, unit='D')
+ delta_0d = pd.to_timedelta(0, unit='D')
+ delta_1s = pd.to_timedelta(1, unit='s')
+ delta_500ms = pd.to_timedelta(500, unit='ms')
+
+ drepr = lambda x: x._repr_base(format='long')
+ assert drepr(delta_1d) == "1 days 00:00:00"
+ assert drepr(-delta_1d) == "-1 days +00:00:00"
+ assert drepr(delta_0d) == "0 days 00:00:00"
+ assert drepr(delta_1s) == "0 days 00:00:01"
+ assert drepr(delta_500ms) == "0 days 00:00:00.500000"
+ assert drepr(delta_1d + delta_1s) == "1 days 00:00:01"
+ assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01"
+ assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000"
+ assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000"
+
+ def test_all(self):
+ delta_1d = pd.to_timedelta(1, unit='D')
+ delta_0d = pd.to_timedelta(0, unit='D')
+ delta_1ns = pd.to_timedelta(1, unit='ns')
+
+ drepr = lambda x: x._repr_base(format='all')
+ assert drepr(delta_1d) == "1 days 00:00:00.000000000"
+ assert drepr(-delta_1d) == "-1 days +00:00:00.000000000"
+ assert drepr(delta_0d) == "0 days 00:00:00.000000000"
+ assert drepr(delta_1ns) == "0 days 00:00:00.000000001"
+ assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001"
+
+
+class TestTimedelta64Formatter(object):
+
+ def test_days(self):
+ x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D')
+ result = fmt.Timedelta64Formatter(x, box=True).get_result()
+ assert result[0].strip() == "'0 days'"
+ assert result[1].strip() == "'1 days'"
+
+ result = fmt.Timedelta64Formatter(x[1:2], box=True).get_result()
+ assert result[0].strip() == "'1 days'"
+
+ result = fmt.Timedelta64Formatter(x, box=False).get_result()
+ assert result[0].strip() == "0 days"
+ assert result[1].strip() == "1 days"
+
+ result = fmt.Timedelta64Formatter(x[1:2], box=False).get_result()
+ assert result[0].strip() == "1 days"
+
+ def test_days_neg(self):
+ x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D')
+ result = fmt.Timedelta64Formatter(-x, box=True).get_result()
+ assert result[0].strip() == "'0 days'"
+ assert result[1].strip() == "'-1 days'"
+
+ def test_subdays(self):
+ y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s')
+ result = fmt.Timedelta64Formatter(y, box=True).get_result()
+ assert result[0].strip() == "'00:00:00'"
+ assert result[1].strip() == "'00:00:01'"
+
+ def test_subdays_neg(self):
+ y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s')
+ result = fmt.Timedelta64Formatter(-y, box=True).get_result()
+ assert result[0].strip() == "'00:00:00'"
+ assert result[1].strip() == "'-1 days +23:59:59'"
+
+ def test_zero(self):
+ x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit='D')
+ result = fmt.Timedelta64Formatter(x, box=True).get_result()
+ assert result[0].strip() == "'0 days'"
+
+ x = pd.to_timedelta(list(range(1)), unit='D')
+ result = fmt.Timedelta64Formatter(x, box=True).get_result()
+ assert result[0].strip() == "'0 days'"
+
+
+class TestDatetime64Formatter(object):
+
+ def test_mixed(self):
+ x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT])
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 00:00:00"
+ assert result[1].strip() == "2013-01-01 12:00:00"
+
+ def test_dates(self):
+ x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT])
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01"
+ assert result[1].strip() == "2013-01-02"
+
+ def test_date_nanos(self):
+ x = Series([Timestamp(200)])
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "1970-01-01 00:00:00.000000200"
+
+ def test_dates_display(self):
+
+ # 10170
+ # make sure that we are consistently display date formatting
+ x = Series(date_range('20130101 09:00:00', periods=5, freq='D'))
+ x.iloc[1] = np.nan
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 09:00:00"
+ assert result[1].strip() == "NaT"
+ assert result[4].strip() == "2013-01-05 09:00:00"
+
+ x = Series(date_range('20130101 09:00:00', periods=5, freq='s'))
+ x.iloc[1] = np.nan
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 09:00:00"
+ assert result[1].strip() == "NaT"
+ assert result[4].strip() == "2013-01-01 09:00:04"
+
+ x = Series(date_range('20130101 09:00:00', periods=5, freq='ms'))
+ x.iloc[1] = np.nan
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 09:00:00.000"
+ assert result[1].strip() == "NaT"
+ assert result[4].strip() == "2013-01-01 09:00:00.004"
+
+ x = Series(date_range('20130101 09:00:00', periods=5, freq='us'))
+ x.iloc[1] = np.nan
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 09:00:00.000000"
+ assert result[1].strip() == "NaT"
+ assert result[4].strip() == "2013-01-01 09:00:00.000004"
+
+ x = Series(date_range('20130101 09:00:00', periods=5, freq='N'))
+ x.iloc[1] = np.nan
+ result = fmt.Datetime64Formatter(x).get_result()
+ assert result[0].strip() == "2013-01-01 09:00:00.000000000"
+ assert result[1].strip() == "NaT"
+ assert result[4].strip() == "2013-01-01 09:00:00.000000004"
+
+ def test_datetime64formatter_yearmonth(self):
+ x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)])
+
+ def format_func(x):
+ return x.strftime('%Y-%m')
+
+ formatter = fmt.Datetime64Formatter(x, formatter=format_func)
+ result = formatter.get_result()
+ assert result == ['2016-01', '2016-02']
+
+ def test_datetime64formatter_hoursecond(self):
+
+ x = Series(pd.to_datetime(['10:10:10.100', '12:12:12.120'],
+ format='%H:%M:%S.%f'))
+
+ def format_func(x):
+ return x.strftime('%H:%M')
+
+ formatter = fmt.Datetime64Formatter(x, formatter=format_func)
+ result = formatter.get_result()
+ assert result == ['10:10', '12:12']
+
+
+class TestNaTFormatting(object):
+
+ def test_repr(self):
+ assert repr(pd.NaT) == "NaT"
+
+ def test_str(self):
+ assert str(pd.NaT) == "NaT"
+
+
+class TestDatetimeIndexFormat(object):
+
+ def test_datetime(self):
+ formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format()
+ assert formatted[0] == "2003-01-01 12:00:00"
+ assert formatted[1] == "NaT"
+
+ def test_date(self):
+ formatted = pd.to_datetime([datetime(2003, 1, 1), pd.NaT]).format()
+ assert formatted[0] == "2003-01-01"
+ assert formatted[1] == "NaT"
+
+ def test_date_tz(self):
+ formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format()
+ assert formatted[0] == "2013-01-01 00:00:00+00:00"
+
+ formatted = pd.to_datetime(
+ [datetime(2013, 1, 1), pd.NaT], utc=True).format()
+ assert formatted[0] == "2013-01-01 00:00:00+00:00"
+
+ def test_date_explicit_date_format(self):
+ formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format(
+ date_format="%m-%d-%Y", na_rep="UT")
+ assert formatted[0] == "02-01-2003"
+ assert formatted[1] == "UT"
+
+
+class TestDatetimeIndexUnicode(object):
+
+ def test_dates(self):
+ text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)
+ ]))
+ assert "['2013-01-01'," in text
+ assert ", '2014-01-01']" in text
+
+ def test_mixed(self):
+ text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(
+ 2014, 1, 1, 12), datetime(2014, 1, 1)]))
+ assert "'2013-01-01 00:00:00'," in text
+ assert "'2014-01-01 00:00:00']" in text
+
+
+class TestStringRepTimestamp(object):
+
+ def test_no_tz(self):
+ dt_date = datetime(2013, 1, 2)
+ assert str(dt_date) == str(Timestamp(dt_date))
+
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3)
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
+
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45)
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
+
+ ts_nanos_only = Timestamp(200)
+ assert str(ts_nanos_only) == "1970-01-01 00:00:00.000000200"
+
+ ts_nanos_micros = Timestamp(1200)
+ assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200"
+
+ def test_tz_pytz(self):
+ dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc)
+ assert str(dt_date) == str(Timestamp(dt_date))
+
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc)
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
+
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc)
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
+
+ def test_tz_dateutil(self):
+ utc = dateutil.tz.tzutc()
+
+ dt_date = datetime(2013, 1, 2, tzinfo=utc)
+ assert str(dt_date) == str(Timestamp(dt_date))
+
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc)
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
+
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc)
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
+
+ def test_nat_representations(self):
+ for f in (str, repr, methodcaller('isoformat')):
+ assert f(pd.NaT) == 'NaT'
+
+
+def test_format_percentiles():
+ result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
+ expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
+ assert result == expected
+
+ result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
+ expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
+ assert result == expected
+
+ msg = r"percentiles should all be in the interval \[0,1\]"
+ with pytest.raises(ValueError, match=msg):
+ fmt.format_percentiles([0.1, np.nan, 0.5])
+ with pytest.raises(ValueError, match=msg):
+ fmt.format_percentiles([-0.001, 0.1, 0.5])
+ with pytest.raises(ValueError, match=msg):
+ fmt.format_percentiles([2, 0.1, 0.5])
+ with pytest.raises(ValueError, match=msg):
+ fmt.format_percentiles([0.1, 0.5, 'a'])
+
+
+def test_repr_html_ipython_config(ip):
+ code = textwrap.dedent("""\
+ import pandas as pd
+ df = pd.DataFrame({"A": [1, 2]})
+ df._repr_html_()
+
+ cfg = get_ipython().config
+ cfg['IPKernelApp']['parent_appname']
+ df._repr_html_()
+ """)
+ result = ip.run_cell(code)
+ assert not result.error_in_exec
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_printing.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_printing.py
new file mode 100644
index 00000000000..67ff68ac4db
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_printing.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import compat
+import pandas.core.config as cf
+
+import pandas.io.formats.format as fmt
+import pandas.io.formats.printing as printing
+
+
+def test_adjoin():
+ data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
+ expected = 'a dd ggg\nb ee hhh\nc ff iii'
+
+ adjoined = printing.adjoin(2, *data)
+
+ assert (adjoined == expected)
+
+
+def test_repr_binary_type():
+ import string
+ letters = string.ascii_letters
+ btype = compat.binary_type
+ try:
+ raw = btype(letters, encoding=cf.get_option('display.encoding'))
+ except TypeError:
+ raw = btype(letters)
+ b = compat.text_type(compat.bytes_to_str(raw))
+ res = printing.pprint_thing(b, quote_strings=True)
+ assert res == repr(b)
+ res = printing.pprint_thing(b, quote_strings=False)
+ assert res == b
+
+
+class TestFormattBase(object):
+
+ def test_adjoin(self):
+ data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
+ expected = 'a dd ggg\nb ee hhh\nc ff iii'
+
+ adjoined = printing.adjoin(2, *data)
+
+ assert adjoined == expected
+
+ def test_adjoin_unicode(self):
+ data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'], ['ggg', 'hhh', u'いいい']]
+ expected = u'あ dd ggg\nb ええ hhh\nc ff いいい'
+ adjoined = printing.adjoin(2, *data)
+ assert adjoined == expected
+
+ adj = fmt.EastAsianTextAdjustment()
+
+ expected = u"""あ dd ggg
+b ええ hhh
+c ff いいい"""
+
+ adjoined = adj.adjoin(2, *data)
+ assert adjoined == expected
+ cols = adjoined.split('\n')
+ assert adj.len(cols[0]) == 13
+ assert adj.len(cols[1]) == 13
+ assert adj.len(cols[2]) == 16
+
+ expected = u"""あ dd ggg
+b ええ hhh
+c ff いいい"""
+
+ adjoined = adj.adjoin(7, *data)
+ assert adjoined == expected
+ cols = adjoined.split('\n')
+ assert adj.len(cols[0]) == 23
+ assert adj.len(cols[1]) == 23
+ assert adj.len(cols[2]) == 26
+
+ def test_justify(self):
+ adj = fmt.EastAsianTextAdjustment()
+
+ def just(x, *args, **kwargs):
+ # wrapper to test single str
+ return adj.justify([x], *args, **kwargs)[0]
+
+ assert just('abc', 5, mode='left') == 'abc '
+ assert just('abc', 5, mode='center') == ' abc '
+ assert just('abc', 5, mode='right') == ' abc'
+ assert just(u'abc', 5, mode='left') == 'abc '
+ assert just(u'abc', 5, mode='center') == ' abc '
+ assert just(u'abc', 5, mode='right') == ' abc'
+
+ assert just(u'パンダ', 5, mode='left') == u'パンダ'
+ assert just(u'パンダ', 5, mode='center') == u'パンダ'
+ assert just(u'パンダ', 5, mode='right') == u'パンダ'
+
+ assert just(u'パンダ', 10, mode='left') == u'パンダ '
+ assert just(u'パンダ', 10, mode='center') == u' パンダ '
+ assert just(u'パンダ', 10, mode='right') == u' パンダ'
+
+ def test_east_asian_len(self):
+ adj = fmt.EastAsianTextAdjustment()
+
+ assert adj.len('abc') == 3
+ assert adj.len(u'abc') == 3
+
+ assert adj.len(u'パンダ') == 6
+ assert adj.len(u'パンダ') == 5
+ assert adj.len(u'パンダpanda') == 11
+ assert adj.len(u'パンダpanda') == 10
+
+ def test_ambiguous_width(self):
+ adj = fmt.EastAsianTextAdjustment()
+ assert adj.len(u'¡¡ab') == 4
+
+ with cf.option_context('display.unicode.ambiguous_as_wide', True):
+ adj = fmt.EastAsianTextAdjustment()
+ assert adj.len(u'¡¡ab') == 6
+
+ data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'],
+ ['ggg', u'¡¡ab', u'いいい']]
+ expected = u'あ dd ggg \nb ええ ¡¡ab\nc ff いいい'
+ adjoined = adj.adjoin(2, *data)
+ assert adjoined == expected
+
+
+class TestTableSchemaRepr(object):
+
+ @classmethod
+ def setup_class(cls):
+ pytest.importorskip('IPython')
+
+ from IPython.core.interactiveshell import InteractiveShell
+ cls.display_formatter = InteractiveShell.instance().display_formatter
+
+ def test_publishes(self):
+
+ df = pd.DataFrame({"A": [1, 2]})
+ objects = [df['A'], df, df] # dataframe / series
+ expected_keys = [
+ {'text/plain', 'application/vnd.dataresource+json'},
+ {'text/plain', 'text/html', 'application/vnd.dataresource+json'},
+ ]
+
+ opt = pd.option_context('display.html.table_schema', True)
+ for obj, expected in zip(objects, expected_keys):
+ with opt:
+ formatted = self.display_formatter.format(obj)
+ assert set(formatted[0].keys()) == expected
+
+ with_latex = pd.option_context('display.latex.repr', True)
+
+ with opt, with_latex:
+ formatted = self.display_formatter.format(obj)
+
+ expected = {'text/plain', 'text/html', 'text/latex',
+ 'application/vnd.dataresource+json'}
+ assert set(formatted[0].keys()) == expected
+
+ def test_publishes_not_implemented(self):
+ # column MultiIndex
+ # GH 15996
+ midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']])
+ df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx)
+
+ opt = pd.option_context('display.html.table_schema', True)
+
+ with opt:
+ formatted = self.display_formatter.format(df)
+
+ expected = {'text/plain', 'text/html'}
+ assert set(formatted[0].keys()) == expected
+
+ def test_config_on(self):
+ df = pd.DataFrame({"A": [1, 2]})
+ with pd.option_context("display.html.table_schema", True):
+ result = df._repr_data_resource_()
+
+ assert result is not None
+
+ def test_config_default_off(self):
+ df = pd.DataFrame({"A": [1, 2]})
+ with pd.option_context("display.html.table_schema", False):
+ result = df._repr_data_resource_()
+
+ assert result is None
+
+ def test_enable_data_resource_formatter(self):
+ # GH 10491
+ formatters = self.display_formatter.formatters
+ mimetype = 'application/vnd.dataresource+json'
+
+ with pd.option_context('display.html.table_schema', True):
+ assert 'application/vnd.dataresource+json' in formatters
+ assert formatters[mimetype].enabled
+
+ # still there, just disabled
+ assert 'application/vnd.dataresource+json' in formatters
+ assert not formatters[mimetype].enabled
+
+ # able to re-set
+ with pd.option_context('display.html.table_schema', True):
+ assert 'application/vnd.dataresource+json' in formatters
+ assert formatters[mimetype].enabled
+ # smoke test that it works
+ self.display_formatter.format(cf)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_style.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_style.py
new file mode 100644
index 00000000000..407c786725f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_style.py
@@ -0,0 +1,1315 @@
+import copy
+import re
+import textwrap
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+jinja2 = pytest.importorskip('jinja2')
+from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip
+
+
+class TestStyler(object):
+
+ def setup_method(self, method):
+ np.random.seed(24)
+ self.s = DataFrame({'A': np.random.permutation(range(6))})
+ self.df = DataFrame({'A': [0, 1], 'B': np.random.randn(2)})
+ self.f = lambda x: x
+ self.g = lambda x: x
+
+ def h(x, foo='bar'):
+ return pd.Series(
+ 'color: {foo}'.format(foo=foo), index=x.index, name=x.name)
+
+ self.h = h
+ self.styler = Styler(self.df)
+ self.attrs = pd.DataFrame({'A': ['color: red', 'color: blue']})
+ self.dataframes = [
+ self.df,
+ pd.DataFrame({'f': [1., 2.], 'o': ['a', 'b'],
+ 'c': pd.Categorical(['a', 'b'])})
+ ]
+
+ def test_init_non_pandas(self):
+ with pytest.raises(TypeError):
+ Styler([1, 2, 3])
+
+ def test_init_series(self):
+ result = Styler(pd.Series([1, 2]))
+ assert result.data.ndim == 2
+
+ def test_repr_html_ok(self):
+ self.styler._repr_html_()
+
+ def test_repr_html_mathjax(self):
+ # gh-19824
+ assert 'tex2jax_ignore' not in self.styler._repr_html_()
+
+ with pd.option_context('display.html.use_mathjax', False):
+ assert 'tex2jax_ignore' in self.styler._repr_html_()
+
+ def test_update_ctx(self):
+ self.styler._update_ctx(self.attrs)
+ expected = {(0, 0): ['color: red'],
+ (1, 0): ['color: blue']}
+ assert self.styler.ctx == expected
+
+ def test_update_ctx_flatten_multi(self):
+ attrs = DataFrame({"A": ['color: red; foo: bar',
+ 'color: blue; foo: baz']})
+ self.styler._update_ctx(attrs)
+ expected = {(0, 0): ['color: red', ' foo: bar'],
+ (1, 0): ['color: blue', ' foo: baz']}
+ assert self.styler.ctx == expected
+
+ def test_update_ctx_flatten_multi_traliing_semi(self):
+ attrs = DataFrame({"A": ['color: red; foo: bar;',
+ 'color: blue; foo: baz;']})
+ self.styler._update_ctx(attrs)
+ expected = {(0, 0): ['color: red', ' foo: bar'],
+ (1, 0): ['color: blue', ' foo: baz']}
+ assert self.styler.ctx == expected
+
+ def test_copy(self):
+ s2 = copy.copy(self.styler)
+ assert self.styler is not s2
+ assert self.styler.ctx is s2.ctx # shallow
+ assert self.styler._todo is s2._todo
+
+ self.styler._update_ctx(self.attrs)
+ self.styler.highlight_max()
+ assert self.styler.ctx == s2.ctx
+ assert self.styler._todo == s2._todo
+
+ def test_deepcopy(self):
+ s2 = copy.deepcopy(self.styler)
+ assert self.styler is not s2
+ assert self.styler.ctx is not s2.ctx
+ assert self.styler._todo is not s2._todo
+
+ self.styler._update_ctx(self.attrs)
+ self.styler.highlight_max()
+ assert self.styler.ctx != s2.ctx
+ assert s2._todo == []
+ assert self.styler._todo != s2._todo
+
+ def test_clear(self):
+ s = self.df.style.highlight_max()._compute()
+ assert len(s.ctx) > 0
+ assert len(s._todo) > 0
+ s.clear()
+ assert len(s.ctx) == 0
+ assert len(s._todo) == 0
+
+ def test_render(self):
+ df = pd.DataFrame({"A": [0, 1]})
+ style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name)
+ s = Styler(df, uuid='AB').apply(style)
+ s.render()
+ # it worked?
+
+ def test_render_empty_dfs(self):
+ empty_df = DataFrame()
+ es = Styler(empty_df)
+ es.render()
+ # An index but no columns
+ DataFrame(columns=['a']).style.render()
+ # A column but no index
+ DataFrame(index=['a']).style.render()
+ # No IndexError raised?
+
+ def test_render_double(self):
+ df = pd.DataFrame({"A": [0, 1]})
+ style = lambda x: pd.Series(["color: red; border: 1px",
+ "color: blue; border: 2px"], name=x.name)
+ s = Styler(df, uuid='AB').apply(style)
+ s.render()
+ # it worked?
+
+ def test_set_properties(self):
+ df = pd.DataFrame({"A": [0, 1]})
+ result = df.style.set_properties(color='white',
+ size='10px')._compute().ctx
+ # order is deterministic
+ v = ["color: white", "size: 10px"]
+ expected = {(0, 0): v, (1, 0): v}
+ assert result.keys() == expected.keys()
+ for v1, v2 in zip(result.values(), expected.values()):
+ assert sorted(v1) == sorted(v2)
+
+ def test_set_properties_subset(self):
+ df = pd.DataFrame({'A': [0, 1]})
+ result = df.style.set_properties(subset=pd.IndexSlice[0, 'A'],
+ color='white')._compute().ctx
+ expected = {(0, 0): ['color: white']}
+ assert result == expected
+
+ def test_empty_index_name_doesnt_display(self):
+ # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902
+ df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
+ result = df.style._translate()
+
+ expected = [[{'class': 'blank level0', 'type': 'th', 'value': '',
+ 'is_visible': True, 'display_value': ''},
+ {'class': 'col_heading level0 col0',
+ 'display_value': 'A',
+ 'type': 'th',
+ 'value': 'A',
+ 'is_visible': True,
+ },
+ {'class': 'col_heading level0 col1',
+ 'display_value': 'B',
+ 'type': 'th',
+ 'value': 'B',
+ 'is_visible': True,
+ },
+ {'class': 'col_heading level0 col2',
+ 'display_value': 'C',
+ 'type': 'th',
+ 'value': 'C',
+ 'is_visible': True,
+ }]]
+
+ assert result['head'] == expected
+
+ def test_index_name(self):
+ # https://github.com/pandas-dev/pandas/issues/11655
+ df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
+ result = df.set_index('A').style._translate()
+
+ expected = [[{'class': 'blank level0', 'type': 'th', 'value': '',
+ 'display_value': '', 'is_visible': True},
+ {'class': 'col_heading level0 col0', 'type': 'th',
+ 'value': 'B', 'display_value': 'B', 'is_visible': True},
+ {'class': 'col_heading level0 col1', 'type': 'th',
+ 'value': 'C', 'display_value': 'C', 'is_visible': True}],
+ [{'class': 'index_name level0', 'type': 'th',
+ 'value': 'A'},
+ {'class': 'blank', 'type': 'th', 'value': ''},
+ {'class': 'blank', 'type': 'th', 'value': ''}]]
+
+ assert result['head'] == expected
+
+ def test_multiindex_name(self):
+ # https://github.com/pandas-dev/pandas/issues/11655
+ df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
+ result = df.set_index(['A', 'B']).style._translate()
+
+ expected = [[
+ {'class': 'blank', 'type': 'th', 'value': '',
+ 'display_value': '', 'is_visible': True},
+ {'class': 'blank level0', 'type': 'th', 'value': '',
+ 'display_value': '', 'is_visible': True},
+ {'class': 'col_heading level0 col0', 'type': 'th',
+ 'value': 'C', 'display_value': 'C', 'is_visible': True}],
+ [{'class': 'index_name level0', 'type': 'th',
+ 'value': 'A'},
+ {'class': 'index_name level1', 'type': 'th',
+ 'value': 'B'},
+ {'class': 'blank', 'type': 'th', 'value': ''}]]
+
+ assert result['head'] == expected
+
+ def test_numeric_columns(self):
+ # https://github.com/pandas-dev/pandas/issues/12125
+ # smoke test for _translate
+ df = pd.DataFrame({0: [1, 2, 3]})
+ df.style._translate()
+
+ def test_apply_axis(self):
+ df = pd.DataFrame({'A': [0, 0], 'B': [1, 1]})
+ f = lambda x: ['val: {max}'.format(max=x.max()) for v in x]
+ result = df.style.apply(f, axis=1)
+ assert len(result._todo) == 1
+ assert len(result.ctx) == 0
+ result._compute()
+ expected = {(0, 0): ['val: 1'], (0, 1): ['val: 1'],
+ (1, 0): ['val: 1'], (1, 1): ['val: 1']}
+ assert result.ctx == expected
+
+ result = df.style.apply(f, axis=0)
+ expected = {(0, 0): ['val: 0'], (0, 1): ['val: 1'],
+ (1, 0): ['val: 0'], (1, 1): ['val: 1']}
+ result._compute()
+ assert result.ctx == expected
+ result = df.style.apply(f) # default
+ result._compute()
+ assert result.ctx == expected
+
+ def test_apply_subset(self):
+ axes = [0, 1]
+ slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']],
+ pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']],
+ pd.IndexSlice[:2, ['A', 'B']]]
+ for ax in axes:
+ for slice_ in slices:
+ result = self.df.style.apply(self.h, axis=ax, subset=slice_,
+ foo='baz')._compute().ctx
+ expected = {(r, c): ['color: baz']
+ for r, row in enumerate(self.df.index)
+ for c, col in enumerate(self.df.columns)
+ if row in self.df.loc[slice_].index and
+ col in self.df.loc[slice_].columns}
+ assert result == expected
+
+ def test_applymap_subset(self):
+ def f(x):
+ return 'foo: bar'
+
+ slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']],
+ pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']],
+ pd.IndexSlice[:2, ['A', 'B']]]
+
+ for slice_ in slices:
+ result = self.df.style.applymap(f, subset=slice_)._compute().ctx
+ expected = {(r, c): ['foo: bar']
+ for r, row in enumerate(self.df.index)
+ for c, col in enumerate(self.df.columns)
+ if row in self.df.loc[slice_].index and
+ col in self.df.loc[slice_].columns}
+ assert result == expected
+
+ def test_applymap_subset_multiindex(self):
+ # GH 19861
+ # Smoke test for applymap
+ def color_negative_red(val):
+ """
+ Takes a scalar and returns a string with
+ the css property `'color: red'` for negative
+ strings, black otherwise.
+ """
+ color = 'red' if val < 0 else 'black'
+ return 'color: %s' % color
+
+ dic = {
+ ('a', 'd'): [-1.12, 2.11],
+ ('a', 'c'): [2.78, -2.88],
+ ('b', 'c'): [-3.99, 3.77],
+ ('b', 'd'): [4.21, -1.22],
+ }
+
+ idx = pd.IndexSlice
+ df = pd.DataFrame(dic, index=[0, 1])
+
+ (df.style
+ .applymap(color_negative_red, subset=idx[:, idx['b', 'd']])
+ .render())
+
+ def test_where_with_one_style(self):
+ # GH 17474
+ def f(x):
+ return x > 0.5
+
+ style1 = 'foo: bar'
+
+ result = self.df.style.where(f, style1)._compute().ctx
+ expected = {(r, c): [style1 if f(self.df.loc[row, col]) else '']
+ for r, row in enumerate(self.df.index)
+ for c, col in enumerate(self.df.columns)}
+ assert result == expected
+
+ def test_where_subset(self):
+ # GH 17474
+ def f(x):
+ return x > 0.5
+
+ style1 = 'foo: bar'
+ style2 = 'baz: foo'
+
+ slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']],
+ pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']],
+ pd.IndexSlice[:2, ['A', 'B']]]
+
+ for slice_ in slices:
+ result = self.df.style.where(f, style1, style2,
+ subset=slice_)._compute().ctx
+ expected = {(r, c):
+ [style1 if f(self.df.loc[row, col]) else style2]
+ for r, row in enumerate(self.df.index)
+ for c, col in enumerate(self.df.columns)
+ if row in self.df.loc[slice_].index and
+ col in self.df.loc[slice_].columns}
+ assert result == expected
+
+ def test_where_subset_compare_with_applymap(self):
+ # GH 17474
+ def f(x):
+ return x > 0.5
+
+ style1 = 'foo: bar'
+ style2 = 'baz: foo'
+
+ def g(x):
+ return style1 if f(x) else style2
+
+ slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']],
+ pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']],
+ pd.IndexSlice[:2, ['A', 'B']]]
+
+ for slice_ in slices:
+ result = self.df.style.where(f, style1, style2,
+ subset=slice_)._compute().ctx
+ expected = self.df.style.applymap(g, subset=slice_)._compute().ctx
+ assert result == expected
+
+ def test_empty(self):
+ df = pd.DataFrame({'A': [1, 0]})
+ s = df.style
+ s.ctx = {(0, 0): ['color: red'],
+ (1, 0): ['']}
+
+ result = s._translate()['cellstyle']
+ expected = [{'props': [['color', ' red']], 'selector': 'row0_col0'},
+ {'props': [['', '']], 'selector': 'row1_col0'}]
+ assert result == expected
+
+ def test_bar_align_left(self):
+ df = pd.DataFrame({'A': [0, 1, 2]})
+ result = df.style.bar()._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient('
+ '90deg,#d65f5f 50.0%, transparent 50.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient('
+ '90deg,#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ result = df.style.bar(color='red', width=50)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient('
+ '90deg,red 25.0%, transparent 25.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient('
+ '90deg,red 50.0%, transparent 50.0%)']
+ }
+ assert result == expected
+
+ df['C'] = ['a'] * len(df)
+ result = df.style.bar(color='red', width=50)._compute().ctx
+ assert result == expected
+ df['C'] = df['C'].astype('category')
+ result = df.style.bar(color='red', width=50)._compute().ctx
+ assert result == expected
+
+ def test_bar_align_left_0points(self):
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+ result = df.style.bar()._compute().ctx
+ expected = {(0, 0): ['width: 10em', ' height: 80%'],
+ (0, 1): ['width: 10em', ' height: 80%'],
+ (0, 2): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%,'
+ ' transparent 50.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%,'
+ ' transparent 50.0%)'],
+ (1, 2): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%,'
+ ' transparent 50.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)'],
+ (2, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)'],
+ (2, 2): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)']}
+ assert result == expected
+
+ result = df.style.bar(axis=1)._compute().ctx
+ expected = {(0, 0): ['width: 10em', ' height: 80%'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%,'
+ ' transparent 50.0%)'],
+ (0, 2): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)'],
+ (1, 0): ['width: 10em', ' height: 80%'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%'
+ ', transparent 50.0%)'],
+ (1, 2): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%'],
+ (2, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 50.0%'
+ ', transparent 50.0%)'],
+ (2, 2): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,#d65f5f 100.0%'
+ ', transparent 100.0%)']}
+ assert result == expected
+
+ def test_bar_align_mid_pos_and_neg(self):
+ df = pd.DataFrame({'A': [-10, 0, 20, 90]})
+
+ result = df.style.bar(align='mid', color=[
+ '#d65f5f', '#5fba7d'])._compute().ctx
+
+ expected = {(0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 10.0%, transparent 10.0%)'],
+ (1, 0): ['width: 10em', ' height: 80%', ],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 10.0%, #5fba7d 10.0%'
+ ', #5fba7d 30.0%, transparent 30.0%)'],
+ (3, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 10.0%, '
+ '#5fba7d 10.0%, #5fba7d 100.0%, '
+ 'transparent 100.0%)']}
+
+ assert result == expected
+
+ def test_bar_align_mid_all_pos(self):
+ df = pd.DataFrame({'A': [10, 20, 50, 100]})
+
+ result = df.style.bar(align='mid', color=[
+ '#d65f5f', '#5fba7d'])._compute().ctx
+
+ expected = {(0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#5fba7d 10.0%, transparent 10.0%)'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#5fba7d 20.0%, transparent 20.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#5fba7d 50.0%, transparent 50.0%)'],
+ (3, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#5fba7d 100.0%, transparent 100.0%)']}
+
+ assert result == expected
+
+ def test_bar_align_mid_all_neg(self):
+ df = pd.DataFrame({'A': [-100, -60, -30, -20]})
+
+ result = df.style.bar(align='mid', color=[
+ '#d65f5f', '#5fba7d'])._compute().ctx
+
+ expected = {(0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 100.0%, transparent 100.0%)'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 40.0%, '
+ '#d65f5f 40.0%, #d65f5f 100.0%, '
+ 'transparent 100.0%)'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 70.0%, '
+ '#d65f5f 70.0%, #d65f5f 100.0%, '
+ 'transparent 100.0%)'],
+ (3, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 80.0%, '
+ '#d65f5f 80.0%, #d65f5f 100.0%, '
+ 'transparent 100.0%)']}
+ assert result == expected
+
+ def test_bar_align_zero_pos_and_neg(self):
+ # See https://github.com/pandas-dev/pandas/pull/14757
+ df = pd.DataFrame({'A': [-10, 0, 20, 90]})
+
+ result = df.style.bar(align='zero', color=[
+ '#d65f5f', '#5fba7d'], width=90)._compute().ctx
+ expected = {(0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 40.0%, #d65f5f 40.0%, '
+ '#d65f5f 45.0%, transparent 45.0%)'],
+ (1, 0): ['width: 10em', ' height: 80%'],
+ (2, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 45.0%, #5fba7d 45.0%, '
+ '#5fba7d 55.0%, transparent 55.0%)'],
+ (3, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 45.0%, #5fba7d 45.0%, '
+ '#5fba7d 90.0%, transparent 90.0%)']}
+ assert result == expected
+
+ def test_bar_align_left_axis_none(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [2, 4]})
+ result = df.style.bar(axis=None)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 25.0%, transparent 25.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_zero_axis_none(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='zero', axis=None)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 50.0%, #d65f5f 50.0%, '
+ '#d65f5f 62.5%, transparent 62.5%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 50.0%, #d65f5f 50.0%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_axis_none(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='mid', axis=None)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 33.3%, #d65f5f 33.3%, '
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 33.3%, transparent 33.3%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 33.3%, #d65f5f 33.3%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='mid', axis=None, vmin=-6)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 60.0%, #d65f5f 60.0%, '
+ '#d65f5f 70.0%, transparent 70.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 40.0%, #d65f5f 40.0%, '
+ '#d65f5f 60.0%, transparent 60.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 60.0%, #d65f5f 60.0%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmax(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='mid', axis=None, vmax=8)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 20.0%, #d65f5f 20.0%, '
+ '#d65f5f 30.0%, transparent 30.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 20.0%, transparent 20.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 20.0%, #d65f5f 20.0%, '
+ '#d65f5f 60.0%, transparent 60.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin_vmax_wide(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='mid', axis=None,
+ vmin=-3, vmax=7)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 30.0%, #d65f5f 30.0%, '
+ '#d65f5f 40.0%, transparent 40.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 10.0%, #d65f5f 10.0%, '
+ '#d65f5f 30.0%, transparent 30.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 30.0%, #d65f5f 30.0%, '
+ '#d65f5f 70.0%, transparent 70.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_vmin_vmax_clipping(self):
+ df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]})
+ result = df.style.bar(align='mid', axis=None,
+ vmin=-1, vmax=3)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%'],
+ (1, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 25.0%, transparent 25.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_mid_nans(self):
+ df = pd.DataFrame({'A': [1, None], 'B': [-1, 3]})
+ result = df.style.bar(align='mid', axis=None)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (1, 0): [''],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg,'
+ '#d65f5f 25.0%, transparent 25.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_align_zero_nans(self):
+ df = pd.DataFrame({'A': [1, None], 'B': [-1, 2]})
+ result = df.style.bar(align='zero', axis=None)._compute().ctx
+ expected = {
+ (0, 0): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 50.0%, #d65f5f 50.0%, '
+ '#d65f5f 75.0%, transparent 75.0%)'],
+ (1, 0): [''],
+ (0, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 25.0%, #d65f5f 25.0%, '
+ '#d65f5f 50.0%, transparent 50.0%)'],
+ (1, 1): ['width: 10em', ' height: 80%',
+ 'background: linear-gradient(90deg, '
+ 'transparent 50.0%, #d65f5f 50.0%, '
+ '#d65f5f 100.0%, transparent 100.0%)']
+ }
+ assert result == expected
+
+ def test_bar_bad_align_raises(self):
+ df = pd.DataFrame({'A': [-100, -60, -30, -20]})
+ with pytest.raises(ValueError):
+ df.style.bar(align='poorly', color=['#d65f5f', '#5fba7d'])
+
+ def test_highlight_null(self, null_color='red'):
+ df = pd.DataFrame({'A': [0, np.nan]})
+ result = df.style.highlight_null()._compute().ctx
+ expected = {(0, 0): [''],
+ (1, 0): ['background-color: red']}
+ assert result == expected
+
+ def test_nonunique_raises(self):
+ df = pd.DataFrame([[1, 2]], columns=['A', 'A'])
+ with pytest.raises(ValueError):
+ df.style
+
+ with pytest.raises(ValueError):
+ Styler(df)
+
+ def test_caption(self):
+ styler = Styler(self.df, caption='foo')
+ result = styler.render()
+ assert all(['caption' in result, 'foo' in result])
+
+ styler = self.df.style
+ result = styler.set_caption('baz')
+ assert styler is result
+ assert styler.caption == 'baz'
+
+ def test_uuid(self):
+ styler = Styler(self.df, uuid='abc123')
+ result = styler.render()
+ assert 'abc123' in result
+
+ styler = self.df.style
+ result = styler.set_uuid('aaa')
+ assert result is styler
+ assert result.uuid == 'aaa'
+
+ def test_unique_id(self):
+ # See https://github.com/pandas-dev/pandas/issues/16780
+ df = pd.DataFrame({'a': [1, 3, 5, 6], 'b': [2, 4, 12, 21]})
+ result = df.style.render(uuid='test')
+ assert 'test' in result
+ ids = re.findall('id="(.*?)"', result)
+ assert np.unique(ids).size == len(ids)
+
+ def test_table_styles(self):
+ style = [{'selector': 'th', 'props': [('foo', 'bar')]}]
+ styler = Styler(self.df, table_styles=style)
+ result = ' '.join(styler.render().split())
+ assert 'th { foo: bar; }' in result
+
+ styler = self.df.style
+ result = styler.set_table_styles(style)
+ assert styler is result
+ assert styler.table_styles == style
+
+ def test_table_attributes(self):
+ attributes = 'class="foo" data-bar'
+ styler = Styler(self.df, table_attributes=attributes)
+ result = styler.render()
+ assert 'class="foo" data-bar' in result
+
+ result = self.df.style.set_table_attributes(attributes).render()
+ assert 'class="foo" data-bar' in result
+
+ def test_precision(self):
+ with pd.option_context('display.precision', 10):
+ s = Styler(self.df)
+ assert s.precision == 10
+ s = Styler(self.df, precision=2)
+ assert s.precision == 2
+
+ s2 = s.set_precision(4)
+ assert s is s2
+ assert s.precision == 4
+
+ def test_apply_none(self):
+ def f(x):
+ return pd.DataFrame(np.where(x == x.max(), 'color: red', ''),
+ index=x.index, columns=x.columns)
+ result = (pd.DataFrame([[1, 2], [3, 4]])
+ .style.apply(f, axis=None)._compute().ctx)
+ assert result[(1, 1)] == ['color: red']
+
+ def test_trim(self):
+ result = self.df.style.render() # trim=True
+ assert result.count('#') == 0
+
+ result = self.df.style.highlight_max().render()
+ assert result.count('#') == len(self.df.columns)
+
+ def test_highlight_max(self):
+ df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
+ # max(df) = min(-df)
+ for max_ in [True, False]:
+ if max_:
+ attr = 'highlight_max'
+ else:
+ df = -df
+ attr = 'highlight_min'
+ result = getattr(df.style, attr)()._compute().ctx
+ assert result[(1, 1)] == ['background-color: yellow']
+
+ result = getattr(df.style, attr)(color='green')._compute().ctx
+ assert result[(1, 1)] == ['background-color: green']
+
+ result = getattr(df.style, attr)(subset='A')._compute().ctx
+ assert result[(1, 0)] == ['background-color: yellow']
+
+ result = getattr(df.style, attr)(axis=0)._compute().ctx
+ expected = {(1, 0): ['background-color: yellow'],
+ (1, 1): ['background-color: yellow'],
+ (0, 1): [''], (0, 0): ['']}
+ assert result == expected
+
+ result = getattr(df.style, attr)(axis=1)._compute().ctx
+ expected = {(0, 1): ['background-color: yellow'],
+ (1, 1): ['background-color: yellow'],
+ (0, 0): [''], (1, 0): ['']}
+ assert result == expected
+
+ # separate since we can't negate the strs
+ df['C'] = ['a', 'b']
+ result = df.style.highlight_max()._compute().ctx
+ expected = {(1, 1): ['background-color: yellow']}
+
+ result = df.style.highlight_min()._compute().ctx
+ expected = {(0, 0): ['background-color: yellow']}
+
+ def test_export(self):
+ f = lambda x: 'color: red' if x > 0 else 'color: blue'
+ g = lambda x, y, z: 'color: {z}'.format(z=z) \
+ if x > 0 else 'color: {z}'.format(z=z)
+ style1 = self.styler
+ style1.applymap(f)\
+ .applymap(g, y='a', z='b')\
+ .highlight_max()
+ result = style1.export()
+ style2 = self.df.style
+ style2.use(result)
+ assert style1._todo == style2._todo
+ style2.render()
+
+ def test_display_format(self):
+ df = pd.DataFrame(np.random.random(size=(2, 2)))
+ ctx = df.style.format("{:0.1f}")._translate()
+
+ assert all(['display_value' in c for c in row]
+ for row in ctx['body'])
+ assert all([len(c['display_value']) <= 3 for c in row[1:]]
+ for row in ctx['body'])
+ assert len(ctx['body'][0][1]['display_value'].lstrip('-')) <= 3
+
+ def test_display_format_raises(self):
+ df = pd.DataFrame(np.random.randn(2, 2))
+ with pytest.raises(TypeError):
+ df.style.format(5)
+ with pytest.raises(TypeError):
+ df.style.format(True)
+
+ def test_display_subset(self):
+ df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]],
+ columns=['a', 'b'])
+ ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"},
+ subset=pd.IndexSlice[0, :])._translate()
+ expected = '0.1'
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][1][1]['display_value'] == '1.1234'
+ assert ctx['body'][0][2]['display_value'] == '12.34%'
+
+ raw_11 = '1.1234'
+ ctx = df.style.format("{:0.1f}",
+ subset=pd.IndexSlice[0, :])._translate()
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][1][1]['display_value'] == raw_11
+
+ ctx = df.style.format("{:0.1f}",
+ subset=pd.IndexSlice[0, :])._translate()
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][1][1]['display_value'] == raw_11
+
+ ctx = df.style.format("{:0.1f}",
+ subset=pd.IndexSlice['a'])._translate()
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][0][2]['display_value'] == '0.1234'
+
+ ctx = df.style.format("{:0.1f}",
+ subset=pd.IndexSlice[0, 'a'])._translate()
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][1][1]['display_value'] == raw_11
+
+ ctx = df.style.format("{:0.1f}",
+ subset=pd.IndexSlice[[0, 1], ['a']])._translate()
+ assert ctx['body'][0][1]['display_value'] == expected
+ assert ctx['body'][1][1]['display_value'] == '1.1'
+ assert ctx['body'][0][2]['display_value'] == '0.1234'
+ assert ctx['body'][1][2]['display_value'] == '1.1234'
+
+ def test_display_dict(self):
+ df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]],
+ columns=['a', 'b'])
+ ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"})._translate()
+ assert ctx['body'][0][1]['display_value'] == '0.1'
+ assert ctx['body'][0][2]['display_value'] == '12.34%'
+ df['c'] = ['aaa', 'bbb']
+ ctx = df.style.format({"a": "{:0.1f}", "c": str.upper})._translate()
+ assert ctx['body'][0][1]['display_value'] == '0.1'
+ assert ctx['body'][0][3]['display_value'] == 'AAA'
+
+ def test_bad_apply_shape(self):
+ df = pd.DataFrame([[1, 2], [3, 4]])
+ with pytest.raises(ValueError):
+ df.style._apply(lambda x: 'x', subset=pd.IndexSlice[[0, 1], :])
+
+ with pytest.raises(ValueError):
+ df.style._apply(lambda x: [''], subset=pd.IndexSlice[[0, 1], :])
+
+ with pytest.raises(ValueError):
+ df.style._apply(lambda x: ['', '', '', ''])
+
+ with pytest.raises(ValueError):
+ df.style._apply(lambda x: ['', '', ''], subset=1)
+
+ with pytest.raises(ValueError):
+ df.style._apply(lambda x: ['', '', ''], axis=1)
+
+ def test_apply_bad_return(self):
+ def f(x):
+ return ''
+ df = pd.DataFrame([[1, 2], [3, 4]])
+ with pytest.raises(TypeError):
+ df.style._apply(f, axis=None)
+
+ def test_apply_bad_labels(self):
+ def f(x):
+ return pd.DataFrame(index=[1, 2], columns=['a', 'b'])
+ df = pd.DataFrame([[1, 2], [3, 4]])
+ with pytest.raises(ValueError):
+ df.style._apply(f, axis=None)
+
+ def test_get_level_lengths(self):
+ index = pd.MultiIndex.from_product([['a', 'b'], [0, 1, 2]])
+ expected = {(0, 0): 3, (0, 3): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1,
+ (1, 3): 1, (1, 4): 1, (1, 5): 1}
+ result = _get_level_lengths(index)
+ tm.assert_dict_equal(result, expected)
+
+ def test_get_level_lengths_un_sorted(self):
+ index = pd.MultiIndex.from_arrays([
+ [1, 1, 2, 1],
+ ['a', 'b', 'b', 'd']
+ ])
+ expected = {(0, 0): 2, (0, 2): 1, (0, 3): 1,
+ (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1}
+ result = _get_level_lengths(index)
+ tm.assert_dict_equal(result, expected)
+
+ def test_mi_sparse(self):
+ df = pd.DataFrame({'A': [1, 2]},
+ index=pd.MultiIndex.from_arrays([['a', 'a'],
+ [0, 1]]))
+
+ result = df.style._translate()
+ body_0 = result['body'][0][0]
+ expected_0 = {
+ "value": "a", "display_value": "a", "is_visible": True,
+ "type": "th", "attributes": ["rowspan=2"],
+ "class": "row_heading level0 row0", "id": "level0_row0"
+ }
+ tm.assert_dict_equal(body_0, expected_0)
+
+ body_1 = result['body'][0][1]
+ expected_1 = {
+ "value": 0, "display_value": 0, "is_visible": True,
+ "type": "th", "class": "row_heading level1 row0",
+ "id": "level1_row0"
+ }
+ tm.assert_dict_equal(body_1, expected_1)
+
+ body_10 = result['body'][1][0]
+ expected_10 = {
+ "value": 'a', "display_value": 'a', "is_visible": False,
+ "type": "th", "class": "row_heading level0 row1",
+ "id": "level0_row1"
+ }
+ tm.assert_dict_equal(body_10, expected_10)
+
+ head = result['head'][0]
+ expected = [
+ {'type': 'th', 'class': 'blank', 'value': '',
+ 'is_visible': True, "display_value": ''},
+ {'type': 'th', 'class': 'blank level0', 'value': '',
+ 'is_visible': True, 'display_value': ''},
+ {'type': 'th', 'class': 'col_heading level0 col0', 'value': 'A',
+ 'is_visible': True, 'display_value': 'A'}]
+ assert head == expected
+
+ def test_mi_sparse_disabled(self):
+ with pd.option_context('display.multi_sparse', False):
+ df = pd.DataFrame({'A': [1, 2]},
+ index=pd.MultiIndex.from_arrays([['a', 'a'],
+ [0, 1]]))
+ result = df.style._translate()
+ body = result['body']
+ for row in body:
+ assert 'attributes' not in row[0]
+
+ def test_mi_sparse_index_names(self):
+ df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays(
+ [['a', 'a'], [0, 1]],
+ names=['idx_level_0', 'idx_level_1'])
+ )
+ result = df.style._translate()
+ head = result['head'][1]
+ expected = [{
+ 'class': 'index_name level0', 'value': 'idx_level_0',
+ 'type': 'th'},
+ {'class': 'index_name level1', 'value': 'idx_level_1',
+ 'type': 'th'},
+ {'class': 'blank', 'value': '', 'type': 'th'}]
+
+ assert head == expected
+
+ def test_mi_sparse_column_names(self):
+ df = pd.DataFrame(
+ np.arange(16).reshape(4, 4),
+ index=pd.MultiIndex.from_arrays(
+ [['a', 'a', 'b', 'a'], [0, 1, 1, 2]],
+ names=['idx_level_0', 'idx_level_1']),
+ columns=pd.MultiIndex.from_arrays(
+ [['C1', 'C1', 'C2', 'C2'], [1, 0, 1, 0]],
+ names=['col_0', 'col_1']
+ )
+ )
+ result = df.style._translate()
+ head = result['head'][1]
+ expected = [
+ {'class': 'blank', 'value': '', 'display_value': '',
+ 'type': 'th', 'is_visible': True},
+ {'class': 'index_name level1', 'value': 'col_1',
+ 'display_value': 'col_1', 'is_visible': True, 'type': 'th'},
+ {'class': 'col_heading level1 col0',
+ 'display_value': 1,
+ 'is_visible': True,
+ 'type': 'th',
+ 'value': 1},
+ {'class': 'col_heading level1 col1',
+ 'display_value': 0,
+ 'is_visible': True,
+ 'type': 'th',
+ 'value': 0},
+
+ {'class': 'col_heading level1 col2',
+ 'display_value': 1,
+ 'is_visible': True,
+ 'type': 'th',
+ 'value': 1},
+
+ {'class': 'col_heading level1 col3',
+ 'display_value': 0,
+ 'is_visible': True,
+ 'type': 'th',
+ 'value': 0},
+ ]
+ assert head == expected
+
+ def test_hide_single_index(self):
+ # GH 14194
+ # single unnamed index
+ ctx = self.df.style._translate()
+ assert ctx['body'][0][0]['is_visible']
+ assert ctx['head'][0][0]['is_visible']
+ ctx2 = self.df.style.hide_index()._translate()
+ assert not ctx2['body'][0][0]['is_visible']
+ assert not ctx2['head'][0][0]['is_visible']
+
+ # single named index
+ ctx3 = self.df.set_index('A').style._translate()
+ assert ctx3['body'][0][0]['is_visible']
+ assert len(ctx3['head']) == 2 # 2 header levels
+ assert ctx3['head'][0][0]['is_visible']
+
+ ctx4 = self.df.set_index('A').style.hide_index()._translate()
+ assert not ctx4['body'][0][0]['is_visible']
+ assert len(ctx4['head']) == 1 # only 1 header levels
+ assert not ctx4['head'][0][0]['is_visible']
+
+ def test_hide_multiindex(self):
+ # GH 14194
+ df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays(
+ [['a', 'a'], [0, 1]],
+ names=['idx_level_0', 'idx_level_1'])
+ )
+ ctx1 = df.style._translate()
+ # tests for 'a' and '0'
+ assert ctx1['body'][0][0]['is_visible']
+ assert ctx1['body'][0][1]['is_visible']
+ # check for blank header rows
+ assert ctx1['head'][0][0]['is_visible']
+ assert ctx1['head'][0][1]['is_visible']
+
+ ctx2 = df.style.hide_index()._translate()
+ # tests for 'a' and '0'
+ assert not ctx2['body'][0][0]['is_visible']
+ assert not ctx2['body'][0][1]['is_visible']
+ # check for blank header rows
+ assert not ctx2['head'][0][0]['is_visible']
+ assert not ctx2['head'][0][1]['is_visible']
+
+ def test_hide_columns_single_level(self):
+ # GH 14194
+ # test hiding single column
+ ctx = self.df.style._translate()
+ assert ctx['head'][0][1]['is_visible']
+ assert ctx['head'][0][1]['display_value'] == 'A'
+ assert ctx['head'][0][2]['is_visible']
+ assert ctx['head'][0][2]['display_value'] == 'B'
+ assert ctx['body'][0][1]['is_visible'] # col A, row 1
+ assert ctx['body'][1][2]['is_visible'] # col B, row 1
+
+ ctx = self.df.style.hide_columns('A')._translate()
+ assert not ctx['head'][0][1]['is_visible']
+ assert not ctx['body'][0][1]['is_visible'] # col A, row 1
+ assert ctx['body'][1][2]['is_visible'] # col B, row 1
+
+ # test hiding mulitiple columns
+ ctx = self.df.style.hide_columns(['A', 'B'])._translate()
+ assert not ctx['head'][0][1]['is_visible']
+ assert not ctx['head'][0][2]['is_visible']
+ assert not ctx['body'][0][1]['is_visible'] # col A, row 1
+ assert not ctx['body'][1][2]['is_visible'] # col B, row 1
+
+ def test_hide_columns_mult_levels(self):
+ # GH 14194
+ # setup dataframe with multiple column levels and indices
+ i1 = pd.MultiIndex.from_arrays([['a', 'a'], [0, 1]],
+ names=['idx_level_0',
+ 'idx_level_1'])
+ i2 = pd.MultiIndex.from_arrays([['b', 'b'], [0, 1]],
+ names=['col_level_0',
+ 'col_level_1'])
+ df = pd.DataFrame([[1, 2], [3, 4]], index=i1, columns=i2)
+ ctx = df.style._translate()
+ # column headers
+ assert ctx['head'][0][2]['is_visible']
+ assert ctx['head'][1][2]['is_visible']
+ assert ctx['head'][1][3]['display_value'] == 1
+ # indices
+ assert ctx['body'][0][0]['is_visible']
+ # data
+ assert ctx['body'][1][2]['is_visible']
+ assert ctx['body'][1][2]['display_value'] == 3
+ assert ctx['body'][1][3]['is_visible']
+ assert ctx['body'][1][3]['display_value'] == 4
+
+ # hide top column level, which hides both columns
+ ctx = df.style.hide_columns('b')._translate()
+ assert not ctx['head'][0][2]['is_visible'] # b
+ assert not ctx['head'][1][2]['is_visible'] # 0
+ assert not ctx['body'][1][2]['is_visible'] # 3
+ assert ctx['body'][0][0]['is_visible'] # index
+
+ # hide first column only
+ ctx = df.style.hide_columns([('b', 0)])._translate()
+ assert ctx['head'][0][2]['is_visible'] # b
+ assert not ctx['head'][1][2]['is_visible'] # 0
+ assert not ctx['body'][1][2]['is_visible'] # 3
+ assert ctx['body'][1][3]['is_visible']
+ assert ctx['body'][1][3]['display_value'] == 4
+
+ # hide second column and index
+ ctx = df.style.hide_columns([('b', 1)]).hide_index()._translate()
+ assert not ctx['body'][0][0]['is_visible'] # index
+ assert ctx['head'][0][2]['is_visible'] # b
+ assert ctx['head'][1][2]['is_visible'] # 0
+ assert not ctx['head'][1][3]['is_visible'] # 1
+ assert not ctx['body'][1][3]['is_visible'] # 4
+ assert ctx['body'][1][2]['is_visible']
+ assert ctx['body'][1][2]['display_value'] == 3
+
+ def test_pipe(self):
+ def set_caption_from_template(styler, a, b):
+ return styler.set_caption(
+ 'Dataframe with a = {a} and b = {b}'.format(a=a, b=b))
+
+ styler = self.df.style.pipe(set_caption_from_template, 'A', b='B')
+ assert 'Dataframe with a = A and b = B' in styler.render()
+
+ # Test with an argument that is a (callable, keyword_name) pair.
+ def f(a, b, styler):
+ return (a, b, styler)
+
+ styler = self.df.style
+ result = styler.pipe((f, 'styler'), a=1, b=2)
+ assert result == (1, 2, styler)
+
+
+class TestStylerMatplotlibDep(object):
+
+ def test_background_gradient(self):
+ df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B'])
+
+ for c_map in [None, 'YlOrRd']:
+ result = df.style.background_gradient(cmap=c_map)._compute().ctx
+ assert all("#" in x[0] for x in result.values())
+ assert result[(0, 0)] == result[(0, 1)]
+ assert result[(1, 0)] == result[(1, 1)]
+
+ result = df.style.background_gradient(
+ subset=pd.IndexSlice[1, 'A'])._compute().ctx
+
+ assert result[(1, 0)] == ['background-color: #fff7fb',
+ 'color: #000000']
+
+ @pytest.mark.parametrize(
+ 'c_map,expected', [
+ (None, {
+ (0, 0): ['background-color: #440154', 'color: #f1f1f1'],
+ (1, 0): ['background-color: #fde725', 'color: #000000']}),
+ ('YlOrRd', {
+ (0, 0): ['background-color: #ffffcc', 'color: #000000'],
+ (1, 0): ['background-color: #800026', 'color: #f1f1f1']})])
+ def test_text_color_threshold(self, c_map, expected):
+ df = pd.DataFrame([1, 2], columns=['A'])
+ result = df.style.background_gradient(cmap=c_map)._compute().ctx
+ assert result == expected
+
+ @pytest.mark.parametrize("text_color_threshold", [1.1, '1', -1, [2, 2]])
+ def test_text_color_threshold_raises(self, text_color_threshold):
+ df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B'])
+ msg = "`text_color_threshold` must be a value from 0 to 1."
+ with pytest.raises(ValueError, match=msg):
+ df.style.background_gradient(
+ text_color_threshold=text_color_threshold)._compute()
+
+ @td.skip_if_no_mpl
+ def test_background_gradient_axis(self):
+ df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B'])
+
+ low = ['background-color: #f7fbff', 'color: #000000']
+ high = ['background-color: #08306b', 'color: #f1f1f1']
+ mid = ['background-color: #abd0e6', 'color: #000000']
+ result = df.style.background_gradient(cmap='Blues',
+ axis=0)._compute().ctx
+ assert result[(0, 0)] == low
+ assert result[(0, 1)] == low
+ assert result[(1, 0)] == high
+ assert result[(1, 1)] == high
+
+ result = df.style.background_gradient(cmap='Blues',
+ axis=1)._compute().ctx
+ assert result[(0, 0)] == low
+ assert result[(0, 1)] == high
+ assert result[(1, 0)] == low
+ assert result[(1, 1)] == high
+
+ result = df.style.background_gradient(cmap='Blues',
+ axis=None)._compute().ctx
+ assert result[(0, 0)] == low
+ assert result[(0, 1)] == mid
+ assert result[(1, 0)] == mid
+ assert result[(1, 1)] == high
+
+
+def test_block_names():
+ # catch accidental removal of a block
+ expected = {
+ 'before_style', 'style', 'table_styles', 'before_cellstyle',
+ 'cellstyle', 'before_table', 'table', 'caption', 'thead', 'tbody',
+ 'after_table', 'before_head_rows', 'head_tr', 'after_head_rows',
+ 'before_rows', 'tr', 'after_rows',
+ }
+ result = set(Styler.template.blocks)
+ assert result == expected
+
+
+def test_from_custom_template(tmpdir):
+ p = tmpdir.mkdir("templates").join("myhtml.tpl")
+ p.write(textwrap.dedent("""\
+ {% extends "html.tpl" %}
+ {% block table %}
+ <h1>{{ table_title|default("My Table") }}</h1>
+ {{ super() }}
+ {% endblock table %}"""))
+ result = Styler.from_custom_template(str(tmpdir.join('templates')),
+ 'myhtml.tpl')
+ assert issubclass(result, Styler)
+ assert result.env is not Styler.env
+ assert result.template is not Styler.template
+ styler = result(pd.DataFrame({"A": [1, 2]}))
+ assert styler.render()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_csv.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_csv.py
new file mode 100644
index 00000000000..1929817a49b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_csv.py
@@ -0,0 +1,563 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, compat
+from pandas.util import testing as tm
+
+
+class TestToCSV(object):
+
+ @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5),
+ reason=("Python csv library bug "
+ "(see https://bugs.python.org/issue32255)"))
+ def test_to_csv_with_single_column(self):
+ # see gh-18676, https://bugs.python.org/issue32255
+ #
+ # Python's CSV library adds an extraneous '""'
+ # before the newline when the NaN-value is in
+ # the first row. Otherwise, only the newline
+ # character is added. This behavior is inconsistent
+ # and was patched in https://bugs.python.org/pull_request4672.
+ df1 = DataFrame([None, 1])
+ expected1 = """\
+""
+1.0
+"""
+ with tm.ensure_clean('test.csv') as path:
+ df1.to_csv(path, header=None, index=None)
+ with open(path, 'r') as f:
+ assert f.read() == expected1
+
+ df2 = DataFrame([1, None])
+ expected2 = """\
+1.0
+""
+"""
+ with tm.ensure_clean('test.csv') as path:
+ df2.to_csv(path, header=None, index=None)
+ with open(path, 'r') as f:
+ assert f.read() == expected2
+
+ def test_to_csv_defualt_encoding(self):
+ # GH17097
+ df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
+
+ with tm.ensure_clean('test.csv') as path:
+ # the default to_csv encoding in Python 2 is ascii, and that in
+ # Python 3 is uft-8.
+ if pd.compat.PY2:
+ # the encoding argument parameter should be utf-8
+ with pytest.raises(UnicodeEncodeError, match='ascii'):
+ df.to_csv(path)
+ else:
+ df.to_csv(path)
+ tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
+
+ def test_to_csv_quotechar(self):
+ df = DataFrame({'col': [1, 2]})
+ expected = """\
+"","col"
+"0","1"
+"1","2"
+"""
+
+ with tm.ensure_clean('test.csv') as path:
+ df.to_csv(path, quoting=1) # 1=QUOTE_ALL
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ expected = """\
+$$,$col$
+$0$,$1$
+$1$,$2$
+"""
+
+ with tm.ensure_clean('test.csv') as path:
+ df.to_csv(path, quoting=1, quotechar="$")
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ with tm.ensure_clean('test.csv') as path:
+ with pytest.raises(TypeError, match='quotechar'):
+ df.to_csv(path, quoting=1, quotechar=None)
+
+ def test_to_csv_doublequote(self):
+ df = DataFrame({'col': ['a"a', '"bb"']})
+ expected = '''\
+"","col"
+"0","a""a"
+"1","""bb"""
+'''
+
+ with tm.ensure_clean('test.csv') as path:
+ df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ from _csv import Error
+ with tm.ensure_clean('test.csv') as path:
+ with pytest.raises(Error, match='escapechar'):
+ df.to_csv(path, doublequote=False) # no escapechar set
+
+ def test_to_csv_escapechar(self):
+ df = DataFrame({'col': ['a"a', '"bb"']})
+ expected = '''\
+"","col"
+"0","a\\"a"
+"1","\\"bb\\""
+'''
+
+ with tm.ensure_clean('test.csv') as path: # QUOTE_ALL
+ df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ df = DataFrame({'col': ['a,a', ',bb,']})
+ expected = """\
+,col
+0,a\\,a
+1,\\,bb\\,
+"""
+
+ with tm.ensure_clean('test.csv') as path:
+ df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ def test_csv_to_string(self):
+ df = DataFrame({'col': [1, 2]})
+ expected_rows = [',col',
+ '0,1',
+ '1,2']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.to_csv() == expected
+
+ def test_to_csv_decimal(self):
+ # see gh-781
+ df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})
+
+ expected_rows = [',col1,col2,col3',
+ '0,1,a,10.1']
+ expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.to_csv() == expected_default
+
+ expected_rows = [';col1;col2;col3',
+ '0;1;a;10,1']
+ expected_european_excel = tm.convert_rows_list_to_csv_str(
+ expected_rows)
+ assert df.to_csv(decimal=',', sep=';') == expected_european_excel
+
+ expected_rows = [',col1,col2,col3',
+ '0,1,a,10.10']
+ expected_float_format_default = tm.convert_rows_list_to_csv_str(
+ expected_rows)
+ assert df.to_csv(float_format='%.2f') == expected_float_format_default
+
+ expected_rows = [';col1;col2;col3',
+ '0;1;a;10,10']
+ expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.to_csv(decimal=',', sep=';',
+ float_format='%.2f') == expected_float_format
+
+ # see gh-11553: testing if decimal is taken into account for '0.0'
+ df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
+
+ expected_rows = ['a,b,c',
+ '0^0,2^2,1',
+ '1^1,3^3,1']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.to_csv(index=False, decimal='^') == expected
+
+ # same but for an index
+ assert df.set_index('a').to_csv(decimal='^') == expected
+
+ # same for a multi-index
+ assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
+
+ def test_to_csv_float_format(self):
+ # testing if float_format is taken into account for the index
+ # GH 11553
+ df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
+
+ expected_rows = ['a,b,c',
+ '0,2.20,1',
+ '1,3.30,1']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df.set_index('a').to_csv(float_format='%.2f') == expected
+
+ # same for a multi-index
+ assert df.set_index(['a', 'b']).to_csv(
+ float_format='%.2f') == expected
+
+ def test_to_csv_na_rep(self):
+ # see gh-11553
+ #
+ # Testing if NaN values are correctly represented in the index.
+ df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
+ expected_rows = ['a,b,c',
+ '0.0,0,2',
+ '_,1,3']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ assert df.set_index('a').to_csv(na_rep='_') == expected
+ assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+ # now with an index containing only NaNs
+ df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
+ expected_rows = ['a,b,c',
+ '_,0,2',
+ '_,1,3']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ assert df.set_index('a').to_csv(na_rep='_') == expected
+ assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+ # check if na_rep parameter does not break anything when no NaN
+ df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
+ expected_rows = ['a,b,c',
+ '0,0,2',
+ '0,1,3']
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ assert df.set_index('a').to_csv(na_rep='_') == expected
+ assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+ def test_to_csv_date_format(self):
+ # GH 10209
+ df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s')
+ })
+ df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d')
+ })
+
+ expected_rows = [',A',
+ '0,2013-01-01 00:00:00',
+ '1,2013-01-01 00:00:01',
+ '2,2013-01-01 00:00:02',
+ '3,2013-01-01 00:00:03',
+ '4,2013-01-01 00:00:04']
+ expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df_sec.to_csv() == expected_default_sec
+
+ expected_rows = [',A',
+ '0,2013-01-01 00:00:00',
+ '1,2013-01-02 00:00:00',
+ '2,2013-01-03 00:00:00',
+ '3,2013-01-04 00:00:00',
+ '4,2013-01-05 00:00:00']
+ expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') ==
+ expected_ymdhms_day)
+
+ expected_rows = [',A',
+ '0,2013-01-01',
+ '1,2013-01-01',
+ '2,2013-01-01',
+ '3,2013-01-01',
+ '4,2013-01-01']
+ expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec
+
+ expected_rows = [',A',
+ '0,2013-01-01',
+ '1,2013-01-02',
+ '2,2013-01-03',
+ '3,2013-01-04',
+ '4,2013-01-05']
+ expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert df_day.to_csv() == expected_default_day
+ assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day
+
+ # see gh-7791
+ #
+ # Testing if date_format parameter is taken into account
+ # for multi-indexed DataFrames.
+ df_sec['B'] = 0
+ df_sec['C'] = 1
+
+ expected_rows = ['A,B,C',
+ '2013-01-01,0,1']
+ expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B'])
+ assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') ==
+ expected_ymd_sec)
+
+ def test_to_csv_multi_index(self):
+ # see gh-6618
+ df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
+
+ exp_rows = [',1',
+ ',2',
+ '0,1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv() == exp
+
+ exp_rows = ['1', '2', '1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv(index=False) == exp
+
+ df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]),
+ index=pd.MultiIndex.from_arrays([[1], [2]]))
+
+ exp_rows = [',,1', ',,2', '1,2,1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv() == exp
+
+ exp_rows = ['1', '2', '1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv(index=False) == exp
+
+ df = DataFrame(
+ [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))
+
+ exp_rows = [',foo', ',bar', '0,1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv() == exp
+
+ exp_rows = ['foo', 'bar', '1']
+ exp = tm.convert_rows_list_to_csv_str(exp_rows)
+ assert df.to_csv(index=False) == exp
+
+ @pytest.mark.parametrize("ind,expected", [
+ (pd.MultiIndex(levels=[[1.0]],
+ codes=[[0]],
+ names=["x"]),
+ "x,data\n1.0,1\n"),
+ (pd.MultiIndex(levels=[[1.], [2.]],
+ codes=[[0], [0]],
+ names=["x", "y"]),
+ "x,y,data\n1.0,2.0,1\n")
+ ])
+ @pytest.mark.parametrize("klass", [
+ pd.DataFrame, pd.Series
+ ])
+ def test_to_csv_single_level_multi_index(self, ind, expected, klass):
+ # see gh-19589
+ result = klass(pd.Series([1], ind, name="data")).to_csv(
+ line_terminator="\n", header=True)
+ assert result == expected
+
+ def test_to_csv_string_array_ascii(self):
+ # GH 10813
+ str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
+ df = pd.DataFrame(str_array)
+ expected_ascii = '''\
+,names
+0,"['foo', 'bar']"
+1,"['baz', 'qux']"
+'''
+ with tm.ensure_clean('str_test.csv') as path:
+ df.to_csv(path, encoding='ascii')
+ with open(path, 'r') as f:
+ assert f.read() == expected_ascii
+
+ @pytest.mark.xfail
+ def test_to_csv_string_array_utf8(self):
+ # GH 10813
+ str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
+ df = pd.DataFrame(str_array)
+ expected_utf8 = '''\
+,names
+0,"[u'foo', u'bar']"
+1,"[u'baz', u'qux']"
+'''
+ with tm.ensure_clean('unicode_test.csv') as path:
+ df.to_csv(path, encoding='utf-8')
+ with open(path, 'r') as f:
+ assert f.read() == expected_utf8
+
+ def test_to_csv_string_with_lf(self):
+ # GH 20353
+ data = {
+ 'int': [1, 2, 3],
+ 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni']
+ }
+ df = pd.DataFrame(data)
+ with tm.ensure_clean('lf_test.csv') as path:
+ # case 1: The default line terminator(=os.linesep)(PR 21406)
+ os_linesep = os.linesep.encode('utf-8')
+ expected_noarg = (
+ b'int,str_lf' + os_linesep +
+ b'1,abc' + os_linesep +
+ b'2,"d\nef"' + os_linesep +
+ b'3,"g\nh\n\ni"' + os_linesep
+ )
+ df.to_csv(path, index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_noarg
+ with tm.ensure_clean('lf_test.csv') as path:
+ # case 2: LF as line terminator
+ expected_lf = (
+ b'int,str_lf\n'
+ b'1,abc\n'
+ b'2,"d\nef"\n'
+ b'3,"g\nh\n\ni"\n'
+ )
+ df.to_csv(path, line_terminator='\n', index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_lf
+ with tm.ensure_clean('lf_test.csv') as path:
+ # case 3: CRLF as line terminator
+ # 'line_terminator' should not change inner element
+ expected_crlf = (
+ b'int,str_lf\r\n'
+ b'1,abc\r\n'
+ b'2,"d\nef"\r\n'
+ b'3,"g\nh\n\ni"\r\n'
+ )
+ df.to_csv(path, line_terminator='\r\n', index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_crlf
+
+ def test_to_csv_string_with_crlf(self):
+ # GH 20353
+ data = {
+ 'int': [1, 2, 3],
+ 'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni']
+ }
+ df = pd.DataFrame(data)
+ with tm.ensure_clean('crlf_test.csv') as path:
+ # case 1: The default line terminator(=os.linesep)(PR 21406)
+ os_linesep = os.linesep.encode('utf-8')
+ expected_noarg = (
+ b'int,str_crlf' + os_linesep +
+ b'1,abc' + os_linesep +
+ b'2,"d\r\nef"' + os_linesep +
+ b'3,"g\r\nh\r\n\r\ni"' + os_linesep
+ )
+ df.to_csv(path, index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_noarg
+ with tm.ensure_clean('crlf_test.csv') as path:
+ # case 2: LF as line terminator
+ expected_lf = (
+ b'int,str_crlf\n'
+ b'1,abc\n'
+ b'2,"d\r\nef"\n'
+ b'3,"g\r\nh\r\n\r\ni"\n'
+ )
+ df.to_csv(path, line_terminator='\n', index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_lf
+ with tm.ensure_clean('crlf_test.csv') as path:
+ # case 3: CRLF as line terminator
+ # 'line_terminator' should not change inner element
+ expected_crlf = (
+ b'int,str_crlf\r\n'
+ b'1,abc\r\n'
+ b'2,"d\r\nef"\r\n'
+ b'3,"g\r\nh\r\n\r\ni"\r\n'
+ )
+ df.to_csv(path, line_terminator='\r\n', index=False)
+ with open(path, 'rb') as f:
+ assert f.read() == expected_crlf
+
+ def test_to_csv_stdout_file(self, capsys):
+ # GH 21561
+ df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']],
+ columns=['name_1', 'name_2'])
+ expected_rows = [',name_1,name_2',
+ '0,foo,bar',
+ '1,baz,qux']
+ expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
+
+ df.to_csv(sys.stdout, encoding='ascii')
+ captured = capsys.readouterr()
+
+ assert captured.out == expected_ascii
+ assert not sys.stdout.closed
+
+ @pytest.mark.xfail(
+ compat.is_platform_windows(),
+ reason=("Especially in Windows, file stream should not be passed"
+ "to csv writer without newline='' option."
+ "(https://docs.python.org/3.6/library/csv.html#csv.writer)"))
+ def test_to_csv_write_to_open_file(self):
+ # GH 21696
+ df = pd.DataFrame({'a': ['x', 'y', 'z']})
+ expected = '''\
+manual header
+x
+y
+z
+'''
+ with tm.ensure_clean('test.txt') as path:
+ with open(path, 'w') as f:
+ f.write('manual header\n')
+ df.to_csv(f, header=None, index=None)
+ with open(path, 'r') as f:
+ assert f.read() == expected
+
+ @pytest.mark.skipif(compat.PY2, reason="Test case for python3")
+ def test_to_csv_write_to_open_file_with_newline_py3(self):
+ # see gh-21696
+ # see gh-20353
+ df = pd.DataFrame({'a': ['x', 'y', 'z']})
+ expected_rows = ["x",
+ "y",
+ "z"]
+ expected = ("manual header\n" +
+ tm.convert_rows_list_to_csv_str(expected_rows))
+ with tm.ensure_clean('test.txt') as path:
+ with open(path, 'w', newline='') as f:
+ f.write('manual header\n')
+ df.to_csv(f, header=None, index=None)
+
+ with open(path, 'rb') as f:
+ assert f.read() == bytes(expected, 'utf-8')
+
+ @pytest.mark.skipif(compat.PY3, reason="Test case for python2")
+ def test_to_csv_write_to_open_file_with_newline_py2(self):
+ # see gh-21696
+ # see gh-20353
+ df = pd.DataFrame({'a': ['x', 'y', 'z']})
+ expected_rows = ["x",
+ "y",
+ "z"]
+ expected = ("manual header\n" +
+ tm.convert_rows_list_to_csv_str(expected_rows))
+ with tm.ensure_clean('test.txt') as path:
+ with open(path, 'wb') as f:
+ f.write('manual header\n')
+ df.to_csv(f, header=None, index=None)
+
+ with open(path, 'rb') as f:
+ assert f.read() == expected
+
+ @pytest.mark.parametrize("to_infer", [True, False])
+ @pytest.mark.parametrize("read_infer", [True, False])
+ def test_to_csv_compression(self, compression_only,
+ read_infer, to_infer):
+ # see gh-15008
+ compression = compression_only
+
+ if compression == "zip":
+ pytest.skip("{compression} is not supported "
+ "for to_csv".format(compression=compression))
+
+ # We'll complete file extension subsequently.
+ filename = "test."
+
+ if compression == "gzip":
+ filename += "gz"
+ else:
+ # xz --> .xz
+ # bz2 --> .bz2
+ filename += compression
+
+ df = DataFrame({"A": [1]})
+
+ to_compression = "infer" if to_infer else compression
+ read_compression = "infer" if read_infer else compression
+
+ with tm.ensure_clean(filename) as path:
+ df.to_csv(path, compression=to_compression)
+ result = pd.read_csv(path, index_col=0,
+ compression=read_compression)
+ tm.assert_frame_equal(result, df)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_excel.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_excel.py
new file mode 100644
index 00000000000..13eb517fcab
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_excel.py
@@ -0,0 +1,278 @@
+"""Tests formatting as writer-agnostic ExcelCells
+
+ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py
+"""
+
+import pytest
+
+import pandas.util.testing as tm
+
+from pandas.io.formats.css import CSSWarning
+from pandas.io.formats.excel import CSSToExcelConverter
+
+
[email protected]('css,expected', [
+ # FONT
+ # - name
+ ('font-family: foo,bar', {'font': {'name': 'foo'}}),
+ ('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}),
+ ('font-family: foo,\nbar', {'font': {'name': 'foo'}}),
+ ('font-family: foo, bar, baz', {'font': {'name': 'foo'}}),
+ ('font-family: bar, foo', {'font': {'name': 'bar'}}),
+ ('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}),
+ ('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}),
+ ('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}),
+ ('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}),
+ # - family
+ ('font-family: serif', {'font': {'name': 'serif', 'family': 1}}),
+ ('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}),
+ ('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}),
+ ('font-family: roman, sans-serif', {'font': {'name': 'roman',
+ 'family': 2}}),
+ ('font-family: roman, sans serif', {'font': {'name': 'roman'}}),
+ ('font-family: roman, sansserif', {'font': {'name': 'roman'}}),
+ ('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}),
+ ('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}),
+ # - size
+ ('font-size: 1em', {'font': {'size': 12}}),
+ ('font-size: xx-small', {'font': {'size': 6}}),
+ ('font-size: x-small', {'font': {'size': 7.5}}),
+ ('font-size: small', {'font': {'size': 9.6}}),
+ ('font-size: medium', {'font': {'size': 12}}),
+ ('font-size: large', {'font': {'size': 13.5}}),
+ ('font-size: x-large', {'font': {'size': 18}}),
+ ('font-size: xx-large', {'font': {'size': 24}}),
+ ('font-size: 50%', {'font': {'size': 6}}),
+ # - bold
+ ('font-weight: 100', {'font': {'bold': False}}),
+ ('font-weight: 200', {'font': {'bold': False}}),
+ ('font-weight: 300', {'font': {'bold': False}}),
+ ('font-weight: 400', {'font': {'bold': False}}),
+ ('font-weight: normal', {'font': {'bold': False}}),
+ ('font-weight: lighter', {'font': {'bold': False}}),
+ ('font-weight: bold', {'font': {'bold': True}}),
+ ('font-weight: bolder', {'font': {'bold': True}}),
+ ('font-weight: 700', {'font': {'bold': True}}),
+ ('font-weight: 800', {'font': {'bold': True}}),
+ ('font-weight: 900', {'font': {'bold': True}}),
+ # - italic
+ ('font-style: italic', {'font': {'italic': True}}),
+ ('font-style: oblique', {'font': {'italic': True}}),
+ # - underline
+ ('text-decoration: underline',
+ {'font': {'underline': 'single'}}),
+ ('text-decoration: overline',
+ {}),
+ ('text-decoration: none',
+ {}),
+ # - strike
+ ('text-decoration: line-through',
+ {'font': {'strike': True}}),
+ ('text-decoration: underline line-through',
+ {'font': {'strike': True, 'underline': 'single'}}),
+ ('text-decoration: underline; text-decoration: line-through',
+ {'font': {'strike': True}}),
+ # - color
+ ('color: red', {'font': {'color': 'FF0000'}}),
+ ('color: #ff0000', {'font': {'color': 'FF0000'}}),
+ ('color: #f0a', {'font': {'color': 'FF00AA'}}),
+ # - shadow
+ ('text-shadow: none', {'font': {'shadow': False}}),
+ ('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}),
+ ('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}),
+ ('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}),
+ ('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}),
+ ('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}),
+ ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}),
+ ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}),
+ ('text-shadow: 0px -2em', {'font': {'shadow': True}}),
+
+ # FILL
+ # - color, fillType
+ ('background-color: red', {'fill': {'fgColor': 'FF0000',
+ 'patternType': 'solid'}}),
+ ('background-color: #ff0000', {'fill': {'fgColor': 'FF0000',
+ 'patternType': 'solid'}}),
+ ('background-color: #f0a', {'fill': {'fgColor': 'FF00AA',
+ 'patternType': 'solid'}}),
+ # BORDER
+ # - style
+ ('border-style: solid',
+ {'border': {'top': {'style': 'medium'},
+ 'bottom': {'style': 'medium'},
+ 'left': {'style': 'medium'},
+ 'right': {'style': 'medium'}}}),
+ ('border-style: solid; border-width: thin',
+ {'border': {'top': {'style': 'thin'},
+ 'bottom': {'style': 'thin'},
+ 'left': {'style': 'thin'},
+ 'right': {'style': 'thin'}}}),
+
+ ('border-top-style: solid; border-top-width: thin',
+ {'border': {'top': {'style': 'thin'}}}),
+ ('border-top-style: solid; border-top-width: 1pt',
+ {'border': {'top': {'style': 'thin'}}}),
+ ('border-top-style: solid',
+ {'border': {'top': {'style': 'medium'}}}),
+ ('border-top-style: solid; border-top-width: medium',
+ {'border': {'top': {'style': 'medium'}}}),
+ ('border-top-style: solid; border-top-width: 2pt',
+ {'border': {'top': {'style': 'medium'}}}),
+ ('border-top-style: solid; border-top-width: thick',
+ {'border': {'top': {'style': 'thick'}}}),
+ ('border-top-style: solid; border-top-width: 4pt',
+ {'border': {'top': {'style': 'thick'}}}),
+
+ ('border-top-style: dotted',
+ {'border': {'top': {'style': 'mediumDashDotDot'}}}),
+ ('border-top-style: dotted; border-top-width: thin',
+ {'border': {'top': {'style': 'dotted'}}}),
+ ('border-top-style: dashed',
+ {'border': {'top': {'style': 'mediumDashed'}}}),
+ ('border-top-style: dashed; border-top-width: thin',
+ {'border': {'top': {'style': 'dashed'}}}),
+ ('border-top-style: double',
+ {'border': {'top': {'style': 'double'}}}),
+ # - color
+ ('border-style: solid; border-color: #0000ff',
+ {'border': {'top': {'style': 'medium', 'color': '0000FF'},
+ 'right': {'style': 'medium', 'color': '0000FF'},
+ 'bottom': {'style': 'medium', 'color': '0000FF'},
+ 'left': {'style': 'medium', 'color': '0000FF'}}}),
+ ('border-top-style: double; border-top-color: blue',
+ {'border': {'top': {'style': 'double', 'color': '0000FF'}}}),
+ ('border-top-style: solid; border-top-color: #06c',
+ {'border': {'top': {'style': 'medium', 'color': '0066CC'}}}),
+ # ALIGNMENT
+ # - horizontal
+ ('text-align: center',
+ {'alignment': {'horizontal': 'center'}}),
+ ('text-align: left',
+ {'alignment': {'horizontal': 'left'}}),
+ ('text-align: right',
+ {'alignment': {'horizontal': 'right'}}),
+ ('text-align: justify',
+ {'alignment': {'horizontal': 'justify'}}),
+ # - vertical
+ ('vertical-align: top',
+ {'alignment': {'vertical': 'top'}}),
+ ('vertical-align: text-top',
+ {'alignment': {'vertical': 'top'}}),
+ ('vertical-align: middle',
+ {'alignment': {'vertical': 'center'}}),
+ ('vertical-align: bottom',
+ {'alignment': {'vertical': 'bottom'}}),
+ ('vertical-align: text-bottom',
+ {'alignment': {'vertical': 'bottom'}}),
+ # - wrap_text
+ ('white-space: nowrap',
+ {'alignment': {'wrap_text': False}}),
+ ('white-space: pre',
+ {'alignment': {'wrap_text': False}}),
+ ('white-space: pre-line',
+ {'alignment': {'wrap_text': False}}),
+ ('white-space: normal',
+ {'alignment': {'wrap_text': True}}),
+ # NUMBER FORMAT
+ ('number-format: 0%',
+ {'number_format': {'format_code': '0%'}}),
+])
+def test_css_to_excel(css, expected):
+ convert = CSSToExcelConverter()
+ assert expected == convert(css)
+
+
+def test_css_to_excel_multiple():
+ convert = CSSToExcelConverter()
+ actual = convert('''
+ font-weight: bold;
+ text-decoration: underline;
+ color: red;
+ border-width: thin;
+ text-align: center;
+ vertical-align: top;
+ unused: something;
+ ''')
+ assert {"font": {"bold": True, "underline": "single", "color": "FF0000"},
+ "border": {"top": {"style": "thin"},
+ "right": {"style": "thin"},
+ "bottom": {"style": "thin"},
+ "left": {"style": "thin"}},
+ "alignment": {"horizontal": "center",
+ "vertical": "top"}} == actual
+
+
[email protected]('css,inherited,expected', [
+ ('font-weight: bold', '',
+ {'font': {'bold': True}}),
+ ('', 'font-weight: bold',
+ {'font': {'bold': True}}),
+ ('font-weight: bold', 'font-style: italic',
+ {'font': {'bold': True, 'italic': True}}),
+ ('font-style: normal', 'font-style: italic',
+ {'font': {'italic': False}}),
+ ('font-style: inherit', '', {}),
+ ('font-style: normal; font-style: inherit', 'font-style: italic',
+ {'font': {'italic': True}}),
+])
+def test_css_to_excel_inherited(css, inherited, expected):
+ convert = CSSToExcelConverter(inherited)
+ assert expected == convert(css)
+
+
[email protected]("input_color,output_color", (
+ [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] +
+ [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] +
+ [("#F0F", "FF00FF"), ("#ABC", "AABBCC")])
+)
+def test_css_to_excel_good_colors(input_color, output_color):
+ # see gh-18392
+ css = ("border-top-color: {color}; "
+ "border-right-color: {color}; "
+ "border-bottom-color: {color}; "
+ "border-left-color: {color}; "
+ "background-color: {color}; "
+ "color: {color}").format(color=input_color)
+
+ expected = dict()
+
+ expected["fill"] = {
+ "patternType": "solid",
+ "fgColor": output_color
+ }
+
+ expected["font"] = {
+ "color": output_color
+ }
+
+ expected["border"] = {
+ k: {
+ "color": output_color,
+ } for k in ("top", "right", "bottom", "left")
+ }
+
+ with tm.assert_produces_warning(None):
+ convert = CSSToExcelConverter()
+ assert expected == convert(css)
+
+
[email protected]("input_color", [None, "not-a-color"])
+def test_css_to_excel_bad_colors(input_color):
+ # see gh-18392
+ css = ("border-top-color: {color}; "
+ "border-right-color: {color}; "
+ "border-bottom-color: {color}; "
+ "border-left-color: {color}; "
+ "background-color: {color}; "
+ "color: {color}").format(color=input_color)
+
+ expected = dict()
+
+ if input_color is not None:
+ expected["fill"] = {
+ "patternType": "solid"
+ }
+
+ with tm.assert_produces_warning(CSSWarning):
+ convert = CSSToExcelConverter()
+ assert expected == convert(css)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_html.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_html.py
new file mode 100644
index 00000000000..554cfd306e2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_html.py
@@ -0,0 +1,602 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+from io import open
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lrange, u
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, compat, option_context
+from pandas.util import testing as tm
+
+import pandas.io.formats.format as fmt
+
+
+def expected_html(datapath, name):
+ """
+ Read HTML file from formats data directory.
+
+ Parameters
+ ----------
+ datapath : pytest fixture
+ The datapath fixture injected into a test by pytest.
+ name : str
+ The name of the HTML file without the suffix.
+
+ Returns
+ -------
+ str : contents of HTML file.
+ """
+ filename = '.'.join([name, 'html'])
+ filepath = datapath('io', 'formats', 'data', 'html', filename)
+ with open(filepath, encoding='utf-8') as f:
+ html = f.read()
+ return html.rstrip()
+
+
[email protected](params=['mixed', 'empty'])
+def biggie_df_fixture(request):
+ """Fixture for a big mixed Dataframe and an empty Dataframe"""
+ if request.param == 'mixed':
+ df = DataFrame({'A': np.random.randn(200),
+ 'B': tm.makeStringIndex(200)},
+ index=lrange(200))
+ df.loc[:20, 'A'] = np.nan
+ df.loc[:20, 'B'] = np.nan
+ return df
+ elif request.param == 'empty':
+ df = DataFrame(index=np.arange(200))
+ return df
+
+
[email protected](params=fmt._VALID_JUSTIFY_PARAMETERS)
+def justify(request):
+ return request.param
+
+
[email protected]('col_space', [30, 50])
+def test_to_html_with_col_space(col_space):
+ df = DataFrame(np.random.random(size=(1, 3)))
+ # check that col_space affects HTML generation
+ # and be very brittle about it.
+ result = df.to_html(col_space=col_space)
+ hdrs = [x for x in result.split(r"\n") if re.search(r"<th[>\s]", x)]
+ assert len(hdrs) > 0
+ for h in hdrs:
+ assert "min-width" in h
+ assert str(col_space) in h
+
+
+def test_to_html_with_empty_string_label():
+ # GH 3547, to_html regards empty string labels as repeated labels
+ data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]}
+ df = DataFrame(data).set_index(['c1', 'c2'])
+ result = df.to_html()
+ assert "rowspan" not in result
+
+
[email protected]('df,expected', [
+ (DataFrame({u('\u03c3'): np.arange(10.)}), 'unicode_1'),
+ (DataFrame({'A': [u('\u03c3')]}), 'unicode_2')
+])
+def test_to_html_unicode(df, expected, datapath):
+ expected = expected_html(datapath, expected)
+ result = df.to_html()
+ assert result == expected
+
+
+def test_to_html_decimal(datapath):
+ # GH 12031
+ df = DataFrame({'A': [6.0, 3.1, 2.2]})
+ result = df.to_html(decimal=',')
+ expected = expected_html(datapath, 'gh12031_expected_output')
+ assert result == expected
+
+
[email protected]('kwargs,string,expected', [
+ (dict(), "<type 'str'>", 'escaped'),
+ (dict(escape=False), "<b>bold</b>", 'escape_disabled')
+])
+def test_to_html_escaped(kwargs, string, expected, datapath):
+ a = 'str<ing1 &amp;'
+ b = 'stri>ng2 &amp;'
+
+ test_dict = {'co<l1': {a: string,
+ b: string},
+ 'co>l2': {a: string,
+ b: string}}
+ result = DataFrame(test_dict).to_html(**kwargs)
+ expected = expected_html(datapath, expected)
+ assert result == expected
+
+
[email protected]('index_is_named', [True, False])
+def test_to_html_multiindex_index_false(index_is_named, datapath):
+ # GH 8452
+ df = DataFrame({
+ 'a': range(2),
+ 'b': range(3, 5),
+ 'c': range(5, 7),
+ 'd': range(3, 5)
+ })
+ df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']])
+ if index_is_named:
+ df.index = Index(df.index.values, name='idx')
+ result = df.to_html(index=False)
+ expected = expected_html(datapath, 'gh8452_expected_output')
+ assert result == expected
+
+
[email protected]('multi_sparse,expected', [
+ (False, 'multiindex_sparsify_false_multi_sparse_1'),
+ (False, 'multiindex_sparsify_false_multi_sparse_2'),
+ (True, 'multiindex_sparsify_1'),
+ (True, 'multiindex_sparsify_2')
+])
+def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath):
+ index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=['foo', None])
+ df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index)
+ if expected.endswith('2'):
+ df.columns = index[::2]
+ with option_context('display.multi_sparse', multi_sparse):
+ result = df.to_html()
+ expected = expected_html(datapath, expected)
+ assert result == expected
+
+
[email protected]('max_rows,expected', [
+ (60, 'gh14882_expected_output_1'),
+
+ # Test that ... appears in a middle level
+ (56, 'gh14882_expected_output_2')
+])
+def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath):
+ # GH 14882 - Issue on truncation with odd length DataFrame
+ index = MultiIndex.from_product([[100, 200, 300],
+ [10, 20, 30],
+ [1, 2, 3, 4, 5, 6, 7]],
+ names=['a', 'b', 'c'])
+ df = DataFrame({'n': range(len(index))}, index=index)
+ result = df.to_html(max_rows=max_rows)
+ expected = expected_html(datapath, expected)
+ assert result == expected
+
+
[email protected]('df,formatters,expected', [
+ (DataFrame(
+ [[0, 1], [2, 3], [4, 5], [6, 7]],
+ columns=['foo', None], index=lrange(4)),
+ {'__index__': lambda x: 'abcd' [x]},
+ 'index_formatter'),
+
+ (DataFrame(
+ {'months': [datetime(2016, 1, 1), datetime(2016, 2, 2)]}),
+ {'months': lambda x: x.strftime('%Y-%m')},
+ 'datetime64_monthformatter'),
+
+ (DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'],
+ format='%H:%M:%S.%f')}),
+ {'hod': lambda x: x.strftime('%H:%M')},
+ 'datetime64_hourformatter')
+])
+def test_to_html_formatters(df, formatters, expected, datapath):
+ expected = expected_html(datapath, expected)
+ result = df.to_html(formatters=formatters)
+ assert result == expected
+
+
+def test_to_html_regression_GH6098():
+ df = DataFrame({
+ u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')],
+ u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')],
+ 'données1': np.random.randn(5),
+ 'données2': np.random.randn(5)})
+
+ # it works
+ df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
+
+
+def test_to_html_truncate(datapath):
+ index = pd.date_range(start='20010101', freq='D', periods=20)
+ df = DataFrame(index=index, columns=range(20))
+ result = df.to_html(max_rows=8, max_cols=4)
+ expected = expected_html(datapath, 'truncate')
+ assert result == expected
+
+
[email protected]('sparsify,expected', [
+ (True, 'truncate_multi_index'),
+ (False, 'truncate_multi_index_sparse_off')
+])
+def test_to_html_truncate_multi_index(sparsify, expected, datapath):
+ arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ df = DataFrame(index=arrays, columns=arrays)
+ result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify)
+ expected = expected_html(datapath, expected)
+ assert result == expected
+
+
[email protected]('option,result,expected', [
+ (None, lambda df: df.to_html(), '1'),
+ (None, lambda df: df.to_html(border=0), '0'),
+ (0, lambda df: df.to_html(), '0'),
+ (0, lambda df: df._repr_html_(), '0'),
+])
+def test_to_html_border(option, result, expected):
+ df = DataFrame({'A': [1, 2]})
+ if option is None:
+ result = result(df)
+ else:
+ with option_context('display.html.border', option):
+ result = result(df)
+ expected = 'border="{}"'.format(expected)
+ assert expected in result
+
+
+def test_display_option_warning():
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ pd.options.html.border
+
+
[email protected]('biggie_df_fixture', ['mixed'], indirect=True)
+def test_to_html(biggie_df_fixture):
+ # TODO: split this test
+ df = biggie_df_fixture
+ s = df.to_html()
+
+ buf = StringIO()
+ retval = df.to_html(buf=buf)
+ assert retval is None
+ assert buf.getvalue() == s
+
+ assert isinstance(s, compat.string_types)
+
+ df.to_html(columns=['B', 'A'], col_space=17)
+ df.to_html(columns=['B', 'A'],
+ formatters={'A': lambda x: '{x:.1f}'.format(x=x)})
+
+ df.to_html(columns=['B', 'A'], float_format=str)
+ df.to_html(columns=['B', 'A'], col_space=12, float_format=str)
+
+
[email protected]('biggie_df_fixture', ['empty'], indirect=True)
+def test_to_html_empty_dataframe(biggie_df_fixture):
+ df = biggie_df_fixture
+ df.to_html()
+
+
+def test_to_html_filename(biggie_df_fixture, tmpdir):
+ df = biggie_df_fixture
+ expected = df.to_html()
+ path = tmpdir.join('test.html')
+ df.to_html(path)
+ result = path.read()
+ assert result == expected
+
+
+def test_to_html_with_no_bold():
+ df = DataFrame({'x': np.random.randn(5)})
+ html = df.to_html(bold_rows=False)
+ result = html[html.find("</thead>")]
+ assert '<strong' not in result
+
+
+def test_to_html_columns_arg():
+ df = DataFrame(tm.getSeriesData())
+ result = df.to_html(columns=['A'])
+ assert '<th>B</th>' not in result
+
+
[email protected]('columns,justify,expected', [
+ (MultiIndex.from_tuples(
+ list(zip(np.arange(2).repeat(2), np.mod(lrange(4), 2))),
+ names=['CL0', 'CL1']),
+ 'left',
+ 'multiindex_1'),
+
+ (MultiIndex.from_tuples(
+ list(zip(range(4), np.mod(lrange(4), 2)))),
+ 'right',
+ 'multiindex_2')
+])
+def test_to_html_multiindex(columns, justify, expected, datapath):
+ df = DataFrame([list('abcd'), list('efgh')], columns=columns)
+ result = df.to_html(justify=justify)
+ expected = expected_html(datapath, expected)
+ assert result == expected
+
+
+def test_to_html_justify(justify, datapath):
+ df = DataFrame({'A': [6, 30000, 2],
+ 'B': [1, 2, 70000],
+ 'C': [223442, 0, 1]},
+ columns=['A', 'B', 'C'])
+ result = df.to_html(justify=justify)
+ expected = expected_html(datapath, 'justify').format(justify=justify)
+ assert result == expected
+
+
[email protected]("justify", ["super-right", "small-left",
+ "noinherit", "tiny", "pandas"])
+def test_to_html_invalid_justify(justify):
+ # GH 17527
+ df = DataFrame()
+ msg = "Invalid value for justify parameter"
+
+ with pytest.raises(ValueError, match=msg):
+ df.to_html(justify=justify)
+
+
+def test_to_html_index(datapath):
+ # TODO: split this test
+ index = ['foo', 'bar', 'baz']
+ df = DataFrame({'A': [1, 2, 3],
+ 'B': [1.2, 3.4, 5.6],
+ 'C': ['one', 'two', np.nan]},
+ columns=['A', 'B', 'C'],
+ index=index)
+ expected_with_index = expected_html(datapath, 'index_1')
+ assert df.to_html() == expected_with_index
+
+ expected_without_index = expected_html(datapath, 'index_2')
+ result = df.to_html(index=False)
+ for i in index:
+ assert i not in result
+ assert result == expected_without_index
+ df.index = Index(['foo', 'bar', 'baz'], name='idx')
+ expected_with_index = expected_html(datapath, 'index_3')
+ assert df.to_html() == expected_with_index
+ assert df.to_html(index=False) == expected_without_index
+
+ tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')]
+ df.index = MultiIndex.from_tuples(tuples)
+
+ expected_with_index = expected_html(datapath, 'index_4')
+ assert df.to_html() == expected_with_index
+
+ result = df.to_html(index=False)
+ for i in ['foo', 'bar', 'car', 'bike']:
+ assert i not in result
+ # must be the same result as normal index
+ assert result == expected_without_index
+
+ df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2'])
+ expected_with_index = expected_html(datapath, 'index_5')
+ assert df.to_html() == expected_with_index
+ assert df.to_html(index=False) == expected_without_index
+
+
[email protected]('classes', [
+ "sortable draggable",
+ ["sortable", "draggable"]
+])
+def test_to_html_with_classes(classes, datapath):
+ df = DataFrame()
+ expected = expected_html(datapath, 'with_classes')
+ result = df.to_html(classes=classes)
+ assert result == expected
+
+
+def test_to_html_no_index_max_rows(datapath):
+ # GH 14998
+ df = DataFrame({"A": [1, 2, 3, 4]})
+ result = df.to_html(index=False, max_rows=1)
+ expected = expected_html(datapath, 'gh14998_expected_output')
+ assert result == expected
+
+
+def test_to_html_multiindex_max_cols(datapath):
+ # GH 6131
+ index = MultiIndex(levels=[['ba', 'bb', 'bc'], ['ca', 'cb', 'cc']],
+ codes=[[0, 1, 2], [0, 1, 2]],
+ names=['b', 'c'])
+ columns = MultiIndex(levels=[['d'], ['aa', 'ab', 'ac']],
+ codes=[[0, 0, 0], [0, 1, 2]],
+ names=[None, 'a'])
+ data = np.array(
+ [[1., np.nan, np.nan], [np.nan, 2., np.nan], [np.nan, np.nan, 3.]])
+ df = DataFrame(data, index, columns)
+ result = df.to_html(max_cols=2)
+ expected = expected_html(datapath, 'gh6131_expected_output')
+ assert result == expected
+
+
+def test_to_html_multi_indexes_index_false(datapath):
+ # GH 22579
+ df = DataFrame({'a': range(10), 'b': range(10, 20), 'c': range(10, 20),
+ 'd': range(10, 20)})
+ df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']])
+ df.index = MultiIndex.from_product([['a', 'b'],
+ ['c', 'd', 'e', 'f', 'g']])
+ result = df.to_html(index=False)
+ expected = expected_html(datapath, 'gh22579_expected_output')
+ assert result == expected
+
+
[email protected]('index_names', [True, False])
[email protected]('header', [True, False])
[email protected]('index', [True, False])
[email protected]('column_index, column_type', [
+ (Index([0, 1]), 'unnamed_standard'),
+ (Index([0, 1], name='columns.name'), 'named_standard'),
+ (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'),
+ (MultiIndex.from_product(
+ [['a'], ['b', 'c']], names=['columns.name.0',
+ 'columns.name.1']), 'named_multi')
+])
[email protected]('row_index, row_type', [
+ (Index([0, 1]), 'unnamed_standard'),
+ (Index([0, 1], name='index.name'), 'named_standard'),
+ (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'),
+ (MultiIndex.from_product(
+ [['a'], ['b', 'c']], names=['index.name.0',
+ 'index.name.1']), 'named_multi')
+])
+def test_to_html_basic_alignment(
+ datapath, row_index, row_type, column_index, column_type,
+ index, header, index_names):
+ # GH 22747, GH 22579
+ df = DataFrame(np.zeros((2, 2), dtype=int),
+ index=row_index, columns=column_index)
+ result = df.to_html(
+ index=index, header=header, index_names=index_names)
+
+ if not index:
+ row_type = 'none'
+ elif not index_names and row_type.startswith('named'):
+ row_type = 'un' + row_type
+
+ if not header:
+ column_type = 'none'
+ elif not index_names and column_type.startswith('named'):
+ column_type = 'un' + column_type
+
+ filename = 'index_' + row_type + '_columns_' + column_type
+ expected = expected_html(datapath, filename)
+ assert result == expected
+
+
[email protected]('index_names', [True, False])
[email protected]('header', [True, False])
[email protected]('index', [True, False])
[email protected]('column_index, column_type', [
+ (Index(np.arange(8)), 'unnamed_standard'),
+ (Index(np.arange(8), name='columns.name'), 'named_standard'),
+ (MultiIndex.from_product(
+ [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'),
+ (MultiIndex.from_product(
+ [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']),
+ 'named_multi')
+])
[email protected]('row_index, row_type', [
+ (Index(np.arange(8)), 'unnamed_standard'),
+ (Index(np.arange(8), name='index.name'), 'named_standard'),
+ (MultiIndex.from_product(
+ [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'),
+ (MultiIndex.from_product(
+ [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']),
+ 'named_multi')
+])
+def test_to_html_alignment_with_truncation(
+ datapath, row_index, row_type, column_index, column_type,
+ index, header, index_names):
+ # GH 22747, GH 22579
+ df = DataFrame(np.arange(64).reshape(8, 8),
+ index=row_index, columns=column_index)
+ result = df.to_html(
+ max_rows=4, max_cols=4,
+ index=index, header=header, index_names=index_names)
+
+ if not index:
+ row_type = 'none'
+ elif not index_names and row_type.startswith('named'):
+ row_type = 'un' + row_type
+
+ if not header:
+ column_type = 'none'
+ elif not index_names and column_type.startswith('named'):
+ column_type = 'un' + column_type
+
+ filename = 'trunc_df_index_' + row_type + '_columns_' + column_type
+ expected = expected_html(datapath, filename)
+ assert result == expected
+
+
[email protected]('index', [False, 0])
+def test_to_html_truncation_index_false_max_rows(datapath, index):
+ # GH 15019
+ data = [[1.764052, 0.400157],
+ [0.978738, 2.240893],
+ [1.867558, -0.977278],
+ [0.950088, -0.151357],
+ [-0.103219, 0.410599]]
+ df = DataFrame(data)
+ result = df.to_html(max_rows=4, index=index)
+ expected = expected_html(datapath, 'gh15019_expected_output')
+ assert result == expected
+
+
[email protected]('index', [False, 0])
[email protected]('col_index_named, expected_output', [
+ (False, 'gh22783_expected_output'),
+ (True, 'gh22783_named_columns_index')
+])
+def test_to_html_truncation_index_false_max_cols(
+ datapath, index, col_index_named, expected_output):
+ # GH 22783
+ data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558],
+ [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]]
+ df = DataFrame(data)
+ if col_index_named:
+ df.columns.rename('columns.name', inplace=True)
+ result = df.to_html(max_cols=4, index=index)
+ expected = expected_html(datapath, expected_output)
+ assert result == expected
+
+
[email protected]('notebook', [True, False])
+def test_to_html_notebook_has_style(notebook):
+ df = DataFrame({"A": [1, 2, 3]})
+ result = df.to_html(notebook=notebook)
+
+ if notebook:
+ assert "tbody tr th:only-of-type" in result
+ assert "vertical-align: middle;" in result
+ assert "thead th" in result
+ else:
+ assert "tbody tr th:only-of-type" not in result
+ assert "vertical-align: middle;" not in result
+ assert "thead th" not in result
+
+
+def test_to_html_with_index_names_false():
+ # GH 16493
+ df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'],
+ name='myindexname'))
+ result = df.to_html(index_names=False)
+ assert 'myindexname' not in result
+
+
+def test_to_html_with_id():
+ # GH 8496
+ df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'],
+ name='myindexname'))
+ result = df.to_html(index_names=False, table_id="TEST_ID")
+ assert ' id="TEST_ID"' in result
+
+
[email protected]('value,float_format,expected', [
+ (0.19999, '%.3f', 'gh21625_expected_output'),
+ (100.0, '%.0f', 'gh22270_expected_output'),
+])
+def test_to_html_float_format_no_fixed_width(
+ value, float_format, expected, datapath):
+ # GH 21625, GH 22270
+ df = DataFrame({'x': [value]})
+ expected = expected_html(datapath, expected)
+ result = df.to_html(float_format=float_format)
+ assert result == expected
+
+
[email protected]("render_links,expected", [
+ (True, 'render_links_true'),
+ (False, 'render_links_false'),
+])
+def test_to_html_render_links(render_links, expected, datapath):
+ # GH 2679
+ data = [
+ [0, 'http://pandas.pydata.org/?q1=a&q2=b', 'pydata.org'],
+ [0, 'www.pydata.org', 'pydata.org']
+ ]
+ df = DataFrame(data, columns=['foo', 'bar', None])
+
+ result = df.to_html(render_links=render_links)
+ expected = expected_html(datapath, expected)
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_latex.py b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_latex.py
new file mode 100644
index 00000000000..1653e474aa7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/formats/test_to_latex.py
@@ -0,0 +1,737 @@
+import codecs
+from datetime import datetime
+
+import pytest
+
+from pandas.compat import u
+
+import pandas as pd
+from pandas import DataFrame, Series, compat
+from pandas.util import testing as tm
+
+
+def frame():
+ return DataFrame(tm.getSeriesData())
+
+
+class TestToLatex(object):
+
+ def test_to_latex_filename(self, frame):
+ with tm.ensure_clean('test.tex') as path:
+ frame.to_latex(path)
+
+ with open(path, 'r') as f:
+ assert frame.to_latex() == f.read()
+
+ # test with utf-8 and encoding option (GH 7061)
+ df = DataFrame([[u'au\xdfgangen']])
+ with tm.ensure_clean('test.tex') as path:
+ df.to_latex(path, encoding='utf-8')
+ with codecs.open(path, 'r', encoding='utf-8') as f:
+ assert df.to_latex() == f.read()
+
+ # test with utf-8 without encoding option
+ if compat.PY3: # python3: pandas default encoding is utf-8
+ with tm.ensure_clean('test.tex') as path:
+ df.to_latex(path)
+ with codecs.open(path, 'r', encoding='utf-8') as f:
+ assert df.to_latex() == f.read()
+ else:
+ # python2 default encoding is ascii, so an error should be raised
+ with tm.ensure_clean('test.tex') as path:
+ with pytest.raises(UnicodeEncodeError):
+ df.to_latex(path)
+
+ def test_to_latex(self, frame):
+ # it works!
+ frame.to_latex()
+
+ df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex()
+ withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withindex_result == withindex_expected
+
+ withoutindex_result = df.to_latex(index=False)
+ withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+ a & b \\
+\midrule
+ 1 & b1 \\
+ 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withoutindex_result == withoutindex_expected
+
+ def test_to_latex_format(self, frame):
+ # GH Bug #9402
+ frame.to_latex(column_format='ccc')
+
+ df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex(column_format='ccc')
+ withindex_expected = r"""\begin{tabular}{ccc}
+\toprule
+{} & a & b \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withindex_result == withindex_expected
+
+ def test_to_latex_empty(self):
+ df = DataFrame()
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{l}
+\toprule
+Empty DataFrame
+Columns: Index([], dtype='object')
+Index: Index([], dtype='object') \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ result = df.to_latex(longtable=True)
+ expected = r"""\begin{longtable}{l}
+\toprule
+Empty DataFrame
+Columns: Index([], dtype='object')
+Index: Index([], dtype='object') \\
+\end{longtable}
+"""
+ assert result == expected
+
+ def test_to_latex_with_formatters(self):
+ df = DataFrame({'datetime64': [datetime(2016, 1, 1),
+ datetime(2016, 2, 5),
+ datetime(2016, 3, 3)],
+ 'float': [1.0, 2.0, 3.0],
+ 'int': [1, 2, 3],
+ 'object': [(1, 2), True, False],
+ })
+
+ formatters = {'datetime64': lambda x: x.strftime('%Y-%m'),
+ 'float': lambda x: '[{x: 4.1f}]'.format(x=x),
+ 'int': lambda x: '0x{x:x}'.format(x=x),
+ 'object': lambda x: '-{x!s}-'.format(x=x),
+ '__index__': lambda x: 'index: {x}'.format(x=x)}
+ result = df.to_latex(formatters=dict(formatters))
+
+ expected = r"""\begin{tabular}{llrrl}
+\toprule
+{} & datetime64 & float & int & object \\
+\midrule
+index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\
+index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\
+index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ def test_to_latex_multiindex(self):
+ df = DataFrame({('x', 'y'): ['a']})
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{ll}
+\toprule
+{} & x \\
+{} & y \\
+\midrule
+0 & a \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert result == expected
+
+ result = df.T.to_latex()
+ expected = r"""\begin{tabular}{lll}
+\toprule
+ & & 0 \\
+\midrule
+x & y & a \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert result == expected
+
+ df = DataFrame.from_dict({
+ ('c1', 0): pd.Series({x: x for x in range(4)}),
+ ('c1', 1): pd.Series({x: x + 4 for x in range(4)}),
+ ('c2', 0): pd.Series({x: x for x in range(4)}),
+ ('c2', 1): pd.Series({x: x + 4 for x in range(4)}),
+ ('c3', 0): pd.Series({x: x for x in range(4)}),
+ }).T
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{llrrrr}
+\toprule
+ & & 0 & 1 & 2 & 3 \\
+\midrule
+c1 & 0 & 0 & 1 & 2 & 3 \\
+ & 1 & 4 & 5 & 6 & 7 \\
+c2 & 0 & 0 & 1 & 2 & 3 \\
+ & 1 & 4 & 5 & 6 & 7 \\
+c3 & 0 & 0 & 1 & 2 & 3 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert result == expected
+
+ # GH 14184
+ df = df.T
+ df.columns.names = ['a', 'b']
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
+b & 0 & 1 & 0 & 1 & 0 \\
+\midrule
+0 & 0 & 4 & 0 & 4 & 0 \\
+1 & 1 & 5 & 1 & 5 & 1 \\
+2 & 2 & 6 & 2 & 6 & 2 \\
+3 & 3 & 7 & 3 & 7 & 3 \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ # GH 10660
+ df = pd.DataFrame({'a': [0, 0, 1, 1],
+ 'b': list('abab'),
+ 'c': [1, 2, 3, 4]})
+ result = df.set_index(['a', 'b']).to_latex()
+ expected = r"""\begin{tabular}{llr}
+\toprule
+ & & c \\
+a & b & \\
+\midrule
+0 & a & 1 \\
+ & b & 2 \\
+1 & a & 3 \\
+ & b & 4 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert result == expected
+
+ result = df.groupby('a').describe().to_latex()
+ expected = r"""\begin{tabular}{lrrrrrrrr}
+\toprule
+{} & \multicolumn{8}{l}{c} \\
+{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\
+a & & & & & & & & \\
+\midrule
+0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\
+1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert result == expected
+
+ def test_to_latex_multiindex_dupe_level(self):
+ # see gh-14484
+ #
+ # If an index is repeated in subsequent rows, it should be
+ # replaced with a blank in the created table. This should
+ # ONLY happen if all higher order indices (to the left) are
+ # equal too. In this test, 'c' has to be printed both times
+ # because the higher order index 'A' != 'B'.
+ df = pd.DataFrame(index=pd.MultiIndex.from_tuples(
+ [('A', 'c'), ('B', 'c')]), columns=['col'])
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{lll}
+\toprule
+ & & col \\
+\midrule
+A & c & NaN \\
+B & c & NaN \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ def test_to_latex_multicolumnrow(self):
+ df = pd.DataFrame({
+ ('c1', 0): {x: x for x in range(5)},
+ ('c1', 1): {x: x + 5 for x in range(5)},
+ ('c2', 0): {x: x for x in range(5)},
+ ('c2', 1): {x: x + 5 for x in range(5)},
+ ('c3', 0): {x: x for x in range(5)}
+ })
+ result = df.to_latex()
+ expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
+{} & 0 & 1 & 0 & 1 & 0 \\
+\midrule
+0 & 0 & 5 & 0 & 5 & 0 \\
+1 & 1 & 6 & 1 & 6 & 1 \\
+2 & 2 & 7 & 2 & 7 & 2 \\
+3 & 3 & 8 & 3 & 8 & 3 \\
+4 & 4 & 9 & 4 & 9 & 4 \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ result = df.to_latex(multicolumn=False)
+ expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+{} & c1 & & c2 & & c3 \\
+{} & 0 & 1 & 0 & 1 & 0 \\
+\midrule
+0 & 0 & 5 & 0 & 5 & 0 \\
+1 & 1 & 6 & 1 & 6 & 1 \\
+2 & 2 & 7 & 2 & 7 & 2 \\
+3 & 3 & 8 & 3 & 8 & 3 \\
+4 & 4 & 9 & 4 & 9 & 4 \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ result = df.T.to_latex(multirow=True)
+ expected = r"""\begin{tabular}{llrrrrr}
+\toprule
+ & & 0 & 1 & 2 & 3 & 4 \\
+\midrule
+\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
+ & 1 & 5 & 6 & 7 & 8 & 9 \\
+\cline{1-7}
+\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
+ & 1 & 5 & 6 & 7 & 8 & 9 \\
+\cline{1-7}
+c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ df.index = df.T.index
+ result = df.T.to_latex(multirow=True, multicolumn=True,
+ multicolumn_format='c')
+ expected = r"""\begin{tabular}{llrrrrr}
+\toprule
+ & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\
+ & & 0 & 1 & 0 & 1 & 0 \\
+\midrule
+\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
+ & 1 & 5 & 6 & 7 & 8 & 9 \\
+\cline{1-7}
+\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
+ & 1 & 5 & 6 & 7 & 8 & 9 \\
+\cline{1-7}
+c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
+\bottomrule
+\end{tabular}
+"""
+ assert result == expected
+
+ def test_to_latex_escape(self):
+ a = 'a'
+ b = 'b'
+
+ test_dict = {u('co$e^x$'): {a: "a",
+ b: "b"},
+ u('co^l1'): {a: "a",
+ b: "b"}}
+
+ unescaped_result = DataFrame(test_dict).to_latex(escape=False)
+ escaped_result = DataFrame(test_dict).to_latex(
+ ) # default: escape=True
+
+ unescaped_expected = r'''\begin{tabular}{lll}
+\toprule
+{} & co$e^x$ & co^l1 \\
+\midrule
+a & a & a \\
+b & b & b \\
+\bottomrule
+\end{tabular}
+'''
+
+ escaped_expected = r'''\begin{tabular}{lll}
+\toprule
+{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\
+\midrule
+a & a & a \\
+b & b & b \\
+\bottomrule
+\end{tabular}
+'''
+
+ assert unescaped_result == unescaped_expected
+ assert escaped_result == escaped_expected
+
+ def test_to_latex_special_escape(self):
+ df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"])
+
+ escaped_result = df.to_latex()
+ escaped_expected = r"""\begin{tabular}{ll}
+\toprule
+{} & 0 \\
+\midrule
+0 & a\textbackslash b\textbackslash c \\
+1 & \textasciicircum a\textasciicircum b\textasciicircum c \\
+2 & \textasciitilde a\textasciitilde b\textasciitilde c \\
+\bottomrule
+\end{tabular}
+"""
+ assert escaped_result == escaped_expected
+
+ def test_to_latex_longtable(self, frame):
+ frame.to_latex(longtable=True)
+
+ df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex(longtable=True)
+ withindex_expected = r"""\begin{longtable}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+\endhead
+\midrule
+\multicolumn{3}{r}{{Continued on next page}} \\
+\midrule
+\endfoot
+
+\bottomrule
+\endlastfoot
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\end{longtable}
+"""
+ assert withindex_result == withindex_expected
+
+ withoutindex_result = df.to_latex(index=False, longtable=True)
+ withoutindex_expected = r"""\begin{longtable}{rl}
+\toprule
+ a & b \\
+\midrule
+\endhead
+\midrule
+\multicolumn{2}{r}{{Continued on next page}} \\
+\midrule
+\endfoot
+
+\bottomrule
+\endlastfoot
+ 1 & b1 \\
+ 2 & b2 \\
+\end{longtable}
+"""
+
+ assert withoutindex_result == withoutindex_expected
+
+ df = DataFrame({'a': [1, 2]})
+ with1column_result = df.to_latex(index=False, longtable=True)
+ assert r"\multicolumn{1}" in with1column_result
+
+ df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
+ with3columns_result = df.to_latex(index=False, longtable=True)
+ assert r"\multicolumn{3}" in with3columns_result
+
+ def test_to_latex_escape_special_chars(self):
+ special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^',
+ '\\']
+ df = DataFrame(data=special_characters)
+ observed = df.to_latex()
+ expected = r"""\begin{tabular}{ll}
+\toprule
+{} & 0 \\
+\midrule
+0 & \& \\
+1 & \% \\
+2 & \$ \\
+3 & \# \\
+4 & \_ \\
+5 & \{ \\
+6 & \} \\
+7 & \textasciitilde \\
+8 & \textasciicircum \\
+9 & \textbackslash \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert observed == expected
+
+ def test_to_latex_no_header(self):
+ # GH 7124
+ df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex(header=False)
+ withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withindex_result == withindex_expected
+
+ withoutindex_result = df.to_latex(index=False, header=False)
+ withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+ 1 & b1 \\
+ 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withoutindex_result == withoutindex_expected
+
+ def test_to_latex_specified_header(self):
+ # GH 7124
+ df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex(header=['AA', 'BB'])
+ withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & AA & BB \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withindex_result == withindex_expected
+
+ withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False)
+ withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+AA & BB \\
+\midrule
+ 1 & b1 \\
+ 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withoutindex_result == withoutindex_expected
+
+ withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False)
+ withoutescape_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & $A$ & $B$ \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withoutescape_result == withoutescape_expected
+
+ with pytest.raises(ValueError):
+ df.to_latex(header=['A'])
+
+ def test_to_latex_decimal(self, frame):
+ # GH 12031
+ frame.to_latex()
+
+ df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']})
+ withindex_result = df.to_latex(decimal=',')
+
+ withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+0 & 1,0 & b1 \\
+1 & 2,1 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert withindex_result == withindex_expected
+
+ def test_to_latex_series(self):
+ s = Series(['a', 'b', 'c'])
+ withindex_result = s.to_latex()
+ withindex_expected = r"""\begin{tabular}{ll}
+\toprule
+{} & 0 \\
+\midrule
+0 & a \\
+1 & b \\
+2 & c \\
+\bottomrule
+\end{tabular}
+"""
+ assert withindex_result == withindex_expected
+
+ def test_to_latex_bold_rows(self):
+ # GH 16707
+ df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ observed = df.to_latex(bold_rows=True)
+ expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+\textbf{0} & 1 & b1 \\
+\textbf{1} & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+ assert observed == expected
+
+ def test_to_latex_no_bold_rows(self):
+ # GH 16707
+ df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+ observed = df.to_latex(bold_rows=False)
+ expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+"""
+ assert observed == expected
+
+ @pytest.mark.parametrize('name0', [None, 'named0'])
+ @pytest.mark.parametrize('name1', [None, 'named1'])
+ @pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
+ def test_to_latex_multiindex_names(self, name0, name1, axes):
+ # GH 18667
+ names = [name0, name1]
+ mi = pd.MultiIndex.from_product([[1, 2], [3, 4]])
+ df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy())
+ for idx in axes:
+ df.axes[idx].names = names
+
+ idx_names = tuple(n or '{}' for n in names)
+ idx_names_row = ('%s & %s & & & & \\\\\n' % idx_names
+ if (0 in axes and any(names)) else '')
+ placeholder = '{}' if any(names) and 1 in axes else ' '
+ col_names = [n if (bool(n) and 1 in axes) else placeholder
+ for n in names]
+ observed = df.to_latex()
+ expected = r"""\begin{tabular}{llrrrr}
+\toprule
+ & %s & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} \\
+ & %s & 3 & 4 & 3 & 4 \\
+%s\midrule
+1 & 3 & -1 & -1 & -1 & -1 \\
+ & 4 & -1 & -1 & -1 & -1 \\
+2 & 3 & -1 & -1 & -1 & -1 \\
+ & 4 & -1 & -1 & -1 & -1 \\
+\bottomrule
+\end{tabular}
+""" % tuple(list(col_names) + [idx_names_row])
+ assert observed == expected
+
+ @pytest.mark.parametrize('one_row', [True, False])
+ def test_to_latex_multiindex_nans(self, one_row):
+ # GH 14249
+ df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
+ if one_row:
+ df = df.iloc[[0]]
+ observed = df.set_index(['a', 'b']).to_latex()
+ expected = r"""\begin{tabular}{llr}
+\toprule
+ & & c \\
+a & b & \\
+\midrule
+NaN & 2 & 4 \\
+"""
+ if not one_row:
+ expected += r"""1.0 & 3 & 5 \\
+"""
+ expected += r"""\bottomrule
+\end{tabular}
+"""
+ assert observed == expected
+
+ def test_to_latex_non_string_index(self):
+ # GH 19981
+ observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
+ expected = r"""\begin{tabular}{llr}
+\toprule
+ & & 2 \\
+0 & 1 & \\
+\midrule
+1 & 2 & 3 \\
+ & 2 & 3 \\
+\bottomrule
+\end{tabular}
+"""
+ assert observed == expected
+
+ def test_to_latex_midrule_location(self):
+ # GH 18326
+ df = pd.DataFrame({'a': [1, 2]})
+ df.index.name = 'foo'
+ observed = df.to_latex(index_names=False)
+ expected = r"""\begin{tabular}{lr}
+\toprule
+{} & a \\
+\midrule
+0 & 1 \\
+1 & 2 \\
+\bottomrule
+\end{tabular}
+"""
+
+ assert observed == expected
+
+ def test_to_latex_multiindex_empty_name(self):
+ # GH 18669
+ mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
+ df = pd.DataFrame(-1, index=mi, columns=range(4))
+ observed = df.to_latex()
+ expected = r"""\begin{tabular}{lrrrr}
+\toprule
+ & 0 & 1 & 2 & 3 \\
+{} & & & & \\
+\midrule
+1 & -1 & -1 & -1 & -1 \\
+2 & -1 & -1 & -1 & -1 \\
+\bottomrule
+\end{tabular}
+"""
+ assert observed == expected
+
+ def test_to_latex_float_format_no_fixed_width(self):
+
+ # GH 21625
+ df = DataFrame({'x': [0.19999]})
+ expected = r"""\begin{tabular}{lr}
+\toprule
+{} & x \\
+\midrule
+0 & 0.200 \\
+\bottomrule
+\end{tabular}
+"""
+ assert df.to_latex(float_format='%.3f') == expected
+
+ # GH 22270
+ df = DataFrame({'x': [100.0]})
+ expected = r"""\begin{tabular}{lr}
+\toprule
+{} & x \\
+\midrule
+0 & 100 \\
+\bottomrule
+\end{tabular}
+"""
+ assert df.to_latex(float_format='%.0f') == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/io/generate_legacy_storage_files.py b/contrib/python/pandas/py2/pandas/tests/io/generate_legacy_storage_files.py
new file mode 100755
index 00000000000..6774eac6d6c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/generate_legacy_storage_files.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python
+
+"""
+self-contained to write legacy storage (pickle/msgpack) files
+
+To use this script. Create an environment where you want
+generate pickles, say its for 0.18.1, with your pandas clone
+in ~/pandas
+
+. activate pandas_0.18.1
+cd ~/
+
+$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
+ pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle
+
+This script generates a storage file for the current arch, system,
+and python version
+ pandas version: 0.18.1
+ output dir : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/
+ storage format: pickle
+created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle
+
+The idea here is you are using the *current* version of the
+generate_legacy_storage_files with an *older* version of pandas to
+generate a pickle file. We will then check this file into a current
+branch, and test using test_pickle.py. This will load the *older*
+pickles and test versus the current data that is generated
+(with master). These are then compared.
+
+If we have cases where we changed the signature (e.g. we renamed
+offset -> freq in Timestamp). Then we have to conditionally execute
+in the generate_legacy_storage_files.py to make it
+run under the older AND the newer version.
+
+"""
+
+from __future__ import print_function
+
+from datetime import timedelta
+from distutils.version import LooseVersion
+import os
+import platform as pl
+import sys
+from warnings import catch_warnings, filterwarnings
+
+import numpy as np
+
+from pandas.compat import u
+
+import pandas
+from pandas import (
+ Categorical, DataFrame, Index, MultiIndex, NaT, Panel, Period, Series,
+ SparseDataFrame, SparseSeries, Timestamp, bdate_range, date_range,
+ period_range, timedelta_range, to_msgpack)
+
+from pandas.tseries.offsets import (
+ FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day,
+ Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin,
+ QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin,
+ YearEnd)
+
+_loose_version = LooseVersion(pandas.__version__)
+
+
+def _create_sp_series():
+ nan = np.nan
+
+ # nan-based
+ arr = np.arange(15, dtype=np.float64)
+ arr[7:12] = nan
+ arr[-1:] = nan
+
+ bseries = SparseSeries(arr, kind='block')
+ bseries.name = u'bseries'
+ return bseries
+
+
+def _create_sp_tsseries():
+ nan = np.nan
+
+ # nan-based
+ arr = np.arange(15, dtype=np.float64)
+ arr[7:12] = nan
+ arr[-1:] = nan
+
+ date_index = bdate_range('1/1/2011', periods=len(arr))
+ bseries = SparseSeries(arr, index=date_index, kind='block')
+ bseries.name = u'btsseries'
+ return bseries
+
+
+def _create_sp_frame():
+ nan = np.nan
+
+ data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
+ u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
+ u'C': np.arange(10).astype(np.int64),
+ u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
+
+ dates = bdate_range('1/1/2011', periods=10)
+ return SparseDataFrame(data, index=dates)
+
+
+def create_data():
+ """ create the pickle/msgpack data """
+
+ data = {
+ u'A': [0., 1., 2., 3., np.nan],
+ u'B': [0, 1, 0, 1, 0],
+ u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
+ u'D': date_range('1/1/2009', periods=5),
+ u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
+ }
+
+ scalars = dict(timestamp=Timestamp('20130101'),
+ period=Period('2012', 'M'))
+
+ index = dict(int=Index(np.arange(10)),
+ date=date_range('20130101', periods=10),
+ period=period_range('2013-01-01', freq='M', periods=10),
+ float=Index(np.arange(10, dtype=np.float64)),
+ uint=Index(np.arange(10, dtype=np.uint64)),
+ timedelta=timedelta_range('00:00:00', freq='30T', periods=10))
+
+ if _loose_version >= LooseVersion('0.18'):
+ from pandas import RangeIndex
+ index['range'] = RangeIndex(10)
+
+ if _loose_version >= LooseVersion('0.21'):
+ from pandas import interval_range
+ index['interval'] = interval_range(0, periods=10)
+
+ mi = dict(reg2=MultiIndex.from_tuples(
+ tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo',
+ u'foo', u'qux', u'qux'],
+ [u'one', u'two', u'one', u'two', u'one',
+ u'two', u'one', u'two']])),
+ names=[u'first', u'second']))
+
+ series = dict(float=Series(data[u'A']),
+ int=Series(data[u'B']),
+ mixed=Series(data[u'E']),
+ ts=Series(np.arange(10).astype(np.int64),
+ index=date_range('20130101', periods=10)),
+ mi=Series(np.arange(5).astype(np.float64),
+ index=MultiIndex.from_tuples(
+ tuple(zip(*[[1, 1, 2, 2, 2],
+ [3, 4, 3, 4, 5]])),
+ names=[u'one', u'two'])),
+ dup=Series(np.arange(5).astype(np.float64),
+ index=[u'A', u'B', u'C', u'D', u'A']),
+ cat=Series(Categorical([u'foo', u'bar', u'baz'])),
+ dt=Series(date_range('20130101', periods=5)),
+ dt_tz=Series(date_range('20130101', periods=5,
+ tz='US/Eastern')),
+ period=Series([Period('2000Q1')] * 5))
+
+ mixed_dup_df = DataFrame(data)
+ mixed_dup_df.columns = list(u"ABCDA")
+ frame = dict(float=DataFrame({u'A': series[u'float'],
+ u'B': series[u'float'] + 1}),
+ int=DataFrame({u'A': series[u'int'],
+ u'B': series[u'int'] + 1}),
+ mixed=DataFrame({k: data[k]
+ for k in [u'A', u'B', u'C', u'D']}),
+ mi=DataFrame({u'A': np.arange(5).astype(np.float64),
+ u'B': np.arange(5).astype(np.int64)},
+ index=MultiIndex.from_tuples(
+ tuple(zip(*[[u'bar', u'bar', u'baz',
+ u'baz', u'baz'],
+ [u'one', u'two', u'one',
+ u'two', u'three']])),
+ names=[u'first', u'second'])),
+ dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
+ columns=[u'A', u'B', u'A']),
+ cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
+ cat_and_float=DataFrame({
+ u'A': Categorical([u'foo', u'bar', u'baz']),
+ u'B': np.arange(3).astype(np.int64)}),
+ mixed_dup=mixed_dup_df,
+ dt_mixed_tzs=DataFrame({
+ u'A': Timestamp('20130102', tz='US/Eastern'),
+ u'B': Timestamp('20130603', tz='CET')}, index=range(5)),
+ dt_mixed2_tzs=DataFrame({
+ u'A': Timestamp('20130102', tz='US/Eastern'),
+ u'B': Timestamp('20130603', tz='CET'),
+ u'C': Timestamp('20130603', tz='UTC')}, index=range(5))
+ )
+
+ with catch_warnings(record=True):
+ filterwarnings("ignore", "\\nPanel", FutureWarning)
+ mixed_dup_panel = Panel({u'ItemA': frame[u'float'],
+ u'ItemB': frame[u'int']})
+ mixed_dup_panel.items = [u'ItemA', u'ItemA']
+ panel = dict(float=Panel({u'ItemA': frame[u'float'],
+ u'ItemB': frame[u'float'] + 1}),
+ dup=Panel(
+ np.arange(30).reshape(3, 5, 2).astype(np.float64),
+ items=[u'A', u'B', u'A']),
+ mixed_dup=mixed_dup_panel)
+
+ cat = dict(int8=Categorical(list('abcdefg')),
+ int16=Categorical(np.arange(1000)),
+ int32=Categorical(np.arange(10000)))
+
+ timestamp = dict(normal=Timestamp('2011-01-01'),
+ nat=NaT,
+ tz=Timestamp('2011-01-01', tz='US/Eastern'))
+
+ if _loose_version < LooseVersion('0.19.2'):
+ timestamp['freq'] = Timestamp('2011-01-01', offset='D')
+ timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
+ offset='M')
+ else:
+ timestamp['freq'] = Timestamp('2011-01-01', freq='D')
+ timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
+ freq='M')
+
+ off = {'DateOffset': DateOffset(years=1),
+ 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
+ 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
+ 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
+ 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
+ 'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
+ 'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
+ 'MonthBegin': MonthBegin(1),
+ 'MonthEnd': MonthEnd(1),
+ 'QuarterBegin': QuarterBegin(1),
+ 'QuarterEnd': QuarterEnd(1),
+ 'Day': Day(1),
+ 'YearBegin': YearBegin(1),
+ 'YearEnd': YearEnd(1),
+ 'Week': Week(1),
+ 'Week_Tues': Week(2, normalize=False, weekday=1),
+ 'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
+ 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
+ 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
+ 'Easter': Easter(),
+ 'Hour': Hour(1),
+ 'Minute': Minute(1)}
+
+ return dict(series=series,
+ frame=frame,
+ panel=panel,
+ index=index,
+ scalars=scalars,
+ mi=mi,
+ sp_series=dict(float=_create_sp_series(),
+ ts=_create_sp_tsseries()),
+ sp_frame=dict(float=_create_sp_frame()),
+ cat=cat,
+ timestamp=timestamp,
+ offsets=off)
+
+
+def create_pickle_data():
+ data = create_data()
+
+ # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
+ # panels if their columns/items were non-unique.
+ if _loose_version < LooseVersion('0.14.1'):
+ del data['frame']['mixed_dup']
+ del data['panel']['mixed_dup']
+ if _loose_version < LooseVersion('0.17.0'):
+ del data['series']['period']
+ del data['scalars']['period']
+ return data
+
+
+def _u(x):
+ return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x
+
+
+def create_msgpack_data():
+ data = create_data()
+ if _loose_version < LooseVersion('0.17.0'):
+ del data['frame']['mixed_dup']
+ del data['panel']['mixed_dup']
+ del data['frame']['dup']
+ del data['panel']['dup']
+ if _loose_version < LooseVersion('0.18.0'):
+ del data['series']['dt_tz']
+ del data['frame']['dt_mixed_tzs']
+ # Not supported
+ del data['sp_series']
+ del data['sp_frame']
+ del data['series']['cat']
+ del data['series']['period']
+ del data['frame']['cat_onecol']
+ del data['frame']['cat_and_float']
+ del data['scalars']['period']
+ if _loose_version < LooseVersion('0.23.0'):
+ del data['index']['interval']
+ del data['offsets']
+ return _u(data)
+
+
+def platform_name():
+ return '_'.join([str(pandas.__version__), str(pl.machine()),
+ str(pl.system().lower()), str(pl.python_version())])
+
+
+def write_legacy_pickles(output_dir):
+
+ # make sure we are < 0.13 compat (in py3)
+ try:
+ from pandas.compat import zip, cPickle as pickle # noqa
+ except ImportError:
+ import pickle
+
+ version = pandas.__version__
+
+ print("This script generates a storage file for the current arch, system, "
+ "and python version")
+ print(" pandas version: {0}".format(version))
+ print(" output dir : {0}".format(output_dir))
+ print(" storage format: pickle")
+
+ pth = '{0}.pickle'.format(platform_name())
+
+ fh = open(os.path.join(output_dir, pth), 'wb')
+ pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
+ fh.close()
+
+ print("created pickle file: %s" % pth)
+
+
+def write_legacy_msgpack(output_dir, compress):
+
+ version = pandas.__version__
+
+ print("This script generates a storage file for the current arch, "
+ "system, and python version")
+ print(" pandas version: {0}".format(version))
+ print(" output dir : {0}".format(output_dir))
+ print(" storage format: msgpack")
+ pth = '{0}.msgpack'.format(platform_name())
+ to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
+ compress=compress)
+
+ print("created msgpack file: %s" % pth)
+
+
+def write_legacy_file():
+ # force our cwd to be the first searched
+ sys.path.insert(0, '.')
+
+ if not (3 <= len(sys.argv) <= 4):
+ exit("Specify output directory and storage type: generate_legacy_"
+ "storage_files.py <output_dir> <storage_type> "
+ "<msgpack_compress_type>")
+
+ output_dir = str(sys.argv[1])
+ storage_type = str(sys.argv[2])
+ try:
+ compress_type = str(sys.argv[3])
+ except IndexError:
+ compress_type = None
+
+ if storage_type == 'pickle':
+ write_legacy_pickles(output_dir=output_dir)
+ elif storage_type == 'msgpack':
+ write_legacy_msgpack(output_dir=output_dir, compress=compress_type)
+ else:
+ exit("storage_type must be one of {'pickle', 'msgpack'}")
+
+
+if __name__ == '__main__':
+ write_legacy_file()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/json/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_compression.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_compression.py
new file mode 100644
index 00000000000..430acbdac80
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_compression.py
@@ -0,0 +1,120 @@
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+
+def test_compression_roundtrip(compression):
+ df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+ with tm.ensure_clean() as path:
+ df.to_json(path, compression=compression)
+ assert_frame_equal(df, pd.read_json(path,
+ compression=compression))
+
+ # explicitly ensure file was compressed.
+ with tm.decompress_file(path, compression) as fh:
+ result = fh.read().decode('utf8')
+ assert_frame_equal(df, pd.read_json(result))
+
+
+def test_read_zipped_json(datapath):
+ uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
+ uncompressed_df = pd.read_json(uncompressed_path)
+
+ compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
+ compressed_df = pd.read_json(compressed_path, compression='zip')
+
+ assert_frame_equal(uncompressed_df, compressed_df)
+
+
[email protected]_if_not_us_locale
+def test_with_s3_url(compression, s3_resource):
+ # Bucket "pandas-test" created in tests/io/conftest.py
+
+ df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+
+ with tm.ensure_clean() as path:
+ df.to_json(path, compression=compression)
+ with open(path, 'rb') as f:
+ s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f)
+
+ roundtripped_df = pd.read_json('s3://pandas-test/test-1',
+ compression=compression)
+ assert_frame_equal(df, roundtripped_df)
+
+
+def test_lines_with_compression(compression):
+
+ with tm.ensure_clean() as path:
+ df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+ df.to_json(path, orient='records', lines=True,
+ compression=compression)
+ roundtripped_df = pd.read_json(path, lines=True,
+ compression=compression)
+ assert_frame_equal(df, roundtripped_df)
+
+
+def test_chunksize_with_compression(compression):
+
+ with tm.ensure_clean() as path:
+ df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
+ df.to_json(path, orient='records', lines=True,
+ compression=compression)
+
+ res = pd.read_json(path, lines=True, chunksize=1,
+ compression=compression)
+ roundtripped_df = pd.concat(res)
+ assert_frame_equal(df, roundtripped_df)
+
+
+def test_write_unsupported_compression_type():
+ df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+ with tm.ensure_clean() as path:
+ msg = "Unrecognized compression type: unsupported"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(path, compression="unsupported")
+
+
+def test_read_unsupported_compression_type():
+ with tm.ensure_clean() as path:
+ msg = "Unrecognized compression type: unsupported"
+ with pytest.raises(ValueError, match=msg):
+ pd.read_json(path, compression="unsupported")
+
+
[email protected]("to_infer", [True, False])
[email protected]("read_infer", [True, False])
+def test_to_json_compression(compression_only,
+ read_infer, to_infer):
+ # see gh-15008
+ compression = compression_only
+
+ if compression == "zip":
+ pytest.skip("{compression} is not supported "
+ "for to_csv".format(compression=compression))
+
+ # We'll complete file extension subsequently.
+ filename = "test."
+
+ if compression == "gzip":
+ filename += "gz"
+ else:
+ # xz --> .xz
+ # bz2 --> .bz2
+ filename += compression
+
+ df = pd.DataFrame({"A": [1]})
+
+ to_compression = "infer" if to_infer else compression
+ read_compression = "infer" if read_infer else compression
+
+ with tm.ensure_clean(filename) as path:
+ df.to_json(path, compression=to_compression)
+ result = pd.read_json(path, compression=read_compression)
+ tm.assert_frame_equal(result, df)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_json_table_schema.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_json_table_schema.py
new file mode 100644
index 00000000000..6fa3b5b3b2e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_json_table_schema.py
@@ -0,0 +1,580 @@
+"""Tests for Table Schema integration."""
+from collections import OrderedDict
+import json
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, DatetimeTZDtype, PeriodDtype)
+
+import pandas as pd
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+from pandas.io.json.table_schema import (
+ as_json_table_type, build_table_schema, convert_json_field_to_pandas_type,
+ convert_pandas_type_to_json_field, set_default_names)
+
+
+class TestBuildSchema(object):
+
+ def setup_method(self, method):
+ self.df = DataFrame(
+ {'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'c'],
+ 'C': pd.date_range('2016-01-01', freq='d', periods=4),
+ 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+ },
+ index=pd.Index(range(4), name='idx'))
+
+ def test_build_table_schema(self):
+ result = build_table_schema(self.df, version=False)
+ expected = {
+ 'fields': [{'name': 'idx', 'type': 'integer'},
+ {'name': 'A', 'type': 'integer'},
+ {'name': 'B', 'type': 'string'},
+ {'name': 'C', 'type': 'datetime'},
+ {'name': 'D', 'type': 'duration'},
+ ],
+ 'primaryKey': ['idx']
+ }
+ assert result == expected
+ result = build_table_schema(self.df)
+ assert "pandas_version" in result
+
+ def test_series(self):
+ s = pd.Series([1, 2, 3], name='foo')
+ result = build_table_schema(s, version=False)
+ expected = {'fields': [{'name': 'index', 'type': 'integer'},
+ {'name': 'foo', 'type': 'integer'}],
+ 'primaryKey': ['index']}
+ assert result == expected
+ result = build_table_schema(s)
+ assert 'pandas_version' in result
+
+ def test_series_unnamed(self):
+ result = build_table_schema(pd.Series([1, 2, 3]), version=False)
+ expected = {'fields': [{'name': 'index', 'type': 'integer'},
+ {'name': 'values', 'type': 'integer'}],
+ 'primaryKey': ['index']}
+ assert result == expected
+
+ def test_multiindex(self):
+ df = self.df.copy()
+ idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
+ df.index = idx
+
+ result = build_table_schema(df, version=False)
+ expected = {
+ 'fields': [{'name': 'level_0', 'type': 'string'},
+ {'name': 'level_1', 'type': 'integer'},
+ {'name': 'A', 'type': 'integer'},
+ {'name': 'B', 'type': 'string'},
+ {'name': 'C', 'type': 'datetime'},
+ {'name': 'D', 'type': 'duration'},
+ ],
+ 'primaryKey': ['level_0', 'level_1']
+ }
+ assert result == expected
+
+ df.index.names = ['idx0', None]
+ expected['fields'][0]['name'] = 'idx0'
+ expected['primaryKey'] = ['idx0', 'level_1']
+ result = build_table_schema(df, version=False)
+ assert result == expected
+
+
+class TestTableSchemaType(object):
+
+ @pytest.mark.parametrize('int_type', [
+ np.int, np.int16, np.int32, np.int64])
+ def test_as_json_table_type_int_data(self, int_type):
+ int_data = [1, 2, 3]
+ assert as_json_table_type(np.array(
+ int_data, dtype=int_type)) == 'integer'
+
+ @pytest.mark.parametrize('float_type', [
+ np.float, np.float16, np.float32, np.float64])
+ def test_as_json_table_type_float_data(self, float_type):
+ float_data = [1., 2., 3.]
+ assert as_json_table_type(np.array(
+ float_data, dtype=float_type)) == 'number'
+
+ @pytest.mark.parametrize('bool_type', [bool, np.bool])
+ def test_as_json_table_type_bool_data(self, bool_type):
+ bool_data = [True, False]
+ assert as_json_table_type(np.array(
+ bool_data, dtype=bool_type)) == 'boolean'
+
+ @pytest.mark.parametrize('date_data', [
+ pd.to_datetime(['2016']),
+ pd.to_datetime(['2016'], utc=True),
+ pd.Series(pd.to_datetime(['2016'])),
+ pd.Series(pd.to_datetime(['2016'], utc=True)),
+ pd.period_range('2016', freq='A', periods=3)
+ ])
+ def test_as_json_table_type_date_data(self, date_data):
+ assert as_json_table_type(date_data) == 'datetime'
+
+ @pytest.mark.parametrize('str_data', [
+ pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
+ def test_as_json_table_type_string_data(self, str_data):
+ assert as_json_table_type(str_data) == 'string'
+
+ @pytest.mark.parametrize('cat_data', [
+ pd.Categorical(['a']),
+ pd.Categorical([1]),
+ pd.Series(pd.Categorical([1])),
+ pd.CategoricalIndex([1]),
+ pd.Categorical([1])])
+ def test_as_json_table_type_categorical_data(self, cat_data):
+ assert as_json_table_type(cat_data) == 'any'
+
+ # ------
+ # dtypes
+ # ------
+ @pytest.mark.parametrize('int_dtype', [
+ np.int, np.int16, np.int32, np.int64])
+ def test_as_json_table_type_int_dtypes(self, int_dtype):
+ assert as_json_table_type(int_dtype) == 'integer'
+
+ @pytest.mark.parametrize('float_dtype', [
+ np.float, np.float16, np.float32, np.float64])
+ def test_as_json_table_type_float_dtypes(self, float_dtype):
+ assert as_json_table_type(float_dtype) == 'number'
+
+ @pytest.mark.parametrize('bool_dtype', [bool, np.bool])
+ def test_as_json_table_type_bool_dtypes(self, bool_dtype):
+ assert as_json_table_type(bool_dtype) == 'boolean'
+
+ @pytest.mark.parametrize('date_dtype', [
+ np.datetime64, np.dtype("<M8[ns]"), PeriodDtype('D'),
+ DatetimeTZDtype('ns', 'US/Central')])
+ def test_as_json_table_type_date_dtypes(self, date_dtype):
+ # TODO: datedate.date? datetime.time?
+ assert as_json_table_type(date_dtype) == 'datetime'
+
+ @pytest.mark.parametrize('td_dtype', [
+ np.timedelta64, np.dtype("<m8[ns]")])
+ def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
+ assert as_json_table_type(td_dtype) == 'duration'
+
+ @pytest.mark.parametrize('str_dtype', [object]) # TODO
+ def test_as_json_table_type_string_dtypes(self, str_dtype):
+ assert as_json_table_type(str_dtype) == 'string'
+
+ def test_as_json_table_type_categorical_dtypes(self):
+ # TODO: I think before is_categorical_dtype(Categorical)
+ # returned True, but now it's False. Figure out why or
+ # if it matters
+ assert as_json_table_type(pd.Categorical(['a'])) == 'any'
+ assert as_json_table_type(CategoricalDtype()) == 'any'
+
+
+class TestTableOrient(object):
+
+ def setup_method(self, method):
+ self.df = DataFrame(
+ {'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'c'],
+ 'C': pd.date_range('2016-01-01', freq='d', periods=4),
+ 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+ 'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+ 'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+ ordered=True)),
+ 'G': [1., 2., 3, 4.],
+ 'H': pd.date_range('2016-01-01', freq='d', periods=4,
+ tz='US/Central'),
+ },
+ index=pd.Index(range(4), name='idx'))
+
+ def test_build_series(self):
+ s = pd.Series([1, 2], name='a')
+ s.index.name = 'id'
+ result = s.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+
+ assert "pandas_version" in result['schema']
+ result['schema'].pop('pandas_version')
+
+ fields = [{'name': 'id', 'type': 'integer'},
+ {'name': 'a', 'type': 'integer'}]
+
+ schema = {
+ 'fields': fields,
+ 'primaryKey': ['id'],
+ }
+
+ expected = OrderedDict([
+ ('schema', schema),
+ ('data', [OrderedDict([('id', 0), ('a', 1)]),
+ OrderedDict([('id', 1), ('a', 2)])])])
+ assert result == expected
+
+ def test_to_json(self):
+ df = self.df.copy()
+ df.index.name = 'idx'
+ result = df.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+
+ assert "pandas_version" in result['schema']
+ result['schema'].pop('pandas_version')
+
+ fields = [
+ {'name': 'idx', 'type': 'integer'},
+ {'name': 'A', 'type': 'integer'},
+ {'name': 'B', 'type': 'string'},
+ {'name': 'C', 'type': 'datetime'},
+ {'name': 'D', 'type': 'duration'},
+ {'constraints': {'enum': ['a', 'b', 'c']},
+ 'name': 'E',
+ 'ordered': False,
+ 'type': 'any'},
+ {'constraints': {'enum': ['a', 'b', 'c']},
+ 'name': 'F',
+ 'ordered': True,
+ 'type': 'any'},
+ {'name': 'G', 'type': 'number'},
+ {'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
+ ]
+
+ schema = {
+ 'fields': fields,
+ 'primaryKey': ['idx'],
+ }
+ data = [
+ OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
+ ('C', '2016-01-01T00:00:00.000Z'),
+ ('D', 'P0DT1H0M0S'),
+ ('E', 'a'), ('F', 'a'), ('G', 1.),
+ ('H', '2016-01-01T06:00:00.000Z')
+ ]),
+ OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
+ ('C', '2016-01-02T00:00:00.000Z'),
+ ('D', 'P0DT1H1M0S'),
+ ('E', 'b'), ('F', 'b'), ('G', 2.),
+ ('H', '2016-01-02T06:00:00.000Z')
+ ]),
+ OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
+ ('C', '2016-01-03T00:00:00.000Z'),
+ ('D', 'P0DT1H2M0S'),
+ ('E', 'c'), ('F', 'c'), ('G', 3.),
+ ('H', '2016-01-03T06:00:00.000Z')
+ ]),
+ OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
+ ('C', '2016-01-04T00:00:00.000Z'),
+ ('D', 'P0DT1H3M0S'),
+ ('E', 'c'), ('F', 'c'), ('G', 4.),
+ ('H', '2016-01-04T06:00:00.000Z')
+ ]),
+ ]
+ expected = OrderedDict([('schema', schema), ('data', data)])
+ assert result == expected
+
+ def test_to_json_float_index(self):
+ data = pd.Series(1, index=[1., 2.])
+ result = data.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+ result['schema'].pop('pandas_version')
+
+ expected = (
+ OrderedDict([('schema', {
+ 'fields': [{'name': 'index', 'type': 'number'},
+ {'name': 'values', 'type': 'integer'}],
+ 'primaryKey': ['index']
+ }),
+ ('data', [OrderedDict([('index', 1.0), ('values', 1)]),
+ OrderedDict([('index', 2.0), ('values', 1)])])])
+ )
+ assert result == expected
+
+ def test_to_json_period_index(self):
+ idx = pd.period_range('2016', freq='Q-JAN', periods=2)
+ data = pd.Series(1, idx)
+ result = data.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+ result['schema'].pop('pandas_version')
+
+ fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
+ {'name': 'values', 'type': 'integer'}]
+
+ schema = {'fields': fields, 'primaryKey': ['index']}
+ data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
+ ('values', 1)]),
+ OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
+ ('values', 1)])]
+ expected = OrderedDict([('schema', schema), ('data', data)])
+ assert result == expected
+
+ def test_to_json_categorical_index(self):
+ data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
+ result = data.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+ result['schema'].pop('pandas_version')
+
+ expected = (
+ OrderedDict([('schema',
+ {'fields': [{'name': 'index', 'type': 'any',
+ 'constraints': {'enum': ['a', 'b']},
+ 'ordered': False},
+ {'name': 'values', 'type': 'integer'}],
+ 'primaryKey': ['index']}),
+ ('data', [
+ OrderedDict([('index', 'a'),
+ ('values', 1)]),
+ OrderedDict([('index', 'b'), ('values', 1)])])])
+ )
+ assert result == expected
+
+ def test_date_format_raises(self):
+ with pytest.raises(ValueError):
+ self.df.to_json(orient='table', date_format='epoch')
+
+ # others work
+ self.df.to_json(orient='table', date_format='iso')
+ self.df.to_json(orient='table')
+
+ @pytest.mark.parametrize('kind', [pd.Series, pd.Index])
+ def test_convert_pandas_type_to_json_field_int(self, kind):
+ data = [1, 2, 3]
+ result = convert_pandas_type_to_json_field(kind(data, name='name'))
+ expected = {"name": "name", "type": "integer"}
+ assert result == expected
+
+ @pytest.mark.parametrize('kind', [pd.Series, pd.Index])
+ def test_convert_pandas_type_to_json_field_float(self, kind):
+ data = [1., 2., 3.]
+ result = convert_pandas_type_to_json_field(kind(data, name='name'))
+ expected = {"name": "name", "type": "number"}
+ assert result == expected
+
+ @pytest.mark.parametrize('dt_args,extra_exp', [
+ ({}, {}), ({'utc': True}, {'tz': 'UTC'})])
+ @pytest.mark.parametrize('wrapper', [None, pd.Series])
+ def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
+ extra_exp, wrapper):
+ data = [1., 2., 3.]
+ data = pd.to_datetime(data, **dt_args)
+ if wrapper is pd.Series:
+ data = pd.Series(data, name='values')
+ result = convert_pandas_type_to_json_field(data)
+ expected = {"name": "values", "type": 'datetime'}
+ expected.update(extra_exp)
+ assert result == expected
+
+ def test_convert_pandas_type_to_json_period_range(self):
+ arr = pd.period_range('2016', freq='A-DEC', periods=4)
+ result = convert_pandas_type_to_json_field(arr)
+ expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
+ assert result == expected
+
+ @pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
+ @pytest.mark.parametrize('ordered', [True, False])
+ def test_convert_pandas_type_to_json_field_categorical(self, kind,
+ ordered):
+ data = ['a', 'b', 'c']
+ if kind is pd.Categorical:
+ arr = pd.Series(kind(data, ordered=ordered), name='cats')
+ elif kind is pd.CategoricalIndex:
+ arr = kind(data, ordered=ordered, name='cats')
+
+ result = convert_pandas_type_to_json_field(arr)
+ expected = {"name": "cats", "type": "any",
+ "constraints": {"enum": data},
+ "ordered": ordered}
+ assert result == expected
+
+ @pytest.mark.parametrize("inp,exp", [
+ ({'type': 'integer'}, 'int64'),
+ ({'type': 'number'}, 'float64'),
+ ({'type': 'boolean'}, 'bool'),
+ ({'type': 'duration'}, 'timedelta64'),
+ ({'type': 'datetime'}, 'datetime64[ns]'),
+ ({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
+ ({'type': 'any'}, 'object'),
+ ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
+ 'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
+ ordered=False)),
+ ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
+ 'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
+ ordered=True)),
+ ({'type': 'string'}, 'object')])
+ def test_convert_json_field_to_pandas_type(self, inp, exp):
+ field = {'name': 'foo'}
+ field.update(inp)
+ assert convert_json_field_to_pandas_type(field) == exp
+
+ @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
+ def test_convert_json_field_to_pandas_type_raises(self, inp):
+ field = {'type': inp}
+ with pytest.raises(ValueError, match=("Unsupported or invalid field "
+ "type: {}".format(inp))):
+ convert_json_field_to_pandas_type(field)
+
+ def test_categorical(self):
+ s = pd.Series(pd.Categorical(['a', 'b', 'a']))
+ s.index.name = 'idx'
+ result = s.to_json(orient='table', date_format='iso')
+ result = json.loads(result, object_pairs_hook=OrderedDict)
+ result['schema'].pop('pandas_version')
+
+ fields = [{'name': 'idx', 'type': 'integer'},
+ {'constraints': {'enum': ['a', 'b']},
+ 'name': 'values',
+ 'ordered': False,
+ 'type': 'any'}]
+
+ expected = OrderedDict([
+ ('schema', {'fields': fields,
+ 'primaryKey': ['idx']}),
+ ('data', [OrderedDict([('idx', 0), ('values', 'a')]),
+ OrderedDict([('idx', 1), ('values', 'b')]),
+ OrderedDict([('idx', 2), ('values', 'a')])])])
+ assert result == expected
+
+ @pytest.mark.parametrize('idx,nm,prop', [
+ (pd.Index([1]), 'index', 'name'),
+ (pd.Index([1], name='myname'), 'myname', 'name'),
+ (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
+ ['level_0', 'level_1'], 'names'),
+ (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
+ names=['n1', 'n2']),
+ ['n1', 'n2'], 'names'),
+ (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
+ names=['n1', None]),
+ ['n1', 'level_1'], 'names')
+ ])
+ def test_set_names_unset(self, idx, nm, prop):
+ data = pd.Series(1, idx)
+ result = set_default_names(data)
+ assert getattr(result.index, prop) == nm
+
+ @pytest.mark.parametrize("idx", [
+ pd.Index([], name='index'),
+ pd.MultiIndex.from_arrays([['foo'], ['bar']],
+ names=('level_0', 'level_1')),
+ pd.MultiIndex.from_arrays([['foo'], ['bar']],
+ names=('foo', 'level_1'))
+ ])
+ def test_warns_non_roundtrippable_names(self, idx):
+ # GH 19130
+ df = pd.DataFrame([[]], index=idx)
+ df.index.name = 'index'
+ with tm.assert_produces_warning():
+ set_default_names(df)
+
+ def test_timestamp_in_columns(self):
+ df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
+ pd.Timedelta(10, unit='s')])
+ result = df.to_json(orient="table")
+ js = json.loads(result)
+ assert js['schema']['fields'][1]['name'] == 1451606400000
+ assert js['schema']['fields'][2]['name'] == 10000
+
+ @pytest.mark.parametrize('case', [
+ pd.Series([1], index=pd.Index([1], name='a'), name='a'),
+ pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
+ pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
+ ['a'], [1]], names=["A", "a"]))
+ ])
+ def test_overlapping_names(self, case):
+ with pytest.raises(ValueError, match='Overlapping'):
+ case.to_json(orient='table')
+
+ def test_mi_falsey_name(self):
+ # GH 16203
+ df = pd.DataFrame(np.random.randn(4, 4),
+ index=pd.MultiIndex.from_product([('A', 'B'),
+ ('a', 'b')]))
+ result = [x['name'] for x in build_table_schema(df)['fields']]
+ assert result == ['level_0', 'level_1', 0, 1, 2, 3]
+
+
+class TestTableOrientReader(object):
+
+ @pytest.mark.parametrize("index_nm", [
+ None,
+ "idx",
+ pytest.param("index",
+ marks=pytest.mark.xfail),
+ 'level_0'])
+ @pytest.mark.parametrize("vals", [
+ {'ints': [1, 2, 3, 4]},
+ {'objects': ['a', 'b', 'c', 'd']},
+ {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
+ {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
+ {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+ ordered=True))},
+ pytest.param({'floats': [1., 2., 3., 4.]},
+ marks=pytest.mark.xfail),
+ {'floats': [1.1, 2.2, 3.3, 4.4]},
+ {'bools': [True, False, False, True]}])
+ def test_read_json_table_orient(self, index_nm, vals, recwarn):
+ df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
+ out = df.to_json(orient="table")
+ result = pd.read_json(out, orient="table")
+ tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize("index_nm", [
+ None, "idx", "index"])
+ @pytest.mark.parametrize("vals", [
+ {'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
+ {'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
+ tz='US/Central')}])
+ def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
+ df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
+ out = df.to_json(orient="table")
+ with pytest.raises(NotImplementedError, match='can not yet read '):
+ pd.read_json(out, orient="table")
+
+ def test_comprehensive(self):
+ df = DataFrame(
+ {'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'c'],
+ 'C': pd.date_range('2016-01-01', freq='d', periods=4),
+ # 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+ 'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+ 'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+ ordered=True)),
+ 'G': [1.1, 2.2, 3.3, 4.4],
+ # 'H': pd.date_range('2016-01-01', freq='d', periods=4,
+ # tz='US/Central'),
+ 'I': [True, False, False, True],
+ },
+ index=pd.Index(range(4), name='idx'))
+
+ out = df.to_json(orient="table")
+ result = pd.read_json(out, orient="table")
+ tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize("index_names", [
+ [None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
+ ['index', 'foo']])
+ def test_multiindex(self, index_names):
+ # GH 18912
+ df = pd.DataFrame(
+ [["Arr", "alpha", [1, 2, 3, 4]],
+ ["Bee", "Beta", [10, 20, 30, 40]]],
+ index=[["A", "B"], ["Null", "Eins"]],
+ columns=["Aussprache", "Griechisch", "Args"]
+ )
+ df.index.names = index_names
+ out = df.to_json(orient="table")
+ result = pd.read_json(out, orient="table")
+ tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize("strict_check", [
+ pytest.param(True, marks=pytest.mark.xfail),
+ False
+ ])
+ def test_empty_frame_roundtrip(self, strict_check):
+ # GH 21287
+ df = pd.DataFrame([], columns=['a', 'b', 'c'])
+ expected = df.copy()
+ out = df.to_json(orient='table')
+ result = pd.read_json(out, orient='table')
+ # TODO: When DF coercion issue (#21345) is resolved tighten type checks
+ tm.assert_frame_equal(expected, result,
+ check_dtype=strict_check,
+ check_index_type=strict_check)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_normalize.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_normalize.py
new file mode 100644
index 00000000000..3bf699cc8a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_normalize.py
@@ -0,0 +1,462 @@
+import json
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Index, compat
+import pandas.util.testing as tm
+
+from pandas.io.json import json_normalize
+from pandas.io.json.normalize import nested_to_record
+
+
+def deep_nested():
+ # deeply nested data
+ return [{'country': 'USA',
+ 'states': [{'name': 'California',
+ 'cities': [{'name': 'San Francisco',
+ 'pop': 12345},
+ {'name': 'Los Angeles',
+ 'pop': 12346}]
+ },
+ {'name': 'Ohio',
+ 'cities': [{'name': 'Columbus',
+ 'pop': 1234},
+ {'name': 'Cleveland',
+ 'pop': 1236}]}
+ ]
+ },
+ {'country': 'Germany',
+ 'states': [{'name': 'Bayern',
+ 'cities': [{'name': 'Munich', 'pop': 12347}]
+ },
+ {'name': 'Nordrhein-Westfalen',
+ 'cities': [{'name': 'Duesseldorf', 'pop': 1238},
+ {'name': 'Koeln', 'pop': 1239}]}
+ ]
+ }
+ ]
+
+
+def state_data():
+ return [
+ {'counties': [{'name': 'Dade', 'population': 12345},
+ {'name': 'Broward', 'population': 40000},
+ {'name': 'Palm Beach', 'population': 60000}],
+ 'info': {'governor': 'Rick Scott'},
+ 'shortname': 'FL',
+ 'state': 'Florida'},
+ {'counties': [{'name': 'Summit', 'population': 1234},
+ {'name': 'Cuyahoga', 'population': 1337}],
+ 'info': {'governor': 'John Kasich'},
+ 'shortname': 'OH',
+ 'state': 'Ohio'}]
+
+
+def author_missing_data():
+ return [
+ {'info': None},
+ {'info':
+ {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
+ 'author_name':
+ {'first': 'Jane', 'last_name': 'Doe'}
+ }]
+
+
+class TestJSONNormalize(object):
+
+ def test_simple_records(self):
+ recs = [{'a': 1, 'b': 2, 'c': 3},
+ {'a': 4, 'b': 5, 'c': 6},
+ {'a': 7, 'b': 8, 'c': 9},
+ {'a': 10, 'b': 11, 'c': 12}]
+
+ result = json_normalize(recs)
+ expected = DataFrame(recs)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_simple_normalize(self, state_data):
+ result = json_normalize(state_data[0], 'counties')
+ expected = DataFrame(state_data[0]['counties'])
+ tm.assert_frame_equal(result, expected)
+
+ result = json_normalize(state_data, 'counties')
+
+ expected = []
+ for rec in state_data:
+ expected.extend(rec['counties'])
+ expected = DataFrame(expected)
+
+ tm.assert_frame_equal(result, expected)
+
+ result = json_normalize(state_data, 'counties', meta='state')
+ expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_empty_array(self):
+ result = json_normalize([])
+ expected = DataFrame()
+ tm.assert_frame_equal(result, expected)
+
+ def test_simple_normalize_with_separator(self, deep_nested):
+ # GH 14883
+ result = json_normalize({'A': {'A': 1, 'B': 2}})
+ expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
+ expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
+ expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ result = json_normalize(deep_nested, ['states', 'cities'],
+ meta=['country', ['states', 'name']],
+ sep='_')
+ expected = Index(['name', 'pop',
+ 'country', 'states_name']).sort_values()
+ assert result.columns.sort_values().equals(expected)
+
+ def test_value_array_record_prefix(self):
+ # GH 21536
+ result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
+ expected = DataFrame([[1], [2]], columns=['Prefix.0'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_nested_object_record_path(self):
+ # GH 22706
+ data = {'state': 'Florida',
+ 'info': {
+ 'governor': 'Rick Scott',
+ 'counties': [{'name': 'Dade', 'population': 12345},
+ {'name': 'Broward', 'population': 40000},
+ {'name': 'Palm Beach', 'population': 60000}]}}
+ result = json_normalize(data, record_path=["info", "counties"])
+ expected = DataFrame([['Dade', 12345],
+ ['Broward', 40000],
+ ['Palm Beach', 60000]],
+ columns=['name', 'population'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_more_deeply_nested(self, deep_nested):
+
+ result = json_normalize(deep_nested, ['states', 'cities'],
+ meta=['country', ['states', 'name']])
+ # meta_prefix={'states': 'state_'})
+
+ ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
+ 'states.name': ['California', 'California', 'Ohio', 'Ohio',
+ 'Bayern', 'Nordrhein-Westfalen',
+ 'Nordrhein-Westfalen'],
+ 'name': ['San Francisco', 'Los Angeles', 'Columbus',
+ 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
+ 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
+
+ expected = DataFrame(ex_data, columns=result.columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_shallow_nested(self):
+ data = [{'state': 'Florida',
+ 'shortname': 'FL',
+ 'info': {
+ 'governor': 'Rick Scott'
+ },
+ 'counties': [{'name': 'Dade', 'population': 12345},
+ {'name': 'Broward', 'population': 40000},
+ {'name': 'Palm Beach', 'population': 60000}]},
+ {'state': 'Ohio',
+ 'shortname': 'OH',
+ 'info': {
+ 'governor': 'John Kasich'
+ },
+ 'counties': [{'name': 'Summit', 'population': 1234},
+ {'name': 'Cuyahoga', 'population': 1337}]}]
+
+ result = json_normalize(data, 'counties',
+ ['state', 'shortname',
+ ['info', 'governor']])
+ ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
+ 'Cuyahoga'],
+ 'state': ['Florida'] * 3 + ['Ohio'] * 2,
+ 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
+ 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
+ 'population': [12345, 40000, 60000, 1234, 1337]}
+ expected = DataFrame(ex_data, columns=result.columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_meta_name_conflict(self):
+ data = [{'foo': 'hello',
+ 'bar': 'there',
+ 'data': [{'foo': 'something', 'bar': 'else'},
+ {'foo': 'something2', 'bar': 'else2'}]}]
+
+ msg = (r"Conflicting metadata name (foo|bar),"
+ " need distinguishing prefix")
+ with pytest.raises(ValueError, match=msg):
+ json_normalize(data, 'data', meta=['foo', 'bar'])
+
+ result = json_normalize(data, 'data', meta=['foo', 'bar'],
+ meta_prefix='meta')
+
+ for val in ['metafoo', 'metabar', 'foo', 'bar']:
+ assert val in result
+
+ def test_meta_parameter_not_modified(self):
+ # GH 18610
+ data = [{'foo': 'hello',
+ 'bar': 'there',
+ 'data': [{'foo': 'something', 'bar': 'else'},
+ {'foo': 'something2', 'bar': 'else2'}]}]
+
+ COLUMNS = ['foo', 'bar']
+ result = json_normalize(data, 'data', meta=COLUMNS,
+ meta_prefix='meta')
+
+ assert COLUMNS == ['foo', 'bar']
+ for val in ['metafoo', 'metabar', 'foo', 'bar']:
+ assert val in result
+
+ def test_record_prefix(self, state_data):
+ result = json_normalize(state_data[0], 'counties')
+ expected = DataFrame(state_data[0]['counties'])
+ tm.assert_frame_equal(result, expected)
+
+ result = json_normalize(state_data, 'counties',
+ meta='state',
+ record_prefix='county_')
+
+ expected = []
+ for rec in state_data:
+ expected.extend(rec['counties'])
+ expected = DataFrame(expected)
+ expected = expected.rename(columns=lambda x: 'county_' + x)
+ expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_non_ascii_key(self):
+ if compat.PY3:
+ testjson = (
+ b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
+ b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
+ ).decode('utf8')
+ else:
+ testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
+ '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
+
+ testdata = {
+ u'sub.A': [1, 3],
+ u'sub.B': [2, 4],
+ b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
+ }
+ expected = DataFrame(testdata)
+
+ result = json_normalize(json.loads(testjson))
+ tm.assert_frame_equal(result, expected)
+
+ def test_missing_field(self, author_missing_data):
+ # GH20030:
+ result = json_normalize(author_missing_data)
+ ex_data = [
+ {'info': np.nan,
+ 'author_name.first': np.nan,
+ 'author_name.last_name': np.nan,
+ 'info.created_at': np.nan,
+ 'info.last_updated': np.nan},
+ {'info': None,
+ 'author_name.first': 'Jane',
+ 'author_name.last_name': 'Doe',
+ 'info.created_at': '11/08/1993',
+ 'info.last_updated': '26/05/2012'}
+ ]
+ expected = DataFrame(ex_data)
+ tm.assert_frame_equal(result, expected)
+
+
+class TestNestedToRecord(object):
+
+ def test_flat_stays_flat(self):
+ recs = [dict(flat1=1, flat2=2),
+ dict(flat1=3, flat2=4),
+ ]
+
+ result = nested_to_record(recs)
+ expected = recs
+ assert result == expected
+
+ def test_one_level_deep_flattens(self):
+ data = dict(flat1=1,
+ dict1=dict(c=1, d=2))
+
+ result = nested_to_record(data)
+ expected = {'dict1.c': 1,
+ 'dict1.d': 2,
+ 'flat1': 1}
+
+ assert result == expected
+
+ def test_nested_flattens(self):
+ data = dict(flat1=1,
+ dict1=dict(c=1, d=2),
+ nested=dict(e=dict(c=1, d=2),
+ d=2))
+
+ result = nested_to_record(data)
+ expected = {'dict1.c': 1,
+ 'dict1.d': 2,
+ 'flat1': 1,
+ 'nested.d': 2,
+ 'nested.e.c': 1,
+ 'nested.e.d': 2}
+
+ assert result == expected
+
+ def test_json_normalize_errors(self):
+ # GH14583: If meta keys are not always present
+ # a new option to set errors='ignore' has been implemented
+ i = {
+ "Trades": [{
+ "general": {
+ "tradeid": 100,
+ "trade_version": 1,
+ "stocks": [{
+
+ "symbol": "AAPL",
+ "name": "Apple",
+ "price": "0"
+ }, {
+ "symbol": "GOOG",
+ "name": "Google",
+ "price": "0"
+ }
+ ]
+ }
+ }, {
+ "general": {
+ "tradeid": 100,
+ "stocks": [{
+ "symbol": "AAPL",
+ "name": "Apple",
+ "price": "0"
+ }, {
+ "symbol": "GOOG",
+ "name": "Google",
+ "price": "0"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ j = json_normalize(data=i['Trades'],
+ record_path=[['general', 'stocks']],
+ meta=[['general', 'tradeid'],
+ ['general', 'trade_version']],
+ errors='ignore')
+ expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
+ 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
+ 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
+ 'price': {0: '0', 1: '0', 2: '0', 3: '0'},
+ 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
+
+ assert j.fillna('').to_dict() == expected
+
+ msg = ("Try running with errors='ignore' as key 'trade_version'"
+ " is not always present")
+ with pytest.raises(KeyError, match=msg):
+ json_normalize(
+ data=i['Trades'],
+ record_path=[['general', 'stocks']],
+ meta=[['general', 'tradeid'],
+ ['general', 'trade_version']],
+ errors='raise')
+
+ def test_donot_drop_nonevalues(self):
+ # GH21356
+ data = [
+ {'info': None,
+ 'author_name':
+ {'first': 'Smith', 'last_name': 'Appleseed'}
+ },
+ {'info':
+ {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
+ 'author_name':
+ {'first': 'Jane', 'last_name': 'Doe'}
+ }
+ ]
+ result = nested_to_record(data)
+ expected = [
+ {'info': None,
+ 'author_name.first': 'Smith',
+ 'author_name.last_name': 'Appleseed'},
+ {'author_name.first': 'Jane',
+ 'author_name.last_name': 'Doe',
+ 'info.created_at': '11/08/1993',
+ 'info.last_updated': '26/05/2012'}]
+
+ assert result == expected
+
+ def test_nonetype_top_level_bottom_level(self):
+ # GH21158: If inner level json has a key with a null value
+ # make sure it doesnt do a new_d.pop twice and except
+ data = {
+ "id": None,
+ "location": {
+ "country": {
+ "state": {
+ "id": None,
+ "town.info": {
+ "id": None,
+ "region": None,
+ "x": 49.151580810546875,
+ "y": -33.148521423339844,
+ "z": 27.572303771972656}}}
+ }
+ }
+ result = nested_to_record(data)
+ expected = {
+ 'id': None,
+ 'location.country.state.id': None,
+ 'location.country.state.town.info.id': None,
+ 'location.country.state.town.info.region': None,
+ 'location.country.state.town.info.x': 49.151580810546875,
+ 'location.country.state.town.info.y': -33.148521423339844,
+ 'location.country.state.town.info.z': 27.572303771972656}
+ assert result == expected
+
+ def test_nonetype_multiple_levels(self):
+ # GH21158: If inner level json has a key with a null value
+ # make sure it doesnt do a new_d.pop twice and except
+ data = {
+ "id": None,
+ "location": {
+ "id": None,
+ "country": {
+ "id": None,
+ "state": {
+ "id": None,
+ "town.info": {
+ "region": None,
+ "x": 49.151580810546875,
+ "y": -33.148521423339844,
+ "z": 27.572303771972656}}}
+ }
+ }
+ result = nested_to_record(data)
+ expected = {
+ 'id': None,
+ 'location.id': None,
+ 'location.country.id': None,
+ 'location.country.state.id': None,
+ 'location.country.state.town.info.region': None,
+ 'location.country.state.town.info.x': 49.151580810546875,
+ 'location.country.state.town.info.y': -33.148521423339844,
+ 'location.country.state.town.info.z': 27.572303771972656}
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_pandas.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_pandas.py
new file mode 100644
index 00000000000..51a1d5488b1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_pandas.py
@@ -0,0 +1,1274 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101
+from datetime import timedelta
+import json
+import os
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+ OrderedDict, StringIO, is_platform_32bit, lrange, range)
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_index_equal,
+ assert_series_equal, ensure_clean, network)
+
+_seriesd = tm.getSeriesData()
+_tsd = tm.getTimeSeriesData()
+
+_frame = DataFrame(_seriesd)
+_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
+_intframe = DataFrame({k: v.astype(np.int64)
+ for k, v in compat.iteritems(_seriesd)})
+
+_tsframe = DataFrame(_tsd)
+_cat_frame = _frame.copy()
+cat = ['bah'] * 5 + ['bar'] * 5 + ['baz'] * \
+ 5 + ['foo'] * (len(_cat_frame) - 15)
+_cat_frame.index = pd.CategoricalIndex(cat, name='E')
+_cat_frame['E'] = list(reversed(cat))
+_cat_frame['sort'] = np.arange(len(_cat_frame), dtype='int64')
+
+_mixed_frame = _frame.copy()
+
+
+class TestPandasContainer(object):
+
+ @pytest.fixture(scope="function", autouse=True)
+ def setup(self, datapath):
+ self.dirpath = datapath("io", "json", "data")
+
+ self.ts = tm.makeTimeSeries()
+ self.ts.name = 'ts'
+
+ self.series = tm.makeStringSeries()
+ self.series.name = 'series'
+
+ self.objSeries = tm.makeObjectSeries()
+ self.objSeries.name = 'objects'
+
+ self.empty_series = Series([], index=[])
+ self.empty_frame = DataFrame({})
+
+ self.frame = _frame.copy()
+ self.frame2 = _frame2.copy()
+ self.intframe = _intframe.copy()
+ self.tsframe = _tsframe.copy()
+ self.mixed_frame = _mixed_frame.copy()
+ self.categorical = _cat_frame.copy()
+
+ yield
+
+ del self.dirpath
+
+ del self.ts
+
+ del self.series
+
+ del self.objSeries
+
+ del self.empty_series
+ del self.empty_frame
+
+ del self.frame
+ del self.frame2
+ del self.intframe
+ del self.tsframe
+ del self.mixed_frame
+
+ def test_frame_double_encoded_labels(self):
+ df = DataFrame([['a', 'b'], ['c', 'd']],
+ index=['index " 1', 'index / 2'],
+ columns=['a \\ b', 'y / z'])
+
+ assert_frame_equal(df, read_json(df.to_json(orient='split'),
+ orient='split'))
+ assert_frame_equal(df, read_json(df.to_json(orient='columns'),
+ orient='columns'))
+ assert_frame_equal(df, read_json(df.to_json(orient='index'),
+ orient='index'))
+ df_unser = read_json(df.to_json(orient='records'), orient='records')
+ assert_index_equal(df.columns, df_unser.columns)
+ tm.assert_numpy_array_equal(df.values, df_unser.values)
+
+ def test_frame_non_unique_index(self):
+ df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
+ columns=['x', 'y'])
+
+ msg = "DataFrame index must be unique for orient='index'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient='index')
+ msg = "DataFrame index must be unique for orient='columns'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient='columns')
+
+ assert_frame_equal(df, read_json(df.to_json(orient='split'),
+ orient='split'))
+ unser = read_json(df.to_json(orient='records'), orient='records')
+ tm.assert_index_equal(df.columns, unser.columns)
+ tm.assert_almost_equal(df.values, unser.values)
+ unser = read_json(df.to_json(orient='values'), orient='values')
+ tm.assert_numpy_array_equal(df.values, unser.values)
+
+ def test_frame_non_unique_columns(self):
+ df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
+ columns=['x', 'x'])
+
+ msg = "DataFrame columns must be unique for orient='index'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient='index')
+ msg = "DataFrame columns must be unique for orient='columns'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient='columns')
+ msg = "DataFrame columns must be unique for orient='records'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient='records')
+
+ assert_frame_equal(df, read_json(df.to_json(orient='split'),
+ orient='split', dtype=False))
+ unser = read_json(df.to_json(orient='values'), orient='values')
+ tm.assert_numpy_array_equal(df.values, unser.values)
+
+ # GH4377; duplicate columns not processing correctly
+ df = DataFrame([['a', 'b'], ['c', 'd']], index=[
+ 1, 2], columns=['x', 'y'])
+ result = read_json(df.to_json(orient='split'), orient='split')
+ assert_frame_equal(result, df)
+
+ def _check(df):
+ result = read_json(df.to_json(orient='split'), orient='split',
+ convert_dates=['x'])
+ assert_frame_equal(result, df)
+
+ for o in [[['a', 'b'], ['c', 'd']],
+ [[1.5, 2.5], [3.5, 4.5]],
+ [[1, 2.5], [3, 4.5]],
+ [[Timestamp('20130101'), 3.5],
+ [Timestamp('20130102'), 4.5]]]:
+ _check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
+
+ def test_frame_from_json_to_json(self):
+ def _check_orient(df, orient, dtype=None, numpy=False,
+ convert_axes=True, check_dtype=True, raise_ok=None,
+ sort=None, check_index_type=True,
+ check_column_type=True, check_numpy_dtype=False):
+ if sort is not None:
+ df = df.sort_values(sort)
+ else:
+ df = df.sort_index()
+
+ # if we are not unique, then check that we are raising ValueError
+ # for the appropriate orients
+ if not df.index.is_unique and orient in ['index', 'columns']:
+ msg = ("DataFrame index must be unique for orient='{}'"
+ .format(orient))
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient=orient)
+ return
+ if (not df.columns.is_unique and
+ orient in ['index', 'columns', 'records']):
+ # TODO: not executed. fix this.
+ with pytest.raises(ValueError, match='ksjkajksfjksjfkjs'):
+ df.to_json(orient=orient)
+ return
+
+ dfjson = df.to_json(orient=orient)
+
+ try:
+ unser = read_json(dfjson, orient=orient, dtype=dtype,
+ numpy=numpy, convert_axes=convert_axes)
+ except Exception as detail:
+ if raise_ok is not None:
+ if isinstance(detail, raise_ok):
+ return
+ raise
+
+ if sort is not None and sort in unser.columns:
+ unser = unser.sort_values(sort)
+ else:
+ unser = unser.sort_index()
+
+ if dtype is False:
+ check_dtype = False
+
+ if not convert_axes and df.index.dtype.type == np.datetime64:
+ unser.index = DatetimeIndex(
+ unser.index.values.astype('i8') * 1e6)
+ if orient == "records":
+ # index is not captured in this orientation
+ tm.assert_almost_equal(df.values, unser.values,
+ check_dtype=check_numpy_dtype)
+ tm.assert_index_equal(df.columns, unser.columns,
+ exact=check_column_type)
+ elif orient == "values":
+ # index and cols are not captured in this orientation
+ if numpy is True and df.shape == (0, 0):
+ assert unser.shape[0] == 0
+ else:
+ tm.assert_almost_equal(df.values, unser.values,
+ check_dtype=check_numpy_dtype)
+ elif orient == "split":
+ # index and col labels might not be strings
+ unser.index = [str(i) for i in unser.index]
+ unser.columns = [str(i) for i in unser.columns]
+
+ if sort is None:
+ unser = unser.sort_index()
+ tm.assert_almost_equal(df.values, unser.values,
+ check_dtype=check_numpy_dtype)
+ else:
+ if convert_axes:
+ tm.assert_frame_equal(df, unser, check_dtype=check_dtype,
+ check_index_type=check_index_type,
+ check_column_type=check_column_type)
+ else:
+ tm.assert_frame_equal(df, unser, check_less_precise=False,
+ check_dtype=check_dtype)
+
+ def _check_all_orients(df, dtype=None, convert_axes=True,
+ raise_ok=None, sort=None, check_index_type=True,
+ check_column_type=True):
+
+ # numpy=False
+ if convert_axes:
+ _check_orient(df, "columns", dtype=dtype, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "records", dtype=dtype, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "split", dtype=dtype, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "index", dtype=dtype, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "values", dtype=dtype, sort=sort,
+ check_index_type=False, check_column_type=False)
+
+ _check_orient(df, "columns", dtype=dtype,
+ convert_axes=False, sort=sort)
+ _check_orient(df, "records", dtype=dtype,
+ convert_axes=False, sort=sort)
+ _check_orient(df, "split", dtype=dtype,
+ convert_axes=False, sort=sort)
+ _check_orient(df, "index", dtype=dtype,
+ convert_axes=False, sort=sort)
+ _check_orient(df, "values", dtype=dtype,
+ convert_axes=False, sort=sort)
+
+ # numpy=True and raise_ok might be not None, so ignore the error
+ if convert_axes:
+ _check_orient(df, "columns", dtype=dtype, numpy=True,
+ raise_ok=raise_ok, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "records", dtype=dtype, numpy=True,
+ raise_ok=raise_ok, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "split", dtype=dtype, numpy=True,
+ raise_ok=raise_ok, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "index", dtype=dtype, numpy=True,
+ raise_ok=raise_ok, sort=sort,
+ check_index_type=False, check_column_type=False)
+ _check_orient(df, "values", dtype=dtype, numpy=True,
+ raise_ok=raise_ok, sort=sort,
+ check_index_type=False, check_column_type=False)
+
+ _check_orient(df, "columns", dtype=dtype, numpy=True,
+ convert_axes=False, raise_ok=raise_ok, sort=sort)
+ _check_orient(df, "records", dtype=dtype, numpy=True,
+ convert_axes=False, raise_ok=raise_ok, sort=sort)
+ _check_orient(df, "split", dtype=dtype, numpy=True,
+ convert_axes=False, raise_ok=raise_ok, sort=sort)
+ _check_orient(df, "index", dtype=dtype, numpy=True,
+ convert_axes=False, raise_ok=raise_ok, sort=sort)
+ _check_orient(df, "values", dtype=dtype, numpy=True,
+ convert_axes=False, raise_ok=raise_ok, sort=sort)
+
+ # basic
+ _check_all_orients(self.frame)
+ assert self.frame.to_json() == self.frame.to_json(orient="columns")
+
+ _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
+ _check_all_orients(self.intframe, dtype=False)
+
+ # big one
+ # index and columns are strings as all unserialised JSON object keys
+ # are assumed to be strings
+ biggie = DataFrame(np.zeros((200, 4)),
+ columns=[str(i) for i in range(4)],
+ index=[str(i) for i in range(200)])
+ _check_all_orients(biggie, dtype=False, convert_axes=False)
+
+ # dtypes
+ _check_all_orients(DataFrame(biggie, dtype=np.float64),
+ dtype=np.float64, convert_axes=False)
+ _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int,
+ convert_axes=False)
+ _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
+ convert_axes=False, raise_ok=ValueError)
+
+ # categorical
+ _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
+
+ # empty
+ _check_all_orients(self.empty_frame, check_index_type=False,
+ check_column_type=False)
+
+ # time series data
+ _check_all_orients(self.tsframe)
+
+ # mixed data
+ index = pd.Index(['a', 'b', 'c', 'd', 'e'])
+ data = {'A': [0., 1., 2., 3., 4.],
+ 'B': [0., 1., 0., 1., 0.],
+ 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+ 'D': [True, False, True, False, True]}
+ df = DataFrame(data=data, index=index)
+ _check_orient(df, "split", check_dtype=False)
+ _check_orient(df, "records", check_dtype=False)
+ _check_orient(df, "values", check_dtype=False)
+ _check_orient(df, "columns", check_dtype=False)
+ # index oriented is problematic as it is read back in in a transposed
+ # state, so the columns are interpreted as having mixed data and
+ # given object dtypes.
+ # force everything to have object dtype beforehand
+ _check_orient(df.transpose().transpose(), "index", dtype=False)
+
+ def test_frame_from_json_bad_data(self):
+ with pytest.raises(ValueError, match='Expected object or value'):
+ read_json(StringIO('{"key":b:a:d}'))
+
+ # too few indices
+ json = StringIO('{"columns":["A","B"],'
+ '"index":["2","3"],'
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
+ msg = r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)"
+ with pytest.raises(ValueError, match=msg):
+ read_json(json, orient="split")
+
+ # too many columns
+ json = StringIO('{"columns":["A","B","C"],'
+ '"index":["1","2","3"],'
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
+ msg = "3 columns passed, passed data had 2 columns"
+ with pytest.raises(AssertionError, match=msg):
+ read_json(json, orient="split")
+
+ # bad key
+ json = StringIO('{"badkey":["A","B"],'
+ '"index":["2","3"],'
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
+ with pytest.raises(ValueError, match=r"unexpected key\(s\): badkey"):
+ read_json(json, orient="split")
+
+ def test_frame_from_json_nones(self):
+ df = DataFrame([[1, 2], [4, 5, 6]])
+ unser = read_json(df.to_json())
+ assert np.isnan(unser[2][0])
+
+ df = DataFrame([['1', '2'], ['4', '5', '6']])
+ unser = read_json(df.to_json())
+ assert np.isnan(unser[2][0])
+ unser = read_json(df.to_json(), dtype=False)
+ assert unser[2][0] is None
+ unser = read_json(df.to_json(), convert_axes=False, dtype=False)
+ assert unser['2']['0'] is None
+
+ unser = read_json(df.to_json(), numpy=False)
+ assert np.isnan(unser[2][0])
+ unser = read_json(df.to_json(), numpy=False, dtype=False)
+ assert unser[2][0] is None
+ unser = read_json(df.to_json(), numpy=False,
+ convert_axes=False, dtype=False)
+ assert unser['2']['0'] is None
+
+ # infinities get mapped to nulls which get mapped to NaNs during
+ # deserialisation
+ df = DataFrame([[1, 2], [4, 5, 6]])
+ df.loc[0, 2] = np.inf
+ unser = read_json(df.to_json())
+ assert np.isnan(unser[2][0])
+ unser = read_json(df.to_json(), dtype=False)
+ assert np.isnan(unser[2][0])
+
+ df.loc[0, 2] = np.NINF
+ unser = read_json(df.to_json())
+ assert np.isnan(unser[2][0])
+ unser = read_json(df.to_json(), dtype=False)
+ assert np.isnan(unser[2][0])
+
+ @pytest.mark.skipif(is_platform_32bit(),
+ reason="not compliant on 32-bit, xref #15865")
+ def test_frame_to_json_float_precision(self):
+ df = pd.DataFrame([dict(a_float=0.95)])
+ encoded = df.to_json(double_precision=1)
+ assert encoded == '{"a_float":{"0":1.0}}'
+
+ df = pd.DataFrame([dict(a_float=1.95)])
+ encoded = df.to_json(double_precision=1)
+ assert encoded == '{"a_float":{"0":2.0}}'
+
+ df = pd.DataFrame([dict(a_float=-1.95)])
+ encoded = df.to_json(double_precision=1)
+ assert encoded == '{"a_float":{"0":-2.0}}'
+
+ df = pd.DataFrame([dict(a_float=0.995)])
+ encoded = df.to_json(double_precision=2)
+ assert encoded == '{"a_float":{"0":1.0}}'
+
+ df = pd.DataFrame([dict(a_float=0.9995)])
+ encoded = df.to_json(double_precision=3)
+ assert encoded == '{"a_float":{"0":1.0}}'
+
+ df = pd.DataFrame([dict(a_float=0.99999999999999944)])
+ encoded = df.to_json(double_precision=15)
+ assert encoded == '{"a_float":{"0":1.0}}'
+
+ def test_frame_to_json_except(self):
+ df = DataFrame([1, 2, 3])
+ msg = "Invalid value 'garbage' for option 'orient'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient="garbage")
+
+ def test_frame_empty(self):
+ df = DataFrame(columns=['jim', 'joe'])
+ assert not df._is_mixed_type
+ assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
+ check_index_type=False)
+ # GH 7445
+ result = pd.DataFrame({'test': []}, index=[]).to_json(orient='columns')
+ expected = '{"test":{}}'
+ assert result == expected
+
+ def test_frame_empty_mixedtype(self):
+ # mixed type
+ df = DataFrame(columns=['jim', 'joe'])
+ df['joe'] = df['joe'].astype('i8')
+ assert df._is_mixed_type
+ assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
+ check_index_type=False)
+
+ def test_frame_mixedtype_orient(self): # GH10289
+ vals = [[10, 1, 'foo', .1, .01],
+ [20, 2, 'bar', .2, .02],
+ [30, 3, 'baz', .3, .03],
+ [40, 4, 'qux', .4, .04]]
+
+ df = DataFrame(vals, index=list('abcd'),
+ columns=['1st', '2nd', '3rd', '4th', '5th'])
+
+ assert df._is_mixed_type
+ right = df.copy()
+
+ for orient in ['split', 'index', 'columns']:
+ inp = df.to_json(orient=orient)
+ left = read_json(inp, orient=orient, convert_axes=False)
+ assert_frame_equal(left, right)
+
+ right.index = np.arange(len(df))
+ inp = df.to_json(orient='records')
+ left = read_json(inp, orient='records', convert_axes=False)
+ assert_frame_equal(left, right)
+
+ right.columns = np.arange(df.shape[1])
+ inp = df.to_json(orient='values')
+ left = read_json(inp, orient='values', convert_axes=False)
+ assert_frame_equal(left, right)
+
+ def test_v12_compat(self):
+ df = DataFrame(
+ [[1.56808523, 0.65727391, 1.81021139, -0.17251653],
+ [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
+ [1.51493992, 0.11805825, 1.629455, -1.31506612],
+ [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
+ [0.05951614, -2.69652057, 1.28163262, 0.34703478]],
+ columns=['A', 'B', 'C', 'D'],
+ index=pd.date_range('2000-01-03', '2000-01-07'))
+ df['date'] = pd.Timestamp('19920106 18:21:32.12')
+ df.iloc[3, df.columns.get_loc('date')] = pd.Timestamp('20130101')
+ df['modified'] = df['date']
+ df.iloc[1, df.columns.get_loc('modified')] = pd.NaT
+
+ v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
+ df_unser = pd.read_json(v12_json)
+ assert_frame_equal(df, df_unser)
+
+ df_iso = df.drop(['modified'], axis=1)
+ v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
+ df_unser_iso = pd.read_json(v12_iso_json)
+ assert_frame_equal(df_iso, df_unser_iso)
+
+ def test_blocks_compat_GH9037(self):
+ index = pd.date_range('20000101', periods=10, freq='H')
+ df_mixed = DataFrame(OrderedDict(
+ float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
+ -0.60316077, 0.24653374, 0.28668979, -2.51969012,
+ 0.95748401, -1.02970536],
+ int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
+ 40314334, 21290235, 4991321, 41903419, 16008365],
+ str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
+ 'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
+ float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685,
+ -0.48217572, 0.86229683, 1.08935819, 0.93898739,
+ -0.03030452, 1.43366348],
+ str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
+ '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
+ int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
+ 34193846, 10561746, 24867120, 76131025]
+ ), index=index)
+
+ # JSON deserialisation always creates unicode strings
+ df_mixed.columns = df_mixed.columns.astype('unicode')
+
+ df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
+ orient='split')
+ assert_frame_equal(df_mixed, df_roundtrip,
+ check_index_type=True,
+ check_column_type=True,
+ check_frame_type=True,
+ by_blocks=True,
+ check_exact=True)
+
+ def test_frame_nonprintable_bytes(self):
+ # GH14256: failing column caused segfaults, if it is not the last one
+
+ class BinaryThing(object):
+
+ def __init__(self, hexed):
+ self.hexed = hexed
+ if compat.PY2:
+ self.binary = hexed.decode('hex')
+ else:
+ self.binary = bytes.fromhex(hexed)
+
+ def __str__(self):
+ return self.hexed
+
+ hexed = '574b4454ba8c5eb4f98a8f45'
+ binthing = BinaryThing(hexed)
+
+ # verify the proper conversion of printable content
+ df_printable = DataFrame({'A': [binthing.hexed]})
+ assert df_printable.to_json() == \
+ '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
+
+ # check if non-printable content throws appropriate Exception
+ df_nonprintable = DataFrame({'A': [binthing]})
+ msg = "Unsupported UTF-8 sequence length when encoding string"
+ with pytest.raises(OverflowError, match=msg):
+ df_nonprintable.to_json()
+
+ # the same with multiple columns threw segfaults
+ df_mixed = DataFrame({'A': [binthing], 'B': [1]},
+ columns=['A', 'B'])
+ with pytest.raises(OverflowError):
+ df_mixed.to_json()
+
+ # default_handler should resolve exceptions for non-string types
+ assert df_nonprintable.to_json(default_handler=str) == \
+ '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
+ assert df_mixed.to_json(default_handler=str) == \
+ '{{"A":{{"0":"{hex}"}},"B":{{"0":1}}}}'.format(hex=hexed)
+
+ def test_label_overflow(self):
+ # GH14256: buffer length not checked when writing label
+ df = pd.DataFrame({'bar' * 100000: [1], 'foo': [1337]})
+ assert df.to_json() == \
+ '{{"{bar}":{{"0":1}},"foo":{{"0":1337}}}}'.format(
+ bar=('bar' * 100000))
+
+ def test_series_non_unique_index(self):
+ s = Series(['a', 'b'], index=[1, 1])
+
+ msg = "Series index must be unique for orient='index'"
+ with pytest.raises(ValueError, match=msg):
+ s.to_json(orient='index')
+
+ assert_series_equal(s, read_json(s.to_json(orient='split'),
+ orient='split', typ='series'))
+ unser = read_json(s.to_json(orient='records'),
+ orient='records', typ='series')
+ tm.assert_numpy_array_equal(s.values, unser.values)
+
+ def test_series_from_json_to_json(self):
+
+ def _check_orient(series, orient, dtype=None, numpy=False,
+ check_index_type=True):
+ series = series.sort_index()
+ unser = read_json(series.to_json(orient=orient),
+ typ='series', orient=orient, numpy=numpy,
+ dtype=dtype)
+ unser = unser.sort_index()
+ if orient == "records" or orient == "values":
+ assert_almost_equal(series.values, unser.values)
+ else:
+ if orient == "split":
+ assert_series_equal(series, unser,
+ check_index_type=check_index_type)
+ else:
+ assert_series_equal(series, unser, check_names=False,
+ check_index_type=check_index_type)
+
+ def _check_all_orients(series, dtype=None, check_index_type=True):
+ _check_orient(series, "columns", dtype=dtype,
+ check_index_type=check_index_type)
+ _check_orient(series, "records", dtype=dtype,
+ check_index_type=check_index_type)
+ _check_orient(series, "split", dtype=dtype,
+ check_index_type=check_index_type)
+ _check_orient(series, "index", dtype=dtype,
+ check_index_type=check_index_type)
+ _check_orient(series, "values", dtype=dtype)
+
+ _check_orient(series, "columns", dtype=dtype, numpy=True,
+ check_index_type=check_index_type)
+ _check_orient(series, "records", dtype=dtype, numpy=True,
+ check_index_type=check_index_type)
+ _check_orient(series, "split", dtype=dtype, numpy=True,
+ check_index_type=check_index_type)
+ _check_orient(series, "index", dtype=dtype, numpy=True,
+ check_index_type=check_index_type)
+ _check_orient(series, "values", dtype=dtype, numpy=True,
+ check_index_type=check_index_type)
+
+ # basic
+ _check_all_orients(self.series)
+ assert self.series.to_json() == self.series.to_json(orient="index")
+
+ objSeries = Series([str(d) for d in self.objSeries],
+ index=self.objSeries.index,
+ name=self.objSeries.name)
+ _check_all_orients(objSeries, dtype=False)
+
+ # empty_series has empty index with object dtype
+ # which cannot be revert
+ assert self.empty_series.index.dtype == np.object_
+ _check_all_orients(self.empty_series, check_index_type=False)
+
+ _check_all_orients(self.ts)
+
+ # dtype
+ s = Series(lrange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])
+ _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64)
+ _check_all_orients(Series(s, dtype=np.int), dtype=np.int)
+
+ def test_series_to_json_except(self):
+ s = Series([1, 2, 3])
+ msg = "Invalid value 'garbage' for option 'orient'"
+ with pytest.raises(ValueError, match=msg):
+ s.to_json(orient="garbage")
+
+ def test_series_from_json_precise_float(self):
+ s = Series([4.56, 4.56, 4.56])
+ result = read_json(s.to_json(), typ='series', precise_float=True)
+ assert_series_equal(result, s, check_index_type=False)
+
+ def test_series_with_dtype(self):
+ # GH 21986
+ s = Series([4.56, 4.56, 4.56])
+ result = read_json(s.to_json(), typ='series', dtype=np.int64)
+ expected = Series([4] * 3)
+ assert_series_equal(result, expected)
+
+ def test_frame_from_json_precise_float(self):
+ df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
+ result = read_json(df.to_json(), precise_float=True)
+ assert_frame_equal(result, df, check_index_type=False,
+ check_column_type=False)
+
+ def test_typ(self):
+
+ s = Series(lrange(6), index=['a', 'b', 'c',
+ 'd', 'e', 'f'], dtype='int64')
+ result = read_json(s.to_json(), typ=None)
+ assert_series_equal(result, s)
+
+ def test_reconstruction_index(self):
+
+ df = DataFrame([[1, 2, 3], [4, 5, 6]])
+ result = read_json(df.to_json())
+
+ assert_frame_equal(result, df)
+
+ df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=['A', 'B', 'C'])
+ result = read_json(df.to_json())
+ assert_frame_equal(result, df)
+
+ def test_path(self):
+ with ensure_clean('test.json') as path:
+ for df in [self.frame, self.frame2, self.intframe, self.tsframe,
+ self.mixed_frame]:
+ df.to_json(path)
+ read_json(path)
+
+ def test_axis_dates(self):
+
+ # frame
+ json = self.tsframe.to_json()
+ result = read_json(json)
+ assert_frame_equal(result, self.tsframe)
+
+ # series
+ json = self.ts.to_json()
+ result = read_json(json, typ='series')
+ assert_series_equal(result, self.ts, check_names=False)
+ assert result.name is None
+
+ def test_convert_dates(self):
+
+ # frame
+ df = self.tsframe.copy()
+ df['date'] = Timestamp('20130101')
+
+ json = df.to_json()
+ result = read_json(json)
+ assert_frame_equal(result, df)
+
+ df['foo'] = 1.
+ json = df.to_json(date_unit='ns')
+
+ result = read_json(json, convert_dates=False)
+ expected = df.copy()
+ expected['date'] = expected['date'].values.view('i8')
+ expected['foo'] = expected['foo'].astype('int64')
+ assert_frame_equal(result, expected)
+
+ # series
+ ts = Series(Timestamp('20130101'), index=self.ts.index)
+ json = ts.to_json()
+ result = read_json(json, typ='series')
+ assert_series_equal(result, ts)
+
+ def test_convert_dates_infer(self):
+ # GH10747
+ from pandas.io.json import dumps
+ infer_words = ['trade_time', 'date', 'datetime', 'sold_at',
+ 'modified', 'timestamp', 'timestamps']
+ for infer_word in infer_words:
+ data = [{'id': 1, infer_word: 1036713600000}, {'id': 2}]
+ expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]],
+ columns=['id', infer_word])
+ result = read_json(dumps(data))[['id', infer_word]]
+ assert_frame_equal(result, expected)
+
+ def test_date_format_frame(self):
+ df = self.tsframe.copy()
+
+ def test_w_date(date, date_unit=None):
+ df['date'] = Timestamp(date)
+ df.iloc[1, df.columns.get_loc('date')] = pd.NaT
+ df.iloc[5, df.columns.get_loc('date')] = pd.NaT
+ if date_unit:
+ json = df.to_json(date_format='iso', date_unit=date_unit)
+ else:
+ json = df.to_json(date_format='iso')
+ result = read_json(json)
+ assert_frame_equal(result, df)
+
+ test_w_date('20130101 20:43:42.123')
+ test_w_date('20130101 20:43:42', date_unit='s')
+ test_w_date('20130101 20:43:42.123', date_unit='ms')
+ test_w_date('20130101 20:43:42.123456', date_unit='us')
+ test_w_date('20130101 20:43:42.123456789', date_unit='ns')
+
+ msg = "Invalid value 'foo' for option 'date_unit'"
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(date_format='iso', date_unit='foo')
+
+ def test_date_format_series(self):
+ def test_w_date(date, date_unit=None):
+ ts = Series(Timestamp(date), index=self.ts.index)
+ ts.iloc[1] = pd.NaT
+ ts.iloc[5] = pd.NaT
+ if date_unit:
+ json = ts.to_json(date_format='iso', date_unit=date_unit)
+ else:
+ json = ts.to_json(date_format='iso')
+ result = read_json(json, typ='series')
+ assert_series_equal(result, ts)
+
+ test_w_date('20130101 20:43:42.123')
+ test_w_date('20130101 20:43:42', date_unit='s')
+ test_w_date('20130101 20:43:42.123', date_unit='ms')
+ test_w_date('20130101 20:43:42.123456', date_unit='us')
+ test_w_date('20130101 20:43:42.123456789', date_unit='ns')
+
+ ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
+ msg = "Invalid value 'foo' for option 'date_unit'"
+ with pytest.raises(ValueError, match=msg):
+ ts.to_json(date_format='iso', date_unit='foo')
+
+ def test_date_unit(self):
+ df = self.tsframe.copy()
+ df['date'] = Timestamp('20130101 20:43:42')
+ dl = df.columns.get_loc('date')
+ df.iloc[1, dl] = Timestamp('19710101 20:43:42')
+ df.iloc[2, dl] = Timestamp('21460101 20:43:42')
+ df.iloc[4, dl] = pd.NaT
+
+ for unit in ('s', 'ms', 'us', 'ns'):
+ json = df.to_json(date_format='epoch', date_unit=unit)
+
+ # force date unit
+ result = read_json(json, date_unit=unit)
+ assert_frame_equal(result, df)
+
+ # detect date unit
+ result = read_json(json, date_unit=None)
+ assert_frame_equal(result, df)
+
+ def test_weird_nested_json(self):
+ # this used to core dump the parser
+ s = r'''{
+ "status": "success",
+ "data": {
+ "posts": [
+ {
+ "id": 1,
+ "title": "A blog post",
+ "body": "Some useful content"
+ },
+ {
+ "id": 2,
+ "title": "Another blog post",
+ "body": "More content"
+ }
+ ]
+ }
+ }'''
+
+ read_json(s)
+
+ def test_doc_example(self):
+ dfj2 = DataFrame(np.random.randn(5, 2), columns=list('AB'))
+ dfj2['date'] = Timestamp('20130101')
+ dfj2['ints'] = lrange(5)
+ dfj2['bools'] = True
+ dfj2.index = pd.date_range('20130101', periods=5)
+
+ json = dfj2.to_json()
+ result = read_json(json, dtype={'ints': np.int64, 'bools': np.bool_})
+ assert_frame_equal(result, result)
+
+ def test_misc_example(self):
+
+ # parsing unordered input fails
+ result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+
+ error_msg = """DataFrame\\.index are different
+
+DataFrame\\.index values are different \\(100\\.0 %\\)
+\\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
+\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
+ with pytest.raises(AssertionError, match=error_msg):
+ assert_frame_equal(result, expected, check_index_type=False)
+
+ result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ @network
+ @pytest.mark.single
+ def test_round_trip_exception_(self):
+ # GH 3867
+ csv = 'https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv'
+ df = pd.read_csv(csv)
+ s = df.to_json()
+ result = pd.read_json(s)
+ assert_frame_equal(result.reindex(
+ index=df.index, columns=df.columns), df)
+
+ @network
+ @pytest.mark.single
+ def test_url(self):
+ url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5' # noqa
+ result = read_json(url, convert_dates=True)
+ for c in ['created_at', 'closed_at', 'updated_at']:
+ assert result[c].dtype == 'datetime64[ns]'
+
+ def test_timedelta(self):
+ converter = lambda x: pd.to_timedelta(x, unit='ms')
+
+ s = Series([timedelta(23), timedelta(seconds=5)])
+ assert s.dtype == 'timedelta64[ns]'
+
+ result = pd.read_json(s.to_json(), typ='series').apply(converter)
+ assert_series_equal(result, s)
+
+ s = Series([timedelta(23), timedelta(seconds=5)],
+ index=pd.Index([0, 1]))
+ assert s.dtype == 'timedelta64[ns]'
+ result = pd.read_json(s.to_json(), typ='series').apply(converter)
+ assert_series_equal(result, s)
+
+ frame = DataFrame([timedelta(23), timedelta(seconds=5)])
+ assert frame[0].dtype == 'timedelta64[ns]'
+ assert_frame_equal(frame, pd.read_json(frame.to_json())
+ .apply(converter))
+
+ frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
+ 'b': [1, 2],
+ 'c': pd.date_range(start='20130101', periods=2)})
+
+ result = pd.read_json(frame.to_json(date_unit='ns'))
+ result['a'] = pd.to_timedelta(result.a, unit='ns')
+ result['c'] = pd.to_datetime(result.c)
+ assert_frame_equal(frame, result)
+
+ def test_mixed_timedelta_datetime(self):
+ frame = DataFrame({'a': [timedelta(23), pd.Timestamp('20130101')]},
+ dtype=object)
+
+ expected = DataFrame({'a': [pd.Timedelta(frame.a[0]).value,
+ pd.Timestamp(frame.a[1]).value]})
+ result = pd.read_json(frame.to_json(date_unit='ns'),
+ dtype={'a': 'int64'})
+ assert_frame_equal(result, expected, check_index_type=False)
+
+ def test_default_handler(self):
+ value = object()
+ frame = DataFrame({'a': [7, value]})
+ expected = DataFrame({'a': [7, str(value)]})
+ result = pd.read_json(frame.to_json(default_handler=str))
+ assert_frame_equal(expected, result, check_index_type=False)
+
+ def test_default_handler_indirect(self):
+ from pandas.io.json import dumps
+
+ def default(obj):
+ if isinstance(obj, complex):
+ return [('mathjs', 'Complex'),
+ ('re', obj.real),
+ ('im', obj.imag)]
+ return str(obj)
+ df_list = [9, DataFrame({'a': [1, 'STR', complex(4, -5)],
+ 'b': [float('nan'), None, 'N/A']},
+ columns=['a', 'b'])]
+ expected = ('[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
+ '["re",4.0],["im",-5.0]],"N\\/A"]]]')
+ assert dumps(df_list, default_handler=default,
+ orient="values") == expected
+
+ def test_default_handler_numpy_unsupported_dtype(self):
+ # GH12554 to_json raises 'Unhandled numpy dtype 15'
+ df = DataFrame({'a': [1, 2.3, complex(4, -5)],
+ 'b': [float('nan'), None, complex(1.2, 0)]},
+ columns=['a', 'b'])
+ expected = ('[["(1+0j)","(nan+0j)"],'
+ '["(2.3+0j)","(nan+0j)"],'
+ '["(4-5j)","(1.2+0j)"]]')
+ assert df.to_json(default_handler=str, orient="values") == expected
+
+ def test_default_handler_raises(self):
+ msg = "raisin"
+
+ def my_handler_raises(obj):
+ raise TypeError(msg)
+ with pytest.raises(TypeError, match=msg):
+ DataFrame({'a': [1, 2, object()]}).to_json(
+ default_handler=my_handler_raises)
+ with pytest.raises(TypeError, match=msg):
+ DataFrame({'a': [1, 2, complex(4, -5)]}).to_json(
+ default_handler=my_handler_raises)
+
+ def test_categorical(self):
+ # GH4377 df.to_json segfaults with non-ndarray blocks
+ df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
+ df["B"] = df["A"]
+ expected = df.to_json()
+
+ df["B"] = df["A"].astype('category')
+ assert expected == df.to_json()
+
+ s = df["A"]
+ sc = df["B"]
+ assert s.to_json() == sc.to_json()
+
+ def test_datetime_tz(self):
+ # GH4377 df.to_json segfaults with non-ndarray blocks
+ tz_range = pd.date_range('20130101', periods=3, tz='US/Eastern')
+ tz_naive = tz_range.tz_convert('utc').tz_localize(None)
+
+ df = DataFrame({
+ 'A': tz_range,
+ 'B': pd.date_range('20130101', periods=3)})
+
+ df_naive = df.copy()
+ df_naive['A'] = tz_naive
+ expected = df_naive.to_json()
+ assert expected == df.to_json()
+
+ stz = Series(tz_range)
+ s_naive = Series(tz_naive)
+ assert stz.to_json() == s_naive.to_json()
+
+ def test_sparse(self):
+ # GH4377 df.to_json segfaults with non-ndarray blocks
+ df = pd.DataFrame(np.random.randn(10, 4))
+ df.loc[:8] = np.nan
+
+ sdf = df.to_sparse()
+ expected = df.to_json()
+ assert expected == sdf.to_json()
+
+ s = pd.Series(np.random.randn(10))
+ s.loc[:8] = np.nan
+ ss = s.to_sparse()
+
+ expected = s.to_json()
+ assert expected == ss.to_json()
+
+ def test_tz_is_utc(self):
+ from pandas.io.json import dumps
+ exp = '"2013-01-10T05:00:00.000Z"'
+
+ ts = Timestamp('2013-01-10 05:00:00Z')
+ assert dumps(ts, iso_dates=True) == exp
+ dt = ts.to_pydatetime()
+ assert dumps(dt, iso_dates=True) == exp
+
+ ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern')
+ assert dumps(ts, iso_dates=True) == exp
+ dt = ts.to_pydatetime()
+ assert dumps(dt, iso_dates=True) == exp
+
+ ts = Timestamp('2013-01-10 00:00:00-0500')
+ assert dumps(ts, iso_dates=True) == exp
+ dt = ts.to_pydatetime()
+ assert dumps(dt, iso_dates=True) == exp
+
+ def test_tz_range_is_utc(self):
+ from pandas.io.json import dumps
+
+ exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
+ dfexp = ('{"DT":{'
+ '"0":"2013-01-01T05:00:00.000Z",'
+ '"1":"2013-01-02T05:00:00.000Z"}}')
+
+ tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2)
+ assert dumps(tz_range, iso_dates=True) == exp
+ dti = pd.DatetimeIndex(tz_range)
+ assert dumps(dti, iso_dates=True) == exp
+ df = DataFrame({'DT': dti})
+ result = dumps(df, iso_dates=True)
+ assert result == dfexp
+
+ tz_range = pd.date_range('2013-01-01 00:00:00', periods=2,
+ tz='US/Eastern')
+ assert dumps(tz_range, iso_dates=True) == exp
+ dti = pd.DatetimeIndex(tz_range)
+ assert dumps(dti, iso_dates=True) == exp
+ df = DataFrame({'DT': dti})
+ assert dumps(df, iso_dates=True) == dfexp
+
+ tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2)
+ assert dumps(tz_range, iso_dates=True) == exp
+ dti = pd.DatetimeIndex(tz_range)
+ assert dumps(dti, iso_dates=True) == exp
+ df = DataFrame({'DT': dti})
+ assert dumps(df, iso_dates=True) == dfexp
+
+ def test_read_inline_jsonl(self):
+ # GH9180
+ result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ @td.skip_if_not_us_locale
+ def test_read_s3_jsonl(self, s3_resource):
+ # GH17200
+
+ result = read_json('s3n://pandas-test/items.jsonl', lines=True)
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ def test_read_local_jsonl(self):
+ # GH17200
+ with ensure_clean('tmp_items.json') as path:
+ with open(path, 'w') as infile:
+ infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
+ result = read_json(path, lines=True)
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ def test_read_jsonl_unicode_chars(self):
+ # GH15132: non-ascii unicode characters
+ # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+ # simulate file handle
+ json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+ json = StringIO(json)
+ result = read_json(json, lines=True)
+ expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+ columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ # simulate string
+ json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+ result = read_json(json, lines=True)
+ expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+ columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ def test_read_json_large_numbers(self):
+ # GH18842
+ json = '{"articleId": "1404366058080022500245"}'
+ json = StringIO(json)
+ result = read_json(json, typ="series")
+ expected = Series(1.404366e+21, index=['articleId'])
+ assert_series_equal(result, expected)
+
+ json = '{"0": {"articleId": "1404366058080022500245"}}'
+ json = StringIO(json)
+ result = read_json(json)
+ expected = DataFrame(1.404366e+21, index=['articleId'], columns=[0])
+ assert_frame_equal(result, expected)
+
+ def test_to_jsonl(self):
+ # GH9180
+ df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+ assert result == expected
+
+ df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+ assert result == expected
+ assert_frame_equal(pd.read_json(result, lines=True), df)
+
+ # GH15096: escaped characters in columns and data
+ df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+ columns=["a\\", 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+ '{"a\\\\":"foo\\"","b":"bar"}')
+ assert result == expected
+ assert_frame_equal(pd.read_json(result, lines=True), df)
+
+ def test_latin_encoding(self):
+ if compat.PY2:
+ pytest.skip("[unicode] is not implemented as a table column")
+
+ # GH 13774
+ pytest.skip("encoding not implemented in .to_json(), "
+ "xref #13774")
+
+ values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+ [b'E\xc9, 17', b'a', b'b', b'c'],
+ [b'EE, 17', b'', b'a', b'b', b'c'],
+ [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+ [b'', b'a', b'b', b'c'],
+ [b'\xf8\xfc', b'a', b'b', b'c'],
+ [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+ [np.nan, b'', b'b', b'c'],
+ [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+ def _try_decode(x, encoding='latin-1'):
+ try:
+ return x.decode(encoding)
+ except AttributeError:
+ return x
+
+ # not sure how to remove latin-1 from code in python 2 and 3
+ values = [[_try_decode(x) for x in y] for y in values]
+
+ examples = []
+ for dtype in ['category', object]:
+ for val in values:
+ examples.append(Series(val, dtype=dtype))
+
+ def roundtrip(s, encoding='latin-1'):
+ with ensure_clean('test.json') as path:
+ s.to_json(path, encoding=encoding)
+ retr = read_json(path, encoding=encoding)
+ assert_series_equal(s, retr, check_categorical=False)
+
+ for s in examples:
+ roundtrip(s)
+
+ def test_data_frame_size_after_to_json(self):
+ # GH15344
+ df = DataFrame({'a': [str(1)]})
+
+ size_before = df.memory_usage(index=True, deep=True).sum()
+ df.to_json()
+ size_after = df.memory_usage(index=True, deep=True).sum()
+
+ assert size_before == size_after
+
+ @pytest.mark.parametrize('data, expected', [
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']),
+ {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo'),
+ {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
+ index=[['a', 'b'], ['c', 'd']]),
+ {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
+ (Series([1, 2, 3], name='A'),
+ {'name': 'A', 'data': [1, 2, 3]}),
+ (Series([1, 2, 3], name='A').rename_axis('foo'),
+ {'name': 'A', 'data': [1, 2, 3]}),
+ (Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']]),
+ {'name': 'A', 'data': [1, 2]}),
+ ])
+ def test_index_false_to_json_split(self, data, expected):
+ # GH 17394
+ # Testing index=False in to_json with orient='split'
+
+ result = data.to_json(orient='split', index=False)
+ result = json.loads(result)
+
+ assert result == expected
+
+ @pytest.mark.parametrize('data', [
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])),
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo')),
+ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
+ index=[['a', 'b'], ['c', 'd']])),
+ (Series([1, 2, 3], name='A')),
+ (Series([1, 2, 3], name='A').rename_axis('foo')),
+ (Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']])),
+ ])
+ def test_index_false_to_json_table(self, data):
+ # GH 17394
+ # Testing index=False in to_json with orient='table'
+
+ result = data.to_json(orient='table', index=False)
+ result = json.loads(result)
+
+ expected = {
+ 'schema': pd.io.json.build_table_schema(data, index=False),
+ 'data': DataFrame(data).to_dict(orient='records')
+ }
+
+ assert result == expected
+
+ @pytest.mark.parametrize('orient', [
+ 'records', 'index', 'columns', 'values'
+ ])
+ def test_index_false_error_to_json(self, orient):
+ # GH 17394
+ # Testing error message from to_json with index=False
+
+ df = pd.DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])
+
+ msg = ("'index=False' is only valid when "
+ "'orient' is 'split' or 'table'")
+ with pytest.raises(ValueError, match=msg):
+ df.to_json(orient=orient, index=False)
+
+ @pytest.mark.parametrize('orient', ['split', 'table'])
+ @pytest.mark.parametrize('index', [True, False])
+ def test_index_false_from_json_to_json(self, orient, index):
+ # GH25170
+ # Test index=False in from_json to_json
+ expected = DataFrame({'a': [1, 2], 'b': [3, 4]})
+ dfjson = expected.to_json(orient=orient, index=index)
+ result = read_json(dfjson, orient=orient)
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_readlines.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_readlines.py
new file mode 100644
index 00000000000..25e78526b2e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_readlines.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+from pandas.compat import StringIO
+
+import pandas as pd
+from pandas import DataFrame, read_json
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_series_equal, ensure_clean)
+
+from pandas.io.json.json import JsonReader
+
+
+def lines_json_df():
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+ return df.to_json(lines=True, orient="records")
+
+
+def test_read_jsonl():
+ # GH9180
+ result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+ expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+
+def test_read_jsonl_unicode_chars():
+ # GH15132: non-ascii unicode characters
+ # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+ # simulate file handle
+ json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+ json = StringIO(json)
+ result = read_json(json, lines=True)
+ expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+ columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+ # simulate string
+ json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+ result = read_json(json, lines=True)
+ expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+ columns=['a', 'b'])
+ assert_frame_equal(result, expected)
+
+
+def test_to_jsonl():
+ # GH9180
+ df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+ assert result == expected
+
+ df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+ assert result == expected
+ assert_frame_equal(read_json(result, lines=True), df)
+
+ # GH15096: escaped characters in columns and data
+ df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+ columns=["a\\", 'b'])
+ result = df.to_json(orient="records", lines=True)
+ expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+ '{"a\\\\":"foo\\"","b":"bar"}')
+ assert result == expected
+ assert_frame_equal(read_json(result, lines=True), df)
+
+
[email protected]("chunksize", [1, 1.0])
+def test_readjson_chunks(lines_json_df, chunksize):
+ # Basic test that read_json(chunks=True) gives the same result as
+ # read_json(chunks=False)
+ # GH17048: memory usage when lines=True
+
+ unchunked = read_json(StringIO(lines_json_df), lines=True)
+ reader = read_json(StringIO(lines_json_df), lines=True,
+ chunksize=chunksize)
+ chunked = pd.concat(reader)
+
+ assert_frame_equal(chunked, unchunked)
+
+
+def test_readjson_chunksize_requires_lines(lines_json_df):
+ msg = "chunksize can only be passed if lines=True"
+ with pytest.raises(ValueError, match=msg):
+ pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
+
+
+def test_readjson_chunks_series():
+ # Test reading line-format JSON to Series with chunksize param
+ s = pd.Series({'A': 1, 'B': 2})
+
+ strio = StringIO(s.to_json(lines=True, orient="records"))
+ unchunked = pd.read_json(strio, lines=True, typ='Series')
+
+ strio = StringIO(s.to_json(lines=True, orient="records"))
+ chunked = pd.concat(pd.read_json(
+ strio, lines=True, typ='Series', chunksize=1
+ ))
+
+ assert_series_equal(chunked, unchunked)
+
+
+def test_readjson_each_chunk(lines_json_df):
+ # Other tests check that the final result of read_json(chunksize=True)
+ # is correct. This checks the intermediate chunks.
+ chunks = list(
+ pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
+ )
+ assert chunks[0].shape == (2, 2)
+ assert chunks[1].shape == (1, 2)
+
+
+def test_readjson_chunks_from_file():
+ with ensure_clean('test.json') as path:
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+ df.to_json(path, lines=True, orient="records")
+ chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
+ unchunked = pd.read_json(path, lines=True)
+ assert_frame_equal(unchunked, chunked)
+
+
[email protected]("chunksize", [None, 1])
+def test_readjson_chunks_closes(chunksize):
+ with ensure_clean('test.json') as path:
+ df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+ df.to_json(path, lines=True, orient="records")
+ reader = JsonReader(
+ path, orient=None, typ="frame", dtype=True, convert_axes=True,
+ convert_dates=True, keep_default_dates=True, numpy=False,
+ precise_float=False, date_unit=None, encoding=None,
+ lines=True, chunksize=chunksize, compression=None)
+ reader.read()
+ assert reader.open_stream.closed, "didn't close stream with \
+ chunksize = {chunksize}".format(chunksize=chunksize)
+
+
[email protected]("chunksize", [0, -1, 2.2, "foo"])
+def test_readjson_invalid_chunksize(lines_json_df, chunksize):
+ msg = r"'chunksize' must be an integer >=1"
+
+ with pytest.raises(ValueError, match=msg):
+ pd.read_json(StringIO(lines_json_df), lines=True,
+ chunksize=chunksize)
+
+
[email protected]("chunksize", [None, 1, 2])
+def test_readjson_chunks_multiple_empty_lines(chunksize):
+ j = """
+
+ {"A":1,"B":4}
+
+
+
+ {"A":2,"B":5}
+
+
+
+
+
+
+
+ {"A":3,"B":6}
+ """
+ orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+ test = pd.read_json(j, lines=True, chunksize=chunksize)
+ if chunksize is not None:
+ test = pd.concat(test)
+ tm.assert_frame_equal(
+ orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
diff --git a/contrib/python/pandas/py2/pandas/tests/io/json/test_ujson.py b/contrib/python/pandas/py2/pandas/tests/io/json/test_ujson.py
new file mode 100644
index 00000000000..63ba9bc0f04
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/json/test_ujson.py
@@ -0,0 +1,1129 @@
+# -*- coding: utf-8 -*-
+
+try:
+ import json
+except ImportError:
+ import simplejson as json
+import calendar
+import datetime
+import decimal
+from functools import partial
+import locale
+import math
+import re
+import time
+
+import dateutil
+import numpy as np
+import pytest
+import pytz
+
+import pandas._libs.json as ujson
+from pandas._libs.tslib import Timestamp
+import pandas.compat as compat
+from pandas.compat import StringIO, range, u
+
+from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range
+import pandas.util.testing as tm
+
+json_unicode = (json.dumps if compat.PY3
+ else partial(json.dumps, encoding="utf-8"))
+
+
+def _clean_dict(d):
+ """
+ Sanitize dictionary for JSON by converting all keys to strings.
+
+ Parameters
+ ----------
+ d : dict
+ The dictionary to convert.
+
+ Returns
+ -------
+ cleaned_dict : dict
+ """
+
+ return {str(k): v for k, v in compat.iteritems(d)}
+
+
+ None, # Column indexed by default.
+ "split",
+ "records",
+ "values",
+ "index"])
+def orient(request):
+ return request.param
+
+
[email protected](params=[None, True])
+def numpy(request):
+ return request.param
+
+
+class TestUltraJSONTests(object):
+
+ @pytest.mark.skipif(compat.is_platform_32bit(),
+ reason="not compliant on 32-bit, xref #15865")
+ def test_encode_decimal(self):
+ sut = decimal.Decimal("1337.1337")
+ encoded = ujson.encode(sut, double_precision=15)
+ decoded = ujson.decode(encoded)
+ assert decoded == 1337.1337
+
+ sut = decimal.Decimal("0.95")
+ encoded = ujson.encode(sut, double_precision=1)
+ assert encoded == "1.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 1.0
+
+ sut = decimal.Decimal("0.94")
+ encoded = ujson.encode(sut, double_precision=1)
+ assert encoded == "0.9"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 0.9
+
+ sut = decimal.Decimal("1.95")
+ encoded = ujson.encode(sut, double_precision=1)
+ assert encoded == "2.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 2.0
+
+ sut = decimal.Decimal("-1.95")
+ encoded = ujson.encode(sut, double_precision=1)
+ assert encoded == "-2.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == -2.0
+
+ sut = decimal.Decimal("0.995")
+ encoded = ujson.encode(sut, double_precision=2)
+ assert encoded == "1.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 1.0
+
+ sut = decimal.Decimal("0.9995")
+ encoded = ujson.encode(sut, double_precision=3)
+ assert encoded == "1.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 1.0
+
+ sut = decimal.Decimal("0.99999999999999944")
+ encoded = ujson.encode(sut, double_precision=15)
+ assert encoded == "1.0"
+
+ decoded = ujson.decode(encoded)
+ assert decoded == 1.0
+
+ @pytest.mark.parametrize("ensure_ascii", [True, False])
+ def test_encode_string_conversion(self, ensure_ascii):
+ string_input = "A string \\ / \b \f \n \r \t </script> &"
+ not_html_encoded = ('"A string \\\\ \\/ \\b \\f \\n '
+ '\\r \\t <\\/script> &"')
+ html_encoded = ('"A string \\\\ \\/ \\b \\f \\n \\r \\t '
+ '\\u003c\\/script\\u003e \\u0026"')
+
+ def helper(expected_output, **encode_kwargs):
+ output = ujson.encode(string_input,
+ ensure_ascii=ensure_ascii,
+ **encode_kwargs)
+
+ assert output == expected_output
+ assert string_input == json.loads(output)
+ assert string_input == ujson.decode(output)
+
+ # Default behavior assumes encode_html_chars=False.
+ helper(not_html_encoded)
+
+ # Make sure explicit encode_html_chars=False works.
+ helper(not_html_encoded, encode_html_chars=False)
+
+ # Make sure explicit encode_html_chars=True does the encoding.
+ helper(html_encoded, encode_html_chars=True)
+
+ @pytest.mark.parametrize("long_number", [
+ -4342969734183514, -12345678901234.56789012, -528656961.4399388
+ ])
+ def test_double_long_numbers(self, long_number):
+ sut = {u("a"): long_number}
+ encoded = ujson.encode(sut, double_precision=15)
+
+ decoded = ujson.decode(encoded)
+ assert sut == decoded
+
+ def test_encode_non_c_locale(self):
+ lc_category = locale.LC_NUMERIC
+
+ # We just need one of these locales to work.
+ for new_locale in ("it_IT.UTF-8", "Italian_Italy"):
+ if tm.can_set_locale(new_locale, lc_category):
+ with tm.set_locale(new_locale, lc_category):
+ assert ujson.loads(ujson.dumps(4.78e60)) == 4.78e60
+ assert ujson.loads("4.78", precise_float=True) == 4.78
+ break
+
+ def test_decimal_decode_test_precise(self):
+ sut = {u("a"): 4.56}
+ encoded = ujson.encode(sut)
+ decoded = ujson.decode(encoded, precise_float=True)
+ assert sut == decoded
+
+ @pytest.mark.skipif(compat.is_platform_windows() and not compat.PY3,
+ reason="buggy on win-64 for py2")
+ def test_encode_double_tiny_exponential(self):
+ num = 1e-40
+ assert num == ujson.decode(ujson.encode(num))
+ num = 1e-100
+ assert num == ujson.decode(ujson.encode(num))
+ num = -1e-45
+ assert num == ujson.decode(ujson.encode(num))
+ num = -1e-145
+ assert np.allclose(num, ujson.decode(ujson.encode(num)))
+
+ @pytest.mark.parametrize("unicode_key", [
+ u("key1"), u("بن")
+ ])
+ def test_encode_dict_with_unicode_keys(self, unicode_key):
+ unicode_dict = {unicode_key: u("value1")}
+ assert unicode_dict == ujson.decode(ujson.encode(unicode_dict))
+
+ @pytest.mark.parametrize("double_input", [
+ math.pi,
+ -math.pi # Should work with negatives too.
+ ])
+ def test_encode_double_conversion(self, double_input):
+ output = ujson.encode(double_input)
+ assert round(double_input, 5) == round(json.loads(output), 5)
+ assert round(double_input, 5) == round(ujson.decode(output), 5)
+
+ def test_encode_with_decimal(self):
+ decimal_input = 1.0
+ output = ujson.encode(decimal_input)
+
+ assert output == "1.0"
+
+ def test_encode_array_of_nested_arrays(self):
+ nested_input = [[[[]]]] * 20
+ output = ujson.encode(nested_input)
+
+ assert nested_input == json.loads(output)
+ assert nested_input == ujson.decode(output)
+
+ nested_input = np.array(nested_input)
+ tm.assert_numpy_array_equal(nested_input, ujson.decode(
+ output, numpy=True, dtype=nested_input.dtype))
+
+ def test_encode_array_of_doubles(self):
+ doubles_input = [31337.31337, 31337.31337,
+ 31337.31337, 31337.31337] * 10
+ output = ujson.encode(doubles_input)
+
+ assert doubles_input == json.loads(output)
+ assert doubles_input == ujson.decode(output)
+
+ tm.assert_numpy_array_equal(np.array(doubles_input),
+ ujson.decode(output, numpy=True))
+
+ def test_double_precision(self):
+ double_input = 30.012345678901234
+ output = ujson.encode(double_input, double_precision=15)
+
+ assert double_input == json.loads(output)
+ assert double_input == ujson.decode(output)
+
+ for double_precision in (3, 9):
+ output = ujson.encode(double_input,
+ double_precision=double_precision)
+ rounded_input = round(double_input, double_precision)
+
+ assert rounded_input == json.loads(output)
+ assert rounded_input == ujson.decode(output)
+
+ @pytest.mark.parametrize("invalid_val", [
+ 20, -1, "9", None
+ ])
+ def test_invalid_double_precision(self, invalid_val):
+ double_input = 30.12345678901234567890
+ expected_exception = (ValueError if isinstance(invalid_val, int)
+ else TypeError)
+
+ with pytest.raises(expected_exception):
+ ujson.encode(double_input, double_precision=invalid_val)
+
+ def test_encode_string_conversion2(self):
+ string_input = "A string \\ / \b \f \n \r \t"
+ output = ujson.encode(string_input)
+
+ assert string_input == json.loads(output)
+ assert string_input == ujson.decode(output)
+ assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"'
+
+ @pytest.mark.parametrize("unicode_input", [
+ "Räksmörgås اسامة بن محمد بن عوض بن لادن",
+ "\xe6\x97\xa5\xd1\x88"
+ ])
+ def test_encode_unicode_conversion(self, unicode_input):
+ enc = ujson.encode(unicode_input)
+ dec = ujson.decode(enc)
+
+ assert enc == json_unicode(unicode_input)
+ assert dec == json.loads(enc)
+
+ def test_encode_control_escaping(self):
+ escaped_input = "\x19"
+ enc = ujson.encode(escaped_input)
+ dec = ujson.decode(enc)
+
+ assert escaped_input == dec
+ assert enc == json_unicode(escaped_input)
+
+ def test_encode_unicode_surrogate_pair(self):
+ surrogate_input = "\xf0\x90\x8d\x86"
+ enc = ujson.encode(surrogate_input)
+ dec = ujson.decode(enc)
+
+ assert enc == json_unicode(surrogate_input)
+ assert dec == json.loads(enc)
+
+ def test_encode_unicode_4bytes_utf8(self):
+ four_bytes_input = "\xf0\x91\x80\xb0TRAILINGNORMAL"
+ enc = ujson.encode(four_bytes_input)
+ dec = ujson.decode(enc)
+
+ assert enc == json_unicode(four_bytes_input)
+ assert dec == json.loads(enc)
+
+ def test_encode_unicode_4bytes_utf8highest(self):
+ four_bytes_input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL"
+ enc = ujson.encode(four_bytes_input)
+
+ dec = ujson.decode(enc)
+
+ assert enc == json_unicode(four_bytes_input)
+ assert dec == json.loads(enc)
+
+ def test_encode_array_in_array(self):
+ arr_in_arr_input = [[[[]]]]
+ output = ujson.encode(arr_in_arr_input)
+
+ assert arr_in_arr_input == json.loads(output)
+ assert output == json.dumps(arr_in_arr_input)
+ assert arr_in_arr_input == ujson.decode(output)
+
+ tm.assert_numpy_array_equal(np.array(arr_in_arr_input),
+ ujson.decode(output, numpy=True))
+
+ @pytest.mark.parametrize("num_input", [
+ 31337,
+ -31337, # Negative number.
+ -9223372036854775808 # Large negative number.
+ ])
+ def test_encode_num_conversion(self, num_input):
+ output = ujson.encode(num_input)
+ assert num_input == json.loads(output)
+ assert output == json.dumps(num_input)
+ assert num_input == ujson.decode(output)
+
+ def test_encode_list_conversion(self):
+ list_input = [1, 2, 3, 4]
+ output = ujson.encode(list_input)
+
+ assert list_input == json.loads(output)
+ assert list_input == ujson.decode(output)
+
+ tm.assert_numpy_array_equal(np.array(list_input),
+ ujson.decode(output, numpy=True))
+
+ def test_encode_dict_conversion(self):
+ dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4}
+ output = ujson.encode(dict_input)
+
+ assert dict_input == json.loads(output)
+ assert dict_input == ujson.decode(output)
+
+ @pytest.mark.parametrize("builtin_value", [None, True, False])
+ def test_encode_builtin_values_conversion(self, builtin_value):
+ output = ujson.encode(builtin_value)
+ assert builtin_value == json.loads(output)
+ assert output == json.dumps(builtin_value)
+ assert builtin_value == ujson.decode(output)
+
+ def test_encode_datetime_conversion(self):
+ datetime_input = datetime.datetime.fromtimestamp(time.time())
+ output = ujson.encode(datetime_input, date_unit="s")
+ expected = calendar.timegm(datetime_input.utctimetuple())
+
+ assert int(expected) == json.loads(output)
+ assert int(expected) == ujson.decode(output)
+
+ def test_encode_date_conversion(self):
+ date_input = datetime.date.fromtimestamp(time.time())
+ output = ujson.encode(date_input, date_unit="s")
+
+ tup = (date_input.year, date_input.month, date_input.day, 0, 0, 0)
+ expected = calendar.timegm(tup)
+
+ assert int(expected) == json.loads(output)
+ assert int(expected) == ujson.decode(output)
+
+ @pytest.mark.parametrize("test", [
+ datetime.time(),
+ datetime.time(1, 2, 3),
+ datetime.time(10, 12, 15, 343243),
+ ])
+ def test_encode_time_conversion_basic(self, test):
+ output = ujson.encode(test)
+ expected = '"{iso}"'.format(iso=test.isoformat())
+ assert expected == output
+
+ def test_encode_time_conversion_pytz(self):
+ # see gh-11473: to_json segfaults with timezone-aware datetimes
+ test = datetime.time(10, 12, 15, 343243, pytz.utc)
+ output = ujson.encode(test)
+ expected = '"{iso}"'.format(iso=test.isoformat())
+ assert expected == output
+
+ def test_encode_time_conversion_dateutil(self):
+ # see gh-11473: to_json segfaults with timezone-aware datetimes
+ test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc())
+ output = ujson.encode(test)
+ expected = '"{iso}"'.format(iso=test.isoformat())
+ assert expected == output
+
+ @pytest.mark.parametrize("decoded_input", [
+ NaT,
+ np.datetime64("NaT"),
+ np.nan,
+ np.inf,
+ -np.inf
+ ])
+ def test_encode_as_null(self, decoded_input):
+ assert ujson.encode(decoded_input) == "null", "Expected null"
+
+ def test_datetime_units(self):
+ val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
+ stamp = Timestamp(val)
+
+ roundtrip = ujson.decode(ujson.encode(val, date_unit='s'))
+ assert roundtrip == stamp.value // 10**9
+
+ roundtrip = ujson.decode(ujson.encode(val, date_unit='ms'))
+ assert roundtrip == stamp.value // 10**6
+
+ roundtrip = ujson.decode(ujson.encode(val, date_unit='us'))
+ assert roundtrip == stamp.value // 10**3
+
+ roundtrip = ujson.decode(ujson.encode(val, date_unit='ns'))
+ assert roundtrip == stamp.value
+
+ msg = "Invalid value 'foo' for option 'date_unit'"
+ with pytest.raises(ValueError, match=msg):
+ ujson.encode(val, date_unit='foo')
+
+ def test_encode_to_utf8(self):
+ unencoded = "\xe6\x97\xa5\xd1\x88"
+
+ enc = ujson.encode(unencoded, ensure_ascii=False)
+ dec = ujson.decode(enc)
+
+ assert enc == json_unicode(unencoded, ensure_ascii=False)
+ assert dec == json.loads(enc)
+
+ def test_decode_from_unicode(self):
+ unicode_input = u("{\"obj\": 31337}")
+
+ dec1 = ujson.decode(unicode_input)
+ dec2 = ujson.decode(str(unicode_input))
+
+ assert dec1 == dec2
+
+ def test_encode_recursion_max(self):
+ # 8 is the max recursion depth
+
+ class O2(object):
+ member = 0
+ pass
+
+ class O1(object):
+ member = 0
+ pass
+
+ decoded_input = O1()
+ decoded_input.member = O2()
+ decoded_input.member.member = decoded_input
+
+ with pytest.raises(OverflowError):
+ ujson.encode(decoded_input)
+
+ def test_decode_jibberish(self):
+ jibberish = "fdsa sda v9sa fdsa"
+
+ with pytest.raises(ValueError):
+ ujson.decode(jibberish)
+
+ @pytest.mark.parametrize("broken_json", [
+ "[", # Broken array start.
+ "{", # Broken object start.
+ "]", # Broken array end.
+ "}", # Broken object end.
+ ])
+ def test_decode_broken_json(self, broken_json):
+ with pytest.raises(ValueError):
+ ujson.decode(broken_json)
+
+ @pytest.mark.parametrize("too_big_char", [
+ "[",
+ "{",
+ ])
+ def test_decode_depth_too_big(self, too_big_char):
+ with pytest.raises(ValueError):
+ ujson.decode(too_big_char * (1024 * 1024))
+
+ @pytest.mark.parametrize("bad_string", [
+ "\"TESTING", # Unterminated.
+ "\"TESTING\\\"", # Unterminated escape.
+ "tru", # Broken True.
+ "fa", # Broken False.
+ "n", # Broken None.
+ ])
+ def test_decode_bad_string(self, bad_string):
+ with pytest.raises(ValueError):
+ ujson.decode(bad_string)
+
+ @pytest.mark.parametrize("broken_json", [
+ '{{1337:""}}',
+ '{{"key":"}',
+ '[[[true',
+ ])
+ def test_decode_broken_json_leak(self, broken_json):
+ for _ in range(1000):
+ with pytest.raises(ValueError):
+ ujson.decode(broken_json)
+
+ @pytest.mark.parametrize("invalid_dict", [
+ "{{{{31337}}}}", # No key.
+ "{{{{\"key\":}}}}", # No value.
+ "{{{{\"key\"}}}}", # No colon or value.
+ ])
+ def test_decode_invalid_dict(self, invalid_dict):
+ with pytest.raises(ValueError):
+ ujson.decode(invalid_dict)
+
+ @pytest.mark.parametrize("numeric_int_as_str", [
+ "31337", "-31337" # Should work with negatives.
+ ])
+ def test_decode_numeric_int(self, numeric_int_as_str):
+ assert int(numeric_int_as_str) == ujson.decode(numeric_int_as_str)
+
+ @pytest.mark.skipif(compat.PY3, reason="only PY2")
+ def test_encode_unicode_4bytes_utf8_fail(self):
+ with pytest.raises(OverflowError):
+ ujson.encode("\xfd\xbf\xbf\xbf\xbf\xbf")
+
+ def test_encode_null_character(self):
+ wrapped_input = "31337 \x00 1337"
+ output = ujson.encode(wrapped_input)
+
+ assert wrapped_input == json.loads(output)
+ assert output == json.dumps(wrapped_input)
+ assert wrapped_input == ujson.decode(output)
+
+ alone_input = "\x00"
+ output = ujson.encode(alone_input)
+
+ assert alone_input == json.loads(output)
+ assert output == json.dumps(alone_input)
+ assert alone_input == ujson.decode(output)
+ assert '" \\u0000\\r\\n "' == ujson.dumps(u(" \u0000\r\n "))
+
+ def test_decode_null_character(self):
+ wrapped_input = "\"31337 \\u0000 31337\""
+ assert ujson.decode(wrapped_input) == json.loads(wrapped_input)
+
+ def test_encode_list_long_conversion(self):
+ long_input = [9223372036854775807, 9223372036854775807,
+ 9223372036854775807, 9223372036854775807,
+ 9223372036854775807, 9223372036854775807]
+ output = ujson.encode(long_input)
+
+ assert long_input == json.loads(output)
+ assert long_input == ujson.decode(output)
+
+ tm.assert_numpy_array_equal(np.array(long_input),
+ ujson.decode(output, numpy=True,
+ dtype=np.int64))
+
+ def test_encode_long_conversion(self):
+ long_input = 9223372036854775807
+ output = ujson.encode(long_input)
+
+ assert long_input == json.loads(output)
+ assert output == json.dumps(long_input)
+ assert long_input == ujson.decode(output)
+
+ @pytest.mark.parametrize("int_exp", [
+ "1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"
+ ])
+ def test_decode_numeric_int_exp(self, int_exp):
+ assert ujson.decode(int_exp) == json.loads(int_exp)
+
+ def test_dump_to_file(self):
+ f = StringIO()
+ ujson.dump([1, 2, 3], f)
+ assert "[1,2,3]" == f.getvalue()
+
+ def test_dump_to_file_like(self):
+ class FileLike(object):
+
+ def __init__(self):
+ self.bytes = ''
+
+ def write(self, data_bytes):
+ self.bytes += data_bytes
+
+ f = FileLike()
+ ujson.dump([1, 2, 3], f)
+ assert "[1,2,3]" == f.bytes
+
+ def test_dump_file_args_error(self):
+ with pytest.raises(TypeError):
+ ujson.dump([], "")
+
+ def test_load_file(self):
+ data = "[1,2,3,4]"
+ exp_data = [1, 2, 3, 4]
+
+ f = StringIO(data)
+ assert exp_data == ujson.load(f)
+
+ f = StringIO(data)
+ tm.assert_numpy_array_equal(np.array(exp_data),
+ ujson.load(f, numpy=True))
+
+ def test_load_file_like(self):
+ class FileLike(object):
+
+ def read(self):
+ try:
+ self.end
+ except AttributeError:
+ self.end = True
+ return "[1,2,3,4]"
+
+ exp_data = [1, 2, 3, 4]
+
+ f = FileLike()
+ assert exp_data == ujson.load(f)
+
+ f = FileLike()
+ tm.assert_numpy_array_equal(np.array(exp_data),
+ ujson.load(f, numpy=True))
+
+ def test_load_file_args_error(self):
+ with pytest.raises(TypeError):
+ ujson.load("[]")
+
+ def test_version(self):
+ assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \
+ "ujson.__version__ must be a string like '1.4.0'"
+
+ def test_encode_numeric_overflow(self):
+ with pytest.raises(OverflowError):
+ ujson.encode(12839128391289382193812939)
+
+ def test_encode_numeric_overflow_nested(self):
+ class Nested(object):
+ x = 12839128391289382193812939
+
+ for _ in range(0, 100):
+ with pytest.raises(OverflowError):
+ ujson.encode(Nested())
+
+ @pytest.mark.parametrize("val", [
+ 3590016419, 2**31, 2**32, (2**32) - 1
+ ])
+ def test_decode_number_with_32bit_sign_bit(self, val):
+ # Test that numbers that fit within 32 bits but would have the
+ # sign bit set (2**31 <= x < 2**32) are decoded properly.
+ doc = '{{"id": {val}}}'.format(val=val)
+ assert ujson.decode(doc)["id"] == val
+
+ def test_encode_big_escape(self):
+ # Make sure no Exception is raised.
+ for _ in range(10):
+ base = '\u00e5'.encode("utf-8") if compat.PY3 else "\xc3\xa5"
+ escape_input = base * 1024 * 1024 * 2
+ ujson.encode(escape_input)
+
+ def test_decode_big_escape(self):
+ # Make sure no Exception is raised.
+ for _ in range(10):
+ base = '\u00e5'.encode("utf-8") if compat.PY3 else "\xc3\xa5"
+ quote = compat.str_to_bytes("\"")
+
+ escape_input = quote + (base * 1024 * 1024 * 2) + quote
+ ujson.decode(escape_input)
+
+ def test_to_dict(self):
+ d = {u("key"): 31337}
+
+ class DictTest(object):
+ def toDict(self):
+ return d
+
+ o = DictTest()
+ output = ujson.encode(o)
+
+ dec = ujson.decode(output)
+ assert dec == d
+
+ def test_default_handler(self):
+
+ class _TestObject(object):
+
+ def __init__(self, val):
+ self.val = val
+
+ @property
+ def recursive_attr(self):
+ return _TestObject("recursive_attr")
+
+ def __str__(self):
+ return str(self.val)
+
+ msg = "Maximum recursion level reached"
+ with pytest.raises(OverflowError, match=msg):
+ ujson.encode(_TestObject("foo"))
+ assert '"foo"' == ujson.encode(_TestObject("foo"),
+ default_handler=str)
+
+ def my_handler(_):
+ return "foobar"
+
+ assert '"foobar"' == ujson.encode(_TestObject("foo"),
+ default_handler=my_handler)
+
+ def my_handler_raises(_):
+ raise TypeError("I raise for anything")
+
+ with pytest.raises(TypeError, match="I raise for anything"):
+ ujson.encode(_TestObject("foo"), default_handler=my_handler_raises)
+
+ def my_int_handler(_):
+ return 42
+
+ assert ujson.decode(ujson.encode(_TestObject("foo"),
+ default_handler=my_int_handler)) == 42
+
+ def my_obj_handler(_):
+ return datetime.datetime(2013, 2, 3)
+
+ assert (ujson.decode(ujson.encode(datetime.datetime(2013, 2, 3))) ==
+ ujson.decode(ujson.encode(_TestObject("foo"),
+ default_handler=my_obj_handler)))
+
+ obj_list = [_TestObject("foo"), _TestObject("bar")]
+ assert (json.loads(json.dumps(obj_list, default=str)) ==
+ ujson.decode(ujson.encode(obj_list, default_handler=str)))
+
+
+class TestNumpyJSONTests(object):
+
+ @pytest.mark.parametrize("bool_input", [True, False])
+ def test_bool(self, bool_input):
+ b = np.bool(bool_input)
+ assert ujson.decode(ujson.encode(b)) == b
+
+ def test_bool_array(self):
+ bool_array = np.array([
+ True, False, True, True,
+ False, True, False, False], dtype=np.bool)
+ output = np.array(ujson.decode(
+ ujson.encode(bool_array)), dtype=np.bool)
+ tm.assert_numpy_array_equal(bool_array, output)
+
+ def test_int(self, any_int_dtype):
+ klass = np.dtype(any_int_dtype).type
+ num = klass(1)
+
+ assert klass(ujson.decode(ujson.encode(num))) == num
+
+ def test_int_array(self, any_int_dtype):
+ arr = np.arange(100, dtype=np.int)
+ arr_input = arr.astype(any_int_dtype)
+
+ arr_output = np.array(ujson.decode(ujson.encode(arr_input)),
+ dtype=any_int_dtype)
+ tm.assert_numpy_array_equal(arr_input, arr_output)
+
+ def test_int_max(self, any_int_dtype):
+ if any_int_dtype in ("int64", "uint64") and compat.is_platform_32bit():
+ pytest.skip("Cannot test 64-bit integer on 32-bit platform")
+
+ klass = np.dtype(any_int_dtype).type
+
+ # uint64 max will always overflow,
+ # as it's encoded to signed.
+ if any_int_dtype == "uint64":
+ num = np.iinfo("int64").max
+ else:
+ num = np.iinfo(any_int_dtype).max
+
+ assert klass(ujson.decode(ujson.encode(num))) == num
+
+ def test_float(self, float_dtype):
+ klass = np.dtype(float_dtype).type
+ num = klass(256.2013)
+
+ assert klass(ujson.decode(ujson.encode(num))) == num
+
+ def test_float_array(self, float_dtype):
+ arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float)
+ float_input = arr.astype(float_dtype)
+
+ float_output = np.array(ujson.decode(
+ ujson.encode(float_input, double_precision=15)),
+ dtype=float_dtype)
+ tm.assert_almost_equal(float_input, float_output)
+
+ def test_float_max(self, float_dtype):
+ klass = np.dtype(float_dtype).type
+ num = klass(np.finfo(float_dtype).max / 10)
+
+ tm.assert_almost_equal(klass(ujson.decode(
+ ujson.encode(num, double_precision=15))), num)
+
+ def test_array_basic(self):
+ arr = np.arange(96)
+ arr = arr.reshape((2, 2, 2, 2, 3, 2))
+
+ tm.assert_numpy_array_equal(
+ np.array(ujson.decode(ujson.encode(arr))), arr)
+ tm.assert_numpy_array_equal(ujson.decode(
+ ujson.encode(arr), numpy=True), arr)
+
+ @pytest.mark.parametrize("shape", [
+ (10, 10),
+ (5, 5, 4),
+ (100, 1),
+ ])
+ def test_array_reshaped(self, shape):
+ arr = np.arange(100)
+ arr = arr.reshape(shape)
+
+ tm.assert_numpy_array_equal(
+ np.array(ujson.decode(ujson.encode(arr))), arr)
+ tm.assert_numpy_array_equal(ujson.decode(
+ ujson.encode(arr), numpy=True), arr)
+
+ def test_array_list(self):
+ arr_list = ["a", list(), dict(), dict(), list(),
+ 42, 97.8, ["a", "b"], {"key": "val"}]
+ arr = np.array(arr_list)
+ tm.assert_numpy_array_equal(
+ np.array(ujson.decode(ujson.encode(arr))), arr)
+
+ def test_array_float(self):
+ dtype = np.float32
+
+ arr = np.arange(100.202, 200.202, 1, dtype=dtype)
+ arr = arr.reshape((5, 5, 4))
+
+ arr_out = np.array(ujson.decode(ujson.encode(arr)), dtype=dtype)
+ tm.assert_almost_equal(arr, arr_out)
+
+ arr_out = ujson.decode(ujson.encode(arr), numpy=True, dtype=dtype)
+ tm.assert_almost_equal(arr, arr_out)
+
+ def test_0d_array(self):
+ with pytest.raises(TypeError):
+ ujson.encode(np.array(1))
+
+ @pytest.mark.parametrize("bad_input,exc_type,kwargs", [
+ ([{}, []], ValueError, {}),
+ ([42, None], TypeError, {}),
+ ([["a"], 42], ValueError, {}),
+ ([42, {}, "a"], TypeError, {}),
+ ([42, ["a"], 42], ValueError, {}),
+ (["a", "b", [], "c"], ValueError, {}),
+ ([{"a": "b"}], ValueError, dict(labelled=True)),
+ ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)),
+ ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True))
+ ])
+ def test_array_numpy_except(self, bad_input, exc_type, kwargs):
+ with pytest.raises(exc_type):
+ ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs)
+
+ def test_array_numpy_labelled(self):
+ labelled_input = {"a": []}
+ output = ujson.loads(ujson.dumps(labelled_input),
+ numpy=True, labelled=True)
+ assert (np.empty((1, 0)) == output[0]).all()
+ assert (np.array(["a"]) == output[1]).all()
+ assert output[2] is None
+
+ labelled_input = [{"a": 42}]
+ output = ujson.loads(ujson.dumps(labelled_input),
+ numpy=True, labelled=True)
+ assert (np.array([u("a")]) == output[2]).all()
+ assert (np.array([42]) == output[0]).all()
+ assert output[1] is None
+
+ # see gh-10837: write out the dump explicitly
+ # so there is no dependency on iteration order
+ input_dumps = ('[{"a": 42, "b":31}, {"a": 24, "c": 99}, '
+ '{"a": 2.4, "b": 78}]')
+ output = ujson.loads(input_dumps, numpy=True, labelled=True)
+ expected_vals = np.array(
+ [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2))
+ assert (expected_vals == output[0]).all()
+ assert output[1] is None
+ assert (np.array([u("a"), "b"]) == output[2]).all()
+
+ input_dumps = ('{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, '
+ '"3": {"a": 2.4, "b": 78}}')
+ output = ujson.loads(input_dumps, numpy=True, labelled=True)
+ expected_vals = np.array(
+ [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2))
+ assert (expected_vals == output[0]).all()
+ assert (np.array(["1", "2", "3"]) == output[1]).all()
+ assert (np.array(["a", "b"]) == output[2]).all()
+
+
+class TestPandasJSONTests(object):
+
+ def test_dataframe(self, orient, numpy):
+ if orient == "records" and numpy:
+ pytest.skip("Not idiomatic pandas")
+
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
+ "a", "b"], columns=["x", "y", "z"])
+ encode_kwargs = {} if orient is None else dict(orient=orient)
+ decode_kwargs = {} if numpy is None else dict(numpy=numpy)
+
+ output = ujson.decode(ujson.encode(df, **encode_kwargs),
+ **decode_kwargs)
+
+ # Ensure proper DataFrame initialization.
+ if orient == "split":
+ dec = _clean_dict(output)
+ output = DataFrame(**dec)
+ else:
+ output = DataFrame(output)
+
+ # Corrections to enable DataFrame comparison.
+ if orient == "values":
+ df.columns = [0, 1, 2]
+ df.index = [0, 1]
+ elif orient == "records":
+ df.index = [0, 1]
+ elif orient == "index":
+ df = df.transpose()
+
+ tm.assert_frame_equal(output, df, check_dtype=False)
+
+ def test_dataframe_nested(self, orient):
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
+ "a", "b"], columns=["x", "y", "z"])
+
+ nested = {"df1": df, "df2": df.copy()}
+ kwargs = {} if orient is None else dict(orient=orient)
+
+ exp = {"df1": ujson.decode(ujson.encode(df, **kwargs)),
+ "df2": ujson.decode(ujson.encode(df, **kwargs))}
+ assert ujson.decode(ujson.encode(nested, **kwargs)) == exp
+
+ def test_dataframe_numpy_labelled(self, orient):
+ if orient in ("split", "values"):
+ pytest.skip("Incompatible with labelled=True")
+
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
+ "a", "b"], columns=["x", "y", "z"], dtype=np.int)
+ kwargs = {} if orient is None else dict(orient=orient)
+
+ output = DataFrame(*ujson.decode(ujson.encode(df, **kwargs),
+ numpy=True, labelled=True))
+
+ if orient is None:
+ df = df.T
+ elif orient == "records":
+ df.index = [0, 1]
+
+ tm.assert_frame_equal(output, df)
+
+ def test_series(self, orient, numpy):
+ s = Series([10, 20, 30, 40, 50, 60], name="series",
+ index=[6, 7, 8, 9, 10, 15]).sort_values()
+
+ encode_kwargs = {} if orient is None else dict(orient=orient)
+ decode_kwargs = {} if numpy is None else dict(numpy=numpy)
+
+ output = ujson.decode(ujson.encode(s, **encode_kwargs),
+ **decode_kwargs)
+
+ if orient == "split":
+ dec = _clean_dict(output)
+ output = Series(**dec)
+ else:
+ output = Series(output)
+
+ if orient in (None, "index"):
+ s.name = None
+ output = output.sort_values()
+ s.index = ["6", "7", "8", "9", "10", "15"]
+ elif orient in ("records", "values"):
+ s.name = None
+ s.index = [0, 1, 2, 3, 4, 5]
+
+ tm.assert_series_equal(output, s, check_dtype=False)
+
+ def test_series_nested(self, orient):
+ s = Series([10, 20, 30, 40, 50, 60], name="series",
+ index=[6, 7, 8, 9, 10, 15]).sort_values()
+ nested = {"s1": s, "s2": s.copy()}
+ kwargs = {} if orient is None else dict(orient=orient)
+
+ exp = {"s1": ujson.decode(ujson.encode(s, **kwargs)),
+ "s2": ujson.decode(ujson.encode(s, **kwargs))}
+ assert ujson.decode(ujson.encode(nested, **kwargs)) == exp
+
+ def test_index(self):
+ i = Index([23, 45, 18, 98, 43, 11], name="index")
+
+ # Column indexed.
+ output = Index(ujson.decode(ujson.encode(i)), name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i), numpy=True), name="index")
+ tm.assert_index_equal(i, output)
+
+ dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split")))
+ output = Index(**dec)
+
+ tm.assert_index_equal(i, output)
+ assert i.name == output.name
+
+ dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"),
+ numpy=True))
+ output = Index(**dec)
+
+ tm.assert_index_equal(i, output)
+ assert i.name == output.name
+
+ output = Index(ujson.decode(ujson.encode(i, orient="values")),
+ name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i, orient="values"),
+ numpy=True), name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i, orient="records")),
+ name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i, orient="records"),
+ numpy=True), name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i, orient="index")),
+ name="index")
+ tm.assert_index_equal(i, output)
+
+ output = Index(ujson.decode(ujson.encode(i, orient="index"),
+ numpy=True), name="index")
+ tm.assert_index_equal(i, output)
+
+ def test_datetime_index(self):
+ date_unit = "ns"
+
+ rng = date_range("1/1/2000", periods=20)
+ encoded = ujson.encode(rng, date_unit=date_unit)
+
+ decoded = DatetimeIndex(np.array(ujson.decode(encoded)))
+ tm.assert_index_equal(rng, decoded)
+
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ decoded = Series(ujson.decode(ujson.encode(ts, date_unit=date_unit)))
+
+ idx_values = decoded.index.values.astype(np.int64)
+ decoded.index = DatetimeIndex(idx_values)
+ tm.assert_series_equal(ts, decoded)
+
+ @pytest.mark.parametrize("invalid_arr", [
+ "[31337,]", # Trailing comma.
+ "[,31337]", # Leading comma.
+ "[]]", # Unmatched bracket.
+ "[,]", # Only comma.
+ ])
+ def test_decode_invalid_array(self, invalid_arr):
+ with pytest.raises(ValueError):
+ ujson.decode(invalid_arr)
+
+ @pytest.mark.parametrize("arr", [
+ [], [31337]
+ ])
+ def test_decode_array(self, arr):
+ assert arr == ujson.decode(str(arr))
+
+ @pytest.mark.parametrize("extreme_num", [
+ 9223372036854775807, -9223372036854775808
+ ])
+ def test_decode_extreme_numbers(self, extreme_num):
+ assert extreme_num == ujson.decode(str(extreme_num))
+
+ @pytest.mark.parametrize("too_extreme_num", [
+ "9223372036854775808", "-90223372036854775809"
+ ])
+ def test_decode_too_extreme_numbers(self, too_extreme_num):
+ with pytest.raises(ValueError):
+ ujson.decode(too_extreme_num)
+
+ def test_decode_with_trailing_whitespaces(self):
+ assert {} == ujson.decode("{}\n\t ")
+
+ def test_decode_with_trailing_non_whitespaces(self):
+ with pytest.raises(ValueError):
+ ujson.decode("{}\n\t a")
+
+ def test_decode_array_with_big_int(self):
+ with pytest.raises(ValueError):
+ ujson.loads("[18446098363113800555]")
+
+ @pytest.mark.parametrize("float_number", [
+ 1.1234567893, 1.234567893, 1.34567893,
+ 1.4567893, 1.567893, 1.67893,
+ 1.7893, 1.893, 1.3,
+ ])
+ @pytest.mark.parametrize("sign", [-1, 1])
+ def test_decode_floating_point(self, sign, float_number):
+ float_number *= sign
+ tm.assert_almost_equal(float_number,
+ ujson.loads(str(float_number)),
+ check_less_precise=15)
+
+ def test_encode_big_set(self):
+ s = set()
+
+ for x in range(0, 100000):
+ s.add(x)
+
+ # Make sure no Exception is raised.
+ ujson.encode(s)
+
+ def test_encode_empty_set(self):
+ assert "[]" == ujson.encode(set())
+
+ def test_encode_set(self):
+ s = {1, 2, 3, 4, 5, 6, 7, 8, 9}
+ enc = ujson.encode(s)
+ dec = ujson.decode(enc)
+
+ for v in dec:
+ assert v in s
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/common.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/common.py
new file mode 100644
index 00000000000..434d347c574
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/common.py
@@ -0,0 +1,9 @@
+from pandas.compat import PY3
+
+# array compat
+if PY3:
+ frombytes = lambda obj, data: obj.frombytes(data)
+ tobytes = lambda obj: obj.tobytes()
+else:
+ frombytes = lambda obj, data: obj.fromstring(data)
+ tobytes = lambda obj: obj.tostring()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/data/frame.mp b/contrib/python/pandas/py2/pandas/tests/io/msgpack/data/frame.mp
new file mode 100644
index 00000000000..21e20d262b2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/data/frame.mp
Binary files differ
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_buffer.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_buffer.py
new file mode 100644
index 00000000000..e36dc5bbdb4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_buffer.py
@@ -0,0 +1,21 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb, unpackb
+
+from .common import frombytes
+
+
+def test_unpack_buffer():
+ from array import array
+ buf = array('b')
+ frombytes(buf, packb((b'foo', b'bar')))
+ obj = unpackb(buf, use_list=1)
+ assert [b'foo', b'bar'] == obj
+
+
+def test_unpack_bytearray():
+ buf = bytearray(packb(('foo', 'bar')))
+ obj = unpackb(buf, use_list=1)
+ assert [b'foo', b'bar'] == obj
+ expected_type = bytes
+ assert all(type(s) == expected_type for s in obj)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_case.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_case.py
new file mode 100644
index 00000000000..c0e76b37ee4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_case.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb, unpackb
+
+
+def check(length, obj):
+ v = packb(obj)
+ assert len(v) == length, \
+ "%r length should be %r but get %r" % (obj, length, len(v))
+ assert unpackb(v, use_list=0) == obj
+
+
+def test_1():
+ for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1,
+ -((1 << 5) - 1), -(1 << 5)]:
+ check(1, o)
+
+
+def test_2():
+ for o in [1 << 7, (1 << 8) - 1, -((1 << 5) + 1), -(1 << 7)]:
+ check(2, o)
+
+
+def test_3():
+ for o in [1 << 8, (1 << 16) - 1, -((1 << 7) + 1), -(1 << 15)]:
+ check(3, o)
+
+
+def test_5():
+ for o in [1 << 16, (1 << 32) - 1, -((1 << 15) + 1), -(1 << 31)]:
+ check(5, o)
+
+
+def test_9():
+ for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1,
+ -0.1, -1.0]:
+ check(9, o)
+
+
+def check_raw(overhead, num):
+ check(num + overhead, b" " * num)
+
+
+def test_fixraw():
+ check_raw(1, 0)
+ check_raw(1, (1 << 5) - 1)
+
+
+def test_raw16():
+ check_raw(3, 1 << 5)
+ check_raw(3, (1 << 16) - 1)
+
+
+def test_raw32():
+ check_raw(5, 1 << 16)
+
+
+def check_array(overhead, num):
+ check(num + overhead, (None, ) * num)
+
+
+def test_fixarray():
+ check_array(1, 0)
+ check_array(1, (1 << 4) - 1)
+
+
+def test_array16():
+ check_array(3, 1 << 4)
+ check_array(3, (1 << 16) - 1)
+
+
+def test_array32():
+ check_array(5, (1 << 16))
+
+
+def match(obj, buf):
+ assert packb(obj) == buf
+ assert unpackb(buf, use_list=0) == obj
+
+
+def test_match():
+ cases = [
+ (None, b'\xc0'),
+ (False, b'\xc2'),
+ (True, b'\xc3'),
+ (0, b'\x00'),
+ (127, b'\x7f'),
+ (128, b'\xcc\x80'),
+ (256, b'\xcd\x01\x00'),
+ (-1, b'\xff'),
+ (-33, b'\xd0\xdf'),
+ (-129, b'\xd1\xff\x7f'),
+ ({1: 1}, b'\x81\x01\x01'),
+ (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"),
+ ((), b'\x90'),
+ (tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
+ b"\x0a\x0b\x0c\x0d\x0e")),
+ (tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07"
+ b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")),
+ ({}, b'\x80'),
+ ({x: x for x in range(15)},
+ (b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07'
+ b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')),
+ ({x: x for x in range(16)},
+ (b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06'
+ b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e'
+ b'\x0f\x0f')),
+ ]
+
+ for v, p in cases:
+ match(v, p)
+
+
+def test_unicode():
+ assert unpackb(packb('foobar'), use_list=1) == b'foobar'
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_except.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_except.py
new file mode 100644
index 00000000000..cd894109e98
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_except.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+
+from datetime import datetime
+
+import pytest
+
+from pandas.io.msgpack import packb, unpackb
+
+
+class DummyException(Exception):
+ pass
+
+
+class TestExceptions(object):
+
+ def test_raise_on_find_unsupported_value(self):
+ msg = "can\'t serialize datetime"
+ with pytest.raises(TypeError, match=msg):
+ packb(datetime.now())
+
+ def test_raise_from_object_hook(self):
+ def hook(_):
+ raise DummyException()
+
+ with pytest.raises(DummyException):
+ unpackb(packb({}), object_hook=hook)
+ with pytest.raises(DummyException):
+ unpackb(packb({'fizz': 'buzz'}), object_hook=hook)
+ with pytest.raises(DummyException):
+ unpackb(packb({'fizz': 'buzz'}), object_pairs_hook=hook)
+ with pytest.raises(DummyException):
+ unpackb(packb({'fizz': {'buzz': 'spam'}}), object_hook=hook)
+ with pytest.raises(DummyException):
+ unpackb(packb({'fizz': {'buzz': 'spam'}}), object_pairs_hook=hook)
+
+ def test_invalid_value(self):
+ msg = "Unpack failed: error"
+ with pytest.raises(ValueError, match=msg):
+ unpackb(b"\xd9\x97#DL_")
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_extension.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_extension.py
new file mode 100644
index 00000000000..06a0691bf4f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_extension.py
@@ -0,0 +1,63 @@
+from __future__ import print_function
+
+import array
+
+import pandas.io.msgpack as msgpack
+from pandas.io.msgpack import ExtType
+
+from .common import frombytes, tobytes
+
+
+def test_pack_ext_type():
+ def p(s):
+ packer = msgpack.Packer()
+ packer.pack_ext_type(0x42, s)
+ return packer.bytes()
+
+ assert p(b'A') == b'\xd4\x42A' # fixext 1
+ assert p(b'AB') == b'\xd5\x42AB' # fixext 2
+ assert p(b'ABCD') == b'\xd6\x42ABCD' # fixext 4
+ assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH' # fixext 8
+ assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16 # fixext 16
+ assert p(b'ABC') == b'\xc7\x03\x42ABC' # ext 8
+ assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123 # ext 16
+ assert (p(b'A' * 0x00012345) ==
+ b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345) # ext 32
+
+
+def test_unpack_ext_type():
+ def check(b, expected):
+ assert msgpack.unpackb(b) == expected
+
+ check(b'\xd4\x42A', ExtType(0x42, b'A')) # fixext 1
+ check(b'\xd5\x42AB', ExtType(0x42, b'AB')) # fixext 2
+ check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD')) # fixext 4
+ check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH')) # fixext 8
+ check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16)) # fixext 16
+ check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC')) # ext 8
+ check(b'\xc8\x01\x23\x42' + b'A' * 0x0123,
+ ExtType(0x42, b'A' * 0x0123)) # ext 16
+ check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345,
+ ExtType(0x42, b'A' * 0x00012345)) # ext 32
+
+
+def test_extension_type():
+ def default(obj):
+ print('default called', obj)
+ if isinstance(obj, array.array):
+ typecode = 123 # application specific typecode
+ data = tobytes(obj)
+ return ExtType(typecode, data)
+ raise TypeError("Unknown type object %r" % (obj, ))
+
+ def ext_hook(code, data):
+ print('ext_hook called', code, data)
+ assert code == 123
+ obj = array.array('d')
+ frombytes(obj, data)
+ return obj
+
+ obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])]
+ s = msgpack.packb(obj, default=default)
+ obj2 = msgpack.unpackb(s, ext_hook=ext_hook)
+ assert obj == obj2
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_format.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_format.py
new file mode 100644
index 00000000000..3659602e138
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_format.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+
+from pandas.io.msgpack import unpackb
+
+
+def check(src, should, use_list=0):
+ assert unpackb(src, use_list=use_list) == should
+
+
+def testSimpleValue():
+ check(b"\x93\xc0\xc2\xc3", (None, False, True, ))
+
+
+def testFixnum():
+ check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0,
+ 64,
+ 127, ),
+ (-32,
+ -16,
+ -1, ), ))
+
+
+def testFixArray():
+ check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), )
+
+
+def testFixRaw():
+ check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), )
+
+
+def testFixMap():
+ check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80",
+ {False: {None: None},
+ True: {None: {}}}, )
+
+
+def testUnsignedInt():
+ check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00"
+ b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00"
+ b"\xce\xff\xff\xff\xff",
+ (0,
+ 128,
+ 255,
+ 0,
+ 32768,
+ 65535,
+ 0,
+ 2147483648,
+ 4294967295, ), )
+
+
+def testSignedInt():
+ check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00"
+ b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00"
+ b"\xd2\xff\xff\xff\xff", (0,
+ -128,
+ -1,
+ 0,
+ -32768,
+ -1,
+ 0,
+ -2147483648,
+ -1, ))
+
+
+def testRaw():
+ check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00"
+ b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab",
+ (b"", b"a", b"ab", b"", b"a", b"ab"))
+
+
+def testArray():
+ check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00"
+ b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02"
+ b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ),
+ (False, True)))
+
+
+def testMap():
+ check(b"\x96"
+ b"\xde\x00\x00"
+ b"\xde\x00\x01\xc0\xc2"
+ b"\xde\x00\x02\xc0\xc2\xc3\xc2"
+ b"\xdf\x00\x00\x00\x00"
+ b"\xdf\x00\x00\x00\x01\xc0\xc2"
+ b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False},
+ {True: False,
+ None: False}, {},
+ {None: False},
+ {True: False,
+ None: False}))
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_limits.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_limits.py
new file mode 100644
index 00000000000..dd8dc8da607
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_limits.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import (
+ absolute_import, division, print_function, unicode_literals)
+
+import pytest
+
+from pandas.io.msgpack import ExtType, Packer, Unpacker, packb, unpackb
+
+
+class TestLimits(object):
+
+ def test_integer(self):
+ x = -(2 ** 63)
+ assert unpackb(packb(x)) == x
+ msg = (r"((long |Python )?(int )?too (big|large) to convert"
+ r"( to C (unsigned )?long))?")
+ with pytest.raises((OverflowError, ValueError), match=msg):
+ packb(x - 1)
+ x = 2 ** 64 - 1
+ assert unpackb(packb(x)) == x
+ with pytest.raises((OverflowError, ValueError), match=msg):
+ packb(x + 1)
+
+ def test_array_header(self):
+ packer = Packer()
+ packer.pack_array_header(2 ** 32 - 1)
+ with pytest.raises((OverflowError, ValueError)):
+ packer.pack_array_header(2 ** 32)
+
+ def test_map_header(self):
+ packer = Packer()
+ packer.pack_map_header(2 ** 32 - 1)
+ with pytest.raises((OverflowError, ValueError)):
+ packer.pack_array_header(2 ** 32)
+
+ def test_max_str_len(self):
+ d = 'x' * 3
+ packed = packb(d)
+
+ unpacker = Unpacker(max_str_len=3, encoding='utf-8')
+ unpacker.feed(packed)
+ assert unpacker.unpack() == d
+
+ unpacker = Unpacker(max_str_len=2, encoding='utf-8')
+ unpacker.feed(packed)
+
+ msg = "3 exceeds max_str_len"
+ with pytest.raises(ValueError, match=msg):
+ unpacker.unpack()
+
+ def test_max_bin_len(self):
+ d = b'x' * 3
+ packed = packb(d, use_bin_type=True)
+
+ unpacker = Unpacker(max_bin_len=3)
+ unpacker.feed(packed)
+ assert unpacker.unpack() == d
+
+ unpacker = Unpacker(max_bin_len=2)
+ unpacker.feed(packed)
+
+ msg = "3 exceeds max_bin_len"
+ with pytest.raises(ValueError, match=msg):
+ unpacker.unpack()
+
+ def test_max_array_len(self):
+ d = [1, 2, 3]
+ packed = packb(d)
+
+ unpacker = Unpacker(max_array_len=3)
+ unpacker.feed(packed)
+ assert unpacker.unpack() == d
+
+ unpacker = Unpacker(max_array_len=2)
+ unpacker.feed(packed)
+
+ msg = "3 exceeds max_array_len"
+ with pytest.raises(ValueError, match=msg):
+ unpacker.unpack()
+
+ def test_max_map_len(self):
+ d = {1: 2, 3: 4, 5: 6}
+ packed = packb(d)
+
+ unpacker = Unpacker(max_map_len=3)
+ unpacker.feed(packed)
+ assert unpacker.unpack() == d
+
+ unpacker = Unpacker(max_map_len=2)
+ unpacker.feed(packed)
+
+ msg = "3 exceeds max_map_len"
+ with pytest.raises(ValueError, match=msg):
+ unpacker.unpack()
+
+ def test_max_ext_len(self):
+ d = ExtType(42, b"abc")
+ packed = packb(d)
+
+ unpacker = Unpacker(max_ext_len=3)
+ unpacker.feed(packed)
+ assert unpacker.unpack() == d
+
+ unpacker = Unpacker(max_ext_len=2)
+ unpacker.feed(packed)
+
+ msg = "4 exceeds max_ext_len"
+ with pytest.raises(ValueError, match=msg):
+ unpacker.unpack()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_obj.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_obj.py
new file mode 100644
index 00000000000..471212f1bfe
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_obj.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+import pytest
+
+from pandas.io.msgpack import packb, unpackb
+
+
+class DecodeError(Exception):
+ pass
+
+
+class TestObj(object):
+
+ def _arr_to_str(self, arr):
+ return ''.join(str(c) for c in arr)
+
+ def bad_complex_decoder(self, o):
+ raise DecodeError("Ooops!")
+
+ def _decode_complex(self, obj):
+ if b'__complex__' in obj:
+ return complex(obj[b'real'], obj[b'imag'])
+ return obj
+
+ def _encode_complex(self, obj):
+ if isinstance(obj, complex):
+ return {b'__complex__': True, b'real': 1, b'imag': 2}
+ return obj
+
+ def test_encode_hook(self):
+ packed = packb([3, 1 + 2j], default=self._encode_complex)
+ unpacked = unpackb(packed, use_list=1)
+ assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2}
+
+ def test_decode_hook(self):
+ packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}])
+ unpacked = unpackb(packed, object_hook=self._decode_complex,
+ use_list=1)
+ assert unpacked[1] == 1 + 2j
+
+ def test_decode_pairs_hook(self):
+ packed = packb([3, {1: 2, 3: 4}])
+ prod_sum = 1 * 2 + 3 * 4
+ unpacked = unpackb(
+ packed, object_pairs_hook=lambda l: sum(k * v for k, v in l),
+ use_list=1)
+ assert unpacked[1] == prod_sum
+
+ def test_only_one_obj_hook(self):
+ msg = "object_pairs_hook and object_hook are mutually exclusive"
+ with pytest.raises(TypeError, match=msg):
+ unpackb(b'', object_hook=lambda x: x,
+ object_pairs_hook=lambda x: x)
+
+ def test_bad_hook(self):
+ msg = r"can't serialize \(1\+2j\)"
+ with pytest.raises(TypeError, match=msg):
+ packed = packb([3, 1 + 2j], default=lambda o: o)
+ unpacked = unpackb(packed, use_list=1) # noqa
+
+ def test_array_hook(self):
+ packed = packb([1, 2, 3])
+ unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1)
+ assert unpacked == '123'
+
+ def test_an_exception_in_objecthook1(self):
+ with pytest.raises(DecodeError, match='Ooops!'):
+ packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}})
+ unpackb(packed, object_hook=self.bad_complex_decoder)
+
+ def test_an_exception_in_objecthook2(self):
+ with pytest.raises(DecodeError, match='Ooops!'):
+ packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]})
+ unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_pack.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_pack.py
new file mode 100644
index 00000000000..8c82d0d2cf8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_pack.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+
+import struct
+
+import pytest
+
+from pandas.compat import OrderedDict, u
+
+from pandas import compat
+
+from pandas.io.msgpack import Packer, Unpacker, packb, unpackb
+
+
+class TestPack(object):
+
+ def check(self, data, use_list=False):
+ re = unpackb(packb(data), use_list=use_list)
+ assert re == data
+
+ def testPack(self):
+ test_data = [
+ 0, 1, 127, 128, 255, 256, 65535, 65536,
+ -1, -32, -33, -128, -129, -32768, -32769,
+ 1.0,
+ b"", b"a", b"a" * 31, b"a" * 32,
+ None, True, False,
+ (), ((),), ((), None,),
+ {None: 0},
+ (1 << 23),
+ ]
+ for td in test_data:
+ self.check(td)
+
+ def testPackUnicode(self):
+ test_data = [u(""), u("abcd"), [u("defgh")], u("Русский текст"), ]
+ for td in test_data:
+ re = unpackb(
+ packb(td, encoding='utf-8'), use_list=1, encoding='utf-8')
+ assert re == td
+ packer = Packer(encoding='utf-8')
+ data = packer.pack(td)
+ re = Unpacker(
+ compat.BytesIO(data), encoding='utf-8', use_list=1).unpack()
+ assert re == td
+
+ def testPackUTF32(self):
+ test_data = [
+ compat.u(""),
+ compat.u("abcd"),
+ [compat.u("defgh")],
+ compat.u("Русский текст"),
+ ]
+ for td in test_data:
+ re = unpackb(
+ packb(td, encoding='utf-32'), use_list=1, encoding='utf-32')
+ assert re == td
+
+ def testPackBytes(self):
+ test_data = [b"", b"abcd", (b"defgh", ), ]
+ for td in test_data:
+ self.check(td)
+
+ def testIgnoreUnicodeErrors(self):
+ re = unpackb(
+ packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore',
+ use_list=1)
+ assert re == "abcdef"
+
+ def testStrictUnicodeUnpack(self):
+ msg = (r"'utf-*8' codec can't decode byte 0xed in position 3:"
+ " invalid continuation byte")
+ with pytest.raises(UnicodeDecodeError, match=msg):
+ unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1)
+
+ def testStrictUnicodePack(self):
+ msg = (r"'ascii' codec can't encode character u*'\\xed' in position 3:"
+ r" ordinal not in range\(128\)")
+ with pytest.raises(UnicodeEncodeError, match=msg):
+ packb(compat.u("abc\xeddef"), encoding='ascii',
+ unicode_errors='strict')
+
+ def testIgnoreErrorsPack(self):
+ re = unpackb(
+ packb(
+ compat.u("abcФФФdef"), encoding='ascii',
+ unicode_errors='ignore'), encoding='utf-8', use_list=1)
+ assert re == compat.u("abcdef")
+
+ def testNoEncoding(self):
+ msg = "Can't encode unicode string: no encoding is specified"
+ with pytest.raises(TypeError, match=msg):
+ packb(compat.u("abc"), encoding=None)
+
+ def testDecodeBinary(self):
+ re = unpackb(packb("abc"), encoding=None, use_list=1)
+ assert re == b"abc"
+
+ def testPackFloat(self):
+ assert packb(1.0,
+ use_single_float=True) == b'\xca' + struct.pack('>f', 1.0)
+ assert packb(
+ 1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0)
+
+ def testArraySize(self, sizes=[0, 5, 50, 1000]):
+ bio = compat.BytesIO()
+ packer = Packer()
+ for size in sizes:
+ bio.write(packer.pack_array_header(size))
+ for i in range(size):
+ bio.write(packer.pack(i))
+
+ bio.seek(0)
+ unpacker = Unpacker(bio, use_list=1)
+ for size in sizes:
+ assert unpacker.unpack() == list(range(size))
+
+ def test_manualreset(self, sizes=[0, 5, 50, 1000]):
+ packer = Packer(autoreset=False)
+ for size in sizes:
+ packer.pack_array_header(size)
+ for i in range(size):
+ packer.pack(i)
+
+ bio = compat.BytesIO(packer.bytes())
+ unpacker = Unpacker(bio, use_list=1)
+ for size in sizes:
+ assert unpacker.unpack() == list(range(size))
+
+ packer.reset()
+ assert packer.bytes() == b''
+
+ def testMapSize(self, sizes=[0, 5, 50, 1000]):
+ bio = compat.BytesIO()
+ packer = Packer()
+ for size in sizes:
+ bio.write(packer.pack_map_header(size))
+ for i in range(size):
+ bio.write(packer.pack(i)) # key
+ bio.write(packer.pack(i * 2)) # value
+
+ bio.seek(0)
+ unpacker = Unpacker(bio)
+ for size in sizes:
+ assert unpacker.unpack() == {i: i * 2 for i in range(size)}
+
+ def test_odict(self):
+ seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)]
+ od = OrderedDict(seq)
+ assert unpackb(packb(od), use_list=1) == dict(seq)
+
+ def pair_hook(seq):
+ return list(seq)
+
+ assert unpackb(
+ packb(od), object_pairs_hook=pair_hook, use_list=1) == seq
+
+ def test_pairlist(self):
+ pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')]
+ packer = Packer()
+ packed = packer.pack_map_pairs(pairlist)
+ unpacked = unpackb(packed, object_pairs_hook=list)
+ assert pairlist == unpacked
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_read_size.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_read_size.py
new file mode 100644
index 00000000000..42791b571e8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_read_size.py
@@ -0,0 +1,71 @@
+"""Test Unpacker's read_array_header and read_map_header methods"""
+from pandas.io.msgpack import OutOfData, Unpacker, packb
+
+UnexpectedTypeException = ValueError
+
+
+def test_read_array_header():
+ unpacker = Unpacker()
+ unpacker.feed(packb(['a', 'b', 'c']))
+ assert unpacker.read_array_header() == 3
+ assert unpacker.unpack() == b'a'
+ assert unpacker.unpack() == b'b'
+ assert unpacker.unpack() == b'c'
+ try:
+ unpacker.unpack()
+ assert 0, 'should raise exception'
+ except OutOfData:
+ assert 1, 'okay'
+
+
+def test_read_map_header():
+ unpacker = Unpacker()
+ unpacker.feed(packb({'a': 'A'}))
+ assert unpacker.read_map_header() == 1
+ assert unpacker.unpack() == B'a'
+ assert unpacker.unpack() == B'A'
+ try:
+ unpacker.unpack()
+ assert 0, 'should raise exception'
+ except OutOfData:
+ assert 1, 'okay'
+
+
+def test_incorrect_type_array():
+ unpacker = Unpacker()
+ unpacker.feed(packb(1))
+ try:
+ unpacker.read_array_header()
+ assert 0, 'should raise exception'
+ except UnexpectedTypeException:
+ assert 1, 'okay'
+
+
+def test_incorrect_type_map():
+ unpacker = Unpacker()
+ unpacker.feed(packb(1))
+ try:
+ unpacker.read_map_header()
+ assert 0, 'should raise exception'
+ except UnexpectedTypeException:
+ assert 1, 'okay'
+
+
+def test_correct_type_nested_array():
+ unpacker = Unpacker()
+ unpacker.feed(packb({'a': ['b', 'c', 'd']}))
+ try:
+ unpacker.read_array_header()
+ assert 0, 'should raise exception'
+ except UnexpectedTypeException:
+ assert 1, 'okay'
+
+
+def test_incorrect_type_nested_map():
+ unpacker = Unpacker()
+ unpacker.feed(packb([{'a': 'b'}]))
+ try:
+ unpacker.read_map_header()
+ assert 0, 'should raise exception'
+ except UnexpectedTypeException:
+ assert 1, 'okay'
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_seq.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_seq.py
new file mode 100644
index 00000000000..68be8c2d975
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_seq.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+
+import io
+
+import pandas.io.msgpack as msgpack
+
+binarydata = bytes(bytearray(range(256)))
+
+
+def gen_binary_data(idx):
+ return binarydata[:idx % 300]
+
+
+def test_exceeding_unpacker_read_size():
+ dumpf = io.BytesIO()
+
+ packer = msgpack.Packer()
+
+ NUMBER_OF_STRINGS = 6
+ read_size = 16
+
+ # 5 ok for read_size=16, while 6 glibc detected *** python: double free or
+ # corruption (fasttop):
+
+ # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python:
+ # double free or corruption (!prev)
+
+ # 40 ok for read_size=1024, while 50 introduces errors
+ # 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected ***
+ # python: double free or corruption (!prev):
+
+ for idx in range(NUMBER_OF_STRINGS):
+ data = gen_binary_data(idx)
+ dumpf.write(packer.pack(data))
+
+ f = io.BytesIO(dumpf.getvalue())
+ dumpf.close()
+
+ unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1)
+
+ read_count = 0
+ for idx, o in enumerate(unpacker):
+ assert type(o) == bytes
+ assert o == gen_binary_data(idx)
+ read_count += 1
+
+ assert read_count == NUMBER_OF_STRINGS
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_sequnpack.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_sequnpack.py
new file mode 100644
index 00000000000..91f5778a7ce
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_sequnpack.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+
+import pytest
+
+from pandas import compat
+
+from pandas.io.msgpack import BufferFull, OutOfData, Unpacker
+
+
+class TestPack(object):
+
+ def test_partial_data(self):
+ unpacker = Unpacker()
+ msg = "No more data to unpack"
+
+ for data in [b"\xa5", b"h", b"a", b"l", b"l"]:
+ unpacker.feed(data)
+ with pytest.raises(StopIteration, match=msg):
+ next(iter(unpacker))
+
+ unpacker.feed(b"o")
+ assert next(iter(unpacker)) == b"hallo"
+
+ def test_foobar(self):
+ unpacker = Unpacker(read_size=3, use_list=1)
+ unpacker.feed(b'foobar')
+ assert unpacker.unpack() == ord(b'f')
+ assert unpacker.unpack() == ord(b'o')
+ assert unpacker.unpack() == ord(b'o')
+ assert unpacker.unpack() == ord(b'b')
+ assert unpacker.unpack() == ord(b'a')
+ assert unpacker.unpack() == ord(b'r')
+ msg = "No more data to unpack"
+ with pytest.raises(OutOfData, match=msg):
+ unpacker.unpack()
+
+ unpacker.feed(b'foo')
+ unpacker.feed(b'bar')
+
+ k = 0
+ for o, e in zip(unpacker, 'foobarbaz'):
+ assert o == ord(e)
+ k += 1
+ assert k == len(b'foobar')
+
+ def test_foobar_skip(self):
+ unpacker = Unpacker(read_size=3, use_list=1)
+ unpacker.feed(b'foobar')
+ assert unpacker.unpack() == ord(b'f')
+ unpacker.skip()
+ assert unpacker.unpack() == ord(b'o')
+ unpacker.skip()
+ assert unpacker.unpack() == ord(b'a')
+ unpacker.skip()
+ msg = "No more data to unpack"
+ with pytest.raises(OutOfData, match=msg):
+ unpacker.unpack()
+
+ def test_maxbuffersize_read_size_exceeds_max_buffer_size(self):
+ msg = "read_size should be less or equal to max_buffer_size"
+ with pytest.raises(ValueError, match=msg):
+ Unpacker(read_size=5, max_buffer_size=3)
+
+ def test_maxbuffersize_bufferfull(self):
+ unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
+ unpacker.feed(b'foo')
+ with pytest.raises(BufferFull, match=r'^$'):
+ unpacker.feed(b'b')
+
+ def test_maxbuffersize(self):
+ unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
+ unpacker.feed(b'foo')
+ assert ord('f') == next(unpacker)
+ unpacker.feed(b'b')
+ assert ord('o') == next(unpacker)
+ assert ord('o') == next(unpacker)
+ assert ord('b') == next(unpacker)
+
+ def test_readbytes(self):
+ unpacker = Unpacker(read_size=3)
+ unpacker.feed(b'foobar')
+ assert unpacker.unpack() == ord(b'f')
+ assert unpacker.read_bytes(3) == b'oob'
+ assert unpacker.unpack() == ord(b'a')
+ assert unpacker.unpack() == ord(b'r')
+
+ # Test buffer refill
+ unpacker = Unpacker(compat.BytesIO(b'foobar'), read_size=3)
+ assert unpacker.unpack() == ord(b'f')
+ assert unpacker.read_bytes(3) == b'oob'
+ assert unpacker.unpack() == ord(b'a')
+ assert unpacker.unpack() == ord(b'r')
+
+ def test_issue124(self):
+ unpacker = Unpacker()
+ unpacker.feed(b'\xa1?\xa1!')
+ assert tuple(unpacker) == (b'?', b'!')
+ assert tuple(unpacker) == ()
+ unpacker.feed(b"\xa1?\xa1")
+ assert tuple(unpacker) == (b'?', )
+ assert tuple(unpacker) == ()
+ unpacker.feed(b"!")
+ assert tuple(unpacker) == (b'!', )
+ assert tuple(unpacker) == ()
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_subtype.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_subtype.py
new file mode 100644
index 00000000000..8af7e0b91d9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_subtype.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+
+from collections import namedtuple
+
+from pandas.io.msgpack import packb
+
+
+class MyList(list):
+ pass
+
+
+class MyDict(dict):
+ pass
+
+
+class MyTuple(tuple):
+ pass
+
+
+MyNamedTuple = namedtuple('MyNamedTuple', 'x y')
+
+
+def test_types():
+ assert packb(MyDict()) == packb(dict())
+ assert packb(MyList()) == packb(list())
+ assert packb(MyNamedTuple(1, 2)) == packb((1, 2))
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack.py
new file mode 100644
index 00000000000..356156296c0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack.py
@@ -0,0 +1,67 @@
+from io import BytesIO
+import sys
+
+import pytest
+
+from pandas.io.msgpack import ExtType, OutOfData, Unpacker, packb
+
+
+class TestUnpack(object):
+
+ def test_unpack_array_header_from_file(self):
+ f = BytesIO(packb([1, 2, 3, 4]))
+ unpacker = Unpacker(f)
+ assert unpacker.read_array_header() == 4
+ assert unpacker.unpack() == 1
+ assert unpacker.unpack() == 2
+ assert unpacker.unpack() == 3
+ assert unpacker.unpack() == 4
+ msg = "No more data to unpack"
+ with pytest.raises(OutOfData, match=msg):
+ unpacker.unpack()
+
+ def test_unpacker_hook_refcnt(self):
+ if not hasattr(sys, 'getrefcount'):
+ pytest.skip('no sys.getrefcount()')
+ result = []
+
+ def hook(x):
+ result.append(x)
+ return x
+
+ basecnt = sys.getrefcount(hook)
+
+ up = Unpacker(object_hook=hook, list_hook=hook)
+
+ assert sys.getrefcount(hook) >= basecnt + 2
+
+ up.feed(packb([{}]))
+ up.feed(packb([{}]))
+ assert up.unpack() == [{}]
+ assert up.unpack() == [{}]
+ assert result == [{}, [{}], {}, [{}]]
+
+ del up
+
+ assert sys.getrefcount(hook) == basecnt
+
+ def test_unpacker_ext_hook(self):
+ class MyUnpacker(Unpacker):
+
+ def __init__(self):
+ super(MyUnpacker, self).__init__(ext_hook=self._hook,
+ encoding='utf-8')
+
+ def _hook(self, code, data):
+ if code == 1:
+ return int(data)
+ else:
+ return ExtType(code, data)
+
+ unpacker = MyUnpacker()
+ unpacker.feed(packb({'a': 1}, encoding='utf-8'))
+ assert unpacker.unpack() == {'a': 1}
+ unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8'))
+ assert unpacker.unpack() == {'a': 123}
+ unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8'))
+ assert unpacker.unpack() == {'a': ExtType(2, b'321')}
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack_raw.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack_raw.py
new file mode 100644
index 00000000000..09ebb681d87
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_unpack_raw.py
@@ -0,0 +1,30 @@
+"""Tests for cases where the user seeks to obtain packed msgpack objects"""
+
+import io
+
+from pandas.io.msgpack import Unpacker, packb
+
+
+def test_write_bytes():
+ unpacker = Unpacker()
+ unpacker.feed(b'abc')
+ f = io.BytesIO()
+ assert unpacker.unpack(f.write) == ord('a')
+ assert f.getvalue() == b'a'
+ f = io.BytesIO()
+ assert unpacker.skip(f.write) is None
+ assert f.getvalue() == b'b'
+ f = io.BytesIO()
+ assert unpacker.skip() is None
+ assert f.getvalue() == b''
+
+
+def test_write_bytes_multi_buffer():
+ long_val = (5) * 100
+ expected = packb(long_val)
+ unpacker = Unpacker(io.BytesIO(expected), read_size=3, max_buffer_size=3)
+
+ f = io.BytesIO()
+ unpacked = unpacker.unpack(f.write)
+ assert unpacked == long_val
+ assert f.getvalue() == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/parser/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/conftest.py b/contrib/python/pandas/py2/pandas/tests/io/parser/conftest.py
new file mode 100644
index 00000000000..feb6c36b517
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/conftest.py
@@ -0,0 +1,85 @@
+import os
+
+import pytest
+
+from pandas import read_csv, read_table
+
+
+class BaseParser(object):
+ engine = None
+ low_memory = True
+ float_precision_choices = []
+
+ def update_kwargs(self, kwargs):
+ kwargs = kwargs.copy()
+ kwargs.update(dict(engine=self.engine,
+ low_memory=self.low_memory))
+
+ return kwargs
+
+ def read_csv(self, *args, **kwargs):
+ kwargs = self.update_kwargs(kwargs)
+ return read_csv(*args, **kwargs)
+
+ def read_table(self, *args, **kwargs):
+ kwargs = self.update_kwargs(kwargs)
+ return read_table(*args, **kwargs)
+
+
+class CParser(BaseParser):
+ engine = "c"
+ float_precision_choices = [None, "high", "round_trip"]
+
+
+class CParserHighMemory(CParser):
+ low_memory = False
+
+
+class CParserLowMemory(CParser):
+ low_memory = True
+
+
+class PythonParser(BaseParser):
+ engine = "python"
+ float_precision_choices = [None]
+
+
+def csv_dir_path(datapath):
+ return datapath("io", "parser", "data")
+
+
+def csv1(csv_dir_path):
+ return os.path.join(csv_dir_path, "test1.csv")
+
+
+_cParserHighMemory = CParserHighMemory()
+_cParserLowMemory = CParserLowMemory()
+_pythonParser = PythonParser()
+
+_py_parsers_only = [_pythonParser]
+_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
+_all_parsers = _c_parsers_only + _py_parsers_only
+
+_py_parser_ids = ["python"]
+_c_parser_ids = ["c_high", "c_low"]
+_all_parser_ids = _c_parser_ids + _py_parser_ids
+
+
[email protected](params=_all_parsers,
+ ids=_all_parser_ids)
+def all_parsers(request):
+ return request.param
+
+
[email protected](params=_c_parsers_only,
+ ids=_c_parser_ids)
+def c_parser_only(request):
+ return request.param
+
+
[email protected](params=_py_parsers_only,
+ ids=_py_parser_ids)
+def python_parser_only(request):
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/data/items.jsonl b/contrib/python/pandas/py2/pandas/tests/io/parser/data/items.jsonl
new file mode 100644
index 00000000000..f784d37befa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/data/items.jsonl
@@ -0,0 +1,2 @@
+{"a": 1, "b": 2}
+{"b":2, "a" :1}
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/data/tar_csv.tar b/contrib/python/pandas/py2/pandas/tests/io/parser/data/tar_csv.tar
new file mode 100644
index 00000000000..d1819550e0a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/data/tar_csv.tar
Binary files differ
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_c_parser_only.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_c_parser_only.py
new file mode 100644
index 00000000000..c089a189ae5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_c_parser_only.py
@@ -0,0 +1,591 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that apply specifically to the CParser. Unless specifically stated
+as a CParser-specific issue, the goal is to eventually move as many of
+these tests out of this module as soon as the Python parser can accept
+further arguments when parsing.
+"""
+
+from io import TextIOWrapper
+import mmap
+import os
+import tarfile
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, BytesIO, StringIO, lrange, range
+from pandas.errors import ParserError
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame, concat
+import pandas.util.testing as tm
+
+
+ "malformed",
+ ["1\r1\r1\r 1\r 1\r",
+ "1\r1\r1\r 1\r 1\r11\r",
+ "1\r1\r1\r 1\r 1\r11\r1\r"],
+ ids=["words pointer", "stream pointer", "lines pointer"])
+def test_buffer_overflow(c_parser_only, malformed):
+ # see gh-9205: test certain malformed input files that cause
+ # buffer overflows in tokenizer.c
+ msg = "Buffer overflow caught - possible malformed input file."
+ parser = c_parser_only
+
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(malformed))
+
+
+def test_buffer_rd_bytes(c_parser_only):
+ # see gh-12098: src->buffer in the C parser can be freed twice leading
+ # to a segfault if a corrupt gzip file is read with 'read_csv', and the
+ # buffer is filled more than once before gzip raises an Exception.
+
+ data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \
+ "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \
+ "\xA6\x4D" + "\x55" * 267 + \
+ "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \
+ "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
+ parser = c_parser_only
+
+ for _ in range(100):
+ try:
+ parser.read_csv(StringIO(data), compression="gzip",
+ delim_whitespace=True)
+ except Exception:
+ pass
+
+
+def test_delim_whitespace_custom_terminator(c_parser_only):
+ # See gh-12912
+ data = "a b c~1 2 3~4 5 6~7 8 9"
+ parser = c_parser_only
+
+ df = parser.read_csv(StringIO(data), lineterminator="~",
+ delim_whitespace=True)
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=["a", "b", "c"])
+ tm.assert_frame_equal(df, expected)
+
+
+def test_dtype_and_names_error(c_parser_only):
+ # see gh-8833: passing both dtype and names
+ # resulting in an error reporting issue
+ parser = c_parser_only
+ data = """
+1.0 1
+2.0 2
+3.0 3
+"""
+ # base cases
+ result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
+ expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
+ tm.assert_frame_equal(result, expected)
+
+ result = parser.read_csv(StringIO(data), sep=r"\s+",
+ header=None, names=["a", "b"])
+ expected = DataFrame(
+ [[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+ # fallback casting
+ result = parser.read_csv(StringIO(
+ data), sep=r"\s+", header=None,
+ names=["a", "b"], dtype={"a": np.int32})
+ expected = DataFrame([[1, 1], [2, 2], [3, 3]],
+ columns=["a", "b"])
+ expected["a"] = expected["a"].astype(np.int32)
+ tm.assert_frame_equal(result, expected)
+
+ data = """
+1.0 1
+nan 2
+3.0 3
+"""
+ # fallback casting, but not castable
+ with pytest.raises(ValueError, match="cannot safely convert"):
+ parser.read_csv(StringIO(data), sep=r"\s+", header=None,
+ names=["a", "b"], dtype={"a": np.int32})
+
+
[email protected]("match,kwargs", [
+ # For each of these cases, all of the dtypes are valid, just unsupported.
+ (("the dtype datetime64 is not supported for parsing, "
+ "pass this column using parse_dates instead"),
+ dict(dtype={"A": "datetime64", "B": "float64"})),
+
+ (("the dtype datetime64 is not supported for parsing, "
+ "pass this column using parse_dates instead"),
+ dict(dtype={"A": "datetime64", "B": "float64"},
+ parse_dates=["B"])),
+
+ ("the dtype timedelta64 is not supported for parsing",
+ dict(dtype={"A": "timedelta64", "B": "float64"})),
+
+ ("the dtype <U8 is not supported for parsing",
+ dict(dtype={"A": "U8"}))
+], ids=["dt64-0", "dt64-1", "td64", "<U8"])
+def test_unsupported_dtype(c_parser_only, match, kwargs):
+ parser = c_parser_only
+ df = DataFrame(np.random.rand(5, 2), columns=list(
+ "AB"), index=["1A", "1B", "1C", "1D", "1E"])
+
+ with tm.ensure_clean("__unsupported_dtype__.csv") as path:
+ df.to_csv(path)
+
+ with pytest.raises(TypeError, match=match):
+ parser.read_csv(path, index_col=0, **kwargs)
+
+
+def test_precise_conversion(c_parser_only):
+ from decimal import Decimal
+ parser = c_parser_only
+
+ normal_errors = []
+ precise_errors = []
+
+ # test numbers between 1 and 2
+ for num in np.linspace(1., 2., num=500):
+ # 25 decimal digits of precision
+ text = "a\n{0:.25}".format(num)
+
+ normal_val = float(parser.read_csv(StringIO(text))["a"][0])
+ precise_val = float(parser.read_csv(
+ StringIO(text), float_precision="high")["a"][0])
+ roundtrip_val = float(parser.read_csv(
+ StringIO(text), float_precision="round_trip")["a"][0])
+ actual_val = Decimal(text[2:])
+
+ def error(val):
+ return abs(Decimal("{0:.100}".format(val)) - actual_val)
+
+ normal_errors.append(error(normal_val))
+ precise_errors.append(error(precise_val))
+
+ # round-trip should match float()
+ assert roundtrip_val == float(text[2:])
+
+ assert sum(precise_errors) <= sum(normal_errors)
+ assert max(precise_errors) <= max(normal_errors)
+
+
+def test_usecols_dtypes(c_parser_only):
+ parser = c_parser_only
+ data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+ result = parser.read_csv(StringIO(data), usecols=(0, 1, 2),
+ names=("a", "b", "c"),
+ header=None,
+ converters={"a": str},
+ dtype={"b": int, "c": float})
+ result2 = parser.read_csv(StringIO(data), usecols=(0, 2),
+ names=("a", "b", "c"),
+ header=None,
+ converters={"a": str},
+ dtype={"b": int, "c": float})
+
+ assert (result.dtypes == [object, np.int, np.float]).all()
+ assert (result2.dtypes == [object, np.float]).all()
+
+
+def test_disable_bool_parsing(c_parser_only):
+ # see gh-2090
+
+ parser = c_parser_only
+ data = """A,B,C
+Yes,No,Yes
+No,Yes,Yes
+Yes,,Yes
+No,No,No"""
+
+ result = parser.read_csv(StringIO(data), dtype=object)
+ assert (result.dtypes == object).all()
+
+ result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
+ assert result["B"][2] == ""
+
+
+def test_custom_lineterminator(c_parser_only):
+ parser = c_parser_only
+ data = "a,b,c~1,2,3~4,5,6"
+
+ result = parser.read_csv(StringIO(data), lineterminator="~")
+ expected = parser.read_csv(StringIO(data.replace("~", "\n")))
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_ragged_csv(c_parser_only):
+ parser = c_parser_only
+ data = """1,2,3
+1,2,3,4
+1,2,3,4,5
+1,2
+1,2,3,4"""
+
+ nice_data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+ result = parser.read_csv(StringIO(data), header=None,
+ names=["a", "b", "c", "d", "e"])
+
+ expected = parser.read_csv(StringIO(nice_data), header=None,
+ names=["a", "b", "c", "d", "e"])
+
+ tm.assert_frame_equal(result, expected)
+
+ # too many columns, cause segfault if not careful
+ data = "1,2\n3,4,5"
+
+ result = parser.read_csv(StringIO(data), header=None,
+ names=lrange(50))
+ expected = parser.read_csv(StringIO(data), header=None,
+ names=lrange(3)).reindex(columns=lrange(50))
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_tokenize_CR_with_quoting(c_parser_only):
+ # see gh-3453
+ parser = c_parser_only
+ data = " a,b,c\r\"a,b\",\"e,d\",\"f,f\""
+
+ result = parser.read_csv(StringIO(data), header=None)
+ expected = parser.read_csv(StringIO(data.replace("\r", "\n")),
+ header=None)
+ tm.assert_frame_equal(result, expected)
+
+ result = parser.read_csv(StringIO(data))
+ expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_grow_boundary_at_cap(c_parser_only):
+ # See gh-12494
+ #
+ # Cause of error was that the C parser
+ # was not increasing the buffer size when
+ # the desired space would fill the buffer
+ # to capacity, which would later cause a
+ # buffer overflow error when checking the
+ # EOF terminator of the CSV stream.
+ parser = c_parser_only
+
+ def test_empty_header_read(count):
+ s = StringIO("," * count)
+ expected = DataFrame(columns=[
+ "Unnamed: {i}".format(i=i)
+ for i in range(count + 1)])
+ df = parser.read_csv(s)
+ tm.assert_frame_equal(df, expected)
+
+ for cnt in range(1, 101):
+ test_empty_header_read(cnt)
+
+
+def test_parse_trim_buffers(c_parser_only):
+ # This test is part of a bugfix for gh-13703. It attempts to
+ # to stress the system memory allocator, to cause it to move the
+ # stream buffer and either let the OS reclaim the region, or let
+ # other memory requests of parser otherwise modify the contents
+ # of memory space, where it was formally located.
+ # This test is designed to cause a `segfault` with unpatched
+ # `tokenizer.c`. Sometimes the test fails on `segfault`, other
+ # times it fails due to memory corruption, which causes the
+ # loaded DataFrame to differ from the expected one.
+
+ parser = c_parser_only
+
+ # Generate a large mixed-type CSV file on-the-fly (one record is
+ # approx 1.5KiB).
+ record_ = \
+ """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
+ """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
+ """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
+ """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
+ """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
+ """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
+ """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
+ """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
+ """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
+ """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
+ """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
+ """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
+ """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
+ """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
+ """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
+ """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
+ """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
+ """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
+ """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
+ """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
+ """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
+ """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
+ """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
+ """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
+ """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
+ """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
+ """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
+ """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
+ """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
+
+ # Set the number of lines so that a call to `parser_trim_buffers`
+ # is triggered: after a couple of full chunks are consumed a
+ # relatively small 'residual' chunk would cause reallocation
+ # within the parser.
+ chunksize, n_lines = 128, 2 * 128 + 15
+ csv_data = "\n".join([record_] * n_lines) + "\n"
+
+ # We will use StringIO to load the CSV from this text buffer.
+ # pd.read_csv() will iterate over the file in chunks and will
+ # finally read a residual chunk of really small size.
+
+ # Generate the expected output: manually create the dataframe
+ # by splitting by comma and repeating the `n_lines` times.
+ row = tuple(val_ if val_ else np.nan
+ for val_ in record_.split(","))
+ expected = DataFrame([row for _ in range(n_lines)],
+ dtype=object, columns=None, index=None)
+
+ # Iterate over the CSV file in chunks of `chunksize` lines
+ chunks_ = parser.read_csv(StringIO(csv_data), header=None,
+ dtype=object, chunksize=chunksize)
+ result = concat(chunks_, axis=0, ignore_index=True)
+
+ # Check for data corruption if there was no segfault
+ tm.assert_frame_equal(result, expected)
+
+ # This extra test was added to replicate the fault in gh-5291.
+ # Force 'utf-8' encoding, so that `_string_convert` would take
+ # a different execution branch.
+ chunks_ = parser.read_csv(StringIO(csv_data), header=None,
+ dtype=object, chunksize=chunksize,
+ encoding="utf_8")
+ result = concat(chunks_, axis=0, ignore_index=True)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_internal_null_byte(c_parser_only):
+ # see gh-14012
+ #
+ # The null byte ('\x00') should not be used as a
+ # true line terminator, escape character, or comment
+ # character, only as a placeholder to indicate that
+ # none was specified.
+ #
+ # This test should be moved to test_common.py ONLY when
+ # Python's csv class supports parsing '\x00'.
+ parser = c_parser_only
+
+ names = ["a", "b", "c"]
+ data = "1,2,3\n4,\x00,6\n7,8,9"
+ expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6],
+ [7, 8, 9]], columns=names)
+
+ result = parser.read_csv(StringIO(data), names=names)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_nrows_large(c_parser_only):
+ # gh-7626 - Read only nrows of data in for large inputs (>262144b)
+ parser = c_parser_only
+ header_narrow = "\t".join(["COL_HEADER_" + str(i)
+ for i in range(10)]) + "\n"
+ data_narrow = "\t".join(["somedatasomedatasomedata1"
+ for _ in range(10)]) + "\n"
+ header_wide = "\t".join(["COL_HEADER_" + str(i)
+ for i in range(15)]) + "\n"
+ data_wide = "\t".join(["somedatasomedatasomedata2"
+ for _ in range(15)]) + "\n"
+ test_input = (header_narrow + data_narrow * 1050 +
+ header_wide + data_wide * 2)
+
+ df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
+
+ assert df.size == 1010 * 10
+
+
+def test_float_precision_round_trip_with_text(c_parser_only):
+ # see gh-15140 - This should not segfault on Python 2.7+
+ parser = c_parser_only
+ df = parser.read_csv(StringIO("a"), header=None,
+ float_precision="round_trip")
+ tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
+
+
+def test_large_difference_in_columns(c_parser_only):
+ # see gh-14125
+ parser = c_parser_only
+
+ count = 10000
+ large_row = ("X," * count)[:-1] + "\n"
+ normal_row = "XXXXXX XXXXXX,111111111111111\n"
+ test_input = (large_row + normal_row * 6)[:-1]
+
+ result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
+ rows = test_input.split("\n")
+
+ expected = DataFrame([row.split(",")[0] for row in rows])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_data_after_quote(c_parser_only):
+ # see gh-15910
+ parser = c_parser_only
+
+ data = "a\n1\n\"b\"a"
+ result = parser.read_csv(StringIO(data))
+
+ expected = DataFrame({"a": ["1", "ba"]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_comment_whitespace_delimited(c_parser_only, capsys):
+ parser = c_parser_only
+ test_input = """\
+1 2
+2 2 3
+3 2 3 # 3 fields
+4 2 3# 3 fields
+5 2 # 2 fields
+6 2# 2 fields
+7 # 1 field, NaN
+8# 1 field, NaN
+9 2 3 # skipped line
+# comment"""
+ df = parser.read_csv(StringIO(test_input), comment="#", header=None,
+ delimiter="\\s+", skiprows=0,
+ error_bad_lines=False)
+ captured = capsys.readouterr()
+ # skipped lines 2, 3, 4, 9
+ for line_num in (2, 3, 4, 9):
+ assert "Skipping line {}".format(line_num) in captured.err
+ expected = DataFrame([[1, 2],
+ [5, 2],
+ [6, 2],
+ [7, np.nan],
+ [8, np.nan]])
+ tm.assert_frame_equal(df, expected)
+
+
+def test_file_like_no_next(c_parser_only):
+ # gh-16530: the file-like need not have a "next" or "__next__"
+ # attribute despite having an "__iter__" attribute.
+ #
+ # NOTE: This is only true for the C engine, not Python engine.
+ class NoNextBuffer(StringIO):
+ def __next__(self):
+ raise AttributeError("No next method")
+
+ next = __next__
+
+ parser = c_parser_only
+ data = "a\n1"
+
+ expected = DataFrame({"a": [1]})
+ result = parser.read_csv(NoNextBuffer(data))
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_buffer_rd_bytes_bad_unicode(c_parser_only):
+ # see gh-22748
+ parser = c_parser_only
+ t = BytesIO(b"\xB0")
+
+ if PY3:
+ msg = "'utf-8' codec can't encode character"
+ t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
+ else:
+ msg = "'utf8' codec can't decode byte"
+
+ with pytest.raises(UnicodeError, match=msg):
+ parser.read_csv(t, encoding="UTF-8")
+
+
[email protected]("tar_suffix", [".tar", ".tar.gz"])
+def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
+ # see gh-16530
+ #
+ # Unfortunately, Python's CSV library can't handle
+ # tarfile objects (expects string, not bytes when
+ # iterating through a file-like).
+ parser = c_parser_only
+ tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
+
+ with tarfile.open(tar_path, "r") as tar:
+ data_file = tar.extractfile("tar_data.csv")
+
+ out = parser.read_csv(data_file)
+ expected = DataFrame({"a": [1]})
+ tm.assert_frame_equal(out, expected)
+
+
+def test_bytes_exceed_2gb(c_parser_only):
+ # see gh-16798
+ #
+ # Read from a "CSV" that has a column larger than 2GB.
+ parser = c_parser_only
+
+ if parser.low_memory:
+ pytest.skip("not a high_memory test")
+
+ csv = StringIO("strings\n" + "\n".join(
+ ["x" * (1 << 20) for _ in range(2100)]))
+ df = parser.read_csv(csv)
+ assert not df.empty
+
+
+def test_chunk_whitespace_on_boundary(c_parser_only):
+ # see gh-9735: this issue is C parser-specific (bug when
+ # parsing whitespace and characters at chunk boundary)
+ #
+ # This test case has a field too large for the Python parser / CSV library.
+ parser = c_parser_only
+
+ chunk1 = "a" * (1024 * 256 - 2) + "\na"
+ chunk2 = "\n a"
+ result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
+
+ expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_file_handles_mmap(c_parser_only, csv1):
+ # gh-14418
+ #
+ # Don't close user provided file handles.
+ parser = c_parser_only
+
+ with open(csv1, "r") as f:
+ m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+ parser.read_csv(m)
+
+ if PY3:
+ assert not m.closed
+ m.close()
+
+
+def test_file_binary_mode(c_parser_only):
+ # see gh-23779
+ parser = c_parser_only
+ expected = DataFrame([[1, 2, 3], [4, 5, 6]])
+
+ with tm.ensure_clean() as path:
+ with open(path, "w") as f:
+ f.write("1,2,3\n4,5,6")
+
+ with open(path, "rb") as f:
+ result = parser.read_csv(f, header=None)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_comment.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_comment.py
new file mode 100644
index 00000000000..299a04f876b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_comment.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that comments are properly handled during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
[email protected]("na_values", [None, ["NaN"]])
+def test_comment(all_parsers, na_values):
+ parser = all_parsers
+ data = """A,B,C
+1,2.,4.#hello world
+5.,NaN,10.0
+"""
+ expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data), comment="#",
+ na_values=na_values)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("read_kwargs", [
+ dict(),
+ dict(lineterminator="*"),
+ dict(delim_whitespace=True),
+])
+def test_line_comment(all_parsers, read_kwargs):
+ parser = all_parsers
+ data = """# empty
+A,B,C
+1,2.,4.#hello world
+#ignore this line
+5.,NaN,10.0
+"""
+ if read_kwargs.get("delim_whitespace"):
+ data = data.replace(",", " ")
+ elif read_kwargs.get("lineterminator"):
+ if parser.engine != "c":
+ pytest.skip("Custom terminator not supported with Python engine")
+
+ data = data.replace("\n", read_kwargs.get("lineterminator"))
+
+ read_kwargs["comment"] = "#"
+ result = parser.read_csv(StringIO(data), **read_kwargs)
+
+ expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_comment_skiprows(all_parsers):
+ parser = all_parsers
+ data = """# empty
+random line
+# second empty line
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+ # This should ignore the first four lines (including comments).
+ expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_comment_header(all_parsers):
+ parser = all_parsers
+ data = """# empty
+# second empty line
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+ # Header should begin at the second non-comment line.
+ expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data), comment="#", header=1)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_comment_skiprows_header(all_parsers):
+ parser = all_parsers
+ data = """# empty
+# second empty line
+# third empty line
+X,Y,Z
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+ # Skiprows should skip the first 4 lines (including comments),
+ # while header should start from the second non-commented line,
+ # starting with line 5.
+ expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("comment_char", ["#", "~", "&", "^", "*", "@"])
+def test_custom_comment_char(all_parsers, comment_char):
+ parser = all_parsers
+ data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
+ result = parser.read_csv(StringIO(data.replace("#", comment_char)),
+ comment=comment_char)
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("header", ["infer", None])
+def test_comment_first_line(all_parsers, header):
+ # see gh-4623
+ parser = all_parsers
+ data = "# notes\na,b,c\n# more notes\n1,2,3"
+
+ if header is None:
+ expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
+ else:
+ expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+
+ result = parser.read_csv(StringIO(data), comment="#", header=header)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_common.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_common.py
new file mode 100644
index 00000000000..05da171d7dc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_common.py
@@ -0,0 +1,1946 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that work on both the Python and C engines but do not have a
+specific classification into the other test modules.
+"""
+
+import codecs
+from collections import OrderedDict
+import csv
+from datetime import datetime
+import os
+import platform
+from tempfile import TemporaryFile
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import Timestamp
+from pandas.compat import BytesIO, StringIO, lrange, range, u
+from pandas.errors import DtypeWarning, EmptyDataError, ParserError
+
+from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
+import pandas.util.testing as tm
+
+from pandas.io.common import URLError
+from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
+
+
+def test_override_set_noconvert_columns():
+ # see gh-17351
+ #
+ # Usecols needs to be sorted in _set_noconvert_columns based
+ # on the test_usecols_with_parse_dates test from test_usecols.py
+ class MyTextFileReader(TextFileReader):
+ def __init__(self):
+ self._currow = 0
+ self.squeeze = False
+
+ class MyCParserWrapper(CParserWrapper):
+ def _set_noconvert_columns(self):
+ if self.usecols_dtype == "integer":
+ # self.usecols is a set, which is documented as unordered
+ # but in practice, a CPython set of integers is sorted.
+ # In other implementations this assumption does not hold.
+ # The following code simulates a different order, which
+ # before GH 17351 would cause the wrong columns to be
+ # converted via the parse_dates parameter
+ self.usecols = list(self.usecols)
+ self.usecols.reverse()
+ return CParserWrapper._set_noconvert_columns(self)
+
+ data = """a,b,c,d,e
+0,1,20140101,0900,4
+0,1,20140102,1000,4"""
+
+ parse_dates = [[1, 2]]
+ cols = {
+ "a": [0, 0],
+ "c_d": [
+ Timestamp("2014-01-01 09:00:00"),
+ Timestamp("2014-01-02 10:00:00")
+ ]
+ }
+ expected = DataFrame(cols, columns=["c_d", "a"])
+
+ parser = MyTextFileReader()
+ parser.options = {"usecols": [0, 2, 3],
+ "parse_dates": parse_dates,
+ "delimiter": ","}
+ parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
+
+ result = parser.read()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_bytes_io_input(all_parsers):
+ if compat.PY2:
+ pytest.skip("Bytes-related test does not need to work on Python 2.x")
+
+ encoding = "cp1255"
+ parser = all_parsers
+
+ data = BytesIO("שלום:1234\n562:123".encode(encoding))
+ result = parser.read_csv(data, sep=":", encoding=encoding)
+
+ expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_decimal_marker(all_parsers):
+ data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+ # Parsers support only length-1 decimals
+ msg = "Only length-1 decimal markers supported"
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), decimal="")
+
+
+def test_bad_stream_exception(all_parsers, csv_dir_path):
+ # see gh-13652
+ #
+ # This test validates that both the Python engine and C engine will
+ # raise UnicodeDecodeError instead of C engine raising ParserError
+ # and swallowing the exception that caused read to fail.
+ path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
+ codec = codecs.lookup("utf-8")
+ utf8 = codecs.lookup('utf-8')
+ parser = all_parsers
+
+ msg = ("'utf-8' codec can't decode byte" if compat.PY3
+ else "'utf8' codec can't decode byte")
+
+ # Stream must be binary UTF8.
+ with open(path, "rb") as handle, codecs.StreamRecoder(
+ handle, utf8.encode, utf8.decode, codec.streamreader,
+ codec.streamwriter) as stream:
+
+ with pytest.raises(UnicodeDecodeError, match=msg):
+ parser.read_csv(stream)
+
+
[email protected](compat.PY2, reason="PY3-only test")
+def test_read_csv_local(all_parsers, csv1):
+ prefix = u("file:///") if compat.is_platform_windows() else u("file://")
+ parser = all_parsers
+
+ fname = prefix + compat.text_type(os.path.abspath(csv1))
+ result = parser.read_csv(fname, index_col=0, parse_dates=True)
+
+ expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738],
+ [1.047916, -0.041232, -0.16181208307, 0.212549],
+ [0.498581, 0.731168, -0.537677223318, 1.346270],
+ [1.120202, 1.567621, 0.00364077397681, 0.675253],
+ [-0.487094, 0.571455, -1.6116394093, 0.103469],
+ [0.836649, 0.246462, 0.588542635376, 1.062782],
+ [-0.157161, 1.340307, 1.1957779562, -1.097007]],
+ columns=["A", "B", "C", "D"],
+ index=Index([datetime(2000, 1, 3),
+ datetime(2000, 1, 4),
+ datetime(2000, 1, 5),
+ datetime(2000, 1, 6),
+ datetime(2000, 1, 7),
+ datetime(2000, 1, 10),
+ datetime(2000, 1, 11)], name="index"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_1000_sep(all_parsers):
+ parser = all_parsers
+ data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+ expected = DataFrame({
+ "A": [1, 10],
+ "B": [2334, 13],
+ "C": [5, 10.]
+ })
+
+ result = parser.read_csv(StringIO(data), sep="|", thousands=",")
+ tm.assert_frame_equal(result, expected)
+
+
+def test_squeeze(all_parsers):
+ data = """\
+a,1
+b,2
+c,3
+"""
+ parser = all_parsers
+ index = Index(["a", "b", "c"], name=0)
+ expected = Series([1, 2, 3], name=1, index=index)
+
+ result = parser.read_csv(StringIO(data), index_col=0,
+ header=None, squeeze=True)
+ tm.assert_series_equal(result, expected)
+
+ # see gh-8217
+ #
+ # Series should not be a view.
+ assert not result._is_view
+
+
+def test_malformed(all_parsers):
+ # see gh-6607
+ parser = all_parsers
+ data = """ignore
+A,B,C
+1,2,3 # comment
+1,2,3,4,5
+2,3,4
+"""
+ msg = "Expected 3 fields in line 4, saw 5"
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), header=1, comment="#")
+
+
[email protected]("nrows", [5, 3, None])
+def test_malformed_chunks(all_parsers, nrows):
+ data = """ignore
+A,B,C
+skip
+1,2,3
+3,5,10 # comment
+1,2,3,4,5
+2,3,4
+"""
+ parser = all_parsers
+ msg = 'Expected 3 fields in line 6, saw 5'
+ reader = parser.read_csv(StringIO(data), header=1, comment="#",
+ iterator=True, chunksize=1, skiprows=[2])
+
+ with pytest.raises(ParserError, match=msg):
+ reader.read(nrows)
+
+
+def test_unnamed_columns(all_parsers):
+ data = """A,B,C,,
+1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+ parser = all_parsers
+ expected = DataFrame([[1, 2, 3, 4, 5],
+ [6, 7, 8, 9, 10],
+ [11, 12, 13, 14, 15]],
+ dtype=np.int64, columns=["A", "B", "C",
+ "Unnamed: 3",
+ "Unnamed: 4"])
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_csv_mixed_type(all_parsers):
+ data = """A,B,C
+a,1,2
+b,3,4
+c,4,5
+"""
+ parser = all_parsers
+ expected = DataFrame({"A": ["a", "b", "c"],
+ "B": [1, 3, 4],
+ "C": [2, 4, 5]})
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_low_memory_no_rows_with_index(all_parsers):
+ # see gh-21141
+ parser = all_parsers
+
+ if not parser.low_memory:
+ pytest.skip("This is a low-memory specific test")
+
+ data = """A,B,C
+1,1,1,2
+2,2,3,4
+3,3,4,5
+"""
+ result = parser.read_csv(StringIO(data), low_memory=True,
+ index_col=0, nrows=0)
+ expected = DataFrame(columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_dataframe(all_parsers, csv1):
+ parser = all_parsers
+ result = parser.read_csv(csv1, index_col=0, parse_dates=True)
+
+ expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738],
+ [1.047916, -0.041232, -0.16181208307, 0.212549],
+ [0.498581, 0.731168, -0.537677223318, 1.346270],
+ [1.120202, 1.567621, 0.00364077397681, 0.675253],
+ [-0.487094, 0.571455, -1.6116394093, 0.103469],
+ [0.836649, 0.246462, 0.588542635376, 1.062782],
+ [-0.157161, 1.340307, 1.1957779562, -1.097007]],
+ columns=["A", "B", "C", "D"],
+ index=Index([datetime(2000, 1, 3),
+ datetime(2000, 1, 4),
+ datetime(2000, 1, 5),
+ datetime(2000, 1, 6),
+ datetime(2000, 1, 7),
+ datetime(2000, 1, 10),
+ datetime(2000, 1, 11)], name="index"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_no_index_name(all_parsers, csv_dir_path):
+ parser = all_parsers
+ csv2 = os.path.join(csv_dir_path, "test2.csv")
+ result = parser.read_csv(csv2, index_col=0, parse_dates=True)
+
+ expected = DataFrame([[0.980269, 3.685731, -0.364216805298,
+ -1.159738, "foo"],
+ [1.047916, -0.041232, -0.16181208307,
+ 0.212549, "bar"],
+ [0.498581, 0.731168, -0.537677223318,
+ 1.346270, "baz"],
+ [1.120202, 1.567621, 0.00364077397681,
+ 0.675253, "qux"],
+ [-0.487094, 0.571455, -1.6116394093,
+ 0.103469, "foo2"]],
+ columns=["A", "B", "C", "D", "E"],
+ index=Index([datetime(2000, 1, 3),
+ datetime(2000, 1, 4),
+ datetime(2000, 1, 5),
+ datetime(2000, 1, 6),
+ datetime(2000, 1, 7)]))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_unicode(all_parsers):
+ parser = all_parsers
+ data = BytesIO(u("\u0141aski, Jan;1").encode("utf-8"))
+
+ result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
+ expected = DataFrame([[u("\u0141aski, Jan"), 1]])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_wrong_num_columns(all_parsers):
+ # Too few columns.
+ data = """A,B,C,D,E,F
+1,2,3,4,5,6
+6,7,8,9,10,11,12
+11,12,13,14,15,16
+"""
+ parser = all_parsers
+ msg = "Expected 6 fields in line 3, saw 7"
+
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data))
+
+
+def test_read_duplicate_index_explicit(all_parsers):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo,12,13,14,15
+bar,12,13,14,15
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=0)
+
+ expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10],
+ [12, 13, 14, 15], [12, 13, 14, 15],
+ [12, 13, 14, 15], [12, 13, 14, 15]],
+ columns=["A", "B", "C", "D"],
+ index=Index(["foo", "bar", "baz",
+ "qux", "foo", "bar"], name="index"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_duplicate_index_implicit(all_parsers):
+ data = """A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo,12,13,14,15
+bar,12,13,14,15
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data))
+
+ expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10],
+ [12, 13, 14, 15], [12, 13, 14, 15],
+ [12, 13, 14, 15], [12, 13, 14, 15]],
+ columns=["A", "B", "C", "D"],
+ index=Index(["foo", "bar", "baz",
+ "qux", "foo", "bar"]))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected", [
+ ("A,B\nTrue,1\nFalse,2\nTrue,3", dict(),
+ DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])),
+ ("A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
+ dict(true_values=["yes", "Yes", "YES"],
+ false_values=["no", "NO", "No"]),
+ DataFrame([[True, 1], [False, 2], [True, 3],
+ [False, 3], [True, 3]], columns=["A", "B"])),
+ ("A,B\nTRUE,1\nFALSE,2\nTRUE,3", dict(),
+ DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])),
+ ("A,B\nfoo,bar\nbar,foo", dict(true_values=["foo"],
+ false_values=["bar"]),
+ DataFrame([[True, False], [False, True]], columns=["A", "B"]))
+])
+def test_parse_bool(all_parsers, data, kwargs, expected):
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_int_conversion(all_parsers):
+ data = """A,B
+1.0,1
+2.0,2
+3.0,3
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data))
+
+ expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("nrows", [3, 3.0])
+def test_read_nrows(all_parsers, nrows):
+ # see gh-10476
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ expected = DataFrame([["foo", 2, 3, 4, 5],
+ ["bar", 7, 8, 9, 10],
+ ["baz", 12, 13, 14, 15]],
+ columns=["index", "A", "B", "C", "D"])
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), nrows=nrows)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("nrows", [1.2, "foo", -1])
+def test_read_nrows_bad(all_parsers, nrows):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ msg = r"'nrows' must be an integer >=0"
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), nrows=nrows)
+
+
[email protected]("index_col", [0, "index"])
+def test_read_chunksize_with_index(all_parsers, index_col):
+ parser = all_parsers
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+ reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2)
+ expected = DataFrame([["foo", 2, 3, 4, 5],
+ ["bar", 7, 8, 9, 10],
+ ["baz", 12, 13, 14, 15],
+ ["qux", 12, 13, 14, 15],
+ ["foo2", 12, 13, 14, 15],
+ ["bar2", 12, 13, 14, 15]],
+ columns=["index", "A", "B", "C", "D"])
+ expected = expected.set_index("index")
+
+ chunks = list(reader)
+ tm.assert_frame_equal(chunks[0], expected[:2])
+ tm.assert_frame_equal(chunks[1], expected[2:4])
+ tm.assert_frame_equal(chunks[2], expected[4:])
+
+
[email protected]("chunksize", [1.3, "foo", 0])
+def test_read_chunksize_bad(all_parsers, chunksize):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ msg = r"'chunksize' must be an integer >=1"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), chunksize=chunksize)
+
+
[email protected]("chunksize", [2, 8])
+def test_read_chunksize_and_nrows(all_parsers, chunksize):
+ # see gh-15755
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ kwargs = dict(index_col=0, nrows=5)
+
+ reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs)
+ expected = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(concat(reader), expected)
+
+
+def test_read_chunksize_and_nrows_changing_size(all_parsers):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ kwargs = dict(index_col=0, nrows=5)
+
+ reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs)
+ expected = parser.read_csv(StringIO(data), **kwargs)
+
+ tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
+ tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
+
+ with pytest.raises(StopIteration, match=""):
+ reader.get_chunk(size=3)
+
+
+def test_get_chunk_passed_chunksize(all_parsers):
+ parser = all_parsers
+ data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+1,2,3"""
+
+ reader = parser.read_csv(StringIO(data), chunksize=2)
+ result = reader.get_chunk()
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [dict(), dict(index_col=0)])
+def test_read_chunksize_compat(all_parsers, kwargs):
+ # see gh-12185
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs)
+
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(concat(reader), result)
+
+
+def test_read_chunksize_jagged_names(all_parsers):
+ # see gh-23509
+ parser = all_parsers
+ data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
+
+ expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
+ reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4)
+
+ result = concat(reader)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_data_list(all_parsers):
+ parser = all_parsers
+ kwargs = dict(index_col=0)
+ data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
+
+ data_list = [["A", "B", "C"], ["foo", "1", "2", "3"],
+ ["bar", "4", "5", "6"]]
+ expected = parser.read_csv(StringIO(data), **kwargs)
+
+ parser = TextParser(data_list, chunksize=2, **kwargs)
+ result = parser.read()
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_iterator(all_parsers):
+ # see gh-6607
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ kwargs = dict(index_col=0)
+
+ expected = parser.read_csv(StringIO(data), **kwargs)
+ reader = parser.read_csv(StringIO(data), iterator=True, **kwargs)
+
+ first_chunk = reader.read(3)
+ tm.assert_frame_equal(first_chunk, expected[:3])
+
+ last_chunk = reader.read(5)
+ tm.assert_frame_equal(last_chunk, expected[3:])
+
+
+def test_iterator2(all_parsers):
+ parser = all_parsers
+ data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+
+ reader = parser.read_csv(StringIO(data), iterator=True)
+ result = list(reader)
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=["foo", "bar", "baz"],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result[0], expected)
+
+
+def test_reader_list(all_parsers):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ kwargs = dict(index_col=0)
+
+ lines = list(csv.reader(StringIO(data)))
+ reader = TextParser(lines, chunksize=2, **kwargs)
+
+ expected = parser.read_csv(StringIO(data), **kwargs)
+ chunks = list(reader)
+
+ tm.assert_frame_equal(chunks[0], expected[:2])
+ tm.assert_frame_equal(chunks[1], expected[2:4])
+ tm.assert_frame_equal(chunks[2], expected[4:])
+
+
+def test_reader_list_skiprows(all_parsers):
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+ parser = all_parsers
+ kwargs = dict(index_col=0)
+
+ lines = list(csv.reader(StringIO(data)))
+ reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs)
+
+ expected = parser.read_csv(StringIO(data), **kwargs)
+ chunks = list(reader)
+
+ tm.assert_frame_equal(chunks[0], expected[1:3])
+
+
+def test_iterator_stop_on_chunksize(all_parsers):
+ # gh-3967: stopping iteration when chunksize is specified
+ parser = all_parsers
+ data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+
+ reader = parser.read_csv(StringIO(data), chunksize=1)
+ result = list(reader)
+
+ assert len(result) == 3
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=["foo", "bar", "baz"],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(concat(result), expected)
+
+
[email protected]("kwargs", [
+ dict(iterator=True,
+ chunksize=1),
+ dict(iterator=True),
+ dict(chunksize=1)
+])
+def test_iterator_skipfooter_errors(all_parsers, kwargs):
+ msg = "'skipfooter' not supported for 'iteration'"
+ parser = all_parsers
+ data = "a\n1\n2"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
+
+
+def test_nrows_skipfooter_errors(all_parsers):
+ msg = "'skipfooter' not supported with 'nrows'"
+ data = "a\n1\n2\n3\n4\n5\n6"
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
+
+
[email protected]("data,kwargs,expected", [
+ ("""foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+""", dict(index_col=0, names=["index", "A", "B", "C", "D"]),
+ DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
+ [12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]],
+ index=Index(["foo", "bar", "baz", "qux",
+ "foo2", "bar2"], name="index"),
+ columns=["A", "B", "C", "D"])),
+ ("""foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+""", dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]),
+ DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
+ [12, 13, 14, 15], [12, 13, 14, 15]],
+ index=MultiIndex.from_tuples([
+ ("foo", "one"), ("foo", "two"), ("foo", "three"),
+ ("bar", "one"), ("bar", "two")],
+ names=["index1", "index2"]),
+ columns=["A", "B", "C", "D"])),
+])
+def test_pass_names_with_index(all_parsers, data, kwargs, expected):
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_col", [[0, 1], [1, 0]])
+def test_multi_index_no_level_names(all_parsers, index_col):
+ data = """index1,index2,A,B,C,D
+foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+"""
+ headless_data = '\n'.join(data.split("\n")[1:])
+
+ names = ["A", "B", "C", "D"]
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(headless_data),
+ index_col=index_col,
+ header=None, names=names)
+ expected = parser.read_csv(StringIO(data), index_col=index_col)
+
+ # No index names in headless data.
+ expected.index.names = [None] * 2
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multi_index_no_level_names_implicit(all_parsers):
+ parser = all_parsers
+ data = """A,B,C,D
+foo,one,2,3,4,5
+foo,two,7,8,9,10
+foo,three,12,13,14,15
+bar,one,12,13,14,15
+bar,two,12,13,14,15
+"""
+
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
+ [12, 13, 14, 15], [12, 13, 14, 15]],
+ columns=["A", "B", "C", "D"],
+ index=MultiIndex.from_tuples([
+ ("foo", "one"), ("foo", "two"), ("foo", "three"),
+ ("bar", "one"), ("bar", "two")]))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,expected,header", [
+ ("a,b", DataFrame(columns=["a", "b"]), [0]),
+ ("a,b\nc,d", DataFrame(columns=MultiIndex.from_tuples(
+ [("a", "c"), ("b", "d")])), [0, 1]),
+])
[email protected]("round_trip", [True, False])
+def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
+ # see gh-14545
+ parser = all_parsers
+ data = expected.to_csv(index=False) if round_trip else data
+
+ result = parser.read_csv(StringIO(data), header=header)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_no_unnamed_index(all_parsers):
+ parser = all_parsers
+ data = """ id c0 c1 c2
+0 1 0 a b
+1 2 0 c d
+2 2 2 e f
+"""
+ result = parser.read_csv(StringIO(data), sep=" ")
+ expected = DataFrame([[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"],
+ [2, 2, 2, "e", "f"]], columns=["Unnamed: 0", "id",
+ "c0", "c1", "c2"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_parse_simple_list(all_parsers):
+ parser = all_parsers
+ data = """foo
+bar baz
+qux foo
+foo
+bar"""
+
+ result = parser.read_csv(StringIO(data), header=None)
+ expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_url(all_parsers, csv_dir_path):
+ # TODO: FTP testing
+ parser = all_parsers
+ kwargs = dict(sep="\t")
+
+ url = ("https://raw.github.com/pandas-dev/pandas/master/"
+ "pandas/tests/io/parser/data/salaries.csv")
+ url_result = parser.read_csv(url, **kwargs)
+
+ local_path = os.path.join(csv_dir_path, "salaries.csv")
+ local_result = parser.read_csv(local_path, **kwargs)
+ tm.assert_frame_equal(url_result, local_result)
+
+
+def test_local_file(all_parsers, csv_dir_path):
+ parser = all_parsers
+ kwargs = dict(sep="\t")
+
+ local_path = os.path.join(csv_dir_path, "salaries.csv")
+ local_result = parser.read_csv(local_path, **kwargs)
+ url = "file://localhost/" + local_path
+
+ try:
+ url_result = parser.read_csv(url, **kwargs)
+ tm.assert_frame_equal(url_result, local_result)
+ except URLError:
+ # Fails on some systems.
+ pytest.skip("Failing on: " + " ".join(platform.uname()))
+
+
+def test_path_path_lib(all_parsers):
+ parser = all_parsers
+ df = tm.makeDataFrame()
+ result = tm.round_trip_pathlib(
+ df.to_csv, lambda p: parser.read_csv(p, index_col=0))
+ tm.assert_frame_equal(df, result)
+
+
+def test_path_local_path(all_parsers):
+ parser = all_parsers
+ df = tm.makeDataFrame()
+ result = tm.round_trip_localpath(
+ df.to_csv, lambda p: parser.read_csv(p, index_col=0))
+ tm.assert_frame_equal(df, result)
+
+
+def test_nonexistent_path(all_parsers):
+ # gh-2428: pls no segfault
+ # gh-14086: raise more helpful FileNotFoundError
+ parser = all_parsers
+ path = "%s.csv" % tm.rands(10)
+
+ msg = ("does not exist" if parser.engine == "c"
+ else r"\[Errno 2\]")
+ with pytest.raises(compat.FileNotFoundError, match=msg) as e:
+ parser.read_csv(path)
+
+ filename = e.value.filename
+ filename = filename.decode() if isinstance(
+ filename, bytes) else filename
+
+ assert path == filename
+
+
+def test_missing_trailing_delimiters(all_parsers):
+ parser = all_parsers
+ data = """A,B,C,D
+1,2,3,4
+1,3,3,
+1,4,5"""
+
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame([[1, 2, 3, 4], [1, 3, 3, np.nan],
+ [1, 4, 5, np.nan]], columns=["A", "B", "C", "D"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skip_initial_space(all_parsers):
+ data = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
+ '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, '
+ '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, '
+ '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, '
+ '0.212036, 14.7674, 41.605, -9999.0, -9999.0, '
+ '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128')
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), names=lrange(33), header=None,
+ na_values=["-9999.0"], skipinitialspace=True)
+ expected = DataFrame([["09-Apr-2012", "01:10:18.300", 2456026.548822908,
+ 12849, 1.00361, 1.12551, 330.65659,
+ 355626618.16711, 73.48821, 314.11625, 1917.09447,
+ 179.71425, 80.0, 240.0, -350, 70.06056, 344.9837,
+ 1, 1, -0.689265, -0.692787, 0.212036, 14.7674,
+ 41.605, np.nan, np.nan, np.nan, np.nan, np.nan,
+ np.nan, 0, 12, 128]])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("sep", [",", "\t"])
[email protected]("encoding", ["utf-16", "utf-16le", "utf-16be"])
+def test_utf16_bom_skiprows(all_parsers, sep, encoding):
+ # see gh-2298
+ parser = all_parsers
+ data = u("""skip this
+skip this too
+A,B,C
+1,2,3
+4,5,6""").replace(",", sep)
+ path = "__%s__.csv" % tm.rands(10)
+ kwargs = dict(sep=sep, skiprows=2)
+ utf8 = "utf-8"
+
+ with tm.ensure_clean(path) as path:
+ bytes_data = data.encode(encoding)
+
+ with open(path, "wb") as f:
+ f.write(bytes_data)
+
+ bytes_buffer = BytesIO(data.encode(utf8))
+
+ if compat.PY3:
+ from io import TextIOWrapper
+ bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
+
+ result = parser.read_csv(path, encoding=encoding, **kwargs)
+ expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
+
+ bytes_buffer.close()
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("buffer", [
+ False,
+ pytest.param(True, marks=pytest.mark.skipif(
+ compat.PY3, reason="Not supported on PY3"))])
+def test_utf16_example(all_parsers, csv_dir_path, buffer):
+ path = os.path.join(csv_dir_path, "utf16_ex.txt")
+ parser = all_parsers
+
+ src = BytesIO(open(path, "rb").read()) if buffer else path
+ result = parser.read_csv(src, encoding="utf-16", sep="\t")
+ assert len(result) == 50
+
+
+def test_unicode_encoding(all_parsers, csv_dir_path):
+ path = os.path.join(csv_dir_path, "unicode_series.csv")
+ parser = all_parsers
+
+ result = parser.read_csv(path, header=None, encoding="latin-1")
+ result = result.set_index(0)
+ got = result[1][1632]
+
+ expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)')
+ assert got == expected
+
+
+def test_trailing_delimiters(all_parsers):
+ # see gh-2442
+ data = """A,B,C
+1,2,3,
+4,5,6,
+7,8,9,"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=False)
+
+ expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_escapechar(all_parsers):
+ # http://stackoverflow.com/questions/13824840/feature-request-for-
+ # pandas-read-csv
+ data = '''SEARCH_TERM,ACTUAL_URL
+"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
+"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
+"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa
+
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), escapechar='\\',
+ quotechar='"', encoding='utf-8')
+
+ assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", '
+ 'IKEA:s 1700-tals serie')
+ tm.assert_index_equal(result.columns,
+ Index(['SEARCH_TERM', 'ACTUAL_URL']))
+
+
+def test_int64_min_issues(all_parsers):
+ # see gh-2599
+ parser = all_parsers
+ data = "A,B\n0,0\n0,"
+ result = parser.read_csv(StringIO(data))
+
+ expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_integers_above_fp_precision(all_parsers):
+ data = """Numbers
+17007000002000191
+17007000002000191
+17007000002000191
+17007000002000191
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000192
+17007000002000194"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame({"Numbers": [17007000002000191,
+ 17007000002000191,
+ 17007000002000191,
+ 17007000002000191,
+ 17007000002000192,
+ 17007000002000192,
+ 17007000002000192,
+ 17007000002000192,
+ 17007000002000192,
+ 17007000002000194]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_chunks_have_consistent_numerical_type(all_parsers):
+ parser = all_parsers
+ integers = [str(i) for i in range(499999)]
+ data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
+
+ # Coercions should work without warnings.
+ with tm.assert_produces_warning(None):
+ result = parser.read_csv(StringIO(data))
+
+ assert type(result.a[0]) is np.float64
+ assert result.a.dtype == np.float
+
+
+def test_warn_if_chunks_have_mismatched_type(all_parsers):
+ warning_type = None
+ parser = all_parsers
+ integers = [str(i) for i in range(499999)]
+ data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
+
+ # see gh-3866: if chunks are different types and can't
+ # be coerced using numerical types, then issue warning.
+ if parser.engine == "c" and parser.low_memory:
+ warning_type = DtypeWarning
+
+ with tm.assert_produces_warning(warning_type):
+ df = parser.read_csv(StringIO(data))
+ assert df.a.dtype == np.object
+
+
[email protected]("sep", [" ", r"\s+"])
+def test_integer_overflow_bug(all_parsers, sep):
+ # see gh-2601
+ data = "65248E10 11\n55555E55 22\n"
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), header=None, sep=sep)
+ expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_catch_too_many_names(all_parsers):
+ # see gh-5156
+ data = """\
+1,2,3
+4,,6
+7,8,9
+10,11,12\n"""
+ parser = all_parsers
+ msg = ("Too many columns specified: "
+ "expected 4 and found 3" if parser.engine == "c"
+ else "Number of passed names did not match "
+ "number of header fields in the file")
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
+
+
+def test_ignore_leading_whitespace(all_parsers):
+ # see gh-3374, gh-6607
+ parser = all_parsers
+ data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
+ result = parser.read_csv(StringIO(data), sep=r"\s+")
+
+ expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_chunk_begins_with_newline_whitespace(all_parsers):
+ # see gh-10022
+ parser = all_parsers
+ data = "\n hello\nworld\n"
+
+ result = parser.read_csv(StringIO(data), header=None)
+ expected = DataFrame([" hello", "world"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_index(all_parsers):
+ # see gh-10184
+ data = "x,y"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=0)
+
+ expected = DataFrame([], columns=["y"], index=Index([], name="x"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_multi_index(all_parsers):
+ # see gh-10467
+ data = "x,y,z"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=["x", "y"])
+
+ expected = DataFrame([], columns=["z"],
+ index=MultiIndex.from_arrays(
+ [[]] * 2, names=["x", "y"]))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_reversed_multi_index(all_parsers):
+ data = "x,y,z"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=[1, 0])
+
+ expected = DataFrame([], columns=["z"],
+ index=MultiIndex.from_arrays(
+ [[]] * 2, names=["y", "x"]))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_float_parser(all_parsers):
+ # see gh-9565
+ parser = all_parsers
+ data = "45e-1,4.5,45.,inf,-inf"
+ result = parser.read_csv(StringIO(data), header=None)
+
+ expected = DataFrame([[float(s) for s in data.split(",")]])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_scientific_no_exponent(all_parsers):
+ # see gh-12215
+ df = DataFrame.from_dict(OrderedDict([("w", ["2e"]), ("x", ["3E"]),
+ ("y", ["42e"]),
+ ("z", ["632E"])]))
+ data = df.to_csv(index=False)
+ parser = all_parsers
+
+ for precision in parser.float_precision_choices:
+ df_roundtrip = parser.read_csv(StringIO(data),
+ float_precision=precision)
+ tm.assert_frame_equal(df_roundtrip, df)
+
+
[email protected]("conv", [None, np.int64, np.uint64])
+def test_int64_overflow(all_parsers, conv):
+ data = """ID
+00013007854817840016671868
+00013007854817840016749251
+00013007854817840016754630
+00013007854817840016781876
+00013007854817840017028824
+00013007854817840017963235
+00013007854817840018860166"""
+ parser = all_parsers
+
+ if conv is None:
+ # 13007854817840016671868 > UINT64_MAX, so this
+ # will overflow and return object as the dtype.
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame(["00013007854817840016671868",
+ "00013007854817840016749251",
+ "00013007854817840016754630",
+ "00013007854817840016781876",
+ "00013007854817840017028824",
+ "00013007854817840017963235",
+ "00013007854817840018860166"], columns=["ID"])
+ tm.assert_frame_equal(result, expected)
+ else:
+ # 13007854817840016671868 > UINT64_MAX, so attempts
+ # to cast to either int64 or uint64 will result in
+ # an OverflowError being raised.
+ msg = ("(Python int too large to convert to C long)|"
+ "(long too big to convert)|"
+ "(int too big to convert)")
+
+ with pytest.raises(OverflowError, match=msg):
+ parser.read_csv(StringIO(data), converters={"ID": conv})
+
+
+ np.iinfo(np.uint64).max,
+ np.iinfo(np.int64).max,
+ np.iinfo(np.int64).min
+])
+def test_int64_uint64_range(all_parsers, val):
+ # These numbers fall right inside the int64-uint64
+ # range, so they should be parsed as string.
+ parser = all_parsers
+ result = parser.read_csv(StringIO(str(val)), header=None)
+
+ expected = DataFrame([val])
+ tm.assert_frame_equal(result, expected)
+
+
+ np.iinfo(np.uint64).max + 1,
+ np.iinfo(np.int64).min - 1
+])
+def test_outside_int64_uint64_range(all_parsers, val):
+ # These numbers fall just outside the int64-uint64
+ # range, so they should be parsed as string.
+ parser = all_parsers
+ result = parser.read_csv(StringIO(str(val)), header=None)
+
+ expected = DataFrame([str(val)])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("exp_data", [[str(-1), str(2**63)],
+ [str(2**63), str(-1)]])
+def test_numeric_range_too_wide(all_parsers, exp_data):
+ # No numerical dtype can hold both negative and uint64
+ # values, so they should be cast as string.
+ parser = all_parsers
+ data = "\n".join(exp_data)
+ expected = DataFrame(exp_data)
+
+ result = parser.read_csv(StringIO(data), header=None)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("iterator", [True, False])
+def test_empty_with_nrows_chunksize(all_parsers, iterator):
+ # see gh-9535
+ parser = all_parsers
+ expected = DataFrame([], columns=["foo", "bar"])
+
+ nrows = 10
+ data = StringIO("foo,bar\n")
+
+ if iterator:
+ result = next(iter(parser.read_csv(data, chunksize=nrows)))
+ else:
+ result = parser.read_csv(data, nrows=nrows)
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected,msg", [
+ # gh-10728: WHITESPACE_LINE
+ ("a,b,c\n4,5,6\n ", dict(),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # gh-10548: EAT_LINE_COMMENT
+ ("a,b,c\n4,5,6\n#comment", dict(comment="#"),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # EAT_CRNL_NOP
+ ("a,b,c\n4,5,6\n\r", dict(),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # EAT_COMMENT
+ ("a,b,c\n4,5,6#comment", dict(comment="#"),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # SKIP_LINE
+ ("a,b,c\n4,5,6\nskipme", dict(skiprows=[2]),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # EAT_LINE_COMMENT
+ ("a,b,c\n4,5,6\n#comment", dict(comment="#", skip_blank_lines=False),
+ DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
+
+ # IN_FIELD
+ ("a,b,c\n4,5,6\n ", dict(skip_blank_lines=False),
+ DataFrame([["4", 5, 6], [" ", None, None]],
+ columns=["a", "b", "c"]), None),
+
+ # EAT_CRNL
+ ("a,b,c\n4,5,6\n\r", dict(skip_blank_lines=False),
+ DataFrame([[4, 5, 6], [None, None, None]],
+ columns=["a", "b", "c"]), None),
+
+ # ESCAPED_CHAR
+ ("a,b,c\n4,5,6\n\\", dict(escapechar="\\"),
+ None, "(EOF following escape character)|(unexpected end of data)"),
+
+ # ESCAPE_IN_QUOTED_FIELD
+ ('a,b,c\n4,5,6\n"\\', dict(escapechar="\\"),
+ None, "(EOF inside string starting at row 2)|(unexpected end of data)"),
+
+ # IN_QUOTED_FIELD
+ ('a,b,c\n4,5,6\n"', dict(escapechar="\\"),
+ None, "(EOF inside string starting at row 2)|(unexpected end of data)"),
+], ids=["whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment",
+ "skip-line", "eat-line-comment", "in-field", "eat-crnl",
+ "escaped-char", "escape-in-quoted-field", "in-quoted-field"])
+def test_eof_states(all_parsers, data, kwargs, expected, msg):
+ # see gh-10728, gh-10548
+ parser = all_parsers
+
+ if expected is None:
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+ else:
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols", [None, [0, 1], ["a", "b"]])
+def test_uneven_lines_with_usecols(all_parsers, usecols):
+ # see gh-12203
+ parser = all_parsers
+ data = r"""a,b,c
+0,1,2
+3,4,5,6,7
+8,9,10"""
+
+ if usecols is None:
+ # Make sure that an error is still raised
+ # when the "usecols" parameter is not provided.
+ msg = r"Expected \d+ fields in line \d+, saw \d+"
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data))
+ else:
+ expected = DataFrame({
+ "a": [0, 3, 8],
+ "b": [1, 4, 9]
+ })
+
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected", [
+ # First, check to see that the response of parser when faced with no
+ # provided columns raises the correct error, with or without usecols.
+ ("", dict(), None),
+ ("", dict(usecols=["X"]), None),
+ (",,", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
+ DataFrame(columns=["X"], index=[0], dtype=np.float64)),
+ ("", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
+ DataFrame(columns=["X"])),
+])
+def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
+ # see gh-12493
+ parser = all_parsers
+
+ if expected is None:
+ msg = "No columns to parse from file"
+ with pytest.raises(EmptyDataError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+ else:
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,expected", [
+ # gh-8661, gh-8679: this should ignore six lines, including
+ # lines with trailing whitespace and blank lines.
+ (dict(header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6],
+ skip_blank_lines=True), DataFrame([[1., 2., 4.],
+ [5.1, np.nan, 10.]])),
+
+ # gh-8983: test skipping set of rows after a row with trailing spaces.
+ (dict(delim_whitespace=True, skiprows=[1, 2, 3, 5, 6],
+ skip_blank_lines=True), DataFrame({"A": [1., 5.1],
+ "B": [2., np.nan],
+ "C": [4., 10]})),
+])
+def test_trailing_spaces(all_parsers, kwargs, expected):
+ data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_raise_on_sep_with_delim_whitespace(all_parsers):
+ # see gh-6607
+ data = "a b c\n1 2 3"
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match="you can only specify one"):
+ parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
+
+
[email protected]("delim_whitespace", [True, False])
+def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
+ # see gh-9710
+ parser = all_parsers
+ data = """\
+MyColumn
+a
+b
+a
+b\n"""
+
+ expected = DataFrame({"MyColumn": list("abab")})
+ result = parser.read_csv(StringIO(data), skipinitialspace=True,
+ delim_whitespace=delim_whitespace)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("sep,skip_blank_lines,exp_data", [
+ (",", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]),
+ (r"\s+", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]),
+ (",", False, [[1., 2., 4.], [np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan], [5., np.nan, 10.],
+ [np.nan, np.nan, np.nan], [-70., .4, 1.]]),
+])
+def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
+ parser = all_parsers
+ data = """\
+A,B,C
+1,2.,4.
+
+
+5.,NaN,10.0
+
+-70,.4,1
+"""
+
+ if sep == r"\s+":
+ data = data.replace(",", " ")
+
+ result = parser.read_csv(StringIO(data), sep=sep,
+ skip_blank_lines=skip_blank_lines)
+ expected = DataFrame(exp_data, columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_whitespace_lines(all_parsers):
+ parser = all_parsers
+ data = """
+
+\t \t\t
+\t
+A,B,C
+\t 1,2.,4.
+5.,NaN,10.0
+"""
+ expected = DataFrame([[1, 2., 4.], [5., np.nan, 10.]],
+ columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,expected", [
+ (""" A B C D
+a 1 2 3 4
+b 1 2 3 4
+c 1 2 3 4
+""", DataFrame([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
+ columns=["A", "B", "C", "D"], index=["a", "b", "c"])),
+ (" a b c\n1 2 3 \n4 5 6\n 7 8 9",
+ DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])),
+])
+def test_whitespace_regex_separator(all_parsers, data, expected):
+ # see gh-6607
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), sep=r"\s+")
+ tm.assert_frame_equal(result, expected)
+
+
+def test_verbose_read(all_parsers, capsys):
+ parser = all_parsers
+ data = """a,b,c,d
+one,1,2,3
+one,1,2,3
+,1,2,3
+one,1,2,3
+,1,2,3
+,1,2,3
+one,1,2,3
+two,1,2,3"""
+
+ # Engines are verbose in different ways.
+ parser.read_csv(StringIO(data), verbose=True)
+ captured = capsys.readouterr()
+
+ if parser.engine == "c":
+ assert "Tokenization took:" in captured.out
+ assert "Parser memory cleanup took:" in captured.out
+ else: # Python engine
+ assert captured.out == "Filled 3 NA values in column a\n"
+
+
+def test_verbose_read2(all_parsers, capsys):
+ parser = all_parsers
+ data = """a,b,c,d
+one,1,2,3
+two,1,2,3
+three,1,2,3
+four,1,2,3
+five,1,2,3
+,1,2,3
+seven,1,2,3
+eight,1,2,3"""
+
+ parser.read_csv(StringIO(data), verbose=True, index_col=0)
+ captured = capsys.readouterr()
+
+ # Engines are verbose in different ways.
+ if parser.engine == "c":
+ assert "Tokenization took:" in captured.out
+ assert "Parser memory cleanup took:" in captured.out
+ else: # Python engine
+ assert captured.out == "Filled 1 NA values in column a\n"
+
+
+def test_iteration_open_handle(all_parsers):
+ parser = all_parsers
+ kwargs = dict(squeeze=True, header=None)
+
+ with tm.ensure_clean() as path:
+ with open(path, "wb" if compat.PY2 else "w") as f:
+ f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
+
+ with open(path, "rb" if compat.PY2 else "r") as f:
+ for line in f:
+ if "CCC" in line:
+ break
+
+ if parser.engine == "c" and compat.PY2:
+ msg = "Mixing iteration and read methods would lose data"
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(f, **kwargs)
+ else:
+ result = parser.read_csv(f, **kwargs)
+ expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("data,thousands,decimal", [
+ ("""A|B|C
+1|2,334.01|5
+10|13|10.
+""", ",", "."),
+ ("""A|B|C
+1|2.334,01|5
+10|13|10,
+""", ".", ","),
+])
+def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
+ parser = all_parsers
+ expected = DataFrame({
+ "A": [1, 10],
+ "B": [2334.01, 13],
+ "C": [5, 10.]
+ })
+
+ result = parser.read_csv(StringIO(data), sep="|",
+ thousands=thousands,
+ decimal=decimal)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_euro_decimal_format(all_parsers):
+ parser = all_parsers
+ data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+
+ result = parser.read_csv(StringIO(data), sep=";", decimal=",")
+ expected = DataFrame([
+ [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
+ [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
+ [3, 878.158, 108013.434, "GHI", "rez", 2.735694704]
+ ], columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_filter", [True, False])
+def test_inf_parsing(all_parsers, na_filter):
+ parser = all_parsers
+ data = """\
+,A
+a,inf
+b,-inf
+c,+Inf
+d,-Inf
+e,INF
+f,-INF
+g,+INf
+h,-INf
+i,inF
+j,-inF"""
+ expected = DataFrame({"A": [float("inf"), float("-inf")] * 5},
+ index=["a", "b", "c", "d", "e",
+ "f", "g", "h", "i", "j"])
+ result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("nrows", [0, 1, 2, 3, 4, 5])
+def test_raise_on_no_columns(all_parsers, nrows):
+ parser = all_parsers
+ data = "\n" * nrows
+
+ msg = "No columns to parse from file"
+ with pytest.raises(EmptyDataError, match=msg):
+ parser.read_csv(StringIO(data))
+
+
+def test_memory_map(all_parsers, csv_dir_path):
+ mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
+ parser = all_parsers
+
+ expected = DataFrame({
+ "a": [1, 2, 3],
+ "b": ["one", "two", "three"],
+ "c": ["I", "II", "III"]
+ })
+
+ result = parser.read_csv(mmap_file, memory_map=True)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_null_byte_char(all_parsers):
+ # see gh-2741
+ data = "\x00,foo"
+ names = ["a", "b"]
+ parser = all_parsers
+
+ if parser.engine == "c":
+ expected = DataFrame([[np.nan, "foo"]], columns=names)
+ out = parser.read_csv(StringIO(data), names=names)
+ tm.assert_frame_equal(out, expected)
+ else:
+ msg = "NULL byte detected"
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), names=names)
+
+
[email protected]("data,kwargs,expected", [
+ # Basic test
+ ("a\n1", dict(), DataFrame({"a": [1]})),
+
+ # "Regular" quoting
+ ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
+
+ # Test in a data row instead of header
+ ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
+
+ # Test in empty data row with skipping
+ ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
+
+ # Test in empty data row without skipping
+ ("\n1", dict(names=["a"], skip_blank_lines=False),
+ DataFrame({"a": [np.nan, 1]})),
+])
+def test_utf8_bom(all_parsers, data, kwargs, expected):
+ # see gh-4793
+ parser = all_parsers
+ bom = u("\ufeff")
+ utf8 = "utf-8"
+
+ def _encode_data_with_bom(_data):
+ bom_data = (bom + _data).encode(utf8)
+ return BytesIO(bom_data)
+
+ result = parser.read_csv(_encode_data_with_bom(data),
+ encoding=utf8, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_temporary_file(all_parsers):
+ # see gh-13398
+ parser = all_parsers
+ data = "0 0"
+
+ new_file = TemporaryFile("w+")
+ new_file.write(data)
+ new_file.flush()
+ new_file.seek(0)
+
+ result = parser.read_csv(new_file, sep=r"\s+", header=None)
+ new_file.close()
+
+ expected = DataFrame([[0, 0]])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("byte", [8, 16])
[email protected]("fmt", ["utf-{0}", "utf_{0}",
+ "UTF-{0}", "UTF_{0}"])
+def test_read_csv_utf_aliases(all_parsers, byte, fmt):
+ # see gh-13549
+ expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
+ parser = all_parsers
+
+ encoding = fmt.format(byte)
+ data = "mb_num,multibyte\n4.8,test".encode(encoding)
+
+ result = parser.read_csv(BytesIO(data), encoding=encoding)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_internal_eof_byte(all_parsers):
+ # see gh-5500
+ parser = all_parsers
+ data = "a,b\n1\x1a,2"
+
+ expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_internal_eof_byte_to_file(all_parsers):
+ # see gh-16559
+ parser = all_parsers
+ data = b'c1,c2\r\n"test \x1a test", test\r\n'
+ expected = DataFrame([["test \x1a test", " test"]],
+ columns=["c1", "c2"])
+ path = "__%s__.csv" % tm.rands(10)
+
+ with tm.ensure_clean(path) as path:
+ with open(path, "wb") as f:
+ f.write(data)
+
+ result = parser.read_csv(path)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_sub_character(all_parsers, csv_dir_path):
+ # see gh-16893
+ filename = os.path.join(csv_dir_path, "sub_char.csv")
+ expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
+
+ parser = all_parsers
+ result = parser.read_csv(filename)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_file_handle_string_io(all_parsers):
+ # gh-14418
+ #
+ # Don't close user provided file handles.
+ parser = all_parsers
+ data = "a,b\n1,2"
+
+ fh = StringIO(data)
+ parser.read_csv(fh)
+ assert not fh.closed
+
+
+def test_file_handles_with_open(all_parsers, csv1):
+ # gh-14418
+ #
+ # Don't close user provided file handles.
+ parser = all_parsers
+
+ with open(csv1, "r") as f:
+ parser.read_csv(f)
+ assert not f.closed
+
+
+def test_invalid_file_buffer_class(all_parsers):
+ # see gh-15337
+ class InvalidBuffer(object):
+ pass
+
+ parser = all_parsers
+ msg = "Invalid file path or buffer object type"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(InvalidBuffer())
+
+
+def test_invalid_file_buffer_mock(all_parsers):
+ # see gh-15337
+ parser = all_parsers
+ msg = "Invalid file path or buffer object type"
+
+ class Foo():
+ pass
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(Foo())
+
+
+def test_valid_file_buffer_seems_invalid(all_parsers):
+ # gh-16135: we want to ensure that "tell" and "seek"
+ # aren't actually being used when we call `read_csv`
+ #
+ # Thus, while the object may look "invalid" (these
+ # methods are attributes of the `StringIO` class),
+ # it is still a valid file-object for our purposes.
+ class NoSeekTellBuffer(StringIO):
+ def tell(self):
+ raise AttributeError("No tell method")
+
+ def seek(self, pos, whence=0):
+ raise AttributeError("No seek method")
+
+ data = "a\n1"
+ parser = all_parsers
+ expected = DataFrame({"a": [1]})
+
+ result = parser.read_csv(NoSeekTellBuffer(data))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [
+ dict(), # Default is True.
+ dict(error_bad_lines=True), # Explicitly pass in.
+])
[email protected]("warn_kwargs", [
+ dict(), dict(warn_bad_lines=True),
+ dict(warn_bad_lines=False)
+])
+def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
+ # see gh-15925
+ parser = all_parsers
+ kwargs.update(**warn_kwargs)
+ data = "a\n1\n1,2,3\n4\n5,6,7"
+
+ msg = "Expected 1 fields in line 3, saw 3"
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+
+
+def test_warn_bad_lines(all_parsers, capsys):
+ # see gh-15925
+ parser = all_parsers
+ data = "a\n1\n1,2,3\n4\n5,6,7"
+ expected = DataFrame({"a": [1, 4]})
+
+ result = parser.read_csv(StringIO(data),
+ error_bad_lines=False,
+ warn_bad_lines=True)
+ tm.assert_frame_equal(result, expected)
+
+ captured = capsys.readouterr()
+ assert "Skipping line 3" in captured.err
+ assert "Skipping line 5" in captured.err
+
+
+def test_suppress_error_output(all_parsers, capsys):
+ # see gh-15925
+ parser = all_parsers
+ data = "a\n1\n1,2,3\n4\n5,6,7"
+ expected = DataFrame({"a": [1, 4]})
+
+ result = parser.read_csv(StringIO(data),
+ error_bad_lines=False,
+ warn_bad_lines=False)
+ tm.assert_frame_equal(result, expected)
+
+ captured = capsys.readouterr()
+ assert captured.err == ""
+
+
+def test_filename_with_special_chars(all_parsers):
+ # see gh-15086.
+ parser = all_parsers
+ df = DataFrame({"a": [1, 2, 3]})
+
+ with tm.ensure_clean("sé-es-vé.csv") as path:
+ df.to_csv(path, index=False)
+
+ result = parser.read_csv(path)
+ tm.assert_frame_equal(result, df)
+
+
+def test_read_csv_memory_growth_chunksize(all_parsers):
+ # see gh-24805
+ #
+ # Let's just make sure that we don't crash
+ # as we iteratively process all chunks.
+ parser = all_parsers
+
+ with tm.ensure_clean() as path:
+ with open(path, "w") as f:
+ for i in range(1000):
+ f.write(str(i) + "\n")
+
+ result = parser.read_csv(path, chunksize=20)
+
+ for _ in result:
+ pass
+
+
+def test_read_table_deprecated(all_parsers):
+ # see gh-21948
+ parser = all_parsers
+ data = "a\tb\n1\t2\n3\t4"
+ expected = parser.read_csv(StringIO(data), sep="\t")
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = parser.read_table(StringIO(data))
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_compression.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_compression.py
new file mode 100644
index 00000000000..6e615e795e5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_compression.py
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests compressed data parsing functionality for all
+of the parsers defined in parsers.py
+"""
+
+import os
+import zipfile
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
[email protected](params=[True, False])
+def buffer(request):
+ return request.param
+
+
+def parser_and_data(all_parsers, csv1):
+ parser = all_parsers
+
+ with open(csv1, "rb") as f:
+ data = f.read()
+ expected = parser.read_csv(csv1)
+
+ return parser, data, expected
+
+
[email protected]("compression", ["zip", "infer", "zip2"])
+def test_zip(parser_and_data, compression):
+ parser, data, expected = parser_and_data
+
+ with tm.ensure_clean("test_file.zip") as path:
+ with zipfile.ZipFile(path, mode="w") as tmp:
+ tmp.writestr("test_file", data)
+
+ if compression == "zip2":
+ with open(path, "rb") as f:
+ result = parser.read_csv(f, compression="zip")
+ else:
+ result = parser.read_csv(path, compression=compression)
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("compression", ["zip", "infer"])
+def test_zip_error_multiple_files(parser_and_data, compression):
+ parser, data, expected = parser_and_data
+
+ with tm.ensure_clean("combined_zip.zip") as path:
+ inner_file_names = ["test_file", "second_file"]
+
+ with zipfile.ZipFile(path, mode="w") as tmp:
+ for file_name in inner_file_names:
+ tmp.writestr(file_name, data)
+
+ with pytest.raises(ValueError, match="Multiple files"):
+ parser.read_csv(path, compression=compression)
+
+
+def test_zip_error_no_files(parser_and_data):
+ parser, _, _ = parser_and_data
+
+ with tm.ensure_clean() as path:
+ with zipfile.ZipFile(path, mode="w"):
+ pass
+
+ with pytest.raises(ValueError, match="Zero files"):
+ parser.read_csv(path, compression="zip")
+
+
+def test_zip_error_invalid_zip(parser_and_data):
+ parser, _, _ = parser_and_data
+
+ with tm.ensure_clean() as path:
+ with open(path, "wb") as f:
+ with pytest.raises(zipfile.BadZipfile,
+ match="File is not a zip file"):
+ parser.read_csv(f, compression="zip")
+
+
[email protected]("filename", [None, "test.{ext}"])
+def test_compression(parser_and_data, compression_only, buffer, filename):
+ parser, data, expected = parser_and_data
+ compress_type = compression_only
+
+ ext = "gz" if compress_type == "gzip" else compress_type
+ filename = filename if filename is None else filename.format(ext=ext)
+
+ if filename and buffer:
+ pytest.skip("Cannot deduce compression from "
+ "buffer of compressed data.")
+
+ with tm.ensure_clean(filename=filename) as path:
+ tm.write_to_compressed(compress_type, path, data)
+ compression = "infer" if filename else compress_type
+
+ if buffer:
+ with open(path, "rb") as f:
+ result = parser.read_csv(f, compression=compression)
+ else:
+ result = parser.read_csv(path, compression=compression)
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("ext", [None, "gz", "bz2"])
+def test_infer_compression(all_parsers, csv1, buffer, ext):
+ # see gh-9770
+ parser = all_parsers
+ kwargs = dict(index_col=0, parse_dates=True)
+
+ expected = parser.read_csv(csv1, **kwargs)
+ kwargs["compression"] = "infer"
+
+ if buffer:
+ with open(csv1) as f:
+ result = parser.read_csv(f, **kwargs)
+ else:
+ ext = "." + ext if ext else ""
+ result = parser.read_csv(csv1 + ext, **kwargs)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_compression_utf16_encoding(all_parsers, csv_dir_path):
+ # see gh-18071
+ parser = all_parsers
+ path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
+
+ result = parser.read_csv(path, encoding="utf-16",
+ compression="zip", sep="\t")
+ expected = pd.DataFrame({
+ u"Country": [u"Venezuela", u"Venezuela"],
+ u"Twitter": [u"Hugo Chávez Frías", u"Henrique Capriles R."]
+ })
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("invalid_compression", ["sfark", "bz3", "zipper"])
+def test_invalid_compression(all_parsers, invalid_compression):
+ parser = all_parsers
+ compress_kwargs = dict(compression=invalid_compression)
+
+ msg = ("Unrecognized compression "
+ "type: {compression}".format(**compress_kwargs))
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv("test_file.zip", **compress_kwargs)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_converters.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_converters.py
new file mode 100644
index 00000000000..47bbae0274f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_converters.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests column conversion functionality during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lmap, parse_date
+
+import pandas as pd
+from pandas import DataFrame, Index
+import pandas.util.testing as tm
+
+
+def test_converters_type_must_be_dict(all_parsers):
+ parser = all_parsers
+ data = """index,A,B,C,D
+foo,2,3,4,5
+"""
+
+ with pytest.raises(TypeError, match="Type converters.+"):
+ parser.read_csv(StringIO(data), converters=0)
+
+
[email protected]("column", [3, "D"])
[email protected]("converter", [
+ parse_date,
+ lambda x: int(x.split("/")[2]) # Produce integer.
+])
+def test_converters(all_parsers, column, converter):
+ parser = all_parsers
+ data = """A,B,C,D
+a,1,2,01/01/2009
+b,3,4,01/02/2009
+c,4,5,01/03/2009
+"""
+ result = parser.read_csv(StringIO(data), converters={column: converter})
+
+ expected = parser.read_csv(StringIO(data))
+ expected["D"] = expected["D"].map(converter)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_converters_no_implicit_conv(all_parsers):
+ # see gh-2184
+ parser = all_parsers
+ data = """000102,1.2,A\n001245,2,B"""
+
+ converters = {0: lambda x: x.strip()}
+ result = parser.read_csv(StringIO(data), header=None,
+ converters=converters)
+
+ # Column 0 should not be casted to numeric and should remain as object.
+ expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_converters_euro_decimal_format(all_parsers):
+ # see gh-583
+ converters = dict()
+ parser = all_parsers
+
+ data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,7387
+2;121,12;14897,76;DEF;uyt;0,3773
+3;878,158;108013,434;GHI;rez;2,7356"""
+ converters["Number1"] = converters["Number2"] =\
+ converters["Number3"] = lambda x: float(x.replace(",", "."))
+
+ result = parser.read_csv(StringIO(data), sep=";", converters=converters)
+ expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
+ [2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
+ [3, 878.158, 108013.434, "GHI", "rez", 2.7356]],
+ columns=["Id", "Number1", "Number2",
+ "Text1", "Text2", "Number3"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_converters_corner_with_nans(all_parsers):
+ parser = all_parsers
+ data = """id,score,days
+1,2,12
+2,2-5,
+3,,14+
+4,6-12,2"""
+
+ # Example converters.
+ def convert_days(x):
+ x = x.strip()
+
+ if not x:
+ return np.nan
+
+ is_plus = x.endswith("+")
+
+ if is_plus:
+ x = int(x[:-1]) + 1
+ else:
+ x = int(x)
+
+ return x
+
+ def convert_days_sentinel(x):
+ x = x.strip()
+
+ if not x:
+ return np.nan
+
+ is_plus = x.endswith("+")
+
+ if is_plus:
+ x = int(x[:-1]) + 1
+ else:
+ x = int(x)
+
+ return x
+
+ def convert_score(x):
+ x = x.strip()
+
+ if not x:
+ return np.nan
+
+ if x.find("-") > 0:
+ val_min, val_max = lmap(int, x.split("-"))
+ val = 0.5 * (val_min + val_max)
+ else:
+ val = float(x)
+
+ return val
+
+ results = []
+
+ for day_converter in [convert_days, convert_days_sentinel]:
+ result = parser.read_csv(StringIO(data),
+ converters={"score": convert_score,
+ "days": day_converter},
+ na_values=["", None])
+ assert pd.isna(result["days"][1])
+ results.append(result)
+
+ tm.assert_frame_equal(results[0], results[1])
+
+
+def test_converter_index_col_bug(all_parsers):
+ # see gh-1835
+ parser = all_parsers
+ data = "A;B\n1;2\n3;4"
+
+ rs = parser.read_csv(StringIO(data), sep=";", index_col="A",
+ converters={"A": lambda x: x})
+
+ xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
+ tm.assert_frame_equal(rs, xp)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_dialect.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_dialect.py
new file mode 100644
index 00000000000..5392f793b36
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_dialect.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that dialects are properly handled during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import csv
+
+import pytest
+
+from pandas.compat import StringIO
+from pandas.errors import ParserWarning
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
+def custom_dialect():
+ dialect_name = "weird"
+ dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
+ skipinitialspace=False, quotechar="~", quoting=3)
+ return dialect_name, dialect_kwargs
+
+
+def test_dialect(all_parsers):
+ parser = all_parsers
+ data = """\
+label1,label2,label3
+index1,"a,c,e
+index2,b,d,f
+"""
+
+ dia = csv.excel()
+ dia.quoting = csv.QUOTE_NONE
+ df = parser.read_csv(StringIO(data), dialect=dia)
+
+ data = """\
+label1,label2,label3
+index1,a,c,e
+index2,b,d,f
+"""
+ exp = parser.read_csv(StringIO(data))
+ exp.replace("a", "\"a", inplace=True)
+ tm.assert_frame_equal(df, exp)
+
+
+def test_dialect_str(all_parsers):
+ dialect_name = "mydialect"
+ parser = all_parsers
+ data = """\
+fruit:vegetable
+apple:broccoli
+pear:tomato
+"""
+ exp = DataFrame({
+ "fruit": ["apple", "pear"],
+ "vegetable": ["broccoli", "tomato"]
+ })
+
+ with tm.with_csv_dialect(dialect_name, delimiter=":"):
+ df = parser.read_csv(StringIO(data), dialect=dialect_name)
+ tm.assert_frame_equal(df, exp)
+
+
+def test_invalid_dialect(all_parsers):
+ class InvalidDialect(object):
+ pass
+
+ data = "a\n1"
+ parser = all_parsers
+ msg = "Invalid dialect"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), dialect=InvalidDialect)
+
+
[email protected]("arg", [None, "doublequote", "escapechar",
+ "skipinitialspace", "quotechar", "quoting"])
[email protected]("value", ["dialect", "default", "other"])
+def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
+ arg, value):
+ # see gh-23761.
+ dialect_name, dialect_kwargs = custom_dialect
+ parser = all_parsers
+
+ expected = DataFrame({"a": [1], "b": [2]})
+ data = "a:b\n1:2"
+
+ warning_klass = None
+ kwds = dict()
+
+ # arg=None tests when we pass in the dialect without any other arguments.
+ if arg is not None:
+ if "value" == "dialect": # No conflict --> no warning.
+ kwds[arg] = dialect_kwargs[arg]
+ elif "value" == "default": # Default --> no warning.
+ from pandas.io.parsers import _parser_defaults
+ kwds[arg] = _parser_defaults[arg]
+ else: # Non-default + conflict with dialect --> warning.
+ warning_klass = ParserWarning
+ kwds[arg] = "blah"
+
+ with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
+ with tm.assert_produces_warning(warning_klass):
+ result = parser.read_csv(StringIO(data),
+ dialect=dialect_name, **kwds)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,warning_klass", [
+ (dict(sep=","), None), # sep is default --> sep_override=True
+ (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
+ (dict(delimiter=":"), None), # No conflict
+ (dict(delimiter=None), None), # Default arguments --> sep_override=True
+ (dict(delimiter=","), ParserWarning), # Conflict
+ (dict(delimiter="."), ParserWarning), # Conflict
+], ids=["sep-override-true", "sep-override-false",
+ "delimiter-no-conflict", "delimiter-default-arg",
+ "delimiter-conflict", "delimiter-conflict2"])
+def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
+ kwargs, warning_klass):
+ # see gh-23761.
+ dialect_name, dialect_kwargs = custom_dialect
+ parser = all_parsers
+
+ expected = DataFrame({"a": [1], "b": [2]})
+ data = "a:b\n1:2"
+
+ with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
+ with tm.assert_produces_warning(warning_klass):
+ result = parser.read_csv(StringIO(data),
+ dialect=dialect_name, **kwargs)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_dtypes.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_dtypes.py
new file mode 100644
index 00000000000..caa03fc3685
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_dtypes.py
@@ -0,0 +1,514 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests dtype specification during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import os
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+from pandas.errors import ParserWarning
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat)
+import pandas.util.testing as tm
+
+
[email protected]("dtype", [str, object])
[email protected]("check_orig", [True, False])
+def test_dtype_all_columns(all_parsers, dtype, check_orig):
+ # see gh-3795, gh-6607
+ parser = all_parsers
+
+ df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"),
+ index=["1A", "1B", "1C", "1D", "1E"])
+
+ with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
+ df.to_csv(path)
+
+ result = parser.read_csv(path, dtype=dtype, index_col=0)
+
+ if check_orig:
+ expected = df.copy()
+ result = result.astype(float)
+ else:
+ expected = df.astype(str)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_dtype_all_columns_empty(all_parsers):
+ # see gh-12048
+ parser = all_parsers
+ result = parser.read_csv(StringIO("A,B"), dtype=str)
+
+ expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_dtype_per_column(all_parsers):
+ parser = all_parsers
+ data = """\
+one,two
+1,2.5
+2,3.5
+3,4.5
+4,5.5"""
+ expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]],
+ columns=["one", "two"])
+ expected["one"] = expected["one"].astype(np.float64)
+ expected["two"] = expected["two"].astype(object)
+
+ result = parser.read_csv(StringIO(data), dtype={"one": np.float64,
+ 1: str})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_invalid_dtype_per_column(all_parsers):
+ parser = all_parsers
+ data = """\
+one,two
+1,2.5
+2,3.5
+3,4.5
+4,5.5"""
+
+ with pytest.raises(TypeError, match="data type 'foo' not understood"):
+ parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
+
+
+ "category",
+ CategoricalDtype(),
+ {"a": "category",
+ "b": "category",
+ "c": CategoricalDtype()}
+])
+def test_categorical_dtype(all_parsers, dtype):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b,c
+1,a,3.4
+1,a,3.4
+2,b,4.5"""
+ expected = DataFrame({"a": Categorical(["1", "1", "2"]),
+ "b": Categorical(["a", "a", "b"]),
+ "c": Categorical(["3.4", "3.4", "4.5"])})
+ actual = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(actual, expected)
+
+
+ {"b": "category"},
+ {1: "category"}
+])
+def test_categorical_dtype_single(all_parsers, dtype):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b,c
+1,a,3.4
+1,a,3.4
+2,b,4.5"""
+ expected = DataFrame({"a": [1, 1, 2],
+ "b": Categorical(["a", "a", "b"]),
+ "c": [3.4, 3.4, 4.5]})
+ actual = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_unsorted(all_parsers):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b,c
+1,b,3.4
+1,b,3.4
+2,a,4.5"""
+ expected = DataFrame({"a": Categorical(["1", "1", "2"]),
+ "b": Categorical(["b", "b", "a"]),
+ "c": Categorical(["3.4", "3.4", "4.5"])})
+ actual = parser.read_csv(StringIO(data), dtype="category")
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_missing(all_parsers):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b,c
+1,b,3.4
+1,nan,3.4
+2,a,4.5"""
+ expected = DataFrame({"a": Categorical(["1", "1", "2"]),
+ "b": Categorical(["b", np.nan, "a"]),
+ "c": Categorical(["3.4", "3.4", "4.5"])})
+ actual = parser.read_csv(StringIO(data), dtype="category")
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_high_cardinality_numeric(all_parsers):
+ # see gh-18186
+ parser = all_parsers
+ data = np.sort([str(i) for i in range(524289)])
+ expected = DataFrame({"a": Categorical(data, ordered=True)})
+
+ actual = parser.read_csv(StringIO("a\n" + "\n".join(data)),
+ dtype="category")
+ actual["a"] = actual["a"].cat.reorder_categories(
+ np.sort(actual.a.cat.categories), ordered=True)
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
+ # see gh-10153
+ pth = os.path.join(csv_dir_path, "unicode_series.csv")
+ parser = all_parsers
+ encoding = "latin-1"
+
+ expected = parser.read_csv(pth, header=None, encoding=encoding)
+ expected[1] = Categorical(expected[1])
+
+ actual = parser.read_csv(pth, header=None, encoding=encoding,
+ dtype={1: "category"})
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
+ # see gh-10153
+ pth = os.path.join(csv_dir_path, "utf16_ex.txt")
+ parser = all_parsers
+ encoding = "utf-16"
+ sep = ","
+
+ expected = parser.read_csv(pth, sep=sep, encoding=encoding)
+ expected = expected.apply(Categorical)
+
+ actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_chunksize_infer_categories(all_parsers):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+ expecteds = [DataFrame({"a": [1, 1],
+ "b": Categorical(["a", "b"])}),
+ DataFrame({"a": [1, 2],
+ "b": Categorical(["b", "c"])},
+ index=[2, 3])]
+ actuals = parser.read_csv(StringIO(data), dtype={"b": "category"},
+ chunksize=2)
+
+ for actual, expected in zip(actuals, expecteds):
+ tm.assert_frame_equal(actual, expected)
+
+
+def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
+ # see gh-10153
+ parser = all_parsers
+ data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+ cats = ["a", "b", "c"]
+ expecteds = [DataFrame({"a": [1, 1],
+ "b": Categorical(["a", "b"],
+ categories=cats)}),
+ DataFrame({"a": [1, 2],
+ "b": Categorical(["b", "c"],
+ categories=cats)},
+ index=[2, 3])]
+ dtype = CategoricalDtype(cats)
+ actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
+
+ for actual, expected in zip(actuals, expecteds):
+ tm.assert_frame_equal(actual, expected)
+
+
[email protected]("ordered", [False, True])
[email protected]("categories", [
+ ["a", "b", "c"],
+ ["a", "c", "b"],
+ ["a", "b", "c", "d"],
+ ["c", "b", "a"],
+])
+def test_categorical_category_dtype(all_parsers, categories, ordered):
+ parser = all_parsers
+ data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+ expected = DataFrame({
+ "a": [1, 1, 1, 2],
+ "b": Categorical(["a", "b", "b", "c"],
+ categories=categories,
+ ordered=ordered)
+ })
+
+ dtype = {"b": CategoricalDtype(categories=categories,
+ ordered=ordered)}
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_category_dtype_unsorted(all_parsers):
+ parser = all_parsers
+ data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+ dtype = CategoricalDtype(["c", "b", "a"])
+ expected = DataFrame({
+ "a": [1, 1, 1, 2],
+ "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"])
+ })
+
+ result = parser.read_csv(StringIO(data), dtype={"b": dtype})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_coerces_numeric(all_parsers):
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype([1, 2, 3])}
+
+ data = "b\n1\n1\n2\n3"
+ expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_coerces_datetime(all_parsers):
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))}
+
+ data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+ expected = DataFrame({"b": Categorical(dtype["b"].categories)})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_coerces_timestamp(all_parsers):
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype([Timestamp("2014")])}
+
+ data = "b\n2014-01-01\n2014-01-01T00:00:00"
+ expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_coerces_timedelta(all_parsers):
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
+
+ data = "b\n1H\n2H\n3H"
+ expected = DataFrame({"b": Categorical(dtype["b"].categories)})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+ "b\nTrue\nFalse\nNA\nFalse",
+ "b\ntrue\nfalse\nNA\nfalse",
+ "b\nTRUE\nFALSE\nNA\nFALSE",
+ "b\nTrue\nFalse\nNA\nFALSE",
+])
+def test_categorical_dtype_coerces_boolean(all_parsers, data):
+ # see gh-20498
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype([False, True])}
+ expected = DataFrame({"b": Categorical([True, False, None, False])})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_unexpected_categories(all_parsers):
+ parser = all_parsers
+ dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
+
+ data = "b\nd\na\nc\nd" # Unexpected c
+ expected = DataFrame({"b": Categorical(list("dacd"),
+ dtype=dtype["b"])})
+
+ result = parser.read_csv(StringIO(data), dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_pass_dtype(all_parsers):
+ parser = all_parsers
+
+ data = "one,two"
+ result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
+
+ expected = DataFrame({"one": np.empty(0, dtype="u1"),
+ "two": np.empty(0, dtype=np.object)},
+ index=Index([], dtype=object))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_index_pass_dtype(all_parsers):
+ parser = all_parsers
+
+ data = "one,two"
+ result = parser.read_csv(StringIO(data), index_col=["one"],
+ dtype={"one": "u1", 1: "f"})
+
+ expected = DataFrame({"two": np.empty(0, dtype="f")},
+ index=Index([], dtype="u1", name="one"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_multi_index_pass_dtype(all_parsers):
+ parser = all_parsers
+
+ data = "one,two,three"
+ result = parser.read_csv(StringIO(data), index_col=["one", "two"],
+ dtype={"one": "u1", 1: "f8"})
+
+ exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"),
+ np.empty(0, dtype=np.float64)],
+ names=["one", "two"])
+ expected = DataFrame({"three": np.empty(0, dtype=np.object)},
+ index=exp_idx)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
+ parser = all_parsers
+
+ data = "one,one"
+ result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
+
+ expected = DataFrame({"one": np.empty(0, dtype="u1"),
+ "one.1": np.empty(0, dtype="f")},
+ index=Index([], dtype=object))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
+ parser = all_parsers
+
+ data = "one,one"
+ result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
+
+ expected = DataFrame({"one": np.empty(0, dtype="u1"),
+ "one.1": np.empty(0, dtype="f")},
+ index=Index([], dtype=object))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
+ # see gh-9424
+ parser = all_parsers
+ expected = concat([Series([], name="one", dtype="u1"),
+ Series([], name="one.1", dtype="f")], axis=1)
+ expected.index = expected.index.astype(object)
+
+ data = "one,one"
+ result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers):
+ # see gh-9424
+ parser = all_parsers
+ expected = concat([Series([], name="one", dtype="u1"),
+ Series([], name="one.1", dtype="f")], axis=1)
+ expected.index = expected.index.astype(object)
+
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ data = ""
+ result = parser.read_csv(StringIO(data), names=["one", "one"],
+ dtype={0: "u1", 1: "f"})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_raise_on_passed_int_dtype_with_nas(all_parsers):
+ # see gh-2631
+ parser = all_parsers
+ data = """YEAR, DOY, a
+2001,106380451,10
+2001,,11
+2001,106380451,67"""
+
+ msg = ("Integer column has NA values" if parser.engine == "c" else
+ "Unable to convert column DOY")
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), dtype={"DOY": np.int64},
+ skipinitialspace=True)
+
+
+def test_dtype_with_converters(all_parsers):
+ parser = all_parsers
+ data = """a,b
+1.1,2.2
+1.2,2.3"""
+
+ # Dtype spec ignored if converted specified.
+ with tm.assert_produces_warning(ParserWarning):
+ result = parser.read_csv(StringIO(data), dtype={"a": "i8"},
+ converters={"a": lambda x: str(x)})
+ expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("dtype,expected", [
+ (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
+ ("category", DataFrame({"a": Categorical([]),
+ "b": Categorical([])},
+ index=[])),
+ (dict(a="category", b="category"),
+ DataFrame({"a": Categorical([]),
+ "b": Categorical([])},
+ index=[])),
+ ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
+ ("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"),
+ "b": Series([], dtype="timedelta64[ns]")},
+ index=[])),
+ (dict(a=np.int64,
+ b=np.int32), DataFrame({"a": Series([], dtype=np.int64),
+ "b": Series([], dtype=np.int32)},
+ index=[])),
+ ({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
+ "b": Series([], dtype=np.int32)},
+ index=[])),
+ ({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
+ "b": Series([], dtype=np.int32)},
+ index=[])),
+])
+def test_empty_dtype(all_parsers, dtype, expected):
+ # see gh-14712
+ parser = all_parsers
+ data = "a,b"
+
+ result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("dtype", list(np.typecodes["AllInteger"] +
+ np.typecodes["Float"]))
+def test_numeric_dtype(all_parsers, dtype):
+ data = "0\n1"
+ parser = all_parsers
+ expected = DataFrame([0, 1], dtype=dtype)
+
+ result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
+ tm.assert_frame_equal(expected, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_header.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_header.py
new file mode 100644
index 00000000000..38f4cc42357
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_header.py
@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that the file header is properly handled or inferred
+during parsing for all of the parsers defined in parsers.py
+"""
+
+from collections import namedtuple
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, u
+from pandas.errors import ParserError
+
+from pandas import DataFrame, Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def test_read_with_bad_header(all_parsers):
+ parser = all_parsers
+ msg = r"but only \d+ lines in file"
+
+ with pytest.raises(ValueError, match=msg):
+ s = StringIO(",,")
+ parser.read_csv(s, header=[10])
+
+
[email protected]("header", [True, False])
+def test_bool_header_arg(all_parsers, header):
+ # see gh-6114
+ parser = all_parsers
+ data = """\
+MyColumn
+a
+b
+a
+b"""
+ msg = "Passing a bool to header is invalid"
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), header=header)
+
+
+def test_no_header_prefix(all_parsers):
+ parser = all_parsers
+ data = """1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+ result = parser.read_csv(StringIO(data), prefix="Field", header=None)
+ expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
+ [11, 12, 13, 14, 15]],
+ columns=["Field0", "Field1", "Field2",
+ "Field3", "Field4"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_header_with_index_col(all_parsers):
+ parser = all_parsers
+ data = """foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+ names = ["A", "B", "C"]
+ result = parser.read_csv(StringIO(data), names=names)
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=["foo", "bar", "baz"],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_header_not_first_line(all_parsers):
+ parser = all_parsers
+ data = """got,to,ignore,this,line
+got,to,ignore,this,line
+index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+"""
+ data2 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+"""
+
+ result = parser.read_csv(StringIO(data), header=2, index_col=0)
+ expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_header_multi_index(all_parsers):
+ parser = all_parsers
+ expected = tm.makeCustomDataframe(
+ 5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+
+ data = """\
+C0,,C_l0_g0,C_l0_g1,C_l0_g2
+
+C1,,C_l1_g0,C_l1_g1,C_l1_g2
+C2,,C_l2_g0,C_l2_g1,C_l2_g2
+C3,,C_l3_g0,C_l3_g1,C_l3_g2
+R0,R1,,,
+R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
+R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
+R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
+R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
+R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
+"""
+ result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3],
+ index_col=[0, 1])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,msg", [
+ (dict(index_col=["foo", "bar"]), ("index_col must only contain "
+ "row numbers when specifying "
+ "a multi-index header")),
+ (dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names "
+ "when specifying a "
+ "multi-index header")),
+ (dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify "
+ "usecols when "
+ "specifying a "
+ "multi-index header")),
+])
+def test_header_multi_index_invalid(all_parsers, kwargs, msg):
+ data = """\
+C0,,C_l0_g0,C_l0_g1,C_l0_g2
+
+C1,,C_l1_g0,C_l1_g1,C_l1_g2
+C2,,C_l2_g0,C_l2_g1,C_l2_g2
+C3,,C_l3_g0,C_l3_g1,C_l3_g2
+R0,R1,,,
+R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
+R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
+R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
+R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
+R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
+"""
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
+
+
+_TestTuple = namedtuple("names", ["first", "second"])
+
+
[email protected]("kwargs", [
+ dict(header=[0, 1]),
+ dict(skiprows=3,
+ names=[("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]),
+ dict(skiprows=3,
+ names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
+ _TestTuple("a", "s"), _TestTuple("b", "t"),
+ _TestTuple("c", "u"), _TestTuple("c", "v")])
+])
+def test_header_multi_index_common_format1(all_parsers, kwargs):
+ parser = all_parsers
+ expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+ index=["one", "two"],
+ columns=MultiIndex.from_tuples(
+ [("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]))
+ data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+,,,,,,
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [
+ dict(header=[0, 1]),
+ dict(skiprows=2,
+ names=[("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]),
+ dict(skiprows=2,
+ names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
+ _TestTuple("a", "s"), _TestTuple("b", "t"),
+ _TestTuple("c", "u"), _TestTuple("c", "v")])
+])
+def test_header_multi_index_common_format2(all_parsers, kwargs):
+ parser = all_parsers
+ expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+ index=["one", "two"],
+ columns=MultiIndex.from_tuples(
+ [("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]))
+ data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [
+ dict(header=[0, 1]),
+ dict(skiprows=2,
+ names=[("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]),
+ dict(skiprows=2,
+ names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
+ _TestTuple("a", "s"), _TestTuple("b", "t"),
+ _TestTuple("c", "u"), _TestTuple("c", "v")])
+])
+def test_header_multi_index_common_format3(all_parsers, kwargs):
+ parser = all_parsers
+ expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+ index=["one", "two"],
+ columns=MultiIndex.from_tuples(
+ [("a", "q"), ("a", "r"), ("a", "s"),
+ ("b", "t"), ("c", "u"), ("c", "v")]))
+ expected = expected.reset_index(drop=True)
+ data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_header_multi_index_common_format_malformed1(all_parsers):
+ parser = all_parsers
+ expected = DataFrame(np.array(
+ [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
+ index=Index([1, 7]),
+ columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
+ [u("r"), u("s"), u("t"),
+ u("u"), u("v")]],
+ codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+ names=[u("a"), u("q")]))
+ data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
+ tm.assert_frame_equal(expected, result)
+
+
+def test_header_multi_index_common_format_malformed2(all_parsers):
+ parser = all_parsers
+ expected = DataFrame(np.array(
+ [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
+ index=Index([1, 7]),
+ columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
+ [u("r"), u("s"), u("t"),
+ u("u"), u("v")]],
+ codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+ names=[None, u("q")]))
+
+ data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
+ tm.assert_frame_equal(expected, result)
+
+
+def test_header_multi_index_common_format_malformed3(all_parsers):
+ parser = all_parsers
+ expected = DataFrame(np.array(
+ [[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
+ index=MultiIndex(levels=[[1, 7], [2, 8]],
+ codes=[[0, 1], [0, 1]]),
+ columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
+ [u("s"), u("t"), u("u"), u("v")]],
+ codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
+ names=[None, u("q")]))
+ data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+ result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
+ tm.assert_frame_equal(expected, result)
+
+
[email protected]("data,header", [
+ ("1,2,3\n4,5,6", None),
+ ("foo,bar,baz\n1,2,3\n4,5,6", 0),
+])
+def test_header_names_backward_compat(all_parsers, data, header):
+ # see gh-2539
+ parser = all_parsers
+ expected = parser.read_csv(StringIO("1,2,3\n4,5,6"),
+ names=["a", "b", "c"])
+
+ result = parser.read_csv(StringIO(data), names=["a", "b", "c"],
+ header=header)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [
+ dict(), dict(index_col=False)
+])
+def test_read_only_header_no_rows(all_parsers, kwargs):
+ # See gh-7773
+ parser = all_parsers
+ expected = DataFrame(columns=["a", "b", "c"])
+
+ result = parser.read_csv(StringIO("a,b,c"), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,names", [
+ (dict(), [0, 1, 2, 3, 4]),
+ (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]),
+ (dict(names=["foo", "bar", "baz", "quux", "panda"]),
+ ["foo", "bar", "baz", "quux", "panda"])
+])
+def test_no_header(all_parsers, kwargs, names):
+ parser = all_parsers
+ data = """1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+ expected = DataFrame([[1, 2, 3, 4, 5],
+ [6, 7, 8, 9, 10],
+ [11, 12, 13, 14, 15]], columns=names)
+ result = parser.read_csv(StringIO(data), header=None, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("header", [
+ ["a", "b"],
+ "string_header"
+])
+def test_non_int_header(all_parsers, header):
+ # see gh-16338
+ msg = "header must be integer or list of integers"
+ data = """1,2\n3,4"""
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), header=header)
+
+
+def test_singleton_header(all_parsers):
+ # see gh-7757
+ data = """a,b,c\n0,1,2\n1,2,3"""
+ parser = all_parsers
+
+ expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
+ result = parser.read_csv(StringIO(data), header=[0])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,expected", [
+ ("A,A,A,B\none,one,one,two\n0,40,34,0.1",
+ DataFrame([[0, 40, 34, 0.1]],
+ columns=MultiIndex.from_tuples(
+ [("A", "one"), ("A", "one.1"),
+ ("A", "one.2"), ("B", "two")]))),
+ ("A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
+ DataFrame([[0, 40, 34, 0.1]],
+ columns=MultiIndex.from_tuples(
+ [("A", "one"), ("A", "one.1"),
+ ("A", "one.1.1"), ("B", "two")]))),
+ ("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
+ DataFrame([[0, 40, 34, 0.1, 0.1]],
+ columns=MultiIndex.from_tuples(
+ [("A", "one"), ("A", "one.1"),
+ ("A", "one.1.1"), ("B", "two"),
+ ("B", "two.1")])))
+])
+def test_mangles_multi_index(all_parsers, data, expected):
+ # see gh-18062
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), header=[0, 1])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_col", [None, [0]])
[email protected]("columns", [None,
+ (["", "Unnamed"]),
+ (["Unnamed", ""]),
+ (["Unnamed", "NotUnnamed"])])
+def test_multi_index_unnamed(all_parsers, index_col, columns):
+ # see gh-23687
+ #
+ # When specifying a multi-index header, make sure that
+ # we don't error just because one of the rows in our header
+ # has ALL column names containing the string "Unnamed". The
+ # correct condition to check is whether the row contains
+ # ALL columns that did not have names (and instead were given
+ # placeholder ones).
+ parser = all_parsers
+ header = [0, 1]
+
+ if index_col is None:
+ data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
+ else:
+ data = (",".join([""] + (columns or ["", ""])) +
+ "\n,0,1\n0,2,3\n1,4,5\n")
+
+ if columns is None:
+ msg = (r"Passed header=\[0,1\] are too "
+ r"many rows for this multi_index of columns")
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), header=header,
+ index_col=index_col)
+ else:
+ result = parser.read_csv(StringIO(data), header=header,
+ index_col=index_col)
+ template = "Unnamed: {i}_level_0"
+ exp_columns = []
+
+ for i, col in enumerate(columns):
+ if not col: # Unnamed.
+ col = template.format(i=i if index_col is None else i + 1)
+
+ exp_columns.append(col)
+
+ columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
+ expected = DataFrame([[2, 3], [4, 5]], columns=columns)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_index_col.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_index_col.py
new file mode 100644
index 00000000000..6421afba18f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_index_col.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that the specified index column (a.k.a "index_col")
+is properly handled or inferred during parsing for all of
+the parsers defined in parsers.py
+"""
+
+import pytest
+
+from pandas.compat import StringIO
+
+from pandas import DataFrame, Index, MultiIndex
+import pandas.util.testing as tm
+
+
[email protected]("with_header", [True, False])
+def test_index_col_named(all_parsers, with_header):
+ parser = all_parsers
+ no_header = """\
+KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
+ header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
+
+ if with_header:
+ data = header + no_header
+
+ result = parser.read_csv(StringIO(data), index_col="ID")
+ expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
+ tm.assert_frame_equal(result, expected)
+ else:
+ data = no_header
+ msg = "Index ID invalid"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), index_col="ID")
+
+
+def test_index_col_named2(all_parsers):
+ parser = all_parsers
+ data = """\
+1,2,3,4,hello
+5,6,7,8,world
+9,10,11,12,foo
+"""
+
+ expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10],
+ "c": [3, 7, 11], "d": [4, 8, 12]},
+ index=Index(["hello", "world", "foo"],
+ name="message"))
+ names = ["a", "b", "c", "d", "message"]
+
+ result = parser.read_csv(StringIO(data), names=names,
+ index_col=["message"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_is_true(all_parsers):
+ # see gh-9798
+ data = "a,b\n1,2"
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match="The value of index_col "
+ "couldn't be 'True'"):
+ parser.read_csv(StringIO(data), index_col=True)
+
+
+def test_infer_index_col(all_parsers):
+ data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data))
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=["foo", "bar", "baz"],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_col,kwargs", [
+ (None, dict(columns=["x", "y", "z"])),
+ (False, dict(columns=["x", "y", "z"])),
+ (0, dict(columns=["y", "z"], index=Index([], name="x"))),
+ (1, dict(columns=["x", "z"], index=Index([], name="y"))),
+ ("x", dict(columns=["y", "z"], index=Index([], name="x"))),
+ ("y", dict(columns=["x", "z"], index=Index([], name="y"))),
+ ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays(
+ [[]] * 2, names=["x", "y"]))),
+ (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays(
+ [[]] * 2, names=["x", "y"]))),
+ ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays(
+ [[]] * 2, names=["y", "x"]))),
+ (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays(
+ [[]] * 2, names=["y", "x"]))),
+])
+def test_index_col_empty_data(all_parsers, index_col, kwargs):
+ data = "x,y,z"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=index_col)
+
+ expected = DataFrame([], **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_with_index_col_false(all_parsers):
+ # see gh-10413
+ data = "x,y"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=False)
+
+ expected = DataFrame([], columns=["x", "y"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_names", [
+ ["", ""],
+ ["foo", ""],
+ ["", "bar"],
+ ["foo", "bar"],
+ ["NotReallyUnnamed", "Unnamed: 0"],
+])
+def test_multi_index_naming(all_parsers, index_names):
+ parser = all_parsers
+
+ # We don't want empty index names being replaced with "Unnamed: 0"
+ data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
+ result = parser.read_csv(StringIO(data), index_col=[0, 1])
+
+ expected = DataFrame({"col": [1, 2, 3, 4]},
+ index=MultiIndex.from_product([["a", "b"],
+ ["c", "d"]]))
+ expected.index.names = [name if name else None for name in index_names]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multi_index_naming_not_all_at_beginning(all_parsers):
+ parser = all_parsers
+ data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
+ result = parser.read_csv(StringIO(data), index_col=[0, 2])
+
+ expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
+ index=MultiIndex(
+ levels=[['a', 'b'], [1, 2, 3, 4]],
+ codes=[[0, 0, 1, 1], [0, 1, 2, 3]]))
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_mangle_dupes.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_mangle_dupes.py
new file mode 100644
index 00000000000..0efc0c2c135
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_mangle_dupes.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that duplicate columns are handled appropriately when parsed by the
+CSV engine. In general, the expected result is that they are either thoroughly
+de-duplicated (if mangling requested) or ignored otherwise.
+"""
+
+import pytest
+
+from pandas.compat import StringIO
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
[email protected]("kwargs", [dict(), dict(mangle_dupe_cols=True)])
+def test_basic(all_parsers, kwargs):
+ # TODO: add test for condition "mangle_dupe_cols=False"
+ # once it is actually supported (gh-12935)
+ parser = all_parsers
+
+ data = "a,a,b,b,b\n1,2,3,4,5"
+ result = parser.read_csv(StringIO(data), sep=",", **kwargs)
+
+ expected = DataFrame([[1, 2, 3, 4, 5]],
+ columns=["a", "a.1", "b", "b.1", "b.2"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_basic_names(all_parsers):
+ # See gh-7160
+ parser = all_parsers
+
+ data = "a,b,a\n0,1,2\n3,4,5"
+ expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+ columns=["a", "b", "a.1"])
+
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_basic_names_warn(all_parsers):
+ # See gh-7160
+ parser = all_parsers
+
+ data = "0,1,2\n3,4,5"
+ expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+ columns=["a", "b", "a.1"])
+
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,expected", [
+ ("a,a,a.1\n1,2,3",
+ DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
+ ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
+ DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1",
+ "a.1.1.1.1", "a.1.1.1.1.1"])),
+ ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
+ DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1",
+ "a.2", "a.2.1", "a.3.1"]))
+])
+def test_thorough_mangle_columns(all_parsers, data, expected):
+ # see gh-17060
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,names,expected", [
+ ("a,b,b\n1,2,3",
+ ["a.1", "a.1", "a.1.1"],
+ DataFrame([["a", "b", "b"], ["1", "2", "3"]],
+ columns=["a.1", "a.1.1", "a.1.1.1"])),
+ ("a,b,c,d,e,f\n1,2,3,4,5,6",
+ ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
+ DataFrame([["a", "b", "c", "d", "e", "f"],
+ ["1", "2", "3", "4", "5", "6"]],
+ columns=["a", "a.1", "a.1.1", "a.1.1.1",
+ "a.1.1.1.1", "a.1.1.1.1.1"])),
+ ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
+ ["a", "a", "a.3", "a.1", "a.2", "a", "a"],
+ DataFrame([["a", "b", "c", "d", "e", "f", "g"],
+ ["1", "2", "3", "4", "5", "6", "7"]],
+ columns=["a", "a.1", "a.3", "a.1.1",
+ "a.2", "a.2.1", "a.3.1"])),
+])
+def test_thorough_mangle_names(all_parsers, data, names, expected):
+ # see gh-17095
+ parser = all_parsers
+
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ result = parser.read_csv(StringIO(data), names=names)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_mangled_unnamed_placeholders(all_parsers):
+ # xref gh-13017
+ orig_key = "0"
+ parser = all_parsers
+
+ orig_value = [1, 2, 3]
+ df = DataFrame({orig_key: orig_value})
+
+ # This test recursively updates `df`.
+ for i in range(3):
+ expected = DataFrame()
+
+ for j in range(i + 1):
+ expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
+
+ expected[orig_key] = orig_value
+ df = parser.read_csv(StringIO(df.to_csv()))
+
+ tm.assert_frame_equal(df, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_multi_thread.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_multi_thread.py
new file mode 100644
index 00000000000..fbf23f769e2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_multi_thread.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests multithreading behaviour for reading and
+parsing files for each parser defined in parsers.py
+"""
+
+from __future__ import division
+
+from multiprocessing.pool import ThreadPool
+
+import numpy as np
+
+from pandas.compat import BytesIO, range
+
+import pandas as pd
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
+def _construct_dataframe(num_rows):
+ """
+ Construct a DataFrame for testing.
+
+ Parameters
+ ----------
+ num_rows : int
+ The number of rows for our DataFrame.
+
+ Returns
+ -------
+ df : DataFrame
+ """
+ df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
+ df["foo"] = "foo"
+ df["bar"] = "bar"
+ df["baz"] = "baz"
+ df["date"] = pd.date_range("20000101 09:00:00",
+ periods=num_rows,
+ freq="s")
+ df["int"] = np.arange(num_rows, dtype="int64")
+ return df
+
+
+def test_multi_thread_string_io_read_csv(all_parsers):
+ # see gh-11786
+ parser = all_parsers
+ max_row_range = 10000
+ num_files = 100
+
+ bytes_to_df = [
+ "\n".join(
+ ["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]
+ ).encode() for _ in range(num_files)]
+ files = [BytesIO(b) for b in bytes_to_df]
+
+ # Read all files in many threads.
+ pool = ThreadPool(8)
+
+ results = pool.map(parser.read_csv, files)
+ first_result = results[0]
+
+ for result in results:
+ tm.assert_frame_equal(first_result, result)
+
+
+def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
+ """
+ Generate a DataFrame via multi-thread.
+
+ Parameters
+ ----------
+ parser : BaseParser
+ The parser object to use for reading the data.
+ path : str
+ The location of the CSV file to read.
+ num_rows : int
+ The number of rows to read per task.
+ num_tasks : int
+ The number of tasks to use for reading this DataFrame.
+
+ Returns
+ -------
+ df : DataFrame
+ """
+ def reader(arg):
+ """
+ Create a reader for part of the CSV.
+
+ Parameters
+ ----------
+ arg : tuple
+ A tuple of the following:
+
+ * start : int
+ The starting row to start for parsing CSV
+ * nrows : int
+ The number of rows to read.
+
+ Returns
+ -------
+ df : DataFrame
+ """
+ start, nrows = arg
+
+ if not start:
+ return parser.read_csv(path, index_col=0, header=0,
+ nrows=nrows, parse_dates=["date"])
+
+ return parser.read_csv(path, index_col=0, header=None,
+ skiprows=int(start) + 1,
+ nrows=nrows, parse_dates=[9])
+
+ tasks = [
+ (num_rows * i // num_tasks,
+ num_rows // num_tasks) for i in range(num_tasks)
+ ]
+
+ pool = ThreadPool(processes=num_tasks)
+ results = pool.map(reader, tasks)
+
+ header = results[0].columns
+
+ for r in results[1:]:
+ r.columns = header
+
+ final_dataframe = pd.concat(results)
+ return final_dataframe
+
+
+def test_multi_thread_path_multipart_read_csv(all_parsers):
+ # see gh-11786
+ num_tasks = 4
+ num_rows = 100000
+
+ parser = all_parsers
+ file_name = "__thread_pool_reader__.csv"
+ df = _construct_dataframe(num_rows)
+
+ with tm.ensure_clean(file_name) as path:
+ df.to_csv(path)
+
+ final_dataframe = _generate_multi_thread_dataframe(parser, path,
+ num_rows, num_tasks)
+ tm.assert_frame_equal(df, final_dataframe)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_na_values.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_na_values.py
new file mode 100644
index 00000000000..1b6d2ee8a06
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_na_values.py
@@ -0,0 +1,441 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that NA values are properly handled during
+parsing for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, range
+
+from pandas import DataFrame, Index, MultiIndex
+import pandas.util.testing as tm
+
+import pandas.io.common as com
+
+
+def test_string_nas(all_parsers):
+ parser = all_parsers
+ data = """A,B,C
+a,b,c
+d,,f
+,g,h
+"""
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame([["a", "b", "c"],
+ ["d", np.nan, "f"],
+ [np.nan, "g", "h"]],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_detect_string_na(all_parsers):
+ parser = all_parsers
+ data = """A,B
+foo,bar
+NA,baz
+NaN,nan
+"""
+ expected = DataFrame([["foo", "bar"], [np.nan, "baz"],
+ [np.nan, np.nan]], columns=["A", "B"])
+ result = parser.read_csv(StringIO(data))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_values", [
+ ["-999.0", "-999"],
+ [-999, -999.0],
+ [-999.0, -999],
+ ["-999.0"], ["-999"],
+ [-999.0], [-999]
+])
+ """A,B
+-999,1.2
+2,-999
+3,4.5
+""",
+ """A,B
+-999,1.200
+2,-999.000
+3,4.500
+"""
+])
+def test_non_string_na_values(all_parsers, data, na_values):
+ # see gh-3611: with an odd float format, we can't match
+ # the string "999.0" exactly but still need float matching
+ parser = all_parsers
+ expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
+ [3.0, 4.5]], columns=["A", "B"])
+
+ result = parser.read_csv(StringIO(data), na_values=na_values)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_default_na_values(all_parsers):
+ _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A",
+ "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan",
+ "-NaN", "-nan", "#N/A N/A", ""}
+ assert _NA_VALUES == com._NA_VALUES
+
+ parser = all_parsers
+ nv = len(_NA_VALUES)
+
+ def f(i, v):
+ if i == 0:
+ buf = ""
+ elif i > 0:
+ buf = "".join([","] * i)
+
+ buf = "{0}{1}".format(buf, v)
+
+ if i < nv - 1:
+ buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
+
+ return buf
+
+ data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
+ expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
+
+ result = parser.read_csv(data, header=None)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_values", ["baz", ["baz"]])
+def test_custom_na_values(all_parsers, na_values):
+ parser = all_parsers
+ data = """A,B,C
+ignore,this,row
+1,NA,3
+-1.#IND,5,baz
+7,8,NaN
+"""
+ expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan],
+ [7, 8, np.nan]], columns=["A", "B", "C"])
+ result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_bool_na_values(all_parsers):
+ data = """A,B,C
+True,False,True
+NA,True,False
+False,NA,True"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object),
+ "B": np.array([False, True, np.nan], dtype=object),
+ "C": [True, False, True]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_na_value_dict(all_parsers):
+ data = """A,B,C
+foo,bar,NA
+bar,foo,foo
+foo,bar,NA
+bar,foo,foo"""
+ parser = all_parsers
+ df = parser.read_csv(StringIO(data),
+ na_values={"A": ["foo"], "B": ["bar"]})
+ expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"],
+ "B": [np.nan, "foo", np.nan, "foo"],
+ "C": [np.nan, "foo", np.nan, "foo"]})
+ tm.assert_frame_equal(df, expected)
+
+
[email protected]("index_col,expected", [
+ ([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]},
+ index=Index([0], name="a"))),
+ ([0, 2], DataFrame({"b": [np.nan], "d": [5]},
+ index=MultiIndex.from_tuples(
+ [(0, 1)], names=["a", "c"]))),
+ (["a", "c"], DataFrame({"b": [np.nan], "d": [5]},
+ index=MultiIndex.from_tuples(
+ [(0, 1)], names=["a", "c"]))),
+])
+def test_na_value_dict_multi_index(all_parsers, index_col, expected):
+ data = """\
+a,b,c,d
+0,NA,1,5
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), na_values=set(),
+ index_col=index_col)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,expected", [
+ (dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
+ "B": [1, 2, 3, 4, 5, 6, 7],
+ "C": ["one", "two", "three", np.nan, "five",
+ np.nan, "seven"]})),
+ (dict(na_values={"A": [], "C": []}, keep_default_na=False),
+ DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
+ "B": [1, 2, 3, 4, 5, 6, 7],
+ "C": ["one", "two", "three", "nan", "five", "", "seven"]})),
+ (dict(na_values=["a"], keep_default_na=False),
+ DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"],
+ "B": [1, 2, 3, 4, 5, 6, 7],
+ "C": ["one", "two", "three", "nan", "five", "", "seven"]})),
+ (dict(na_values={"A": [], "C": []}),
+ DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
+ "B": [1, 2, 3, 4, 5, 6, 7],
+ "C": ["one", "two", "three", np.nan,
+ "five", np.nan, "seven"]})),
+])
+def test_na_values_keep_default(all_parsers, kwargs, expected):
+ data = """\
+A,B,C
+a,1,one
+b,2,two
+,3,three
+d,4,nan
+e,5,five
+nan,6,
+g,7,seven
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_no_na_values_no_keep_default(all_parsers):
+ # see gh-4318: passing na_values=None and
+ # keep_default_na=False yields 'None" as a na_value
+ data = """\
+A,B,C
+a,1,None
+b,2,two
+,3,None
+d,4,nan
+e,5,five
+nan,6,
+g,7,seven
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), keep_default_na=False)
+
+ expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
+ "B": [1, 2, 3, 4, 5, 6, 7],
+ "C": ["None", "two", "None", "nan",
+ "five", "", "seven"]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_no_keep_default_na_dict_na_values(all_parsers):
+ # see gh-19227
+ data = "a,b\n,2"
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), na_values={"b": ["2"]},
+ keep_default_na=False)
+ expected = DataFrame({"a": [""], "b": [np.nan]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
+ # see gh-19227
+ #
+ # Scalar values shouldn't cause the parsing to crash or fail.
+ data = "a,b\n1,2"
+ parser = all_parsers
+ df = parser.read_csv(StringIO(data), na_values={"b": 2},
+ keep_default_na=False)
+ expected = DataFrame({"a": [1], "b": [np.nan]})
+ tm.assert_frame_equal(df, expected)
+
+
[email protected]("col_zero_na_values", [
+ 113125, "113125"
+])
+def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers,
+ col_zero_na_values):
+ # see gh-19227
+ data = """\
+113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
+729639,"qwer","",asdfkj,466.681,,252.373
+"""
+ parser = all_parsers
+ expected = DataFrame({0: [np.nan, 729639.0],
+ 1: [np.nan, "qwer"],
+ 2: ["/blaha", np.nan],
+ 3: ["kjsdkj", "asdfkj"],
+ 4: [412.166, 466.681],
+ 5: ["225.874", ""],
+ 6: [np.nan, 252.373]})
+
+ result = parser.read_csv(StringIO(data), header=None,
+ keep_default_na=False,
+ na_values={2: "", 6: "214.008",
+ 1: "blah", 0: col_zero_na_values})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_filter,row_data", [
+ (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
+ (False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
+])
+def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
+ data = """\
+A,B
+1,A
+nan,B
+3,C
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), na_values=["B"],
+ na_filter=na_filter)
+
+ expected = DataFrame(row_data, columns=["A", "B"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_na_trailing_columns(all_parsers):
+ parser = all_parsers
+ data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
+2012-03-14,USD,AAPL,BUY,1000
+2012-05-12,USD,SBUX,SELL,500"""
+
+ # Trailing columns should be all NaN.
+ result = parser.read_csv(StringIO(data))
+ expected = DataFrame([
+ ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
+ ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
+ ], columns=["Date", "Currency", "Symbol", "Type",
+ "Units", "UnitPrice", "Cost", "Tax"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_values,row_data", [
+ (1, [[np.nan, 2.0], [2.0, np.nan]]),
+ ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
+])
+def test_na_values_scalar(all_parsers, na_values, row_data):
+ # see gh-12224
+ parser = all_parsers
+ names = ["a", "b"]
+ data = "1,2\n2,1"
+
+ result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
+ expected = DataFrame(row_data, columns=names)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_na_values_dict_aliasing(all_parsers):
+ parser = all_parsers
+ na_values = {"a": 2, "b": 1}
+ na_values_copy = na_values.copy()
+
+ names = ["a", "b"]
+ data = "1,2\n2,1"
+
+ expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
+ result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
+
+ tm.assert_frame_equal(result, expected)
+ tm.assert_dict_equal(na_values, na_values_copy)
+
+
+def test_na_values_dict_col_index(all_parsers):
+ # see gh-14203
+ data = "a\nfoo\n1"
+ parser = all_parsers
+ na_values = {0: "foo"}
+
+ result = parser.read_csv(StringIO(data), na_values=na_values)
+ expected = DataFrame({"a": [np.nan, 1]})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected", [
+ (str(2**63) + "\n" + str(2**63 + 1),
+ dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])),
+ (str(2**63) + ",1" + "\n,2",
+ dict(), DataFrame([[str(2**63), 1], ['', 2]])),
+ (str(2**63) + "\n1",
+ dict(na_values=[2**63]), DataFrame([np.nan, 1])),
+])
+def test_na_values_uint64(all_parsers, data, kwargs, expected):
+ # see gh-14983
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), header=None, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_na_values_no_default_with_index(all_parsers):
+ # see gh-15835
+ data = "a,1\nb,2"
+ parser = all_parsers
+ expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
+
+ result = parser.read_csv(StringIO(data), index_col=0,
+ keep_default_na=False)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("na_filter,index_data", [
+ (False, ["", "5"]),
+ (True, [np.nan, 5.0]),
+])
+def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
+ # see gh-5239
+ #
+ # Don't parse NA-values in index unless na_filter=True
+ parser = all_parsers
+ data = "a,b,c\n1,,3\n4,5,6"
+
+ expected = DataFrame({"a": [1, 4], "c": [3, 6]},
+ index=Index(index_data, name="b"))
+ result = parser.read_csv(StringIO(data), index_col=[1],
+ na_filter=na_filter)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_inf_na_values_with_int_index(all_parsers):
+ # see gh-17128
+ parser = all_parsers
+ data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
+
+ # Don't fail with OverflowError with inf's and integer index column.
+ out = parser.read_csv(StringIO(data), index_col=[0],
+ na_values=["inf", "-inf"])
+ expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
+ index=Index([1, 2], name="idx"))
+ tm.assert_frame_equal(out, expected)
+
+
[email protected]("na_filter", [True, False])
+def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
+ # see gh-20377
+ parser = all_parsers
+ data = "a,b,c\n1,,3\n4,5,6"
+
+ # na_filter=True --> missing value becomes NaN.
+ # na_filter=False --> missing value remains empty string.
+ empty = np.nan if na_filter else ""
+ expected = DataFrame({"a": ["1", "4"],
+ "b": [empty, "5"],
+ "c": ["3", "6"]})
+
+ result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data, na_values", [
+ ("false,1\n,1\ntrue", None),
+ ("false,1\nnull,1\ntrue", None),
+ ("false,1\nnan,1\ntrue", None),
+ ("false,1\nfoo,1\ntrue", 'foo'),
+ ("false,1\nfoo,1\ntrue", ['foo']),
+ ("false,1\nfoo,1\ntrue", {'a': 'foo'}),
+])
+def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
+ parser = all_parsers
+ msg = ("(Bool column has NA values in column [0a])|"
+ "(cannot safely convert passed user dtype of "
+ "bool for object dtyped data in column 0)")
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
+ dtype={'a': 'bool'}, na_values=na_values)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_network.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_network.py
new file mode 100644
index 00000000000..e54da94089c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_network.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests parsers ability to read and parse non-local files
+and hence require a network connection to be read.
+"""
+import logging
+
+import numpy as np
+import pytest
+
+from pandas.compat import BytesIO, StringIO
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+from pandas.io.parsers import read_csv
+
+
+ "compress_type, extension", [
+ ('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
+ pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
+ ]
+)
[email protected]('mode', ['explicit', 'infer'])
[email protected]('engine', ['python', 'c'])
+def test_compressed_urls(salaries_table, compress_type, extension, mode,
+ engine):
+ check_compressed_urls(salaries_table, compress_type, extension, mode,
+ engine)
+
+
+def check_compressed_urls(salaries_table, compression, extension, mode,
+ engine):
+ # test reading compressed urls with various engines and
+ # extension inference
+ base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
+ 'pandas/tests/io/parser/data/salaries.csv')
+
+ url = base_url + extension
+
+ if mode != 'explicit':
+ compression = mode
+
+ url_table = read_csv(url, sep='\t', compression=compression, engine=engine)
+ tm.assert_frame_equal(url_table, salaries_table)
+
+
+def tips_df(datapath):
+ """DataFrame with the tips dataset."""
+ return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
+
+
[email protected]("s3_resource")
[email protected]_if_not_us_locale()
+class TestS3(object):
+
+ def test_parse_public_s3_bucket(self, tips_df):
+ pytest.importorskip('s3fs')
+
+ # more of an integration test due to the not-public contents portion
+ # can probably mock this though.
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df = read_csv('s3://pandas-test/tips.csv' +
+ ext, compression=comp)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(df, tips_df)
+
+ # Read public file from bucket with not-public contents
+ df = read_csv('s3://cant_get_it/tips.csv')
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(df, tips_df)
+
+ def test_parse_public_s3n_bucket(self, tips_df):
+
+ # Read from AWS s3 as "s3n" URL
+ df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+ def test_parse_public_s3a_bucket(self, tips_df):
+ # Read from AWS s3 as "s3a" URL
+ df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+ def test_parse_public_s3_bucket_nrows(self, tips_df):
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df = read_csv('s3://pandas-test/tips.csv' +
+ ext, nrows=10, compression=comp)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+ def test_parse_public_s3_bucket_chunked(self, tips_df):
+ # Read with a chunksize
+ chunksize = 5
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
+ chunksize=chunksize, compression=comp)
+ assert df_reader.chunksize == chunksize
+ for i_chunk in [0, 1, 2]:
+ # Read a couple of chunks and make sure we see them
+ # properly.
+ df = df_reader.get_chunk()
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ true_df = tips_df.iloc[
+ chunksize * i_chunk: chunksize * (i_chunk + 1)]
+ tm.assert_frame_equal(true_df, df)
+
+ def test_parse_public_s3_bucket_chunked_python(self, tips_df):
+ # Read with a chunksize using the Python parser
+ chunksize = 5
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
+ chunksize=chunksize, compression=comp,
+ engine='python')
+ assert df_reader.chunksize == chunksize
+ for i_chunk in [0, 1, 2]:
+ # Read a couple of chunks and make sure we see them properly.
+ df = df_reader.get_chunk()
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ true_df = tips_df.iloc[
+ chunksize * i_chunk: chunksize * (i_chunk + 1)]
+ tm.assert_frame_equal(true_df, df)
+
+ def test_parse_public_s3_bucket_python(self, tips_df):
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
+ compression=comp)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(df, tips_df)
+
+ def test_infer_s3_compression(self, tips_df):
+ for ext in ['', '.gz', '.bz2']:
+ df = read_csv('s3://pandas-test/tips.csv' + ext,
+ engine='python', compression='infer')
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(df, tips_df)
+
+ def test_parse_public_s3_bucket_nrows_python(self, tips_df):
+ for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+ df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
+ nrows=10, compression=comp)
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+ def test_s3_fails(self):
+ with pytest.raises(IOError):
+ read_csv('s3://nyqpug/asdf.csv')
+
+ # Receive a permission error when trying to read a private bucket.
+ # It's irrelevant here that this isn't actually a table.
+ with pytest.raises(IOError):
+ read_csv('s3://cant_get_it/')
+
+ def test_read_csv_handles_boto_s3_object(self,
+ s3_resource,
+ tips_file):
+ # see gh-16135
+
+ s3_object = s3_resource.meta.client.get_object(
+ Bucket='pandas-test',
+ Key='tips.csv')
+
+ result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
+ assert isinstance(result, DataFrame)
+ assert not result.empty
+
+ expected = read_csv(tips_file)
+ tm.assert_frame_equal(result, expected)
+
+ def test_read_csv_chunked_download(self, s3_resource, caplog):
+ # 8 MB, S3FS usees 5MB chunks
+ df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
+ buf = BytesIO()
+ str_buf = StringIO()
+
+ df.to_csv(str_buf)
+
+ buf = BytesIO(str_buf.getvalue().encode('utf-8'))
+
+ s3_resource.Bucket("pandas-test").put_object(
+ Key="large-file.csv",
+ Body=buf)
+
+ with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
+ read_csv("s3://pandas-test/large-file.csv", nrows=5)
+ # log of fetch_range (start, stop)
+ assert ((0, 5505024) in {x.args[-2:] for x in caplog.records})
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_parse_dates.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_parse_dates.py
new file mode 100644
index 00000000000..ffc8af09bf2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_parse_dates.py
@@ -0,0 +1,849 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests date parsing functionality for all of the
+parsers defined in parsers.py
+"""
+
+from datetime import date, datetime
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslib import Timestamp
+from pandas._libs.tslibs import parsing
+from pandas.compat import StringIO, lrange, parse_date
+from pandas.compat.numpy import np_array_datetime64_compat
+
+import pandas as pd
+from pandas import DataFrame, DatetimeIndex, Index, MultiIndex
+from pandas.core.indexes.datetimes import date_range
+import pandas.util.testing as tm
+
+import pandas.io.date_converters as conv
+import pandas.io.parsers as parsers
+
+
+def test_separator_date_conflict(all_parsers):
+ # Regression test for gh-4678
+ #
+ # Make sure thousands separator and
+ # date parsing do not conflict.
+ parser = all_parsers
+ data = "06-02-2013;13:00;1-000.215"
+ expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
+ columns=["Date", 2])
+
+ df = parser.read_csv(StringIO(data), sep=";", thousands="-",
+ parse_dates={"Date": [0, 1]}, header=None)
+ tm.assert_frame_equal(df, expected)
+
+
[email protected]("keep_date_col", [True, False])
+def test_multiple_date_col_custom(all_parsers, keep_date_col):
+ data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+ parser = all_parsers
+
+ def date_parser(*date_cols):
+ """
+ Test date parser.
+
+ Parameters
+ ----------
+ date_cols : args
+ The list of data columns to parse.
+
+ Returns
+ -------
+ parsed : Series
+ """
+ return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
+
+ result = parser.read_csv(StringIO(data), header=None,
+ date_parser=date_parser, prefix="X",
+ parse_dates={"actual": [1, 2],
+ "nominal": [1, 3]},
+ keep_date_col=keep_date_col)
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
+ "KORD", "19990127", " 19:00:00", " 18:56:00",
+ 0.81, 2.81, 7.2, 0.0, 280.0],
+ [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
+ "KORD", "19990127", " 20:00:00", " 19:56:00",
+ 0.01, 2.21, 7.2, 0.0, 260.0],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
+ "KORD", "19990127", " 21:00:00", " 20:56:00",
+ -0.59, 2.21, 5.7, 0.0, 280.0],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
+ "KORD", "19990127", " 21:00:00", " 21:18:00",
+ -0.99, 2.01, 3.6, 0.0, 270.0],
+ [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
+ "KORD", "19990127", " 22:00:00", " 21:56:00",
+ -0.59, 1.71, 5.1, 0.0, 290.0],
+ [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
+ "KORD", "19990127", " 23:00:00", " 22:56:00",
+ -0.59, 1.71, 4.6, 0.0, 280.0],
+ ], columns=["actual", "nominal", "X0", "X1", "X2",
+ "X3", "X4", "X5", "X6", "X7", "X8"])
+
+ if not keep_date_col:
+ expected = expected.drop(["X1", "X2", "X3"], axis=1)
+ elif parser.engine == "python":
+ expected["X1"] = expected["X1"].astype(np.int64)
+
+ # Python can sometimes be flaky about how
+ # the aggregated columns are entered, so
+ # this standardizes the order.
+ result = result[expected.columns]
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("keep_date_col", [True, False])
+def test_multiple_date_col(all_parsers, keep_date_col):
+ data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), header=None,
+ prefix="X", parse_dates=[[1, 2], [1, 3]],
+ keep_date_col=keep_date_col)
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
+ "KORD", "19990127", " 19:00:00", " 18:56:00",
+ 0.81, 2.81, 7.2, 0.0, 280.0],
+ [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
+ "KORD", "19990127", " 20:00:00", " 19:56:00",
+ 0.01, 2.21, 7.2, 0.0, 260.0],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
+ "KORD", "19990127", " 21:00:00", " 20:56:00",
+ -0.59, 2.21, 5.7, 0.0, 280.0],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
+ "KORD", "19990127", " 21:00:00", " 21:18:00",
+ -0.99, 2.01, 3.6, 0.0, 270.0],
+ [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
+ "KORD", "19990127", " 22:00:00", " 21:56:00",
+ -0.59, 1.71, 5.1, 0.0, 290.0],
+ [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
+ "KORD", "19990127", " 23:00:00", " 22:56:00",
+ -0.59, 1.71, 4.6, 0.0, 280.0],
+ ], columns=["X1_X2", "X1_X3", "X0", "X1", "X2",
+ "X3", "X4", "X5", "X6", "X7", "X8"])
+
+ if not keep_date_col:
+ expected = expected.drop(["X1", "X2", "X3"], axis=1)
+ elif parser.engine == "python":
+ expected["X1"] = expected["X1"].astype(np.int64)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_date_col_as_index_col(all_parsers):
+ data = """\
+KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), header=None, prefix="X",
+ parse_dates=[1], index_col=1)
+
+ index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0),
+ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0),
+ datetime(1999, 1, 27, 22, 0)], name="X1")
+ expected = DataFrame([
+ ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
+ ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
+ ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
+ ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
+ ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
+ ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_date_cols_int_cast(all_parsers):
+ data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
+ "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
+ "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
+ "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
+ "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
+ "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
+ parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), header=None,
+ date_parser=conv.parse_date_time,
+ parse_dates=parse_dates, prefix="X")
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
+ "KORD", 0.81],
+ [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
+ "KORD", 0.01],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
+ "KORD", -0.59],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
+ "KORD", -0.99],
+ [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
+ "KORD", -0.59],
+ [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
+ "KORD", -0.59],
+ ], columns=["actual", "nominal", "X0", "X4"])
+
+ # Python can sometimes be flaky about how
+ # the aggregated columns are entered, so
+ # this standardizes the order.
+ result = result[expected.columns]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_date_col_timestamp_parse(all_parsers):
+ parser = all_parsers
+ data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
+05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
+
+ result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]],
+ header=None, date_parser=Timestamp)
+ expected = DataFrame([
+ [Timestamp("05/31/2012, 15:30:00.029"),
+ 1306.25, 1, "E", 0, np.nan, 1306.25],
+ [Timestamp("05/31/2012, 15:30:00.029"),
+ 1306.25, 8, "E", 0, np.nan, 1306.25]
+ ], columns=["0_1", 2, 3, 4, 5, 6, 7])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_date_cols_with_header(all_parsers):
+ parser = all_parsers
+ data = """\
+ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
+
+ result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
+ 0.81, 2.81, 7.2, 0.0, 280.0],
+ [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
+ 0.01, 2.21, 7.2, 0.0, 260.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
+ -0.59, 2.21, 5.7, 0.0, 280.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
+ -0.99, 2.01, 3.6, 0.0, 270.0],
+ [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
+ -0.59, 1.71, 5.1, 0.0, 290.0],
+ [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
+ -0.59, 1.71, 4.6, 0.0, 280.0],
+ ], columns=["nominal", "ID", "ActualTime", "TDew",
+ "TAir", "Windspeed", "Precip", "WindDir"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,parse_dates,msg", [
+ ("""\
+date_NominalTime,date,NominalTime
+KORD1,19990127, 19:00:00
+KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already "
+ "in dict date_NominalTime")),
+ ("""\
+ID,date,nominalTime
+KORD,19990127, 19:00:00
+KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict")
+])
+def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), parse_dates=parse_dates)
+
+
+def test_date_parser_int_bug(all_parsers):
+ # see gh-3071
+ parser = all_parsers
+ data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
+ "accountid,userid,contactid,level,silo,method\n"
+ "1343103150,0.062353,0,4,6,0.01690,3,"
+ "12345,1,-1,3,invoice_InvoiceResource,search\n")
+
+ result = parser.read_csv(
+ StringIO(data), index_col=0, parse_dates=[0],
+ date_parser=lambda x: datetime.utcfromtimestamp(int(x)))
+ expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1,
+ 3, "invoice_InvoiceResource", "search"]],
+ columns=["elapsed", "sys", "user", "queries",
+ "query_time", "rows", "accountid",
+ "userid", "contactid", "level",
+ "silo", "method"],
+ index=Index([Timestamp("2012-07-24 04:12:30")],
+ name="posix_timestamp"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_nat_parse(all_parsers):
+ # see gh-3062
+ parser = all_parsers
+ df = DataFrame(dict({"A": np.asarray(lrange(10), dtype="float64"),
+ "B": pd.Timestamp("20010101")}))
+ df.iloc[3:6, :] = np.nan
+
+ with tm.ensure_clean("__nat_parse_.csv") as path:
+ df.to_csv(path)
+
+ result = parser.read_csv(path, index_col=0, parse_dates=["B"])
+ tm.assert_frame_equal(result, df)
+
+
+def test_csv_custom_parser(all_parsers):
+ data = """A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+ parser = all_parsers
+ result = parser.read_csv(
+ StringIO(data),
+ date_parser=lambda x: datetime.strptime(x, "%Y%m%d"))
+ expected = parser.read_csv(StringIO(data), parse_dates=True)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_dates_implicit_first_col(all_parsers):
+ data = """A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), parse_dates=True)
+
+ expected = parser.read_csv(StringIO(data), index_col=0,
+ parse_dates=True)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_dates_string(all_parsers):
+ data = """date,A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col="date",
+ parse_dates=["date"])
+ index = date_range("1/1/2009", periods=3)
+ index.name = "date"
+
+ expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4],
+ "C": [2, 4, 5]}, index=index)
+ tm.assert_frame_equal(result, expected)
+
+
+# Bug in https://github.com/dateutil/dateutil/issues/217
+# has been addressed, but we just don't pass in the `yearfirst`
[email protected](reason="yearfirst is not surfaced in read_*")
[email protected]("parse_dates", [
+ [["date", "time"]],
+ [[0, 1]]
+])
+def test_yy_format_with_year_first(all_parsers, parse_dates):
+ data = """date,time,B,C
+090131,0010,1,2
+090228,1020,3,4
+090331,0830,5,6
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), index_col=0,
+ parse_dates=parse_dates)
+ index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
+ datetime(2009, 2, 28, 10, 20, 0),
+ datetime(2009, 3, 31, 8, 30, 0)],
+ dtype=object, name="date_time")
+ expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("parse_dates", [[0, 2], ["a", "c"]])
+def test_parse_dates_column_list(all_parsers, parse_dates):
+ data = "a,b,c\n01/01/2010,1,15/02/2010"
+ parser = all_parsers
+
+ expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1],
+ "c": [datetime(2010, 2, 15)]})
+ expected = expected.set_index(["a", "b"])
+
+ result = parser.read_csv(StringIO(data), index_col=[0, 1],
+ parse_dates=parse_dates, dayfirst=True)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_col", [[0, 1], [1, 0]])
+def test_multi_index_parse_dates(all_parsers, index_col):
+ data = """index1,index2,A,B,C
+20090101,one,a,1,2
+20090101,two,b,3,4
+20090101,three,c,4,5
+20090102,one,a,1,2
+20090102,two,b,3,4
+20090102,three,c,4,5
+20090103,one,a,1,2
+20090103,two,b,3,4
+20090103,three,c,4,5
+"""
+ parser = all_parsers
+ index = MultiIndex.from_product([
+ (datetime(2009, 1, 1), datetime(2009, 1, 2),
+ datetime(2009, 1, 3)), ("one", "two", "three")],
+ names=["index1", "index2"])
+
+ # Out of order.
+ if index_col == [1, 0]:
+ index = index.swaplevel(0, 1)
+
+ expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
+ ["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
+ ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]],
+ columns=["A", "B", "C"], index=index)
+ result = parser.read_csv(StringIO(data), index_col=index_col,
+ parse_dates=True)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [
+ dict(dayfirst=True), dict(day_first=True)
+])
+def test_parse_dates_custom_euro_format(all_parsers, kwargs):
+ parser = all_parsers
+ data = """foo,bar,baz
+31/01/2010,1,2
+01/02/2010,1,NA
+02/02/2010,1,2
+"""
+ if "dayfirst" in kwargs:
+ df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
+ date_parser=lambda d: parse_date(d, **kwargs),
+ header=0, index_col=0, parse_dates=True,
+ na_values=["NA"])
+ exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
+ datetime(2010, 2, 2)], name="time")
+ expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
+ index=exp_index, columns=["Q", "NTU"])
+ tm.assert_frame_equal(df, expected)
+ else:
+ msg = "got an unexpected keyword argument 'day_first'"
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
+ date_parser=lambda d: parse_date(d, **kwargs),
+ skiprows=[0], index_col=0, parse_dates=True,
+ na_values=["NA"])
+
+
+def test_parse_tz_aware(all_parsers):
+ # See gh-1693
+ parser = all_parsers
+ data = "Date,x\n2012-06-13T01:39:00Z,0.5"
+
+ result = parser.read_csv(StringIO(data), index_col=0,
+ parse_dates=True)
+ expected = DataFrame({"x": [0.5]}, index=Index([Timestamp(
+ "2012-06-13 01:39:00+00:00")], name="Date"))
+ tm.assert_frame_equal(result, expected)
+ assert result.index.tz is pytz.utc
+
+
[email protected]("parse_dates,index_col", [
+ ({"nominal": [1, 2]}, "nominal"),
+ ({"nominal": [1, 2]}, 0),
+ ([[1, 2]], 0),
+])
+def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
+ parser = all_parsers
+ data = """
+ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
+KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00",
+ 0.81, 2.81, 7.2, 0.0, 280.0],
+ [datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00",
+ 0.01, 2.21, 7.2, 0.0, 260.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00",
+ -0.59, 2.21, 5.7, 0.0, 280.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00",
+ -0.99, 2.01, 3.6, 0.0, 270.0],
+ [datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00",
+ -0.59, 1.71, 5.1, 0.0, 290.0],
+ [datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00",
+ -0.59, 1.71, 4.6, 0.0, 280.0],
+ ], columns=["nominal", "ID", "ActualTime", "TDew",
+ "TAir", "Windspeed", "Precip", "WindDir"])
+ expected = expected.set_index("nominal")
+
+ if not isinstance(parse_dates, dict):
+ expected.index.name = "date_NominalTime"
+
+ result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
+ index_col=index_col)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_date_cols_chunked(all_parsers):
+ parser = all_parsers
+ data = """\
+ID,date,nominalTime,actualTime,A,B,C,D,E
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+
+ expected = DataFrame([
+ [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
+ 0.81, 2.81, 7.2, 0.0, 280.0],
+ [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
+ 0.01, 2.21, 7.2, 0.0, 260.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
+ -0.59, 2.21, 5.7, 0.0, 280.0],
+ [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
+ -0.99, 2.01, 3.6, 0.0, 270.0],
+ [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
+ -0.59, 1.71, 5.1, 0.0, 290.0],
+ [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
+ -0.59, 1.71, 4.6, 0.0, 280.0],
+ ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"])
+ expected = expected.set_index("nominal")
+
+ reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]},
+ index_col="nominal", chunksize=2)
+ chunks = list(reader)
+
+ tm.assert_frame_equal(chunks[0], expected[:2])
+ tm.assert_frame_equal(chunks[1], expected[2:4])
+ tm.assert_frame_equal(chunks[2], expected[4:])
+
+
+def test_multiple_date_col_named_index_compat(all_parsers):
+ parser = all_parsers
+ data = """\
+ID,date,nominalTime,actualTime,A,B,C,D,E
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+
+ with_indices = parser.read_csv(StringIO(data),
+ parse_dates={"nominal": [1, 2]},
+ index_col="nominal")
+ with_names = parser.read_csv(StringIO(data), index_col="nominal",
+ parse_dates={"nominal": [
+ "date", "nominalTime"]})
+ tm.assert_frame_equal(with_indices, with_names)
+
+
+def test_multiple_date_col_multiple_index_compat(all_parsers):
+ parser = all_parsers
+ data = """\
+ID,date,nominalTime,actualTime,A,B,C,D,E
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+ result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"],
+ parse_dates={"nominal": [1, 2]})
+ expected = parser.read_csv(StringIO(data),
+ parse_dates={"nominal": [1, 2]})
+
+ expected = expected.set_index(["nominal", "ID"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [dict(), dict(index_col="C")])
+def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
+ # see gh-5636
+ parser = all_parsers
+ msg = ("Only booleans, lists, and dictionaries "
+ "are accepted for the 'parse_dates' parameter")
+ data = """A,B,C
+ 1,2,2003-11-1"""
+
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
+
+
[email protected]("parse_dates", [
+ (1,), np.array([4, 5]), {1, 3, 3}
+])
+def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
+ parser = all_parsers
+ msg = ("Only booleans, lists, and dictionaries "
+ "are accepted for the 'parse_dates' parameter")
+ data = """A,B,C
+ 1,2,2003-11-1"""
+
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), parse_dates=(1,))
+
+
+def test_parse_dates_empty_string(all_parsers):
+ # see gh-2263
+ parser = all_parsers
+ data = "Date,test\n2012-01-01,1\n,2"
+ result = parser.read_csv(StringIO(data), parse_dates=["Date"],
+ na_filter=False)
+
+ expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]],
+ columns=["Date", "test"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected", [
+ ("a\n04.15.2016", dict(parse_dates=["a"]),
+ DataFrame([datetime(2016, 4, 15)], columns=["a"])),
+ ("a\n04.15.2016", dict(parse_dates=True, index_col=0),
+ DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))),
+ ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]),
+ DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]],
+ columns=["a", "b"])),
+ ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]),
+ DataFrame(index=MultiIndex.from_tuples(
+ [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))),
+])
+def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
+ # see gh-14066
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_date_time_multi_level_column_name(all_parsers):
+ data = """\
+D,T,A,B
+date, time,a,b
+2001-01-05, 09:00:00, 0.0, 10.
+2001-01-06, 00:00:00, 1.0, 11.
+"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), header=[0, 1],
+ parse_dates={"date_time": [0, 1]},
+ date_parser=conv.parse_date_time)
+
+ expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
+ [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
+ expected = DataFrame(expected_data,
+ columns=["date_time", ("A", "a"), ("B", "b")])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,kwargs,expected", [
+ ("""\
+date,time,a,b
+2001-01-05, 10:00:00, 0.0, 10.
+2001-01-05, 00:00:00, 1., 11.
+""", dict(header=0, parse_dates={"date_time": [0, 1]}),
+ DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
+ [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]],
+ columns=["date_time", "a", "b"])),
+ (("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
+ "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
+ "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
+ "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
+ "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
+ "KORD,19990127, 23:00:00, 22:56:00, -0.5900"),
+ dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}),
+ DataFrame([
+ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
+ "KORD", 0.81],
+ [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
+ "KORD", 0.01],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
+ "KORD", -0.59],
+ [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
+ "KORD", -0.99],
+ [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
+ "KORD", -0.59],
+ [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
+ "KORD", -0.59]], columns=["actual", "nominal", 0, 4])),
+])
+def test_parse_date_time(all_parsers, data, kwargs, expected):
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time,
+ **kwargs)
+
+ # Python can sometimes be flaky about how
+ # the aggregated columns are entered, so
+ # this standardizes the order.
+ result = result[expected.columns]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_date_fields(all_parsers):
+ parser = all_parsers
+ data = ("year,month,day,a\n2001,01,10,10.\n"
+ "2001,02,1,11.")
+ result = parser.read_csv(StringIO(data), header=0,
+ parse_dates={"ymd": [0, 1, 2]},
+ date_parser=conv.parse_date_fields)
+
+ expected = DataFrame([[datetime(2001, 1, 10), 10.],
+ [datetime(2001, 2, 1), 11.]], columns=["ymd", "a"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_date_all_fields(all_parsers):
+ parser = all_parsers
+ data = """\
+year,month,day,hour,minute,second,a,b
+2001,01,05,10,00,0,0.0,10.
+2001,01,5,10,0,00,1.,11.
+"""
+ result = parser.read_csv(StringIO(data), header=0,
+ date_parser=conv.parse_all_fields,
+ parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
+ expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
+ [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]],
+ columns=["ymdHMS", "a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_datetime_fractional_seconds(all_parsers):
+ parser = all_parsers
+ data = """\
+year,month,day,hour,minute,second,a,b
+2001,01,05,10,00,0.123456,0.0,10.
+2001,01,5,10,0,0.500000,1.,11.
+"""
+ result = parser.read_csv(StringIO(data), header=0,
+ date_parser=conv.parse_all_fields,
+ parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
+ expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0,
+ microsecond=123456), 0.0, 10.0],
+ [datetime(2001, 1, 5, 10, 0, 0,
+ microsecond=500000), 1.0, 11.0]],
+ columns=["ymdHMS", "a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_generic(all_parsers):
+ parser = all_parsers
+ data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
+
+ result = parser.read_csv(StringIO(data), header=0,
+ parse_dates={"ym": [0, 1]},
+ date_parser=lambda y, m: date(year=int(y),
+ month=int(m),
+ day=1))
+ expected = DataFrame([[date(2001, 1, 1), 10, 10.],
+ [date(2001, 2, 1), 1, 11.]],
+ columns=["ym", "day", "a"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_date_parser_resolution_if_not_ns(all_parsers):
+ # see gh-10245
+ parser = all_parsers
+ data = """\
+date,time,prn,rxstatus
+2013-11-03,19:00:00,126,00E80000
+2013-11-03,19:00:00,23,00E80000
+2013-11-03,19:00:00,13,00E80000
+"""
+
+ def date_parser(dt, time):
+ return np_array_datetime64_compat(dt + "T" + time + "Z",
+ dtype="datetime64[s]")
+
+ result = parser.read_csv(StringIO(data), date_parser=date_parser,
+ parse_dates={"datetime": ["date", "time"]},
+ index_col=["datetime", "prn"])
+
+ datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3,
+ dtype="datetime64[s]")
+ expected = DataFrame(data={"rxstatus": ["00E80000"] * 3},
+ index=MultiIndex.from_tuples(
+ [(datetimes[0], 126), (datetimes[1], 23),
+ (datetimes[2], 13)], names=["datetime", "prn"]))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_date_column_with_empty_string(all_parsers):
+ # see gh-6428
+ parser = all_parsers
+ data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
+ result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
+
+ expected_data = [[7, "10/18/2006"],
+ [7, "10/18/2008"],
+ [621, " "]]
+ expected = DataFrame(expected_data, columns=["case", "opdate"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,expected", [
+ ("a\n135217135789158401\n1352171357E+5",
+ DataFrame({"a": [135217135789158401,
+ 135217135700000]}, dtype="float64")),
+ ("a\n99999999999\n123456789012345\n1234E+0",
+ DataFrame({"a": [99999999999,
+ 123456789012345,
+ 1234]}, dtype="float64"))
+])
[email protected]("parse_dates", [True, False])
+def test_parse_date_float(all_parsers, data, expected, parse_dates):
+ # see gh-2697
+ #
+ # Date parsing should fail, so we leave the data untouched
+ # (i.e. float precision should remain unchanged).
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_parse_timezone(all_parsers):
+ # see gh-22256
+ parser = all_parsers
+ data = """dt,val
+ 2018-01-04 09:01:00+09:00,23350
+ 2018-01-04 09:02:00+09:00,23400
+ 2018-01-04 09:03:00+09:00,23400
+ 2018-01-04 09:04:00+09:00,23400
+ 2018-01-04 09:05:00+09:00,23400"""
+ result = parser.read_csv(StringIO(data), parse_dates=["dt"])
+
+ dti = pd.date_range(start="2018-01-04 09:01:00",
+ end="2018-01-04 09:05:00", freq="1min",
+ tz=pytz.FixedOffset(540))
+ expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
+
+ expected = DataFrame(expected_data)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_python_parser_only.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_python_parser_only.py
new file mode 100644
index 00000000000..c2edff258f1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_python_parser_only.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that apply specifically to the Python parser. Unless specifically
+stated as a Python-specific issue, the goal is to eventually move as many of
+these tests out of this module as soon as the C parser can accept further
+arguments when parsing.
+"""
+
+import csv
+
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import BytesIO, StringIO, u
+from pandas.errors import ParserError
+
+from pandas import DataFrame, Index, MultiIndex
+import pandas.util.testing as tm
+
+
+def test_default_separator(python_parser_only):
+ # see gh-17333
+ #
+ # csv.Sniffer in Python treats "o" as separator.
+ data = "aob\n1o2\n3o4"
+ parser = python_parser_only
+ expected = DataFrame({"a": [1, 3], "b": [2, 4]})
+
+ result = parser.read_csv(StringIO(data), sep=None)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("skipfooter", ["foo", 1.5, True])
+def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
+ # see gh-15925 (comment)
+ data = "a\n1\n2"
+ parser = python_parser_only
+ msg = "skipfooter must be an integer"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), skipfooter=skipfooter)
+
+
+def test_invalid_skipfooter_negative(python_parser_only):
+ # see gh-15925 (comment)
+ data = "a\n1\n2"
+ parser = python_parser_only
+ msg = "skipfooter cannot be negative"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), skipfooter=-1)
+
+
[email protected]("kwargs", [
+ dict(sep=None),
+ dict(delimiter="|")
+])
+def test_sniff_delimiter(python_parser_only, kwargs):
+ data = """index|A|B|C
+foo|1|2|3
+bar|4|5|6
+baz|7|8|9
+"""
+ parser = python_parser_only
+ result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=["A", "B", "C"],
+ index=Index(["foo", "bar", "baz"], name="index"))
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("encoding", [None, "utf-8"])
+def test_sniff_delimiter_encoding(python_parser_only, encoding):
+ parser = python_parser_only
+ data = """ignore this
+ignore this too
+index|A|B|C
+foo|1|2|3
+bar|4|5|6
+baz|7|8|9
+"""
+
+ if encoding is not None:
+ data = u(data).encode(encoding)
+ data = BytesIO(data)
+
+ if compat.PY3:
+ from io import TextIOWrapper
+ data = TextIOWrapper(data, encoding=encoding)
+ else:
+ data = StringIO(data)
+
+ result = parser.read_csv(data, index_col=0, sep=None,
+ skiprows=2, encoding=encoding)
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=["A", "B", "C"],
+ index=Index(["foo", "bar", "baz"], name="index"))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_single_line(python_parser_only):
+ # see gh-6607: sniff separator
+ parser = python_parser_only
+ result = parser.read_csv(StringIO("1,2"), names=["a", "b"],
+ header=None, sep=None)
+
+ expected = DataFrame({"a": [1], "b": [2]})
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs", [dict(skipfooter=2), dict(nrows=3)])
+def test_skipfooter(python_parser_only, kwargs):
+ # see gh-6607
+ data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+want to skip this
+also also skip this
+"""
+ parser = python_parser_only
+ result = parser.read_csv(StringIO(data), **kwargs)
+
+ expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=["A", "B", "C"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("compression,klass", [
+ ("gzip", "GzipFile"),
+ ("bz2", "BZ2File"),
+])
+def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
+ # see gh-6607
+ parser = python_parser_only
+
+ with open(csv1, "rb") as f:
+ data = f.read()
+
+ data = data.replace(b",", b"::")
+ expected = parser.read_csv(csv1)
+
+ module = pytest.importorskip(compression)
+ klass = getattr(module, klass)
+
+ with tm.ensure_clean() as path:
+ tmp = klass(path, mode="wb")
+ tmp.write(data)
+ tmp.close()
+
+ result = parser.read_csv(path, sep="::",
+ compression=compression)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_buglet_4x_multi_index(python_parser_only):
+ # see gh-6607
+ data = """ A B C D E
+one two three four
+a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
+a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
+x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
+ parser = python_parser_only
+
+ expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
+ [0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
+ [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]],
+ columns=["A", "B", "C", "D", "E"],
+ index=MultiIndex.from_tuples([
+ ("a", "b", 10.0032, 5),
+ ("a", "q", 20, 4),
+ ("x", "q", 30, 3),
+ ], names=["one", "two", "three", "four"]))
+ result = parser.read_csv(StringIO(data), sep=r"\s+")
+ tm.assert_frame_equal(result, expected)
+
+
+def test_read_csv_buglet_4x_multi_index2(python_parser_only):
+ # see gh-6893
+ data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
+ parser = python_parser_only
+
+ expected = DataFrame.from_records(
+ [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
+ columns=list("abcABC"), index=list("abc"))
+ result = parser.read_csv(StringIO(data), sep=r"\s+")
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("add_footer", [True, False])
+def test_skipfooter_with_decimal(python_parser_only, add_footer):
+ # see gh-6971
+ data = "1#2\n3#4"
+ parser = python_parser_only
+ expected = DataFrame({"a": [1.2, 3.4]})
+
+ if add_footer:
+ # The stray footer line should not mess with the
+ # casting of the first two lines if we skip it.
+ kwargs = dict(skipfooter=1)
+ data += "\nFooter"
+ else:
+ kwargs = dict()
+
+ result = parser.read_csv(StringIO(data), names=["a"],
+ decimal="#", **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("sep", ["::", "#####", "!!!", "123", "#1!c5",
+ "%!c!d", "@@#4:2", "_!pd#_"])
[email protected]("encoding", ["utf-16", "utf-16-be", "utf-16-le",
+ "utf-32", "cp037"])
+def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
+ # see gh-3404
+ expected = DataFrame({"a": [1], "b": [2]})
+ parser = python_parser_only
+
+ data = "1" + sep + "2"
+ encoded_data = data.encode(encoding)
+
+ result = parser.read_csv(BytesIO(encoded_data), sep=sep,
+ names=["a", "b"], encoding=encoding)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
+def test_multi_char_sep_quotes(python_parser_only, quoting):
+ # see gh-13374
+ kwargs = dict(sep=",,")
+ parser = python_parser_only
+
+ data = 'a,,b\n1,,a\n2,,"2,,b"'
+ msg = "ignored when a multi-char delimiter is used"
+
+ def fail_read():
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
+
+ if quoting == csv.QUOTE_NONE:
+ # We expect no match, so there should be an assertion
+ # error out of the inner context manager.
+ with pytest.raises(AssertionError):
+ fail_read()
+ else:
+ fail_read()
+
+
+def test_none_delimiter(python_parser_only, capsys):
+ # see gh-13374 and gh-17465
+ parser = python_parser_only
+ data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
+ expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
+
+ # We expect the third line in the data to be
+ # skipped because it is malformed, but we do
+ # not expect any errors to occur.
+ result = parser.read_csv(StringIO(data), header=0,
+ sep=None, warn_bad_lines=True,
+ error_bad_lines=False)
+ tm.assert_frame_equal(result, expected)
+
+ captured = capsys.readouterr()
+ assert "Skipping line 3" in captured.err
+
+
+ 'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
[email protected]("skipfooter", [0, 1])
+def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
+ # see gh-13879 and gh-15910
+ msg = "parsing errors in the skipped footer rows"
+ parser = python_parser_only
+
+ def fail_read():
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), skipfooter=skipfooter)
+
+ if skipfooter:
+ fail_read()
+ else:
+ # We expect no match, so there should be an assertion
+ # error out of the inner context manager.
+ with pytest.raises(AssertionError):
+ fail_read()
+
+
+def test_malformed_skipfooter(python_parser_only):
+ parser = python_parser_only
+ data = """ignore
+A,B,C
+1,2,3 # comment
+1,2,3,4,5
+2,3,4
+footer
+"""
+ msg = "Expected 3 fields in line 4, saw 5"
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data), header=1,
+ comment="#", skipfooter=1)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_quoting.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_quoting.py
new file mode 100644
index 00000000000..b33a1b8448b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_quoting.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that quoting specifications are properly handled
+during parsing for all of the parsers defined in parsers.py
+"""
+
+import csv
+
+import pytest
+
+from pandas.compat import PY2, StringIO, u
+from pandas.errors import ParserError
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+
+
[email protected]("kwargs,msg", [
+ (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'),
+ (dict(quotechar=None, quoting=csv.QUOTE_MINIMAL),
+ "quotechar must be set if quoting enabled"),
+ (dict(quotechar=2), '"quotechar" must be string, not int')
+])
+def test_bad_quote_char(all_parsers, kwargs, msg):
+ data = "1,2,3"
+ parser = all_parsers
+
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+
+
[email protected]("quoting,msg", [
+ ("foo", '"quoting" must be an integer'),
+ (5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
+])
+def test_bad_quoting(all_parsers, quoting, msg):
+ data = "1,2,3"
+ parser = all_parsers
+
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), quoting=quoting)
+
+
+def test_quote_char_basic(all_parsers):
+ parser = all_parsers
+ data = 'a,b,c\n1,2,"cat"'
+ expected = DataFrame([[1, 2, "cat"]],
+ columns=["a", "b", "c"])
+
+ result = parser.read_csv(StringIO(data), quotechar='"')
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("quote_char", ["~", "*", "%", "$", "@", "P"])
+def test_quote_char_various(all_parsers, quote_char):
+ parser = all_parsers
+ expected = DataFrame([[1, 2, "cat"]],
+ columns=["a", "b", "c"])
+
+ data = 'a,b,c\n1,2,"cat"'
+ new_data = data.replace('"', quote_char)
+
+ result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
[email protected]("quote_char", ["", None])
+def test_null_quote_char(all_parsers, quoting, quote_char):
+ kwargs = dict(quotechar=quote_char, quoting=quoting)
+ data = "a,b,c\n1,2,3"
+ parser = all_parsers
+
+ if quoting != csv.QUOTE_NONE:
+ # Sanity checking.
+ msg = "quotechar must be set if quoting enabled"
+
+ with pytest.raises(TypeError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+ else:
+ expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,exp_data", [
+ (dict(), [[1, 2, "foo"]]), # Test default.
+
+ # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+ (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]),
+
+ # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
+ (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]),
+
+ # QUOTE_NONE tells the reader to do no special handling
+ # of quote characters and leave them alone.
+ (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]),
+
+ # QUOTE_NONNUMERIC tells the reader to cast
+ # all non-quoted fields to float
+ (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]])
+])
+def test_quoting_various(all_parsers, kwargs, exp_data):
+ data = '1,2,"foo"'
+ parser = all_parsers
+ columns = ["a", "b", "c"]
+
+ result = parser.read_csv(StringIO(data), names=columns, **kwargs)
+ expected = DataFrame(exp_data, columns=columns)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("doublequote,exp_data", [
+ (True, [[3, '4 " 5']]),
+ (False, [[3, '4 " 5"']]),
+])
+def test_double_quote(all_parsers, doublequote, exp_data):
+ parser = all_parsers
+ data = 'a,b\n3,"4 "" 5"'
+
+ result = parser.read_csv(StringIO(data), quotechar='"',
+ doublequote=doublequote)
+ expected = DataFrame(exp_data, columns=["a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("quotechar", [
+ u('"'),
+ pytest.param(u('\u0001'), marks=pytest.mark.skipif(
+ PY2, reason="Python 2.x does not handle unicode well."))])
+def test_quotechar_unicode(all_parsers, quotechar):
+ # see gh-14477
+ data = "a\n1"
+ parser = all_parsers
+ expected = DataFrame({"a": [1]})
+
+ result = parser.read_csv(StringIO(data), quotechar=quotechar)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("balanced", [True, False])
+def test_unbalanced_quoting(all_parsers, balanced):
+ # see gh-22789.
+ parser = all_parsers
+ data = "a,b,c\n1,2,\"3"
+
+ if balanced:
+ # Re-balance the quoting and read in without errors.
+ expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+ result = parser.read_csv(StringIO(data + '"'))
+ tm.assert_frame_equal(result, expected)
+ else:
+ msg = ("EOF inside string starting at row 1" if parser.engine == "c"
+ else "unexpected end of data")
+
+ with pytest.raises(ParserError, match=msg):
+ parser.read_csv(StringIO(data))
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_read_fwf.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_read_fwf.py
new file mode 100644
index 00000000000..172bbe0bad4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_read_fwf.py
@@ -0,0 +1,580 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the 'read_fwf' function in parsers.py. This
+test suite is independent of the others because the
+engine is set to 'python-fwf' internally.
+"""
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import BytesIO, StringIO
+
+import pandas as pd
+from pandas import DataFrame, DatetimeIndex
+import pandas.util.testing as tm
+
+from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
+
+
+def test_basic():
+ data = """\
+A B C D
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ result = read_fwf(StringIO(data))
+ expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7],
+ [201159, 444.953632, 166.985655, 11788.4],
+ [201160, 364.136849, 183.628767, 11806.2],
+ [201161, 413.836124, 184.375703, 11916.8],
+ [201162, 502.953953, 173.237159, 12468.3]],
+ columns=["A", "B", "C", "D"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_colspecs():
+ data = """\
+A B C D E
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(data), colspecs=colspecs)
+
+ expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
+ [2011, 59, 444.953632, 166.985655, 11788.4],
+ [2011, 60, 364.136849, 183.628767, 11806.2],
+ [2011, 61, 413.836124, 184.375703, 11916.8],
+ [2011, 62, 502.953953, 173.237159, 12468.3]],
+ columns=["A", "B", "C", "D", "E"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_widths():
+ data = """\
+A B C D E
+2011 58 360.242940 149.910199 11950.7
+2011 59 444.953632 166.985655 11788.4
+2011 60 364.136849 183.628767 11806.2
+2011 61 413.836124 184.375703 11916.8
+2011 62 502.953953 173.237159 12468.3
+"""
+ result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
+
+ expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
+ [2011, 59, 444.953632, 166.985655, 11788.4],
+ [2011, 60, 364.136849, 183.628767, 11806.2],
+ [2011, 61, 413.836124, 184.375703, 11916.8],
+ [2011, 62, 502.953953, 173.237159, 12468.3]],
+ columns=["A", "B", "C", "D", "E"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_non_space_filler():
+ # From Thomas Kluyver:
+ #
+ # Apparently, some non-space filler characters can be seen, this is
+ # supported by specifying the 'delimiter' character:
+ #
+ # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
+ data = """\
+A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
+201158~~~~360.242940~~~149.910199~~~11950.7
+201159~~~~444.953632~~~166.985655~~~11788.4
+201160~~~~364.136849~~~183.628767~~~11806.2
+201161~~~~413.836124~~~184.375703~~~11916.8
+201162~~~~502.953953~~~173.237159~~~12468.3
+"""
+ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
+
+ expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
+ [2011, 59, 444.953632, 166.985655, 11788.4],
+ [2011, 60, 364.136849, 183.628767, 11806.2],
+ [2011, 61, 413.836124, 184.375703, 11916.8],
+ [2011, 62, 502.953953, 173.237159, 12468.3]],
+ columns=["A", "B", "C", "D", "E"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_over_specified():
+ data = """\
+A B C D E
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+
+ with pytest.raises(ValueError, match="must specify only one of"):
+ read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
+
+
+def test_under_specified():
+ data = """\
+A B C D E
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ with pytest.raises(ValueError, match="Must specify either"):
+ read_fwf(StringIO(data), colspecs=None, widths=None)
+
+
+def test_read_csv_compat():
+ csv_data = """\
+A,B,C,D,E
+2011,58,360.242940,149.910199,11950.7
+2011,59,444.953632,166.985655,11788.4
+2011,60,364.136849,183.628767,11806.2
+2011,61,413.836124,184.375703,11916.8
+2011,62,502.953953,173.237159,12468.3
+"""
+ expected = read_csv(StringIO(csv_data), engine="python")
+
+ fwf_data = """\
+A B C D E
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_bytes_io_input():
+ if not compat.PY3:
+ pytest.skip("Bytes-related test - only needs to work on Python 3")
+
+ result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
+ widths=[2, 2], encoding="utf8")
+ expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_fwf_colspecs_is_list_or_tuple():
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+ msg = "column specifications must be a list or tuple.+"
+
+ with pytest.raises(TypeError, match=msg):
+ read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
+
+
+def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+ msg = "Each column specification must be.+"
+
+ with pytest.raises(TypeError, match=msg):
+ read_fwf(StringIO(data), [("a", 1)])
+
+
[email protected]("colspecs,exp_data", [
+ ([(0, 3), (3, None)], [[123, 456], [456, 789]]),
+ ([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
+ ([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
+ ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
+])
+def test_fwf_colspecs_none(colspecs, exp_data):
+ # see gh-7079
+ data = """\
+123456
+456789
+"""
+ expected = DataFrame(exp_data)
+
+ result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("infer_nrows,exp_data", [
+ # infer_nrows --> colspec == [(2, 3), (5, 6)]
+ (1, [[1, 2], [3, 8]]),
+
+ # infer_nrows > number of rows
+ (10, [[1, 2], [123, 98]]),
+])
+def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
+ # see gh-15138
+ data = """\
+ 1 2
+123 98
+"""
+ expected = DataFrame(exp_data)
+
+ result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_fwf_regression():
+ # see gh-3594
+ #
+ # Turns out "T060" is parsable as a datetime slice!
+ tz_list = [1, 10, 20, 30, 60, 80, 100]
+ widths = [16] + [8] * len(tz_list)
+ names = ["SST"] + ["T%03d" % z for z in tz_list[1:]]
+
+ data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
+2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
+2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
+2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
+2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
+"""
+
+ result = read_fwf(StringIO(data), index_col=0, header=None, names=names,
+ widths=widths, parse_dates=True,
+ date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"))
+ expected = DataFrame([
+ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
+ [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
+ [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
+ [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
+ [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
+ ], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00",
+ "2009-06-13 20:40:00", "2009-06-13 20:50:00",
+ "2009-06-13 21:00:00"]),
+ columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_fwf_for_uint8():
+ data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
+1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
+ df = read_fwf(StringIO(data),
+ colspecs=[(0, 17), (25, 26), (33, 37),
+ (49, 51), (58, 62), (63, 1000)],
+ names=["time", "pri", "pgn", "dst", "src", "data"],
+ converters={
+ "pgn": lambda x: int(x, 16),
+ "src": lambda x: int(x, 16),
+ "dst": lambda x: int(x, 16),
+ "data": lambda x: len(x.split(" "))})
+
+ expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
+ [1421302964.226776, 6, 61442, None, 71, 8]],
+ columns=["time", "pri", "pgn",
+ "dst", "src", "data"])
+ expected["dst"] = expected["dst"].astype(object)
+ tm.assert_frame_equal(df, expected)
+
+
[email protected]("comment", ["#", "~", "!"])
+def test_fwf_comment(comment):
+ data = """\
+ 1 2. 4 #hello world
+ 5 NaN 10.0
+"""
+ data = data.replace("#", comment)
+
+ colspecs = [(0, 3), (4, 9), (9, 25)]
+ expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]])
+
+ result = read_fwf(StringIO(data), colspecs=colspecs,
+ header=None, comment=comment)
+ tm.assert_almost_equal(result, expected)
+
+
[email protected]("thousands", [",", "#", "~"])
+def test_fwf_thousands(thousands):
+ data = """\
+ 1 2,334.0 5
+10 13 10.
+"""
+ data = data.replace(",", thousands)
+
+ colspecs = [(0, 3), (3, 11), (12, 16)]
+ expected = DataFrame([[1, 2334., 5], [10, 13, 10.]])
+
+ result = read_fwf(StringIO(data), header=None,
+ colspecs=colspecs, thousands=thousands)
+ tm.assert_almost_equal(result, expected)
+
+
[email protected]("header", [True, False])
+def test_bool_header_arg(header):
+ # see gh-6114
+ data = """\
+MyColumn
+ a
+ b
+ a
+ b"""
+
+ msg = "Passing a bool to header is invalid"
+ with pytest.raises(TypeError, match=msg):
+ read_fwf(StringIO(data), header=header)
+
+
+def test_full_file():
+ # File with all values.
+ test = """index A B C
+2000-01-03T00:00:00 0.980268513777 3 foo
+2000-01-04T00:00:00 1.04791624281 -4 bar
+2000-01-05T00:00:00 0.498580885705 73 baz
+2000-01-06T00:00:00 1.12020151869 1 foo
+2000-01-07T00:00:00 0.487094399463 0 bar
+2000-01-10T00:00:00 0.836648671666 2 baz
+2000-01-11T00:00:00 0.157160753327 34 foo"""
+ colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+ expected = read_fwf(StringIO(test), colspecs=colspecs)
+
+ result = read_fwf(StringIO(test))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_full_file_with_missing():
+ # File with missing values.
+ test = """index A B C
+2000-01-03T00:00:00 0.980268513777 3 foo
+2000-01-04T00:00:00 1.04791624281 -4 bar
+ 0.498580885705 73 baz
+2000-01-06T00:00:00 1.12020151869 1 foo
+2000-01-07T00:00:00 0 bar
+2000-01-10T00:00:00 0.836648671666 2 baz
+ 34"""
+ colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+ expected = read_fwf(StringIO(test), colspecs=colspecs)
+
+ result = read_fwf(StringIO(test))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_full_file_with_spaces():
+ # File with spaces in columns.
+ test = """
+Account Name Balance CreditLimit AccountCreated
+101 Keanu Reeves 9315.45 10000.00 1/17/1998
+312 Gerard Butler 90.00 1000.00 8/6/2003
+868 Jennifer Love Hewitt 0 17000.00 5/25/1985
+761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
+317 Bill Murray 789.65 5000.00 2/5/2007
+""".strip("\r\n")
+ colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+ expected = read_fwf(StringIO(test), colspecs=colspecs)
+
+ result = read_fwf(StringIO(test))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_full_file_with_spaces_and_missing():
+ # File with spaces and missing values in columns.
+ test = """
+Account Name Balance CreditLimit AccountCreated
+101 10000.00 1/17/1998
+312 Gerard Butler 90.00 1000.00 8/6/2003
+868 5/25/1985
+761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
+317 Bill Murray 789.65
+""".strip("\r\n")
+ colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+ expected = read_fwf(StringIO(test), colspecs=colspecs)
+
+ result = read_fwf(StringIO(test))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_messed_up_data():
+ # Completely messed up file.
+ test = """
+ Account Name Balance Credit Limit Account Created
+ 101 10000.00 1/17/1998
+ 312 Gerard Butler 90.00 1000.00
+
+ 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
+ 317 Bill Murray 789.65
+""".strip("\r\n")
+ colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
+ expected = read_fwf(StringIO(test), colspecs=colspecs)
+
+ result = read_fwf(StringIO(test))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_multiple_delimiters():
+ test = r"""
+col1~~~~~col2 col3++++++++++++++++++col4
+~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
+ 33+++122.33\\\bar.........Gerard Butler
+++44~~~~12.01 baz~~Jennifer Love Hewitt
+~~55 11+++foo++++Jada Pinkett-Smith
+..66++++++.03~~~bar Bill Murray
+""".strip("\r\n")
+ delimiter = " +~.\\"
+ colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
+ expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
+
+ result = read_fwf(StringIO(test), delimiter=delimiter)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_variable_width_unicode():
+ if not compat.PY3:
+ pytest.skip("Bytes-related test - only needs to work on Python 3")
+
+ data = """
+שלום שלום
+ום שלל
+של ום
+""".strip("\r\n")
+ encoding = "utf8"
+ kwargs = dict(header=None, encoding=encoding)
+
+ expected = read_fwf(BytesIO(data.encode(encoding)),
+ colspecs=[(0, 4), (5, 9)], **kwargs)
+ result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+ dict(), {"a": "float64", "b": str, "c": "int32"}
+])
+def test_dtype(dtype):
+ data = """ a b c
+1 2 3.2
+3 4 5.2
+"""
+ colspecs = [(0, 5), (5, 10), (10, None)]
+ result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
+
+ expected = pd.DataFrame({
+ "a": [1, 3], "b": [2, 4],
+ "c": [3.2, 5.2]}, columns=["a", "b", "c"])
+
+ for col, dt in dtype.items():
+ expected[col] = expected[col].astype(dt)
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skiprows_inference():
+ # see gh-11256
+ data = """
+Text contained in the file header
+
+DataCol1 DataCol2
+ 0.0 1.0
+ 101.6 956.1
+""".strip()
+ skiprows = 2
+ expected = read_csv(StringIO(data), skiprows=skiprows,
+ delim_whitespace=True)
+
+ result = read_fwf(StringIO(data), skiprows=skiprows)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skiprows_by_index_inference():
+ data = """
+To be skipped
+Not To Be Skipped
+Once more to be skipped
+123 34 8 123
+456 78 9 456
+""".strip()
+ skiprows = [0, 2]
+ expected = read_csv(StringIO(data), skiprows=skiprows,
+ delim_whitespace=True)
+
+ result = read_fwf(StringIO(data), skiprows=skiprows)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skiprows_inference_empty():
+ data = """
+AA BBB C
+12 345 6
+78 901 2
+""".strip()
+
+ msg = "No rows from which to infer column width"
+ with pytest.raises(EmptyDataError, match=msg):
+ read_fwf(StringIO(data), skiprows=3)
+
+
+def test_whitespace_preservation():
+ # see gh-16772
+ header = None
+ csv_data = """
+ a ,bbb
+ cc,dd """
+
+ fwf_data = """
+ a bbb
+ ccdd """
+ result = read_fwf(StringIO(fwf_data), widths=[3, 3],
+ header=header, skiprows=[0], delimiter="\n\t")
+ expected = read_csv(StringIO(csv_data), header=header)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_default_delimiter():
+ header = None
+ csv_data = """
+a,bbb
+cc,dd"""
+
+ fwf_data = """
+a \tbbb
+cc\tdd """
+ result = read_fwf(StringIO(fwf_data), widths=[3, 3],
+ header=header, skiprows=[0])
+ expected = read_csv(StringIO(csv_data), header=header)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("infer", [True, False, None])
+def test_fwf_compression(compression_only, infer):
+ data = """1111111111
+ 2222222222
+ 3333333333""".strip()
+
+ compression = compression_only
+ extension = "gz" if compression == "gzip" else compression
+
+ kwargs = dict(widths=[5, 5], names=["one", "two"])
+ expected = read_fwf(StringIO(data), **kwargs)
+
+ if compat.PY3:
+ data = bytes(data, encoding="utf-8")
+
+ with tm.ensure_clean(filename="tmp." + extension) as path:
+ tm.write_to_compressed(compression, path, data)
+
+ if infer is not None:
+ kwargs["compression"] = "infer" if infer else compression
+
+ result = read_fwf(path, **kwargs)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_skiprows.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_skiprows.py
new file mode 100644
index 00000000000..1df2ca4fad4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_skiprows.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that skipped rows are properly handled during
+parsing for all of the parsers defined in parsers.py
+"""
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, lrange, range
+from pandas.errors import EmptyDataError
+
+from pandas import DataFrame, Index
+import pandas.util.testing as tm
+
+
[email protected]("skiprows", [lrange(6), 6])
+def test_skip_rows_bug(all_parsers, skiprows):
+ # see gh-505
+ parser = all_parsers
+ text = """#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+ result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None,
+ index_col=0, parse_dates=True)
+ index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
+ datetime(2000, 1, 3)], name=0)
+
+ expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
+ columns=[1, 2, 3], index=index)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_deep_skip_rows(all_parsers):
+ # see gh-4382
+ parser = all_parsers
+ data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)])
+ for i in range(10)])
+ condensed_data = "a,b,c\n" + "\n".join([
+ ",".join([str(i), str(i + 1), str(i + 2)])
+ for i in [0, 1, 2, 3, 4, 6, 8, 9]])
+
+ result = parser.read_csv(StringIO(data), skiprows=[6, 8])
+ condensed_result = parser.read_csv(StringIO(condensed_data))
+ tm.assert_frame_equal(result, condensed_result)
+
+
+def test_skip_rows_blank(all_parsers):
+ # see gh-9832
+ parser = all_parsers
+ text = """#foo,a,b,c
+#foo,a,b,c
+
+#foo,a,b,c
+#foo,a,b,c
+
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+ data = parser.read_csv(StringIO(text), skiprows=6, header=None,
+ index_col=0, parse_dates=True)
+ index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
+ datetime(2000, 1, 3)], name=0)
+
+ expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
+ columns=[1, 2, 3],
+ index=index)
+ tm.assert_frame_equal(data, expected)
+
+
[email protected]("data,kwargs,expected", [
+ ("""id,text,num_lines
+1,"line 11
+line 12",2
+2,"line 21
+line 22",2
+3,"line 31",1""",
+ dict(skiprows=[1]),
+ DataFrame([[2, "line 21\nline 22", 2],
+ [3, "line 31", 1]], columns=["id", "text", "num_lines"])),
+ ("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
+ dict(quotechar="~", skiprows=[2]),
+ DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])),
+ (("Text,url\n~example\n "
+ "sentence\n one~,url1\n~"
+ "example\n sentence\n two~,url2\n~"
+ "example\n sentence\n three~,url3"),
+ dict(quotechar="~", skiprows=[1, 3]),
+ DataFrame([['example\n sentence\n two', 'url2']],
+ columns=["Text", "url"]))
+])
+def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
+ # see gh-12775 and gh-10911
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skip_row_with_quote(all_parsers):
+ # see gh-12775 and gh-10911
+ parser = all_parsers
+ data = """id,text,num_lines
+1,"line '11' line 12",2
+2,"line '21' line 22",2
+3,"line '31' line 32",1"""
+
+ exp_data = [[2, "line '21' line 22", 2],
+ [3, "line '31' line 32", 1]]
+ expected = DataFrame(exp_data, columns=[
+ "id", "text", "num_lines"])
+
+ result = parser.read_csv(StringIO(data), skiprows=[1])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,exp_data", [
+ ("""id,text,num_lines
+1,"line \n'11' line 12",2
+2,"line \n'21' line 22",2
+3,"line \n'31' line 32",1""",
+ [[2, "line \n'21' line 22", 2],
+ [3, "line \n'31' line 32", 1]]),
+ ("""id,text,num_lines
+1,"line '11\n' line 12",2
+2,"line '21\n' line 22",2
+3,"line '31\n' line 32",1""",
+ [[2, "line '21\n' line 22", 2],
+ [3, "line '31\n' line 32", 1]]),
+ ("""id,text,num_lines
+1,"line '11\n' \r\tline 12",2
+2,"line '21\n' \r\tline 22",2
+3,"line '31\n' \r\tline 32",1""",
+ [[2, "line '21\n' \r\tline 22", 2],
+ [3, "line '31\n' \r\tline 32", 1]]),
+])
+def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
+ # see gh-12775 and gh-10911
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), skiprows=[1])
+
+ expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("line_terminator", [
+ "\n", # "LF"
+ "\r\n", # "CRLF"
+ "\r" # "CR"
+])
+def test_skiprows_lineterminator(all_parsers, line_terminator):
+ # see gh-9079
+ parser = all_parsers
+ data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ",
+ "2007/01/01 01:00 0.2140 U M ",
+ "2007/01/01 02:00 0.2141 M O ",
+ "2007/01/01 04:00 0.2142 D M "])
+ expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"],
+ ["2007/01/01", "02:00", 0.2141, "M", "O"],
+ ["2007/01/01", "04:00", 0.2142, "D", "M"]],
+ columns=["date", "time", "var", "flag",
+ "oflag"])
+
+ if parser.engine == "python" and line_terminator == "\r":
+ pytest.skip("'CR' not respect with the Python parser yet")
+
+ data = data.replace("\n", line_terminator)
+ result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
+ names=["date", "time", "var", "flag", "oflag"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skiprows_infield_quote(all_parsers):
+ # see gh-14459
+ parser = all_parsers
+ data = "a\"\nb\"\na\n1"
+ expected = DataFrame({"a": [1]})
+
+ result = parser.read_csv(StringIO(data), skiprows=2)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("kwargs,expected", [
+ (dict(), DataFrame({"1": [3, 5]})),
+ (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]}))
+])
+def test_skip_rows_callable(all_parsers, kwargs, expected):
+ parser = all_parsers
+ data = "a\n1\n2\n3\n4\n5"
+
+ result = parser.read_csv(StringIO(data),
+ skiprows=lambda x: x % 2 == 0,
+ **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_skip_rows_skip_all(all_parsers):
+ parser = all_parsers
+ data = "a\n1\n2\n3\n4\n5"
+ msg = "No columns to parse from file"
+
+ with pytest.raises(EmptyDataError, match=msg):
+ parser.read_csv(StringIO(data), skiprows=lambda x: True)
+
+
+def test_skip_rows_bad_callable(all_parsers):
+ msg = "by zero"
+ parser = all_parsers
+ data = "a\n1\n2\n3\n4\n5"
+
+ with pytest.raises(ZeroDivisionError, match=msg):
+ parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_textreader.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_textreader.py
new file mode 100644
index 00000000000..8119de67890
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_textreader.py
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the TextReader class in parsers.pyx, which
+is integral to the C engine in parsers.py
+"""
+
+import os
+
+import numpy as np
+from numpy import nan
+import pytest
+
+import pandas._libs.parsers as parser
+from pandas._libs.parsers import TextReader
+import pandas.compat as compat
+from pandas.compat import BytesIO, StringIO, map
+
+from pandas import DataFrame
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+from pandas.io.parsers import TextFileReader, read_csv
+
+
+class TestTextReader(object):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+ self.dirpath = datapath('io', 'parser', 'data')
+ self.csv1 = os.path.join(self.dirpath, 'test1.csv')
+ self.csv2 = os.path.join(self.dirpath, 'test2.csv')
+ self.xls1 = os.path.join(self.dirpath, 'test.xls')
+
+ def test_file_handle(self):
+ with open(self.csv1, 'rb') as f:
+ reader = TextReader(f)
+ reader.read()
+
+ def test_string_filename(self):
+ reader = TextReader(self.csv1, header=None)
+ reader.read()
+
+ def test_file_handle_mmap(self):
+ with open(self.csv1, 'rb') as f:
+ reader = TextReader(f, memory_map=True, header=None)
+ reader.read()
+
+ def test_StringIO(self):
+ with open(self.csv1, 'rb') as f:
+ text = f.read()
+ src = BytesIO(text)
+ reader = TextReader(src, header=None)
+ reader.read()
+
+ def test_string_factorize(self):
+ # should this be optional?
+ data = 'a\nb\na\nb\na'
+ reader = TextReader(StringIO(data), header=None)
+ result = reader.read()
+ assert len(set(map(id, result[0]))) == 2
+
+ def test_skipinitialspace(self):
+ data = ('a, b\n'
+ 'a, b\n'
+ 'a, b\n'
+ 'a, b')
+
+ reader = TextReader(StringIO(data), skipinitialspace=True,
+ header=None)
+ result = reader.read()
+
+ tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
+ dtype=np.object_))
+ tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
+ dtype=np.object_))
+
+ def test_parse_booleans(self):
+ data = 'True\nFalse\nTrue\nTrue'
+
+ reader = TextReader(StringIO(data), header=None)
+ result = reader.read()
+
+ assert result[0].dtype == np.bool_
+
+ def test_delimit_whitespace(self):
+ data = 'a b\na\t\t "b"\n"a"\t \t b'
+
+ reader = TextReader(StringIO(data), delim_whitespace=True,
+ header=None)
+ result = reader.read()
+
+ tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
+ dtype=np.object_))
+ tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
+ dtype=np.object_))
+
+ def test_embedded_newline(self):
+ data = 'a\n"hello\nthere"\nthis'
+
+ reader = TextReader(StringIO(data), header=None)
+ result = reader.read()
+
+ expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
+ tm.assert_numpy_array_equal(result[0], expected)
+
+ def test_euro_decimal(self):
+ data = '12345,67\n345,678'
+
+ reader = TextReader(StringIO(data), delimiter=':',
+ decimal=',', header=None)
+ result = reader.read()
+
+ expected = np.array([12345.67, 345.678])
+ tm.assert_almost_equal(result[0], expected)
+
+ def test_integer_thousands(self):
+ data = '123,456\n12,500'
+
+ reader = TextReader(StringIO(data), delimiter=':',
+ thousands=',', header=None)
+ result = reader.read()
+
+ expected = np.array([123456, 12500], dtype=np.int64)
+ tm.assert_almost_equal(result[0], expected)
+
+ def test_integer_thousands_alt(self):
+ data = '123.456\n12.500'
+
+ reader = TextFileReader(StringIO(data), delimiter=':',
+ thousands='.', header=None)
+ result = reader.read()
+
+ expected = DataFrame([123456, 12500])
+ tm.assert_frame_equal(result, expected)
+
+ def test_skip_bad_lines(self, capsys):
+ # too many lines, see #2430 for why
+ data = ('a:b:c\n'
+ 'd:e:f\n'
+ 'g:h:i\n'
+ 'j:k:l:m\n'
+ 'l:m:n\n'
+ 'o:p:q:r')
+
+ reader = TextReader(StringIO(data), delimiter=':',
+ header=None)
+ msg = (r"Error tokenizing data\. C error: Expected 3 fields in"
+ " line 4, saw 4")
+ with pytest.raises(parser.ParserError, match=msg):
+ reader.read()
+
+ reader = TextReader(StringIO(data), delimiter=':',
+ header=None,
+ error_bad_lines=False,
+ warn_bad_lines=False)
+ result = reader.read()
+ expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
+ 1: np.array(['b', 'e', 'h', 'm'], dtype=object),
+ 2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
+ assert_array_dicts_equal(result, expected)
+
+ reader = TextReader(StringIO(data), delimiter=':',
+ header=None,
+ error_bad_lines=False,
+ warn_bad_lines=True)
+ reader.read()
+ captured = capsys.readouterr()
+
+ assert 'Skipping line 4' in captured.err
+ assert 'Skipping line 6' in captured.err
+
+ def test_header_not_enough_lines(self):
+ data = ('skip this\n'
+ 'skip this\n'
+ 'a,b,c\n'
+ '1,2,3\n'
+ '4,5,6')
+
+ reader = TextReader(StringIO(data), delimiter=',', header=2)
+ header = reader.header
+ expected = [['a', 'b', 'c']]
+ assert header == expected
+
+ recs = reader.read()
+ expected = {0: np.array([1, 4], dtype=np.int64),
+ 1: np.array([2, 5], dtype=np.int64),
+ 2: np.array([3, 6], dtype=np.int64)}
+ assert_array_dicts_equal(recs, expected)
+
+ def test_escapechar(self):
+ data = ('\\"hello world\"\n'
+ '\\"hello world\"\n'
+ '\\"hello world\"')
+
+ reader = TextReader(StringIO(data), delimiter=',', header=None,
+ escapechar='\\')
+ result = reader.read()
+ expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
+ assert_array_dicts_equal(result, expected)
+
+ def test_eof_has_eol(self):
+ # handling of new line at EOF
+ pass
+
+ def test_na_substitution(self):
+ pass
+
+ def test_numpy_string_dtype(self):
+ data = """\
+a,1
+aa,2
+aaa,3
+aaaa,4
+aaaaa,5"""
+
+ def _make_reader(**kwds):
+ return TextReader(StringIO(data), delimiter=',', header=None,
+ **kwds)
+
+ reader = _make_reader(dtype='S5,i4')
+ result = reader.read()
+
+ assert result[0].dtype == 'S5'
+
+ ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
+ assert (result[0] == ex_values).all()
+ assert result[1].dtype == 'i4'
+
+ reader = _make_reader(dtype='S4')
+ result = reader.read()
+ assert result[0].dtype == 'S4'
+ ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
+ assert (result[0] == ex_values).all()
+ assert result[1].dtype == 'S4'
+
+ def test_pass_dtype(self):
+ data = """\
+one,two
+1,a
+2,b
+3,c
+4,d"""
+
+ def _make_reader(**kwds):
+ return TextReader(StringIO(data), delimiter=',', **kwds)
+
+ reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
+ result = reader.read()
+ assert result[0].dtype == 'u1'
+ assert result[1].dtype == 'S1'
+
+ reader = _make_reader(dtype={'one': np.uint8, 1: object})
+ result = reader.read()
+ assert result[0].dtype == 'u1'
+ assert result[1].dtype == 'O'
+
+ reader = _make_reader(dtype={'one': np.dtype('u1'),
+ 1: np.dtype('O')})
+ result = reader.read()
+ assert result[0].dtype == 'u1'
+ assert result[1].dtype == 'O'
+
+ def test_usecols(self):
+ data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+ def _make_reader(**kwds):
+ return TextReader(StringIO(data), delimiter=',', **kwds)
+
+ reader = _make_reader(usecols=(1, 2))
+ result = reader.read()
+
+ exp = _make_reader().read()
+ assert len(result) == 2
+ assert (result[1] == exp[1]).all()
+ assert (result[2] == exp[2]).all()
+
+ def test_cr_delimited(self):
+ def _test(text, **kwargs):
+ nice_text = text.replace('\r', '\r\n')
+ result = TextReader(StringIO(text), **kwargs).read()
+ expected = TextReader(StringIO(nice_text), **kwargs).read()
+ assert_array_dicts_equal(result, expected)
+
+ data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
+ _test(data, delimiter=',')
+
+ data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
+ _test(data, delim_whitespace=True)
+
+ data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
+ _test(data, delimiter=',')
+
+ sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
+ 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
+ ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
+ _test(sample, delimiter=',')
+
+ data = 'A B C\r 2 3\r4 5 6'
+ _test(data, delim_whitespace=True)
+
+ data = 'A B C\r2 3\r4 5 6'
+ _test(data, delim_whitespace=True)
+
+ def test_empty_field_eof(self):
+ data = 'a,b,c\n1,2,3\n4,,'
+
+ result = TextReader(StringIO(data), delimiter=',').read()
+
+ expected = {0: np.array([1, 4], dtype=np.int64),
+ 1: np.array(['2', ''], dtype=object),
+ 2: np.array(['3', ''], dtype=object)}
+ assert_array_dicts_equal(result, expected)
+
+ # GH5664
+ a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
+ b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
+ columns=list('abcd'),
+ index=[1, 1])
+ c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
+ [8, 9, 10, 11], [13, 14, nan, nan]],
+ columns=list('abcd'),
+ index=[0, 5, 7, 12])
+
+ for _ in range(100):
+ df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
+ names=['a'], engine='c')
+ assert_frame_equal(df, a)
+
+ df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
+ names=list("abcd"), engine='c')
+ assert_frame_equal(df, b)
+
+ df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
+ names=list('abcd'), engine='c')
+ assert_frame_equal(df, c)
+
+ def test_empty_csv_input(self):
+ # GH14867
+ df = read_csv(StringIO(), chunksize=20, header=None,
+ names=['a', 'b', 'c'])
+ assert isinstance(df, TextFileReader)
+
+
+def assert_array_dicts_equal(left, right):
+ for k, v in compat.iteritems(left):
+ assert tm.assert_numpy_array_equal(np.asarray(v),
+ np.asarray(right[k]))
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_unsupported.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_unsupported.py
new file mode 100644
index 00000000000..8c6dbd64c78
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_unsupported.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that features that are currently unsupported in
+either the Python or C parser are actually enforced
+and are clearly communicated to the user.
+
+Ultimately, the goal is to remove test cases from this
+test suite as new feature support is added to the parsers.
+"""
+
+import pytest
+
+from pandas.compat import StringIO
+from pandas.errors import ParserError
+
+import pandas.util.testing as tm
+
+import pandas.io.parsers as parsers
+from pandas.io.parsers import read_csv
+
+
[email protected](params=["python", "python-fwf"], ids=lambda val: val)
+def python_engine(request):
+ return request.param
+
+
+class TestUnsupportedFeatures(object):
+
+ def test_mangle_dupe_cols_false(self):
+ # see gh-12935
+ data = 'a b c\n1 2 3'
+ msg = 'is not supported'
+
+ for engine in ('c', 'python'):
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine=engine,
+ mangle_dupe_cols=False)
+
+ def test_c_engine(self):
+ # see gh-6607
+ data = 'a b c\n1 2 3'
+ msg = 'does not support'
+
+ # specify C engine with unsupported options (raise)
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine='c',
+ sep=None, delim_whitespace=False)
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine='c', sep=r'\s')
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128))
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine='c', skipfooter=1)
+
+ # specify C-unsupported options without python-unsupported options
+ with tm.assert_produces_warning(parsers.ParserWarning):
+ read_csv(StringIO(data), sep=None, delim_whitespace=False)
+ with tm.assert_produces_warning(parsers.ParserWarning):
+ read_csv(StringIO(data), sep=r'\s')
+ with tm.assert_produces_warning(parsers.ParserWarning):
+ read_csv(StringIO(data), sep='\t', quotechar=chr(128))
+ with tm.assert_produces_warning(parsers.ParserWarning):
+ read_csv(StringIO(data), skipfooter=1)
+
+ text = """ A B C D E
+one two three four
+a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
+a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
+x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
+ msg = 'Error tokenizing data'
+
+ with pytest.raises(ParserError, match=msg):
+ read_csv(StringIO(text), sep='\\s+')
+ with pytest.raises(ParserError, match=msg):
+ read_csv(StringIO(text), engine='c', sep='\\s+')
+
+ msg = "Only length-1 thousands markers supported"
+ data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), thousands=',,')
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), thousands='')
+
+ msg = "Only length-1 line terminators supported"
+ data = 'a,b,c~~1,2,3~~4,5,6'
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), lineterminator='~~')
+
+ def test_python_engine(self, python_engine):
+ from pandas.io.parsers import _python_unsupported as py_unsupported
+
+ data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+
+ for default in py_unsupported:
+ msg = ('The %r option is not supported '
+ 'with the %r engine' % (default, python_engine))
+
+ kwargs = {default: object()}
+ with pytest.raises(ValueError, match=msg):
+ read_csv(StringIO(data), engine=python_engine, **kwargs)
+
+ def test_python_engine_file_no_next(self, python_engine):
+ # see gh-16530
+ class NoNextBuffer(object):
+ def __init__(self, csv_data):
+ self.data = csv_data
+
+ def __iter__(self):
+ return self
+
+ def read(self):
+ return self.data
+
+ data = "a\n1"
+ msg = "The 'python' engine cannot iterate"
+
+ with pytest.raises(ValueError, match=msg):
+ read_csv(NoNextBuffer(data), engine=python_engine)
+
+
+class TestDeprecatedFeatures(object):
+
+ @pytest.mark.parametrize("engine", ["c", "python"])
+ @pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
+ {"tupleize_cols": False}])
+ def test_deprecated_args(self, engine, kwargs):
+ data = "1,2,3"
+ arg, _ = list(kwargs.items())[0]
+
+ with tm.assert_produces_warning(
+ FutureWarning, check_stacklevel=False):
+ read_csv(StringIO(data), engine=engine, **kwargs)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/test_usecols.py b/contrib/python/pandas/py2/pandas/tests/io/parser/test_usecols.py
new file mode 100644
index 00000000000..652f78d198e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/test_usecols.py
@@ -0,0 +1,534 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the usecols functionality during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import Timestamp
+from pandas.compat import StringIO
+
+from pandas import DataFrame, Index
+import pandas.util.testing as tm
+
+_msg_validate_usecols_arg = ("'usecols' must either be list-like "
+ "of all strings, all unicode, all "
+ "integers or a callable.")
+_msg_validate_usecols_names = ("Usecols do not match columns, columns "
+ "expected but not found: {0}")
+
+
+def test_raise_on_mixed_dtype_usecols(all_parsers):
+ # See gh-12678
+ data = """a,b,c
+ 1000,2000,3000
+ 4000,5000,6000
+ """
+ usecols = [0, "b", 2]
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+ parser.read_csv(StringIO(data), usecols=usecols)
+
+
[email protected]("usecols", [(1, 2), ("b", "c")])
+def test_usecols(all_parsers, usecols):
+ data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+
+ expected = DataFrame([[2, 3], [5, 6], [8, 9],
+ [11, 12]], columns=["b", "c"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_names(all_parsers):
+ data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+ parser = all_parsers
+ names = ["foo", "bar"]
+ result = parser.read_csv(StringIO(data), names=names,
+ usecols=[1, 2], header=0)
+
+ expected = DataFrame([[2, 3], [5, 6], [8, 9],
+ [11, 12]], columns=names)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("names,usecols", [
+ (["b", "c"], [1, 2]),
+ (["a", "b", "c"], ["b", "c"])
+])
+def test_usecols_relative_to_names(all_parsers, names, usecols):
+ data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), names=names,
+ header=None, usecols=usecols)
+
+ expected = DataFrame([[2, 3], [5, 6], [8, 9],
+ [11, 12]], columns=["b", "c"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_relative_to_names2(all_parsers):
+ # see gh-5766
+ data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), names=["a", "b"],
+ header=None, usecols=[0, 1])
+
+ expected = DataFrame([[1, 2], [4, 5], [7, 8],
+ [10, 11]], columns=["a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_name_length_conflict(all_parsers):
+ data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+ parser = all_parsers
+ msg = ("Number of passed names did not "
+ "match number of header fields in the file"
+ if parser.engine == "python" else
+ "Passed header names mismatches usecols")
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), names=["a", "b"],
+ header=None, usecols=[1])
+
+
+def test_usecols_single_string(all_parsers):
+ # see gh-20558
+ parser = all_parsers
+ data = """foo, bar, baz
+1000, 2000, 3000
+4000, 5000, 6000"""
+
+ with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+ parser.read_csv(StringIO(data), usecols="foo")
+
+
[email protected]("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8",
+ "a,b,c,d\n1,2,3,4,\n5,6,7,8,"])
+def test_usecols_index_col_false(all_parsers, data):
+ # see gh-9082
+ parser = all_parsers
+ usecols = ["a", "c", "d"]
+ expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
+
+ result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("index_col", ["b", 0])
[email protected]("usecols", [["b", "c"], [1, 2]])
+def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
+ # see gh-4201: test that index_col as integer reflects usecols
+ parser = all_parsers
+ data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
+ expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
+
+ result = parser.read_csv(StringIO(data), usecols=usecols,
+ index_col=index_col)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_index_col_conflict2(all_parsers):
+ # see gh-4201: test that index_col as integer reflects usecols
+ parser = all_parsers
+ data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
+
+ expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
+ expected = expected.set_index(["b", "c"])
+
+ result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"],
+ index_col=["b", "c"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_implicit_index_col(all_parsers):
+ # see gh-2654
+ parser = all_parsers
+ data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
+
+ result = parser.read_csv(StringIO(data), usecols=["a", "b"])
+ expected = DataFrame({"a": ["apple", "orange"],
+ "b": ["bat", "cow"]}, index=[4, 8])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_regex_sep(all_parsers):
+ # see gh-2733
+ parser = all_parsers
+ data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
+ result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
+
+ expected = DataFrame({"a": ["apple", "orange"],
+ "b": ["bat", "cow"]}, index=[4, 8])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_whitespace(all_parsers):
+ parser = all_parsers
+ data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
+
+ result = parser.read_csv(StringIO(data), delim_whitespace=True,
+ usecols=("a", "b"))
+ expected = DataFrame({"a": ["apple", "orange"],
+ "b": ["bat", "cow"]}, index=[4, 8])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols,expected", [
+ # Column selection by index.
+ ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]],
+ columns=["2", "0"])),
+
+ # Column selection by name.
+ (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]],
+ columns=["0", "1"])),
+])
+def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
+ parser = all_parsers
+ data = """2,0,1
+1000,2000,3000
+4000,5000,6000"""
+
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols", [[0, 2, 3], [3, 0, 2]])
+def test_usecols_with_parse_dates(all_parsers, usecols):
+ # see gh-9755
+ data = """a,b,c,d,e
+0,1,20140101,0900,4
+0,1,20140102,1000,4"""
+ parser = all_parsers
+ parse_dates = [[1, 2]]
+
+ cols = {
+ "a": [0, 0],
+ "c_d": [
+ Timestamp("2014-01-01 09:00:00"),
+ Timestamp("2014-01-02 10:00:00")
+ ]
+ }
+ expected = DataFrame(cols, columns=["c_d", "a"])
+ result = parser.read_csv(StringIO(data), usecols=usecols,
+ parse_dates=parse_dates)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_parse_dates2(all_parsers):
+ # see gh-13604
+ parser = all_parsers
+ data = """2008-02-07 09:40,1032.43
+2008-02-07 09:50,1042.54
+2008-02-07 10:00,1051.65"""
+
+ names = ["date", "values"]
+ usecols = names[:]
+ parse_dates = [0]
+
+ index = Index([Timestamp("2008-02-07 09:40"),
+ Timestamp("2008-02-07 09:50"),
+ Timestamp("2008-02-07 10:00")],
+ name="date")
+ cols = {"values": [1032.43, 1042.54, 1051.65]}
+ expected = DataFrame(cols, index=index)
+
+ result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
+ index_col=0, usecols=usecols,
+ header=None, names=names)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_parse_dates3(all_parsers):
+ # see gh-14792
+ parser = all_parsers
+ data = """a,b,c,d,e,f,g,h,i,j
+2016/09/21,1,1,2,3,4,5,6,7,8"""
+
+ usecols = list("abcdefghij")
+ parse_dates = [0]
+
+ cols = {"a": Timestamp("2016-09-21"),
+ "b": [1], "c": [1], "d": [2],
+ "e": [3], "f": [4], "g": [5],
+ "h": [6], "i": [7], "j": [8]}
+ expected = DataFrame(cols, columns=usecols)
+
+ result = parser.read_csv(StringIO(data), usecols=usecols,
+ parse_dates=parse_dates)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_parse_dates4(all_parsers):
+ data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
+ usecols = list("abcdefghij")
+ parse_dates = [[0, 1]]
+ parser = all_parsers
+
+ cols = {"a_b": "2016/09/21 1",
+ "c": [1], "d": [2], "e": [3], "f": [4],
+ "g": [5], "h": [6], "i": [7], "j": [8]}
+ expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
+
+ result = parser.read_csv(StringIO(data), usecols=usecols,
+ parse_dates=parse_dates)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols", [[0, 2, 3], [3, 0, 2]])
+ list("abcde"), # Names span all columns in original data.
+ list("acd"), # Names span only the selected columns.
+])
+def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
+ # see gh-9755
+ s = """0,1,20140101,0900,4
+0,1,20140102,1000,4"""
+ parse_dates = [[1, 2]]
+ parser = all_parsers
+
+ cols = {
+ "a": [0, 0],
+ "c_d": [
+ Timestamp("2014-01-01 09:00:00"),
+ Timestamp("2014-01-02 10:00:00")
+ ]
+ }
+ expected = DataFrame(cols, columns=["c_d", "a"])
+
+ result = parser.read_csv(StringIO(s), names=names,
+ parse_dates=parse_dates,
+ usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_unicode_strings(all_parsers):
+ # see gh-13219
+ data = """AAA,BBB,CCC,DDD
+0.056674973,8,True,a
+2.613230982,2,False,b
+3.568935038,7,False,a"""
+ parser = all_parsers
+
+ exp_data = {
+ "AAA": {
+ 0: 0.056674972999999997,
+ 1: 2.6132309819999997,
+ 2: 3.5689350380000002
+ },
+ "BBB": {0: 8, 1: 2, 2: 7}
+ }
+ expected = DataFrame(exp_data)
+
+ result = parser.read_csv(StringIO(data), usecols=[u"AAA", u"BBB"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_with_single_byte_unicode_strings(all_parsers):
+ # see gh-13219
+ data = """A,B,C,D
+0.056674973,8,True,a
+2.613230982,2,False,b
+3.568935038,7,False,a"""
+ parser = all_parsers
+
+ exp_data = {
+ "A": {
+ 0: 0.056674972999999997,
+ 1: 2.6132309819999997,
+ 2: 3.5689350380000002
+ },
+ "B": {0: 8, 1: 2, 2: 7}
+ }
+ expected = DataFrame(exp_data)
+
+ result = parser.read_csv(StringIO(data), usecols=[u"A", u"B"])
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols", [[u"AAA", b"BBB"], [b"AAA", u"BBB"]])
+def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
+ data = """AAA,BBB,CCC,DDD
+0.056674973,8,True,a
+2.613230982,2,False,b
+3.568935038,7,False,a"""
+ parser = all_parsers
+
+ with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
+ parser.read_csv(StringIO(data), usecols=usecols)
+
+
[email protected]("usecols", [
+ ["あああ", "いい"],
+ [u"あああ", u"いい"]
+])
+def test_usecols_with_multi_byte_characters(all_parsers, usecols):
+ data = """あああ,いい,ううう,ええええ
+0.056674973,8,True,a
+2.613230982,2,False,b
+3.568935038,7,False,a"""
+ parser = all_parsers
+
+ exp_data = {
+ "あああ": {
+ 0: 0.056674972999999997,
+ 1: 2.6132309819999997,
+ 2: 3.5689350380000002
+ },
+ "いい": {0: 8, 1: 2, 2: 7}
+ }
+ expected = DataFrame(exp_data)
+
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_empty_usecols(all_parsers):
+ data = "a,b,c\n1,2,3\n4,5,6"
+ expected = DataFrame()
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), usecols=set())
+ tm.assert_frame_equal(result, expected)
+
+
+def test_np_array_usecols(all_parsers):
+ # see gh-12546
+ parser = all_parsers
+ data = "a,b,c\n1,2,3"
+ usecols = np.array(["a", "b"])
+
+ expected = DataFrame([[1, 2]], columns=usecols)
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols,expected", [
+ (lambda x: x.upper() in ["AAA", "BBB", "DDD"],
+ DataFrame({
+ "AaA": {
+ 0: 0.056674972999999997,
+ 1: 2.6132309819999997,
+ 2: 3.5689350380000002
+ },
+ "bBb": {0: 8, 1: 2, 2: 7},
+ "ddd": {0: "a", 1: "b", 2: "a"}
+ })),
+ (lambda x: False, DataFrame()),
+])
+def test_callable_usecols(all_parsers, usecols, expected):
+ # see gh-14154
+ data = """AaA,bBb,CCC,ddd
+0.056674973,8,True,a
+2.613230982,2,False,b
+3.568935038,7,False,a"""
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
+def test_incomplete_first_row(all_parsers, usecols):
+ # see gh-6710
+ data = "1,2\n1,2,3"
+ parser = all_parsers
+ names = ["a", "b", "c"]
+ expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
+
+ result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("data,usecols,kwargs,expected", [
+ # see gh-8985
+ ("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2],
+ dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])),
+
+ # see gh-9549
+ (("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n"
+ "1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"],
+ dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5],
+ "B": [2, 4, 2, 2, 2, 6],
+ "C": [3, 5, 4, 3, 3, 7]})),
+])
+def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
+ # see gh-8985
+ parser = all_parsers
+ result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("usecols,kwargs,expected,msg", [
+ (["a", "b", "c", "d"], dict(),
+ DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None),
+ (["a", "b", "c", "f"], dict(), None,
+ _msg_validate_usecols_names.format(r"\['f'\]")),
+ (["a", "b", "f"], dict(), None,
+ _msg_validate_usecols_names.format(r"\['f'\]")),
+ (["a", "b", "f", "g"], dict(), None,
+ _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")),
+
+ # see gh-14671
+ (None, dict(header=0, names=["A", "B", "C", "D"]),
+ DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7],
+ "D": [4, 8]}), None),
+ (["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]),
+ None, _msg_validate_usecols_names.format(r"\['f'\]")),
+ (["A", "B", "f"], dict(names=["A", "B", "C", "D"]),
+ None, _msg_validate_usecols_names.format(r"\['f'\]")),
+])
+def test_raises_on_usecols_names_mismatch(all_parsers, usecols,
+ kwargs, expected, msg):
+ data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
+ kwargs.update(usecols=usecols)
+ parser = all_parsers
+
+ if expected is None:
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(data), **kwargs)
+ else:
+ result = parser.read_csv(StringIO(data), **kwargs)
+ tm.assert_frame_equal(result, expected)
+
+
+ reason="see gh-16469: works on the C engine but not the Python engine",
+ strict=False)
[email protected]("usecols", [["A", "C"], [0, 2]])
+def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
+ data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
+ names = ["A", "B", "C", "D"]
+ parser = all_parsers
+
+ result = parser.read_csv(StringIO(data), header=0,
+ names=names, usecols=usecols)
+ expected = DataFrame({"A": [1, 5], "C": [3, 7]})
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/sas/__init__.py b/contrib/python/pandas/py2/pandas/tests/io/sas/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/sas/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas.py b/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas.py
new file mode 100644
index 00000000000..34bca1e5b74
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas.py
@@ -0,0 +1,25 @@
+import pytest
+
+from pandas.compat import StringIO
+
+from pandas import read_sas
+import pandas.util.testing as tm
+
+
+class TestSas(object):
+
+ def test_sas_buffer_format(self):
+ # see gh-14947
+ b = StringIO("")
+
+ msg = ("If this is a buffer object rather than a string "
+ "name, you must specify a format string")
+ with pytest.raises(ValueError, match=msg):
+ read_sas(b)
+
+ def test_sas_read_no_format_or_extension(self):
+ # see gh-24548
+ msg = ("unable to infer format of SAS file")
+ with tm.ensure_clean('test_file_no_extension') as path:
+ with pytest.raises(ValueError, match=msg):
+ read_sas(path)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas7bdat.py b/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas7bdat.py
new file mode 100644
index 00000000000..3dd8d0449ef
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/sas/test_sas7bdat.py
@@ -0,0 +1,227 @@
+import io
+import os
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY2
+from pandas.errors import EmptyDataError
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+# https://github.com/cython/cython/issues/1720
[email protected]("ignore:can't resolve package:ImportWarning")
+class TestSAS7BDAT(object):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+ self.dirpath = datapath("io", "sas", "data")
+ self.data = []
+ self.test_ix = [list(range(1, 16)), [16]]
+ for j in 1, 2:
+ fname = os.path.join(
+ self.dirpath, "test_sas7bdat_{j}.csv".format(j=j))
+ df = pd.read_csv(fname)
+ epoch = pd.datetime(1960, 1, 1)
+ t1 = pd.to_timedelta(df["Column4"], unit='d')
+ df["Column4"] = epoch + t1
+ t2 = pd.to_timedelta(df["Column12"], unit='d')
+ df["Column12"] = epoch + t2
+ for k in range(df.shape[1]):
+ col = df.iloc[:, k]
+ if col.dtype == np.int64:
+ df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
+ elif col.dtype == np.dtype('O'):
+ if PY2:
+ f = lambda x: (x.decode('utf-8') if
+ isinstance(x, str) else x)
+ df.iloc[:, k] = df.iloc[:, k].apply(f)
+ self.data.append(df)
+
+ def test_from_file(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k))
+ df = pd.read_sas(fname, encoding='utf-8')
+ tm.assert_frame_equal(df, df0)
+
+ def test_from_buffer(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k))
+ with open(fname, 'rb') as f:
+ byts = f.read()
+ buf = io.BytesIO(byts)
+ rdr = pd.read_sas(buf, format="sas7bdat",
+ iterator=True, encoding='utf-8')
+ df = rdr.read()
+ tm.assert_frame_equal(df, df0, check_exact=False)
+ rdr.close()
+
+ def test_from_iterator(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k))
+ rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
+ df = rdr.read(2)
+ tm.assert_frame_equal(df, df0.iloc[0:2, :])
+ df = rdr.read(3)
+ tm.assert_frame_equal(df, df0.iloc[2:5, :])
+ rdr.close()
+
+ @td.skip_if_no('pathlib')
+ def test_path_pathlib(self):
+ from pathlib import Path
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = Path(os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k)))
+ df = pd.read_sas(fname, encoding='utf-8')
+ tm.assert_frame_equal(df, df0)
+
+ @td.skip_if_no('py.path')
+ def test_path_localpath(self):
+ from py.path import local as LocalPath
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = LocalPath(os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k)))
+ df = pd.read_sas(fname, encoding='utf-8')
+ tm.assert_frame_equal(df, df0)
+
+ def test_iterator_loop(self):
+ # github #13654
+ for j in 0, 1:
+ for k in self.test_ix[j]:
+ for chunksize in 3, 5, 10, 11:
+ fname = os.path.join(
+ self.dirpath, "test{k}.sas7bdat".format(k=k))
+ rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
+ y = 0
+ for x in rdr:
+ y += x.shape[0]
+ assert y == rdr.row_count
+ rdr.close()
+
+ def test_iterator_read_too_much(self):
+ # github #14734
+ k = self.test_ix[0][0]
+ fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+ rdr = pd.read_sas(fname, format="sas7bdat",
+ iterator=True, encoding='utf-8')
+ d1 = rdr.read(rdr.row_count + 20)
+ rdr.close()
+
+ rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+ d2 = rdr.read(rdr.row_count + 20)
+ tm.assert_frame_equal(d1, d2)
+ rdr.close()
+
+
+def test_encoding_options(datapath):
+ fname = datapath("io", "sas", "data", "test1.sas7bdat")
+ df1 = pd.read_sas(fname)
+ df2 = pd.read_sas(fname, encoding='utf-8')
+ for col in df1.columns:
+ try:
+ df1[col] = df1[col].str.decode('utf-8')
+ except AttributeError:
+ pass
+ tm.assert_frame_equal(df1, df2)
+
+ from pandas.io.sas.sas7bdat import SAS7BDATReader
+ rdr = SAS7BDATReader(fname, convert_header_text=False)
+ df3 = rdr.read()
+ rdr.close()
+ for x, y in zip(df1.columns, df3.columns):
+ assert(x == y.decode())
+
+
+def test_productsales(datapath):
+ fname = datapath("io", "sas", "data", "productsales.sas7bdat")
+ df = pd.read_sas(fname, encoding='utf-8')
+ fname = datapath("io", "sas", "data", "productsales.csv")
+ df0 = pd.read_csv(fname, parse_dates=['MONTH'])
+ vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
+ df0[vn] = df0[vn].astype(np.float64)
+ tm.assert_frame_equal(df, df0)
+
+
+def test_12659(datapath):
+ fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
+ df = pd.read_sas(fname)
+ fname = datapath("io", "sas", "data", "test_12659.csv")
+ df0 = pd.read_csv(fname)
+ df0 = df0.astype(np.float64)
+ tm.assert_frame_equal(df, df0)
+
+
+def test_airline(datapath):
+ fname = datapath("io", "sas", "data", "airline.sas7bdat")
+ df = pd.read_sas(fname)
+ fname = datapath("io", "sas", "data", "airline.csv")
+ df0 = pd.read_csv(fname)
+ df0 = df0.astype(np.float64)
+ tm.assert_frame_equal(df, df0, check_exact=False)
+
+
+def test_date_time(datapath):
+ # Support of different SAS date/datetime formats (PR #15871)
+ fname = datapath("io", "sas", "data", "datetime.sas7bdat")
+ df = pd.read_sas(fname)
+ fname = datapath("io", "sas", "data", "datetime.csv")
+ df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
+ 'DateTimeHi', 'Taiw'])
+ # GH 19732: Timestamps imported from sas will incur floating point errors
+ df.iloc[:, 3] = df.iloc[:, 3].dt.round('us')
+ tm.assert_frame_equal(df, df0)
+
+
+def test_compact_numerical_values(datapath):
+ # Regression test for #21616
+ fname = datapath("io", "sas", "data", "cars.sas7bdat")
+ df = pd.read_sas(fname, encoding='latin-1')
+ # The two columns CYL and WGT in cars.sas7bdat have column
+ # width < 8 and only contain integral values.
+ # Test that pandas doesn't corrupt the numbers by adding
+ # decimals.
+ result = df['WGT']
+ expected = df['WGT'].round()
+ tm.assert_series_equal(result, expected, check_exact=True)
+ result = df['CYL']
+ expected = df['CYL'].round()
+ tm.assert_series_equal(result, expected, check_exact=True)
+
+
+def test_many_columns(datapath):
+ # Test for looking for column information in more places (PR #22628)
+ fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
+ df = pd.read_sas(fname, encoding='latin-1')
+ fname = datapath("io", "sas", "data", "many_columns.csv")
+ df0 = pd.read_csv(fname, encoding='latin-1')
+ tm.assert_frame_equal(df, df0)
+
+
+def test_inconsistent_number_of_rows(datapath):
+ # Regression test for issue #16615. (PR #22628)
+ fname = datapath("io", "sas", "data", "load_log.sas7bdat")
+ df = pd.read_sas(fname, encoding='latin-1')
+ assert len(df) == 2097
+
+
+def test_zero_variables(datapath):
+ # Check if the SAS file has zero variables (PR #18184)
+ fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
+ with pytest.raises(EmptyDataError):
+ pd.read_sas(fname)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/sas/test_xport.py b/contrib/python/pandas/py2/pandas/tests/io/sas/test_xport.py
new file mode 100644
index 00000000000..1b086daf51c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/sas/test_xport.py
@@ -0,0 +1,146 @@
+import os
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas.io.sas.sasreader import read_sas
+
+# CSV versions of test xpt files were obtained using the R foreign library
+
+# Numbers in a SAS xport file are always float64, so need to convert
+# before making comparisons.
+
+
+def numeric_as_float(data):
+ for v in data.columns:
+ if data[v].dtype is np.dtype('int64'):
+ data[v] = data[v].astype(np.float64)
+
+
+class TestXport(object):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+ self.dirpath = datapath("io", "sas", "data")
+ self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
+ self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
+ self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
+ self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
+
+ def test1_basic(self):
+ # Tests with DEMO_G.xpt (all numeric file)
+
+ # Compare to this
+ data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+ numeric_as_float(data_csv)
+
+ # Read full file
+ data = read_sas(self.file01, format="xport")
+ tm.assert_frame_equal(data, data_csv)
+ num_rows = data.shape[0]
+
+ # Test reading beyond end of file
+ reader = read_sas(self.file01, format="xport", iterator=True)
+ data = reader.read(num_rows + 100)
+ assert data.shape[0] == num_rows
+ reader.close()
+
+ # Test incremental read with `read` method.
+ reader = read_sas(self.file01, format="xport", iterator=True)
+ data = reader.read(10)
+ reader.close()
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+ # Test incremental read with `get_chunk` method.
+ reader = read_sas(self.file01, format="xport", chunksize=10)
+ data = reader.get_chunk()
+ reader.close()
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+ # Test read in loop
+ m = 0
+ reader = read_sas(self.file01, format="xport", chunksize=100)
+ for x in reader:
+ m += x.shape[0]
+ reader.close()
+ assert m == num_rows
+
+ # Read full file with `read_sas` method
+ data = read_sas(self.file01)
+ tm.assert_frame_equal(data, data_csv)
+
+ def test1_index(self):
+ # Tests with DEMO_G.xpt using index (all numeric file)
+
+ # Compare to this
+ data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+ data_csv = data_csv.set_index("SEQN")
+ numeric_as_float(data_csv)
+
+ # Read full file
+ data = read_sas(self.file01, index="SEQN", format="xport")
+ tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+ # Test incremental read with `read` method.
+ reader = read_sas(self.file01, index="SEQN", format="xport",
+ iterator=True)
+ data = reader.read(10)
+ reader.close()
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+ check_index_type=False)
+
+ # Test incremental read with `get_chunk` method.
+ reader = read_sas(self.file01, index="SEQN", format="xport",
+ chunksize=10)
+ data = reader.get_chunk()
+ reader.close()
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+ check_index_type=False)
+
+ def test1_incremental(self):
+ # Test with DEMO_G.xpt, reading full file incrementally
+
+ data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+ data_csv = data_csv.set_index("SEQN")
+ numeric_as_float(data_csv)
+
+ reader = read_sas(self.file01, index="SEQN", chunksize=1000)
+
+ all_data = [x for x in reader]
+ data = pd.concat(all_data, axis=0)
+
+ tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+ def test2(self):
+ # Test with SSHSV1_A.xpt
+
+ # Compare to this
+ data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
+ numeric_as_float(data_csv)
+
+ data = read_sas(self.file02)
+ tm.assert_frame_equal(data, data_csv)
+
+ def test_multiple_types(self):
+ # Test with DRXFCD_G.xpt (contains text and numeric variables)
+
+ # Compare to this
+ data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
+
+ data = read_sas(self.file03, encoding="utf-8")
+ tm.assert_frame_equal(data, data_csv)
+
+ def test_truncated_float_support(self):
+ # Test with paxraw_d_short.xpt, a shortened version of:
+ # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
+ # This file has truncated floats (5 bytes in this case).
+
+ # GH 11713
+
+ data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
+
+ data = read_sas(self.file04, format="xport")
+ tm.assert_frame_equal(data.astype('int64'), data_csv)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_clipboard.py b/contrib/python/pandas/py2/pandas/tests/io/test_clipboard.py
new file mode 100644
index 00000000000..8eb26d9f3de
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_clipboard.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+from textwrap import dedent
+
+import numpy as np
+from numpy.random import randint
+import pytest
+
+from pandas.compat import PY2
+
+import pandas as pd
+from pandas import DataFrame, get_option, read_clipboard
+from pandas.util import testing as tm
+from pandas.util.testing import makeCustomDataframe as mkdf
+
+from pandas.io.clipboard.exceptions import PyperclipException
+
+try:
+ DataFrame({'A': [1, 2]}).to_clipboard()
+ _DEPS_INSTALLED = 1
+except (PyperclipException, RuntimeError):
+ _DEPS_INSTALLED = 0
+
+
+def build_kwargs(sep, excel):
+ kwargs = {}
+ if excel != 'default':
+ kwargs['excel'] = excel
+ if sep != 'default':
+ kwargs['sep'] = sep
+ return kwargs
+
+
[email protected](params=['delims', 'utf8', 'string', 'long', 'nonascii',
+ 'colwidth', 'mixed', 'float', 'int'])
+def df(request):
+ data_type = request.param
+
+ if data_type == 'delims':
+ return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'],
+ 'b': ['hi\'j', 'k\'\'lm']})
+ elif data_type == 'utf8':
+ return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
+ 'b': ['øπ∆˚¬', 'œ∑´®']})
+ elif data_type == 'string':
+ return mkdf(5, 3, c_idx_type='s', r_idx_type='i',
+ c_idx_names=[None], r_idx_names=[None])
+ elif data_type == 'long':
+ max_rows = get_option('display.max_rows')
+ return mkdf(max_rows + 1, 3,
+ data_gen_f=lambda *args: randint(2),
+ c_idx_type='s', r_idx_type='i',
+ c_idx_names=[None], r_idx_names=[None])
+ elif data_type == 'nonascii':
+ return pd.DataFrame({'en': 'in English'.split(),
+ 'es': 'en español'.split()})
+ elif data_type == 'colwidth':
+ _cw = get_option('display.max_colwidth') + 1
+ return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
+ c_idx_type='s', r_idx_type='i',
+ c_idx_names=[None], r_idx_names=[None])
+ elif data_type == 'mixed':
+ return DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
+ 'b': np.arange(1, 6),
+ 'c': list('abcde')})
+ elif data_type == 'float':
+ return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01,
+ c_idx_type='s', r_idx_type='i',
+ c_idx_names=[None], r_idx_names=[None])
+ elif data_type == 'int':
+ return mkdf(5, 3, data_gen_f=lambda *args: randint(2),
+ c_idx_type='s', r_idx_type='i',
+ c_idx_names=[None], r_idx_names=[None])
+ else:
+ raise ValueError
+
+
+def mock_clipboard(monkeypatch, request):
+ """Fixture mocking clipboard IO.
+
+ This mocks pandas.io.clipboard.clipboard_get and
+ pandas.io.clipboard.clipboard_set.
+
+ This uses a local dict for storing data. The dictionary
+ key used is the test ID, available with ``request.node.name``.
+
+ This returns the local dictionary, for direct manipulation by
+ tests.
+ """
+
+ # our local clipboard for tests
+ _mock_data = {}
+
+ def _mock_set(data):
+ _mock_data[request.node.name] = data
+
+ def _mock_get():
+ return _mock_data[request.node.name]
+
+ monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set)
+ monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get)
+
+ yield _mock_data
+
+
+def test_mock_clipboard(mock_clipboard):
+ import pandas.io.clipboard
+ pandas.io.clipboard.clipboard_set("abc")
+ assert "abc" in set(mock_clipboard.values())
+ result = pandas.io.clipboard.clipboard_get()
+ assert result == "abc"
+
+
[email protected](not _DEPS_INSTALLED,
+ reason="clipboard primitives not installed")
[email protected]("mock_clipboard")
+class TestClipboard(object):
+
+ def check_round_trip_frame(self, data, excel=None, sep=None,
+ encoding=None):
+ data.to_clipboard(excel=excel, sep=sep, encoding=encoding)
+ result = read_clipboard(sep=sep or '\t', index_col=0,
+ encoding=encoding)
+ tm.assert_frame_equal(data, result, check_dtype=False)
+
+ # Test that default arguments copy as tab delimited
+ def test_round_trip_frame(self, df):
+ self.check_round_trip_frame(df)
+
+ # Test that explicit delimiters are respected
+ @pytest.mark.parametrize('sep', ['\t', ',', '|'])
+ def test_round_trip_frame_sep(self, df, sep):
+ self.check_round_trip_frame(df, sep=sep)
+
+ # Test white space separator
+ def test_round_trip_frame_string(self, df):
+ df.to_clipboard(excel=False, sep=None)
+ result = read_clipboard()
+ assert df.to_string() == result.to_string()
+ assert df.shape == result.shape
+
+ # Two character separator is not supported in to_clipboard
+ # Test that multi-character separators are not silently passed
+ def test_excel_sep_warning(self, df):
+ with tm.assert_produces_warning():
+ df.to_clipboard(excel=True, sep=r'\t')
+
+ # Separator is ignored when excel=False and should produce a warning
+ def test_copy_delim_warning(self, df):
+ with tm.assert_produces_warning():
+ df.to_clipboard(excel=False, sep='\t')
+
+ # Tests that the default behavior of to_clipboard is tab
+ # delimited and excel="True"
+ @pytest.mark.parametrize('sep', ['\t', None, 'default'])
+ @pytest.mark.parametrize('excel', [True, None, 'default'])
+ def test_clipboard_copy_tabs_default(self, sep, excel, df, request,
+ mock_clipboard):
+ kwargs = build_kwargs(sep, excel)
+ df.to_clipboard(**kwargs)
+ if PY2:
+ # to_clipboard copies unicode, to_csv produces bytes. This is
+ # expected behavior
+ result = mock_clipboard[request.node.name].encode('utf-8')
+ expected = df.to_csv(sep='\t')
+ assert result == expected
+ else:
+ assert mock_clipboard[request.node.name] == df.to_csv(sep='\t')
+
+ # Tests reading of white space separated tables
+ @pytest.mark.parametrize('sep', [None, 'default'])
+ @pytest.mark.parametrize('excel', [False])
+ def test_clipboard_copy_strings(self, sep, excel, df):
+ kwargs = build_kwargs(sep, excel)
+ df.to_clipboard(**kwargs)
+ result = read_clipboard(sep=r'\s+')
+ assert result.to_string() == df.to_string()
+ assert df.shape == result.shape
+
+ def test_read_clipboard_infer_excel(self, request,
+ mock_clipboard):
+ # gh-19010: avoid warnings
+ clip_kwargs = dict(engine="python")
+
+ text = dedent("""
+ John James Charlie Mingus
+ 1 2
+ 4 Harry Carney
+ """.strip())
+ mock_clipboard[request.node.name] = text
+ df = pd.read_clipboard(**clip_kwargs)
+
+ # excel data is parsed correctly
+ assert df.iloc[1][1] == 'Harry Carney'
+
+ # having diff tab counts doesn't trigger it
+ text = dedent("""
+ a\t b
+ 1 2
+ 3 4
+ """.strip())
+ mock_clipboard[request.node.name] = text
+ res = pd.read_clipboard(**clip_kwargs)
+
+ text = dedent("""
+ a b
+ 1 2
+ 3 4
+ """.strip())
+ mock_clipboard[request.node.name] = text
+ exp = pd.read_clipboard(**clip_kwargs)
+
+ tm.assert_frame_equal(res, exp)
+
+ def test_invalid_encoding(self, df):
+ # test case for testing invalid encoding
+ with pytest.raises(ValueError):
+ df.to_clipboard(encoding='ascii')
+ with pytest.raises(NotImplementedError):
+ pd.read_clipboard(encoding='ascii')
+
+ @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8'])
+ def test_round_trip_valid_encodings(self, enc, df):
+ self.check_round_trip_frame(df, encoding=enc)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_common.py b/contrib/python/pandas/py2/pandas/tests/io/test_common.py
new file mode 100644
index 00000000000..3354bca63be
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_common.py
@@ -0,0 +1,357 @@
+"""
+Tests for the pandas.io.common functionalities
+"""
+import mmap
+import os
+
+import pytest
+
+from pandas.compat import FileNotFoundError, StringIO, is_platform_windows
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas.util.testing as tm
+
+import pandas.io.common as icom
+
+
+class CustomFSPath(object):
+ """For testing fspath on unknown objects"""
+ def __init__(self, path):
+ self.path = path
+
+ def __fspath__(self):
+ return self.path
+
+
+# Functions that consume a string path and return a string or path-like object
+path_types = [str, CustomFSPath]
+
+try:
+ from pathlib import Path
+ path_types.append(Path)
+except ImportError:
+ pass
+
+try:
+ from py.path import local as LocalPath
+ path_types.append(LocalPath)
+except ImportError:
+ pass
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+# https://github.com/cython/cython/issues/1720
[email protected]("ignore:can't resolve package:ImportWarning")
+class TestCommonIOCapabilities(object):
+ data1 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+ def test_expand_user(self):
+ filename = '~/sometest'
+ expanded_name = icom._expand_user(filename)
+
+ assert expanded_name != filename
+ assert os.path.isabs(expanded_name)
+ assert os.path.expanduser(filename) == expanded_name
+
+ def test_expand_user_normal_path(self):
+ filename = '/somefolder/sometest'
+ expanded_name = icom._expand_user(filename)
+
+ assert expanded_name == filename
+ assert os.path.expanduser(filename) == expanded_name
+
+ @td.skip_if_no('pathlib')
+ def test_stringify_path_pathlib(self):
+ rel_path = icom._stringify_path(Path('.'))
+ assert rel_path == '.'
+ redundant_path = icom._stringify_path(Path('foo//bar'))
+ assert redundant_path == os.path.join('foo', 'bar')
+
+ @td.skip_if_no('py.path')
+ def test_stringify_path_localpath(self):
+ path = os.path.join('foo', 'bar')
+ abs_path = os.path.abspath(path)
+ lpath = LocalPath(path)
+ assert icom._stringify_path(lpath) == abs_path
+
+ def test_stringify_path_fspath(self):
+ p = CustomFSPath('foo/bar.csv')
+ result = icom._stringify_path(p)
+ assert result == 'foo/bar.csv'
+
+ @pytest.mark.parametrize('extension,expected', [
+ ('', None),
+ ('.gz', 'gzip'),
+ ('.bz2', 'bz2'),
+ ('.zip', 'zip'),
+ ('.xz', 'xz'),
+ ])
+ @pytest.mark.parametrize('path_type', path_types)
+ def test_infer_compression_from_path(self, extension, expected, path_type):
+ path = path_type('foo/bar.csv' + extension)
+ compression = icom._infer_compression(path, compression='infer')
+ assert compression == expected
+
+ def test_get_filepath_or_buffer_with_path(self):
+ filename = '~/sometest'
+ filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
+ filename)
+ assert filepath_or_buffer != filename
+ assert os.path.isabs(filepath_or_buffer)
+ assert os.path.expanduser(filename) == filepath_or_buffer
+ assert not should_close
+
+ def test_get_filepath_or_buffer_with_buffer(self):
+ input_buffer = StringIO()
+ filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
+ input_buffer)
+ assert filepath_or_buffer == input_buffer
+ assert not should_close
+
+ def test_iterator(self):
+ reader = pd.read_csv(StringIO(self.data1), chunksize=1)
+ result = pd.concat(reader, ignore_index=True)
+ expected = pd.read_csv(StringIO(self.data1))
+ tm.assert_frame_equal(result, expected)
+
+ # GH12153
+ it = pd.read_csv(StringIO(self.data1), chunksize=1)
+ first = next(it)
+ tm.assert_frame_equal(first, expected.iloc[[0]])
+ tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
+
+ @pytest.mark.parametrize('reader, module, error_class, fn_ext', [
+ (pd.read_csv, 'os', FileNotFoundError, 'csv'),
+ (pd.read_fwf, 'os', FileNotFoundError, 'txt'),
+ (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
+ (pd.read_feather, 'feather', Exception, 'feather'),
+ (pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
+ (pd.read_stata, 'os', FileNotFoundError, 'dta'),
+ (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
+ (pd.read_json, 'os', ValueError, 'json'),
+ (pd.read_msgpack, 'os', ValueError, 'mp'),
+ (pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
+ ])
+ def test_read_non_existant(self, reader, module, error_class, fn_ext):
+ pytest.importorskip(module)
+
+ path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext)
+ msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist"
+ .format(fn_ext))
+ msg2 = (r"\[Errno 2\] No such file or directory: '.+does_not_exist"
+ r"\.{}'").format(fn_ext)
+ msg3 = "Expected object or value"
+ msg4 = "path_or_buf needs to be a string file path or file-like"
+ msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:"
+ r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext)
+ with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format(
+ msg1, msg2, msg3, msg4, msg5)):
+ reader(path)
+
+ @pytest.mark.parametrize('reader, module, error_class, fn_ext', [
+ (pd.read_csv, 'os', FileNotFoundError, 'csv'),
+ (pd.read_fwf, 'os', FileNotFoundError, 'txt'),
+ (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
+ (pd.read_feather, 'feather', Exception, 'feather'),
+ (pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
+ (pd.read_stata, 'os', FileNotFoundError, 'dta'),
+ (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
+ (pd.read_json, 'os', ValueError, 'json'),
+ (pd.read_msgpack, 'os', ValueError, 'mp'),
+ (pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
+ ])
+ def test_read_expands_user_home_dir(self, reader, module,
+ error_class, fn_ext, monkeypatch):
+ pytest.importorskip(module)
+
+ path = os.path.join('~', 'does_not_exist.' + fn_ext)
+ monkeypatch.setattr(icom, '_expand_user',
+ lambda x: os.path.join('foo', x))
+
+ msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist"
+ .format(fn_ext))
+ msg2 = (r"\[Errno 2\] No such file or directory:"
+ r" '.+does_not_exist\.{}'").format(fn_ext)
+ msg3 = "Unexpected character found when decoding 'false'"
+ msg4 = "path_or_buf needs to be a string file path or file-like"
+ msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:"
+ r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext)
+
+ with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format(
+ msg1, msg2, msg3, msg4, msg5)):
+ reader(path)
+
+ def test_read_non_existant_read_table(self):
+ path = os.path.join(HERE, 'data', 'does_not_exist.' + 'csv')
+ msg1 = r"File b'.+does_not_exist\.csv' does not exist"
+ msg2 = (r"\[Errno 2\] File .+does_not_exist\.csv does not exist:"
+ r" '.+does_not_exist\.csv'")
+ with pytest.raises(FileNotFoundError, match=r"({}|{})".format(
+ msg1, msg2)):
+ with tm.assert_produces_warning(FutureWarning):
+ pd.read_table(path)
+
+ @pytest.mark.parametrize('reader, module, path', [
+ (pd.read_csv, 'os', ('io', 'data', 'iris.csv')),
+ (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')),
+ (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')),
+ (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')),
+ (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf',
+ 'datetimetz_object.h5')),
+ (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')),
+ (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')),
+ (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')),
+ (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')),
+ (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')),
+ ])
+ def test_read_fspath_all(self, reader, module, path, datapath):
+ pytest.importorskip(module)
+ path = datapath(*path)
+
+ mypath = CustomFSPath(path)
+ result = reader(mypath)
+ expected = reader(path)
+
+ if path.endswith('.pickle'):
+ # categorical
+ tm.assert_categorical_equal(result, expected)
+ else:
+ tm.assert_frame_equal(result, expected)
+
+ def test_read_fspath_all_read_table(self, datapath):
+ path = datapath('io', 'data', 'iris.csv')
+
+ mypath = CustomFSPath(path)
+ with tm.assert_produces_warning(FutureWarning):
+ result = pd.read_table(mypath)
+ with tm.assert_produces_warning(FutureWarning):
+ expected = pd.read_table(path)
+
+ if path.endswith('.pickle'):
+ # categorical
+ tm.assert_categorical_equal(result, expected)
+ else:
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('writer_name, writer_kwargs, module', [
+ ('to_csv', {}, 'os'),
+ ('to_excel', {'engine': 'xlwt'}, 'xlwt'),
+ ('to_feather', {}, 'feather'),
+ ('to_html', {}, 'os'),
+ ('to_json', {}, 'os'),
+ ('to_latex', {}, 'os'),
+ ('to_msgpack', {}, 'os'),
+ ('to_pickle', {}, 'os'),
+ ('to_stata', {}, 'os'),
+ ])
+ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
+ p1 = tm.ensure_clean('string')
+ p2 = tm.ensure_clean('fspath')
+ df = pd.DataFrame({"A": [1, 2]})
+
+ with p1 as string, p2 as fspath:
+ pytest.importorskip(module)
+ mypath = CustomFSPath(fspath)
+ writer = getattr(df, writer_name)
+
+ writer(string, **writer_kwargs)
+ with open(string, 'rb') as f:
+ expected = f.read()
+
+ writer(mypath, **writer_kwargs)
+ with open(fspath, 'rb') as f:
+ result = f.read()
+
+ assert result == expected
+
+ def test_write_fspath_hdf5(self):
+ # Same test as write_fspath_all, except HDF5 files aren't
+ # necessarily byte-for-byte identical for a given dataframe, so we'll
+ # have to read and compare equality
+ pytest.importorskip('tables')
+
+ df = pd.DataFrame({"A": [1, 2]})
+ p1 = tm.ensure_clean('string')
+ p2 = tm.ensure_clean('fspath')
+
+ with p1 as string, p2 as fspath:
+ mypath = CustomFSPath(fspath)
+ df.to_hdf(mypath, key='bar')
+ df.to_hdf(string, key='bar')
+
+ result = pd.read_hdf(fspath, key='bar')
+ expected = pd.read_hdf(string, key='bar')
+
+ tm.assert_frame_equal(result, expected)
+
+
+def mmap_file(datapath):
+ return datapath('io', 'data', 'test_mmap.csv')
+
+
+class TestMMapWrapper(object):
+
+ def test_constructor_bad_file(self, mmap_file):
+ non_file = StringIO('I am not a file')
+ non_file.fileno = lambda: -1
+
+ # the error raised is different on Windows
+ if is_platform_windows():
+ msg = "The parameter is incorrect"
+ err = OSError
+ else:
+ msg = "[Errno 22]"
+ err = mmap.error
+
+ with pytest.raises(err, match=msg):
+ icom.MMapWrapper(non_file)
+
+ target = open(mmap_file, 'r')
+ target.close()
+
+ msg = "I/O operation on closed file"
+ with pytest.raises(ValueError, match=msg):
+ icom.MMapWrapper(target)
+
+ def test_get_attr(self, mmap_file):
+ with open(mmap_file, 'r') as target:
+ wrapper = icom.MMapWrapper(target)
+
+ attrs = dir(wrapper.mmap)
+ attrs = [attr for attr in attrs
+ if not attr.startswith('__')]
+ attrs.append('__next__')
+
+ for attr in attrs:
+ assert hasattr(wrapper, attr)
+
+ assert not hasattr(wrapper, 'foo')
+
+ def test_next(self, mmap_file):
+ with open(mmap_file, 'r') as target:
+ wrapper = icom.MMapWrapper(target)
+ lines = target.readlines()
+
+ for line in lines:
+ next_line = next(wrapper)
+ assert next_line.strip() == line.strip()
+
+ with pytest.raises(StopIteration, match=r'^$'):
+ next(wrapper)
+
+ def test_unknown_engine(self):
+ with tm.ensure_clean() as path:
+ df = tm.makeDataFrame()
+ df.to_csv(path)
+ with pytest.raises(ValueError, match='Unknown engine'):
+ pd.read_csv(path, engine='pyt')
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_compression.py b/contrib/python/pandas/py2/pandas/tests/io/test_compression.py
new file mode 100644
index 00000000000..a3fb35f9f01
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_compression.py
@@ -0,0 +1,116 @@
+import contextlib
+import os
+import warnings
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+import pandas.io.common as icom
+
+
+def catch_to_csv_depr():
+ # Catching warnings because Series.to_csv has
+ # been deprecated. Remove this context when
+ # Series.to_csv has been aligned.
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", FutureWarning)
+ yield
+
+
+ pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ columns=['X', 'Y', 'Z']),
+ pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
[email protected]('method', ['to_pickle', 'to_json', 'to_csv'])
+def test_compression_size(obj, method, compression_only):
+ with tm.ensure_clean() as path:
+ with catch_to_csv_depr():
+ getattr(obj, method)(path, compression=compression_only)
+ compressed_size = os.path.getsize(path)
+ getattr(obj, method)(path, compression=None)
+ uncompressed_size = os.path.getsize(path)
+ assert uncompressed_size > compressed_size
+
+
+ pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ columns=['X', 'Y', 'Z']),
+ pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
[email protected]('method', ['to_csv', 'to_json'])
+def test_compression_size_fh(obj, method, compression_only):
+ with tm.ensure_clean() as path:
+ f, handles = icom._get_handle(path, 'w', compression=compression_only)
+ with catch_to_csv_depr():
+ with f:
+ getattr(obj, method)(f)
+ assert not f.closed
+ assert f.closed
+ compressed_size = os.path.getsize(path)
+ with tm.ensure_clean() as path:
+ f, handles = icom._get_handle(path, 'w', compression=None)
+ with catch_to_csv_depr():
+ with f:
+ getattr(obj, method)(f)
+ assert not f.closed
+ assert f.closed
+ uncompressed_size = os.path.getsize(path)
+ assert uncompressed_size > compressed_size
+
+
[email protected]('write_method, write_kwargs, read_method', [
+ ('to_csv', {'index': False}, pd.read_csv),
+ ('to_json', {}, pd.read_json),
+ ('to_pickle', {}, pd.read_pickle),
+])
+def test_dataframe_compression_defaults_to_infer(
+ write_method, write_kwargs, read_method, compression_only):
+ # GH22004
+ input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=['X', 'Y', 'Z'])
+ extension = icom._compression_to_extension[compression_only]
+ with tm.ensure_clean('compressed' + extension) as path:
+ getattr(input, write_method)(path, **write_kwargs)
+ output = read_method(path, compression=compression_only)
+ tm.assert_frame_equal(output, input)
+
+
[email protected]('write_method,write_kwargs,read_method,read_kwargs', [
+ ('to_csv', {'index': False, 'header': True},
+ pd.read_csv, {'squeeze': True}),
+ ('to_json', {}, pd.read_json, {'typ': 'series'}),
+ ('to_pickle', {}, pd.read_pickle, {}),
+])
+def test_series_compression_defaults_to_infer(
+ write_method, write_kwargs, read_method, read_kwargs,
+ compression_only):
+ # GH22004
+ input = pd.Series([0, 5, -2, 10], name='X')
+ extension = icom._compression_to_extension[compression_only]
+ with tm.ensure_clean('compressed' + extension) as path:
+ getattr(input, write_method)(path, **write_kwargs)
+ output = read_method(path, compression=compression_only, **read_kwargs)
+ tm.assert_series_equal(output, input, check_names=False)
+
+
+def test_compression_warning(compression_only):
+ # Assert that passing a file object to to_csv while explicitly specifying a
+ # compression protocol triggers a RuntimeWarning, as per GH21227.
+ # Note that pytest has an issue that causes assert_produces_warning to fail
+ # in Python 2 if the warning has occurred in previous tests
+ # (see https://git.io/fNEBm & https://git.io/fNEBC). Hence, should this
+ # test fail in just Python 2 builds, it likely indicates that other tests
+ # are producing RuntimeWarnings, thereby triggering the pytest bug.
+ df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ columns=['X', 'Y', 'Z'])
+ with tm.ensure_clean() as path:
+ f, handles = icom._get_handle(path, 'w', compression=compression_only)
+ with tm.assert_produces_warning(RuntimeWarning,
+ check_stacklevel=False):
+ with f:
+ df.to_csv(f, compression=compression_only)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_date_converters.py b/contrib/python/pandas/py2/pandas/tests/io/test_date_converters.py
new file mode 100644
index 00000000000..c5a94883aa6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_date_converters.py
@@ -0,0 +1,43 @@
+from datetime import datetime
+
+import numpy as np
+
+import pandas.util.testing as tm
+
+import pandas.io.date_converters as conv
+
+
+def test_parse_date_time():
+ dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
+ times = np.array(['05:07:09', '06:08:00'], dtype=object)
+ expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
+ datetime(2008, 2, 4, 6, 8, 0)])
+
+ result = conv.parse_date_time(dates, times)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_parse_date_fields():
+ days = np.array([3, 4])
+ months = np.array([1, 2])
+ years = np.array([2007, 2008])
+ result = conv.parse_date_fields(years, months, days)
+
+ expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_parse_all_fields():
+ hours = np.array([5, 6])
+ minutes = np.array([7, 8])
+ seconds = np.array([9, 0])
+
+ days = np.array([3, 4])
+ years = np.array([2007, 2008])
+ months = np.array([1, 2])
+
+ result = conv.parse_all_fields(years, months, days,
+ hours, minutes, seconds)
+ expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
+ datetime(2008, 2, 4, 6, 8, 0)])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_excel.py b/contrib/python/pandas/py2/pandas/tests/io/test_excel.py
new file mode 100644
index 00000000000..e4dd18db37e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_excel.py
@@ -0,0 +1,2566 @@
+from collections import OrderedDict
+import contextlib
+from datetime import date, datetime, time, timedelta
+from distutils.version import LooseVersion
+from functools import partial
+import os
+import warnings
+from warnings import catch_warnings
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas.compat import PY36, BytesIO, iteritems, map, range, u
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.core.config import get_option, set_option
+import pandas.util.testing as tm
+from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf
+
+from pandas.io.common import URLError
+from pandas.io.excel import (
+ ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter,
+ read_excel, register_writer)
+from pandas.io.formats.excel import ExcelFormatter
+from pandas.io.parsers import read_csv
+
+_seriesd = tm.getSeriesData()
+_tsd = tm.getTimeSeriesData()
+_frame = DataFrame(_seriesd)[:10]
+_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10]
+_tsframe = tm.makeTimeDataFrame()[:5]
+_mixed_frame = _frame.copy()
+_mixed_frame['foo'] = 'bar'
+
+
+def ignore_xlrd_time_clock_warning():
+ """
+ Context manager to ignore warnings raised by the xlrd library,
+ regarding the deprecation of `time.clock` in Python 3.7.
+ """
+ with warnings.catch_warnings():
+ warnings.filterwarnings(
+ action='ignore',
+ message='time.clock has been deprecated',
+ category=DeprecationWarning)
+ yield
+
+
[email protected]_if_no('xlrd', '1.0.0')
+class SharedItems(object):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+ self.dirpath = datapath("io", "data")
+ self.frame = _frame.copy()
+ self.frame2 = _frame2.copy()
+ self.tsframe = _tsframe.copy()
+ self.mixed_frame = _mixed_frame.copy()
+
+ def get_csv_refdf(self, basename):
+ """
+ Obtain the reference data from read_csv with the Python engine.
+
+ Parameters
+ ----------
+
+ basename : str
+ File base name, excluding file extension.
+
+ Returns
+ -------
+
+ dfref : DataFrame
+ """
+ pref = os.path.join(self.dirpath, basename + '.csv')
+ dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
+ return dfref
+
+ def get_excelfile(self, basename, ext):
+ """
+ Return test data ExcelFile instance.
+
+ Parameters
+ ----------
+
+ basename : str
+ File base name, excluding file extension.
+
+ Returns
+ -------
+
+ excel : io.excel.ExcelFile
+ """
+ return ExcelFile(os.path.join(self.dirpath, basename + ext))
+
+ def get_exceldf(self, basename, ext, *args, **kwds):
+ """
+ Return test data DataFrame.
+
+ Parameters
+ ----------
+
+ basename : str
+ File base name, excluding file extension.
+
+ Returns
+ -------
+
+ df : DataFrame
+ """
+ pth = os.path.join(self.dirpath, basename + ext)
+ return read_excel(pth, *args, **kwds)
+
+
+class ReadingTestsBase(SharedItems):
+ # This is based on ExcelWriterBase
+
+ @pytest.fixture(autouse=True, params=['xlrd', None])
+ def set_engine(self, request):
+ func_name = "get_exceldf"
+ old_func = getattr(self, func_name)
+ new_func = partial(old_func, engine=request.param)
+ setattr(self, func_name, new_func)
+ yield
+ setattr(self, func_name, old_func)
+
+ @td.skip_if_no("xlrd", "1.0.1") # see gh-22682
+ def test_usecols_int(self, ext):
+
+ df_ref = self.get_csv_refdf("test1")
+ df_ref = df_ref.reindex(columns=["A", "B", "C"])
+
+ # usecols as int
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ with ignore_xlrd_time_clock_warning():
+ df1 = self.get_exceldf("test1", ext, "Sheet1",
+ index_col=0, usecols=3)
+
+ # usecols as int
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ with ignore_xlrd_time_clock_warning():
+ df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
+ index_col=0, usecols=3)
+
+ # parse_cols instead of usecols, usecols as int
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ with ignore_xlrd_time_clock_warning():
+ df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
+ index_col=0, parse_cols=3)
+
+ # TODO add index to xls file)
+ tm.assert_frame_equal(df1, df_ref, check_names=False)
+ tm.assert_frame_equal(df2, df_ref, check_names=False)
+ tm.assert_frame_equal(df3, df_ref, check_names=False)
+
+ @td.skip_if_no('xlrd', '1.0.1') # GH-22682
+ def test_usecols_list(self, ext):
+
+ dfref = self.get_csv_refdf('test1')
+ dfref = dfref.reindex(columns=['B', 'C'])
+ df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+ usecols=[0, 2, 3])
+ df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, usecols=[0, 2, 3])
+
+ with tm.assert_produces_warning(FutureWarning):
+ with ignore_xlrd_time_clock_warning():
+ df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, parse_cols=[0, 2, 3])
+
+ # TODO add index to xls file)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
+ tm.assert_frame_equal(df3, dfref, check_names=False)
+
+ @td.skip_if_no('xlrd', '1.0.1') # GH-22682
+ def test_usecols_str(self, ext):
+
+ dfref = self.get_csv_refdf('test1')
+
+ df1 = dfref.reindex(columns=['A', 'B', 'C'])
+ df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+ usecols='A:D')
+ df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, usecols='A:D')
+
+ with tm.assert_produces_warning(FutureWarning):
+ with ignore_xlrd_time_clock_warning():
+ df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, parse_cols='A:D')
+
+ # TODO add index to xls, read xls ignores index name ?
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
+ tm.assert_frame_equal(df4, df1, check_names=False)
+
+ df1 = dfref.reindex(columns=['B', 'C'])
+ df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+ usecols='A,C,D')
+ df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, usecols='A,C,D')
+ # TODO add index to xls file
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
+
+ df1 = dfref.reindex(columns=['B', 'C'])
+ df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+ usecols='A,C:D')
+ df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0, usecols='A,C:D')
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
+
+ @pytest.mark.parametrize("usecols", [
+ [0, 1, 3], [0, 3, 1],
+ [1, 0, 3], [1, 3, 0],
+ [3, 0, 1], [3, 1, 0],
+ ])
+ def test_usecols_diff_positional_int_columns_order(self, ext, usecols):
+ expected = self.get_csv_refdf("test1")[["A", "C"]]
+ result = self.get_exceldf("test1", ext, "Sheet1",
+ index_col=0, usecols=usecols)
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ @pytest.mark.parametrize("usecols", [
+ ["B", "D"], ["D", "B"]
+ ])
+ def test_usecols_diff_positional_str_columns_order(self, ext, usecols):
+ expected = self.get_csv_refdf("test1")[["B", "D"]]
+ expected.index = range(len(expected))
+
+ result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols)
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ def test_read_excel_without_slicing(self, ext):
+ expected = self.get_csv_refdf("test1")
+ result = self.get_exceldf("test1", ext, "Sheet1", index_col=0)
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ def test_usecols_excel_range_str(self, ext):
+ expected = self.get_csv_refdf("test1")[["C", "D"]]
+ result = self.get_exceldf("test1", ext, "Sheet1",
+ index_col=0, usecols="A,D:E")
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ def test_usecols_excel_range_str_invalid(self, ext):
+ msg = "Invalid column name: E1"
+
+ with pytest.raises(ValueError, match=msg):
+ self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1")
+
+ def test_index_col_label_error(self, ext):
+ msg = "list indices must be integers.*, not str"
+
+ with pytest.raises(TypeError, match=msg):
+ self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
+ usecols=["A", "C"])
+
+ def test_index_col_empty(self, ext):
+ # see gh-9208
+ result = self.get_exceldf("test1", ext, "Sheet3",
+ index_col=["A", "B", "C"])
+ expected = DataFrame(columns=["D", "E", "F"],
+ index=MultiIndex(levels=[[]] * 3,
+ codes=[[]] * 3,
+ names=["A", "B", "C"]))
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("index_col", [None, 2])
+ def test_index_col_with_unnamed(self, ext, index_col):
+ # see gh-18792
+ result = self.get_exceldf("test1", ext, "Sheet4",
+ index_col=index_col)
+ expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
+ columns=["Unnamed: 0", "col1", "col2"])
+ if index_col:
+ expected = expected.set_index(expected.columns[index_col])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_usecols_pass_non_existent_column(self, ext):
+ msg = ("Usecols do not match columns, "
+ "columns expected but not found: " + r"\['E'\]")
+
+ with pytest.raises(ValueError, match=msg):
+ self.get_exceldf("test1", ext, usecols=["E"])
+
+ def test_usecols_wrong_type(self, ext):
+ msg = ("'usecols' must either be list-like of "
+ "all strings, all unicode, all integers or a callable.")
+
+ with pytest.raises(ValueError, match=msg):
+ self.get_exceldf("test1", ext, usecols=["E1", 0])
+
+ def test_excel_stop_iterator(self, ext):
+
+ parsed = self.get_exceldf('test2', ext, 'Sheet1')
+ expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
+ tm.assert_frame_equal(parsed, expected)
+
+ def test_excel_cell_error_na(self, ext):
+
+ parsed = self.get_exceldf('test3', ext, 'Sheet1')
+ expected = DataFrame([[np.nan]], columns=['Test'])
+ tm.assert_frame_equal(parsed, expected)
+
+ def test_excel_passes_na(self, ext):
+
+ excel = self.get_excelfile('test4', ext)
+
+ parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
+ na_values=['apple'])
+ expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
+ columns=['Test'])
+ tm.assert_frame_equal(parsed, expected)
+
+ parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
+ na_values=['apple'])
+ expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
+ columns=['Test'])
+ tm.assert_frame_equal(parsed, expected)
+
+ # 13967
+ excel = self.get_excelfile('test5', ext)
+
+ parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
+ na_values=['apple'])
+ expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
+ columns=['Test'])
+ tm.assert_frame_equal(parsed, expected)
+
+ parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
+ na_values=['apple'])
+ expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
+ columns=['Test'])
+ tm.assert_frame_equal(parsed, expected)
+
+ @td.skip_if_no('xlrd', '1.0.1') # GH-22682
+ def test_deprecated_sheetname(self, ext):
+ # gh-17964
+ excel = self.get_excelfile('test1', ext)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ read_excel(excel, sheetname='Sheet1')
+
+ with pytest.raises(TypeError):
+ read_excel(excel, sheet='Sheet1')
+
+ @td.skip_if_no('xlrd', '1.0.1') # GH-22682
+ def test_excel_table_sheet_by_index(self, ext):
+
+ excel = self.get_excelfile('test1', ext)
+ dfref = self.get_csv_refdf('test1')
+
+ df1 = read_excel(excel, 0, index_col=0)
+ df2 = read_excel(excel, 1, skiprows=[1], index_col=0)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
+
+ df1 = excel.parse(0, index_col=0)
+ df2 = excel.parse(1, skiprows=[1], index_col=0)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
+
+ df3 = read_excel(excel, 0, index_col=0, skipfooter=1)
+ tm.assert_frame_equal(df3, df1.iloc[:-1])
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ df4 = read_excel(excel, 0, index_col=0, skip_footer=1)
+ tm.assert_frame_equal(df3, df4)
+
+ df3 = excel.parse(0, index_col=0, skipfooter=1)
+ tm.assert_frame_equal(df3, df1.iloc[:-1])
+
+ import xlrd
+ with pytest.raises(xlrd.XLRDError):
+ read_excel(excel, 'asdf')
+
+ def test_excel_table(self, ext):
+
+ dfref = self.get_csv_refdf('test1')
+
+ df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0)
+ df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+ index_col=0)
+ # TODO add index to file
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
+
+ df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+ skipfooter=1)
+ tm.assert_frame_equal(df3, df1.iloc[:-1])
+
+ def test_reader_special_dtypes(self, ext):
+
+ expected = DataFrame.from_dict(OrderedDict([
+ ("IntCol", [1, 2, -3, 4, 0]),
+ ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
+ ("BoolCol", [True, False, True, True, False]),
+ ("StrCol", [1, 2, 3, 4, 5]),
+ # GH5394 - this is why convert_float isn't vectorized
+ ("Str2Col", ["a", 3, "c", "d", "e"]),
+ ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
+ datetime(1905, 1, 1), datetime(2013, 12, 14),
+ datetime(2015, 3, 14)])
+ ]))
+ basename = 'test_types'
+
+ # should read in correctly and infer types
+ actual = self.get_exceldf(basename, ext, 'Sheet1')
+ tm.assert_frame_equal(actual, expected)
+
+ # if not coercing number, then int comes in as float
+ float_expected = expected.copy()
+ float_expected["IntCol"] = float_expected["IntCol"].astype(float)
+ float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
+ actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False)
+ tm.assert_frame_equal(actual, float_expected)
+
+ # check setting Index (assuming xls and xlsx are the same here)
+ for icol, name in enumerate(expected.columns):
+ actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol)
+ exp = expected.set_index(name)
+ tm.assert_frame_equal(actual, exp)
+
+ # convert_float and converters should be different but both accepted
+ expected["StrCol"] = expected["StrCol"].apply(str)
+ actual = self.get_exceldf(
+ basename, ext, 'Sheet1', converters={"StrCol": str})
+ tm.assert_frame_equal(actual, expected)
+
+ no_convert_float = float_expected.copy()
+ no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
+ actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False,
+ converters={"StrCol": str})
+ tm.assert_frame_equal(actual, no_convert_float)
+
+ # GH8212 - support for converters and missing values
+ def test_reader_converters(self, ext):
+
+ basename = 'test_converters'
+
+ expected = DataFrame.from_dict(OrderedDict([
+ ("IntCol", [1, 2, -3, -1000, 0]),
+ ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
+ ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
+ ("StrCol", ['1', np.nan, '3', '4', '5']),
+ ]))
+
+ converters = {'IntCol': lambda x: int(x) if x != '' else -1000,
+ 'FloatCol': lambda x: 10 * x if x else np.nan,
+ 2: lambda x: 'Found' if x != '' else 'Not found',
+ 3: lambda x: str(x) if x else '',
+ }
+
+ # should read in correctly and set types of single cells (not array
+ # dtypes)
+ actual = self.get_exceldf(basename, ext, 'Sheet1',
+ converters=converters)
+ tm.assert_frame_equal(actual, expected)
+
+ def test_reader_dtype(self, ext):
+ # GH 8212
+ basename = 'testdtype'
+ actual = self.get_exceldf(basename, ext)
+
+ expected = DataFrame({
+ 'a': [1, 2, 3, 4],
+ 'b': [2.5, 3.5, 4.5, 5.5],
+ 'c': [1, 2, 3, 4],
+ 'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
+ columns=['a', 'b', 'c', 'd'])
+
+ tm.assert_frame_equal(actual, expected)
+
+ actual = self.get_exceldf(basename, ext,
+ dtype={'a': 'float64',
+ 'b': 'float32',
+ 'c': str})
+
+ expected['a'] = expected['a'].astype('float64')
+ expected['b'] = expected['b'].astype('float32')
+ expected['c'] = ['001', '002', '003', '004']
+ tm.assert_frame_equal(actual, expected)
+
+ with pytest.raises(ValueError):
+ self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
+ @pytest.mark.parametrize("dtype,expected", [
+ (None,
+ DataFrame({
+ "a": [1, 2, 3, 4],
+ "b": [2.5, 3.5, 4.5, 5.5],
+ "c": [1, 2, 3, 4],
+ "d": [1.0, 2.0, np.nan, 4.0]
+ })),
+ ({"a": "float64",
+ "b": "float32",
+ "c": str,
+ "d": str
+ },
+ DataFrame({
+ "a": Series([1, 2, 3, 4], dtype="float64"),
+ "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
+ "c": ["001", "002", "003", "004"],
+ "d": ["1", "2", np.nan, "4"]
+ })),
+ ])
+ def test_reader_dtype_str(self, ext, dtype, expected):
+ # see gh-20377
+ basename = "testdtype"
+
+ actual = self.get_exceldf(basename, ext, dtype=dtype)
+ tm.assert_frame_equal(actual, expected)
+
+ def test_reading_all_sheets(self, ext):
+ # Test reading all sheetnames by setting sheetname to None,
+ # Ensure a dict is returned.
+ # See PR #9450
+ basename = 'test_multisheet'
+ dfs = self.get_exceldf(basename, ext, sheet_name=None)
+ # ensure this is not alphabetical to test order preservation
+ expected_keys = ['Charlie', 'Alpha', 'Beta']
+ tm.assert_contains_all(expected_keys, dfs.keys())
+ # Issue 9930
+ # Ensure sheet order is preserved
+ assert expected_keys == list(dfs.keys())
+
+ def test_reading_multiple_specific_sheets(self, ext):
+ # Test reading specific sheetnames by specifying a mixed list
+ # of integers and strings, and confirm that duplicated sheet
+ # references (positions/names) are removed properly.
+ # Ensure a dict is returned
+ # See PR #9450
+ basename = 'test_multisheet'
+ # Explicitly request duplicates. Only the set should be returned.
+ expected_keys = [2, 'Charlie', 'Charlie']
+ dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys)
+ expected_keys = list(set(expected_keys))
+ tm.assert_contains_all(expected_keys, dfs.keys())
+ assert len(expected_keys) == len(dfs.keys())
+
+ def test_reading_all_sheets_with_blank(self, ext):
+ # Test reading all sheetnames by setting sheetname to None,
+ # In the case where some sheets are blank.
+ # Issue #11711
+ basename = 'blank_with_header'
+ dfs = self.get_exceldf(basename, ext, sheet_name=None)
+ expected_keys = ['Sheet1', 'Sheet2', 'Sheet3']
+ tm.assert_contains_all(expected_keys, dfs.keys())
+
+ # GH6403
+ def test_read_excel_blank(self, ext):
+ actual = self.get_exceldf('blank', ext, 'Sheet1')
+ tm.assert_frame_equal(actual, DataFrame())
+
+ def test_read_excel_blank_with_header(self, ext):
+ expected = DataFrame(columns=['col_1', 'col_2'])
+ actual = self.get_exceldf('blank_with_header', ext, 'Sheet1')
+ tm.assert_frame_equal(actual, expected)
+
+ @td.skip_if_no("xlwt")
+ @td.skip_if_no("openpyxl")
+ @pytest.mark.parametrize("header,expected", [
+ (None, DataFrame([np.nan] * 4)),
+ (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))
+ ])
+ def test_read_one_empty_col_no_header(self, ext, header, expected):
+ # xref gh-12292
+ filename = "no_header"
+ df = pd.DataFrame(
+ [["", 1, 100],
+ ["", 2, 200],
+ ["", 3, 300],
+ ["", 4, 400]]
+ )
+
+ with ensure_clean(ext) as path:
+ df.to_excel(path, filename, index=False, header=False)
+ result = read_excel(path, filename, usecols=[0], header=header)
+
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no("xlwt")
+ @td.skip_if_no("openpyxl")
+ @pytest.mark.parametrize("header,expected", [
+ (None, DataFrame([0] + [np.nan] * 4)),
+ (0, DataFrame([np.nan] * 4))
+ ])
+ def test_read_one_empty_col_with_header(self, ext, header, expected):
+ filename = "with_header"
+ df = pd.DataFrame(
+ [["", 1, 100],
+ ["", 2, 200],
+ ["", 3, 300],
+ ["", 4, 400]]
+ )
+
+ with ensure_clean(ext) as path:
+ df.to_excel(path, 'with_header', index=False, header=True)
+ result = read_excel(path, filename, usecols=[0], header=header)
+
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no('openpyxl')
+ @td.skip_if_no('xlwt')
+ def test_set_column_names_in_parameter(self, ext):
+ # GH 12870 : pass down column names associated with
+ # keyword argument names
+ refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'],
+ [3, 'baz']], columns=['a', 'b'])
+
+ with ensure_clean(ext) as pth:
+ with ExcelWriter(pth) as writer:
+ refdf.to_excel(writer, 'Data_no_head',
+ header=False, index=False)
+ refdf.to_excel(writer, 'Data_with_head', index=False)
+
+ refdf.columns = ['A', 'B']
+
+ with ExcelFile(pth) as reader:
+ xlsdf_no_head = read_excel(reader, 'Data_no_head',
+ header=None, names=['A', 'B'])
+ xlsdf_with_head = read_excel(reader, 'Data_with_head',
+ index_col=None, names=['A', 'B'])
+
+ tm.assert_frame_equal(xlsdf_no_head, refdf)
+ tm.assert_frame_equal(xlsdf_with_head, refdf)
+
+ def test_date_conversion_overflow(self, ext):
+ # GH 10001 : pandas.ExcelFile ignore parse_dates=False
+ expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'],
+ [pd.Timestamp('2016-03-16'), 'Jack Black'],
+ [1e+20, 'Timothy Brown']],
+ columns=['DateColWithBigInt', 'StringCol'])
+
+ result = self.get_exceldf('testdateoverflow', ext)
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no("xlrd", "1.0.1") # see gh-22682
+ def test_sheet_name_and_sheetname(self, ext):
+ # gh-10559: Minor improvement: Change "sheet_name" to "sheetname"
+ # gh-10969: DOC: Consistent var names (sheetname vs sheet_name)
+ # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name
+ # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different
+ # behavior for "sheetname" argument
+ filename = "test1"
+ sheet_name = "Sheet1"
+
+ df_ref = self.get_csv_refdf(filename)
+ df1 = self.get_exceldf(filename, ext,
+ sheet_name=sheet_name, index_col=0) # doc
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ with ignore_xlrd_time_clock_warning():
+ df2 = self.get_exceldf(filename, ext, index_col=0,
+ sheetname=sheet_name) # backward compat
+
+ excel = self.get_excelfile(filename, ext)
+ df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ df2_parse = excel.parse(index_col=0,
+ sheetname=sheet_name) # backward compat
+
+ tm.assert_frame_equal(df1, df_ref, check_names=False)
+ tm.assert_frame_equal(df2, df_ref, check_names=False)
+ tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
+ tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
+
+ def test_sheet_name_both_raises(self, ext):
+ with pytest.raises(TypeError, match="Cannot specify both"):
+ self.get_exceldf('test1', ext, sheetname='Sheet1',
+ sheet_name='Sheet1')
+
+ excel = self.get_excelfile('test1', ext)
+ with pytest.raises(TypeError, match="Cannot specify both"):
+ excel.parse(sheetname='Sheet1',
+ sheet_name='Sheet1')
+
+ def test_excel_read_buffer(self, ext):
+
+ pth = os.path.join(self.dirpath, 'test1' + ext)
+ expected = read_excel(pth, 'Sheet1', index_col=0)
+ with open(pth, 'rb') as f:
+ actual = read_excel(f, 'Sheet1', index_col=0)
+ tm.assert_frame_equal(expected, actual)
+
+ with open(pth, 'rb') as f:
+ xls = ExcelFile(f)
+ actual = read_excel(xls, 'Sheet1', index_col=0)
+ tm.assert_frame_equal(expected, actual)
+
+ def test_bad_engine_raises(self, ext):
+ bad_engine = 'foo'
+ with pytest.raises(ValueError, match="Unknown engine: foo"):
+ read_excel('', engine=bad_engine)
+
+ @tm.network
+ def test_read_from_http_url(self, ext):
+ url = ('https://raw.github.com/pandas-dev/pandas/master/'
+ 'pandas/tests/io/data/test1' + ext)
+ url_table = read_excel(url)
+ local_table = self.get_exceldf('test1', ext)
+ tm.assert_frame_equal(url_table, local_table)
+
+ @td.skip_if_not_us_locale
+ def test_read_from_s3_url(self, ext, s3_resource):
+ # Bucket "pandas-test" created in tests/io/conftest.py
+ file_name = os.path.join(self.dirpath, 'test1' + ext)
+
+ with open(file_name, "rb") as f:
+ s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext,
+ Body=f)
+
+ url = ('s3://pandas-test/test1' + ext)
+ url_table = read_excel(url)
+ local_table = self.get_exceldf('test1', ext)
+ tm.assert_frame_equal(url_table, local_table)
+
+ @pytest.mark.slow
+ # ignore warning from old xlrd
+ @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning")
+ def test_read_from_file_url(self, ext):
+
+ # FILE
+ localtable = os.path.join(self.dirpath, 'test1' + ext)
+ local_table = read_excel(localtable)
+
+ try:
+ url_table = read_excel('file://localhost/' + localtable)
+ except URLError:
+ # fails on some systems
+ import platform
+ pytest.skip("failing on %s" %
+ ' '.join(platform.uname()).strip())
+
+ tm.assert_frame_equal(url_table, local_table)
+
+ @td.skip_if_no('pathlib')
+ def test_read_from_pathlib_path(self, ext):
+
+ # GH12655
+ from pathlib import Path
+
+ str_path = os.path.join(self.dirpath, 'test1' + ext)
+ expected = read_excel(str_path, 'Sheet1', index_col=0)
+
+ path_obj = Path(self.dirpath, 'test1' + ext)
+ actual = read_excel(path_obj, 'Sheet1', index_col=0)
+
+ tm.assert_frame_equal(expected, actual)
+
+ @td.skip_if_no('py.path')
+ def test_read_from_py_localpath(self, ext):
+
+ # GH12655
+ from py.path import local as LocalPath
+
+ str_path = os.path.join(self.dirpath, 'test1' + ext)
+ expected = read_excel(str_path, 'Sheet1', index_col=0)
+
+ abs_dir = os.path.abspath(self.dirpath)
+ path_obj = LocalPath(abs_dir).join('test1' + ext)
+ actual = read_excel(path_obj, 'Sheet1', index_col=0)
+
+ tm.assert_frame_equal(expected, actual)
+
+ def test_reader_closes_file(self, ext):
+
+ pth = os.path.join(self.dirpath, 'test1' + ext)
+ f = open(pth, 'rb')
+ with ExcelFile(f) as xlsx:
+ # parses okay
+ read_excel(xlsx, 'Sheet1', index_col=0)
+
+ assert f.closed
+
+ @td.skip_if_no("xlwt")
+ @td.skip_if_no("openpyxl")
+ def test_creating_and_reading_multiple_sheets(self, ext):
+ # see gh-9450
+ #
+ # Test reading multiple sheets, from a runtime
+ # created Excel file with multiple sheets.
+ def tdf(col_sheet_name):
+ d, i = [11, 22, 33], [1, 2, 3]
+ return DataFrame(d, i, columns=[col_sheet_name])
+
+ sheets = ["AAA", "BBB", "CCC"]
+
+ dfs = [tdf(s) for s in sheets]
+ dfs = dict(zip(sheets, dfs))
+
+ with ensure_clean(ext) as pth:
+ with ExcelWriter(pth) as ew:
+ for sheetname, df in iteritems(dfs):
+ df.to_excel(ew, sheetname)
+
+ dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0)
+
+ for s in sheets:
+ tm.assert_frame_equal(dfs[s], dfs_returned[s])
+
+ def test_reader_seconds(self, ext):
+
+ # Test reading times with and without milliseconds. GH5945.
+ expected = DataFrame.from_dict({"Time": [time(1, 2, 3),
+ time(2, 45, 56, 100000),
+ time(4, 29, 49, 200000),
+ time(6, 13, 42, 300000),
+ time(7, 57, 35, 400000),
+ time(9, 41, 28, 500000),
+ time(11, 25, 21, 600000),
+ time(13, 9, 14, 700000),
+ time(14, 53, 7, 800000),
+ time(16, 37, 0, 900000),
+ time(18, 20, 54)]})
+
+ actual = self.get_exceldf('times_1900', ext, 'Sheet1')
+ tm.assert_frame_equal(actual, expected)
+
+ actual = self.get_exceldf('times_1904', ext, 'Sheet1')
+ tm.assert_frame_equal(actual, expected)
+
+ def test_read_excel_multiindex(self, ext):
+ # see gh-4679
+ mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
+ mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
+
+ # "mi_column" sheet
+ expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True],
+ [2, 3.5, pd.Timestamp("2015-01-02"), False],
+ [3, 4.5, pd.Timestamp("2015-01-03"), False],
+ [4, 5.5, pd.Timestamp("2015-01-04"), True]],
+ columns=mi)
+
+ actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ # "mi_index" sheet
+ expected.index = mi
+ expected.columns = ["a", "b", "c", "d"]
+
+ actual = read_excel(mi_file, "mi_index", index_col=[0, 1])
+ tm.assert_frame_equal(actual, expected, check_names=False)
+
+ # "both" sheet
+ expected.columns = mi
+
+ actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1])
+ tm.assert_frame_equal(actual, expected, check_names=False)
+
+ # "mi_index_name" sheet
+ expected.columns = ["a", "b", "c", "d"]
+ expected.index = mi.set_names(["ilvl1", "ilvl2"])
+
+ actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1])
+ tm.assert_frame_equal(actual, expected)
+
+ # "mi_column_name" sheet
+ expected.index = list(range(4))
+ expected.columns = mi.set_names(["c1", "c2"])
+ actual = read_excel(mi_file, "mi_column_name",
+ header=[0, 1], index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ # see gh-11317
+ # "name_with_int" sheet
+ expected.columns = mi.set_levels(
+ [1, 2], level=1).set_names(["c1", "c2"])
+
+ actual = read_excel(mi_file, "name_with_int",
+ index_col=0, header=[0, 1])
+ tm.assert_frame_equal(actual, expected)
+
+ # "both_name" sheet
+ expected.columns = mi.set_names(["c1", "c2"])
+ expected.index = mi.set_names(["ilvl1", "ilvl2"])
+
+ actual = read_excel(mi_file, "both_name",
+ index_col=[0, 1], header=[0, 1])
+ tm.assert_frame_equal(actual, expected)
+
+ # "both_skiprows" sheet
+ actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1],
+ header=[0, 1], skiprows=2)
+ tm.assert_frame_equal(actual, expected)
+
+ def test_read_excel_multiindex_header_only(self, ext):
+ # see gh-11733.
+ #
+ # Don't try to parse a header name if there isn't one.
+ mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
+ result = read_excel(mi_file, "index_col_none", header=[0, 1])
+
+ exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
+ expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no("xlsxwriter")
+ def test_read_excel_multiindex_empty_level(self, ext):
+ # see gh-12453
+ with ensure_clean(ext) as path:
+ df = DataFrame({
+ ("One", "x"): {0: 1},
+ ("Two", "X"): {0: 3},
+ ("Two", "Y"): {0: 7},
+ ("Zero", ""): {0: 0}
+ })
+
+ expected = DataFrame({
+ ("One", "x"): {0: 1},
+ ("Two", "X"): {0: 3},
+ ("Two", "Y"): {0: 7},
+ ("Zero", "Unnamed: 4_level_1"): {0: 0}
+ })
+
+ df.to_excel(path)
+ actual = pd.read_excel(path, header=[0, 1], index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ df = pd.DataFrame({
+ ("Beg", ""): {0: 0},
+ ("Middle", "x"): {0: 1},
+ ("Tail", "X"): {0: 3},
+ ("Tail", "Y"): {0: 7}
+ })
+
+ expected = pd.DataFrame({
+ ("Beg", "Unnamed: 1_level_1"): {0: 0},
+ ("Middle", "x"): {0: 1},
+ ("Tail", "X"): {0: 3},
+ ("Tail", "Y"): {0: 7}
+ })
+
+ df.to_excel(path)
+ actual = pd.read_excel(path, header=[0, 1], index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ @td.skip_if_no("xlsxwriter")
+ @pytest.mark.parametrize("c_idx_names", [True, False])
+ @pytest.mark.parametrize("r_idx_names", [True, False])
+ @pytest.mark.parametrize("c_idx_levels", [1, 3])
+ @pytest.mark.parametrize("r_idx_levels", [1, 3])
+ def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names,
+ c_idx_levels, r_idx_levels):
+ # see gh-4679
+ with ensure_clean(ext) as pth:
+ if c_idx_levels == 1 and c_idx_names:
+ pytest.skip("Column index name cannot be "
+ "serialized unless it's a MultiIndex")
+
+ # Empty name case current read in as
+ # unnamed levels, not Nones.
+ check_names = r_idx_names or r_idx_levels <= 1
+
+ df = mkdf(5, 5, c_idx_names, r_idx_names,
+ c_idx_levels, r_idx_levels)
+ df.to_excel(pth)
+
+ act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
+ header=list(range(c_idx_levels)))
+ tm.assert_frame_equal(df, act, check_names=check_names)
+
+ df.iloc[0, :] = np.nan
+ df.to_excel(pth)
+
+ act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
+ header=list(range(c_idx_levels)))
+ tm.assert_frame_equal(df, act, check_names=check_names)
+
+ df.iloc[-1, :] = np.nan
+ df.to_excel(pth)
+ act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
+ header=list(range(c_idx_levels)))
+ tm.assert_frame_equal(df, act, check_names=check_names)
+
+ def test_excel_old_index_format(self, ext):
+ # see gh-4679
+ filename = "test_index_name_pre17" + ext
+ in_file = os.path.join(self.dirpath, filename)
+
+ # We detect headers to determine if index names exist, so
+ # that "index" name in the "names" version of the data will
+ # now be interpreted as rows that include null data.
+ data = np.array([[None, None, None, None, None],
+ ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
+ ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
+ ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
+ ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
+ ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
+ columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
+ mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1",
+ "R_l0_g2", "R_l0_g3", "R_l0_g4"],
+ ["R1", "R_l1_g0", "R_l1_g1",
+ "R_l1_g2", "R_l1_g3", "R_l1_g4"]],
+ codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
+ names=[None, None])
+ si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2",
+ "R_l0_g3", "R_l0_g4"], name=None)
+
+ expected = pd.DataFrame(data, index=si, columns=columns)
+
+ actual = pd.read_excel(in_file, "single_names", index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ expected.index = mi
+
+ actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1])
+ tm.assert_frame_equal(actual, expected)
+
+ # The analogous versions of the "names" version data
+ # where there are explicitly no names for the indices.
+ data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
+ ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
+ ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
+ ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
+ ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
+ columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
+ mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2",
+ "R_l0_g3", "R_l0_g4"],
+ ["R_l1_g0", "R_l1_g1", "R_l1_g2",
+ "R_l1_g3", "R_l1_g4"]],
+ codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
+ names=[None, None])
+ si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2",
+ "R_l0_g3", "R_l0_g4"], name=None)
+
+ expected = pd.DataFrame(data, index=si, columns=columns)
+
+ actual = pd.read_excel(in_file, "single_no_names", index_col=0)
+ tm.assert_frame_equal(actual, expected)
+
+ expected.index = mi
+
+ actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1])
+ tm.assert_frame_equal(actual, expected, check_names=False)
+
+ def test_read_excel_bool_header_arg(self, ext):
+ # GH 6114
+ for arg in [True, False]:
+ with pytest.raises(TypeError):
+ pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
+ header=arg)
+
+ def test_read_excel_chunksize(self, ext):
+ # GH 8011
+ with pytest.raises(NotImplementedError):
+ pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
+ chunksize=100)
+
+ @td.skip_if_no("xlwt")
+ @td.skip_if_no("openpyxl")
+ def test_read_excel_parse_dates(self, ext):
+ # see gh-11544, gh-12051
+ df = DataFrame(
+ {"col": [1, 2, 3],
+ "date_strings": pd.date_range("2012-01-01", periods=3)})
+ df2 = df.copy()
+ df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y")
+
+ with ensure_clean(ext) as pth:
+ df2.to_excel(pth)
+
+ res = read_excel(pth, index_col=0)
+ tm.assert_frame_equal(df2, res)
+
+ res = read_excel(pth, parse_dates=["date_strings"], index_col=0)
+ tm.assert_frame_equal(df, res)
+
+ date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y")
+ res = read_excel(pth, parse_dates=["date_strings"],
+ date_parser=date_parser, index_col=0)
+ tm.assert_frame_equal(df, res)
+
+ def test_read_excel_skiprows_list(self, ext):
+ # GH 4903
+ actual = pd.read_excel(os.path.join(self.dirpath,
+ 'testskiprows' + ext),
+ 'skiprows_list', skiprows=[0, 2])
+ expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True],
+ [2, 3.5, pd.Timestamp('2015-01-02'), False],
+ [3, 4.5, pd.Timestamp('2015-01-03'), False],
+ [4, 5.5, pd.Timestamp('2015-01-04'), True]],
+ columns=['a', 'b', 'c', 'd'])
+ tm.assert_frame_equal(actual, expected)
+
+ actual = pd.read_excel(os.path.join(self.dirpath,
+ 'testskiprows' + ext),
+ 'skiprows_list', skiprows=np.array([0, 2]))
+ tm.assert_frame_equal(actual, expected)
+
+ def test_read_excel_nrows(self, ext):
+ # GH 16645
+ num_rows_to_pull = 5
+ actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
+ nrows=num_rows_to_pull)
+ expected = pd.read_excel(os.path.join(self.dirpath,
+ 'test1' + ext))
+ expected = expected[:num_rows_to_pull]
+ tm.assert_frame_equal(actual, expected)
+
+ def test_read_excel_nrows_greater_than_nrows_in_file(self, ext):
+ # GH 16645
+ expected = pd.read_excel(os.path.join(self.dirpath,
+ 'test1' + ext))
+ num_records_in_file = len(expected)
+ num_rows_to_pull = num_records_in_file + 10
+ actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
+ nrows=num_rows_to_pull)
+ tm.assert_frame_equal(actual, expected)
+
+ def test_read_excel_nrows_non_integer_parameter(self, ext):
+ # GH 16645
+ msg = "'nrows' must be an integer >=0"
+ with pytest.raises(ValueError, match=msg):
+ pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
+ nrows='5')
+
+ def test_read_excel_squeeze(self, ext):
+ # GH 12157
+ f = os.path.join(self.dirpath, 'test_squeeze' + ext)
+
+ actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True)
+ expected = pd.Series([2, 3, 4], [4, 5, 6], name='b')
+ expected.index.name = 'a'
+ tm.assert_series_equal(actual, expected)
+
+ actual = pd.read_excel(f, 'two_columns', squeeze=True)
+ expected = pd.DataFrame({'a': [4, 5, 6],
+ 'b': [2, 3, 4]})
+ tm.assert_frame_equal(actual, expected)
+
+ actual = pd.read_excel(f, 'one_column', squeeze=True)
+ expected = pd.Series([1, 2, 3], name='a')
+ tm.assert_series_equal(actual, expected)
+
+
[email protected]("ext", ['.xls', '.xlsx', '.xlsm'])
+class TestXlrdReader(ReadingTestsBase):
+ """
+ This is the base class for the xlrd tests, and 3 different file formats
+ are supported: xls, xlsx, xlsm
+ """
+
+ @td.skip_if_no("xlwt")
+ def test_read_xlrd_book(self, ext):
+ import xlrd
+ df = self.frame
+
+ engine = "xlrd"
+ sheet_name = "SheetA"
+
+ with ensure_clean(ext) as pth:
+ df.to_excel(pth, sheet_name)
+ book = xlrd.open_workbook(pth)
+
+ with ExcelFile(book, engine=engine) as xl:
+ result = read_excel(xl, sheet_name, index_col=0)
+ tm.assert_frame_equal(df, result)
+
+ result = read_excel(book, sheet_name=sheet_name,
+ engine=engine, index_col=0)
+ tm.assert_frame_equal(df, result)
+
+
+class _WriterBase(SharedItems):
+
+ @pytest.fixture(autouse=True)
+ def set_engine_and_path(self, request, merge_cells, engine, ext):
+ """Fixture to set engine and open file for use in each test case
+
+ Rather than requiring `engine=...` to be provided explicitly as an
+ argument in each test, this fixture sets a global option to dictate
+ which engine should be used to write Excel files. After executing
+ the test it rolls back said change to the global option.
+
+ It also uses a context manager to open a temporary excel file for
+ the function to write to, accessible via `self.path`
+
+ Notes
+ -----
+ This fixture will run as part of each test method defined in the
+ class and any subclasses, on account of the `autouse=True`
+ argument
+ """
+ option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.'))
+ prev_engine = get_option(option_name)
+ set_option(option_name, engine)
+ with ensure_clean(ext) as path:
+ self.path = path
+ yield
+ set_option(option_name, prev_engine) # Roll back option change
+
+
[email protected]("merge_cells", [True, False])
[email protected]("engine,ext", [
+ pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif(
+ not td.safe_import('openpyxl'), reason='No openpyxl')),
+ pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif(
+ not td.safe_import('openpyxl'), reason='No openpyxl')),
+ pytest.param('xlwt', '.xls', marks=pytest.mark.skipif(
+ not td.safe_import('xlwt'), reason='No xlwt')),
+ pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif(
+ not td.safe_import('xlsxwriter'), reason='No xlsxwriter'))
+])
+class TestExcelWriter(_WriterBase):
+ # Base class for test cases to run with different Excel writers.
+
+ def test_excel_sheet_by_name_raise(self, *_):
+ import xlrd
+
+ gt = DataFrame(np.random.randn(10, 2))
+ gt.to_excel(self.path)
+
+ xl = ExcelFile(self.path)
+ df = read_excel(xl, 0, index_col=0)
+
+ tm.assert_frame_equal(gt, df)
+
+ with pytest.raises(xlrd.XLRDError):
+ read_excel(xl, "0")
+
+ def test_excel_writer_context_manager(self, *_):
+ with ExcelWriter(self.path) as writer:
+ self.frame.to_excel(writer, "Data1")
+ self.frame2.to_excel(writer, "Data2")
+
+ with ExcelFile(self.path) as reader:
+ found_df = read_excel(reader, "Data1", index_col=0)
+ found_df2 = read_excel(reader, "Data2", index_col=0)
+
+ tm.assert_frame_equal(found_df, self.frame)
+ tm.assert_frame_equal(found_df2, self.frame2)
+
+ def test_roundtrip(self, merge_cells, engine, ext):
+ self.frame['A'][:5] = nan
+
+ self.frame.to_excel(self.path, 'test1')
+ self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+ self.frame.to_excel(self.path, 'test1', header=False)
+ self.frame.to_excel(self.path, 'test1', index=False)
+
+ # test roundtrip
+ self.frame.to_excel(self.path, 'test1')
+ recons = read_excel(self.path, 'test1', index_col=0)
+ tm.assert_frame_equal(self.frame, recons)
+
+ self.frame.to_excel(self.path, 'test1', index=False)
+ recons = read_excel(self.path, 'test1', index_col=None)
+ recons.index = self.frame.index
+ tm.assert_frame_equal(self.frame, recons)
+
+ self.frame.to_excel(self.path, 'test1', na_rep='NA')
+ recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA'])
+ tm.assert_frame_equal(self.frame, recons)
+
+ # GH 3611
+ self.frame.to_excel(self.path, 'test1', na_rep='88')
+ recons = read_excel(self.path, 'test1', index_col=0, na_values=['88'])
+ tm.assert_frame_equal(self.frame, recons)
+
+ self.frame.to_excel(self.path, 'test1', na_rep='88')
+ recons = read_excel(self.path, 'test1', index_col=0,
+ na_values=[88, 88.0])
+ tm.assert_frame_equal(self.frame, recons)
+
+ # GH 6573
+ self.frame.to_excel(self.path, 'Sheet1')
+ recons = read_excel(self.path, index_col=0)
+ tm.assert_frame_equal(self.frame, recons)
+
+ self.frame.to_excel(self.path, '0')
+ recons = read_excel(self.path, index_col=0)
+ tm.assert_frame_equal(self.frame, recons)
+
+ # GH 8825 Pandas Series should provide to_excel method
+ s = self.frame["A"]
+ s.to_excel(self.path)
+ recons = read_excel(self.path, index_col=0)
+ tm.assert_frame_equal(s.to_frame(), recons)
+
+ def test_mixed(self, merge_cells, engine, ext):
+ self.mixed_frame.to_excel(self.path, 'test1')
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1', index_col=0)
+ tm.assert_frame_equal(self.mixed_frame, recons)
+
+ def test_ts_frame(self, *_):
+ df = tm.makeTimeDataFrame()[:5]
+
+ df.to_excel(self.path, "test1")
+ reader = ExcelFile(self.path)
+
+ recons = read_excel(reader, "test1", index_col=0)
+ tm.assert_frame_equal(df, recons)
+
+ def test_basics_with_nan(self, merge_cells, engine, ext):
+ self.frame['A'][:5] = nan
+ self.frame.to_excel(self.path, 'test1')
+ self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+ self.frame.to_excel(self.path, 'test1', header=False)
+ self.frame.to_excel(self.path, 'test1', index=False)
+
+ @pytest.mark.parametrize("np_type", [
+ np.int8, np.int16, np.int32, np.int64])
+ def test_int_types(self, merge_cells, engine, ext, np_type):
+ # Test np.int values read come back as int
+ # (rather than float which is Excel's format).
+ frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
+ dtype=np_type)
+ frame.to_excel(self.path, "test1")
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, "test1", index_col=0)
+
+ int_frame = frame.astype(np.int64)
+ tm.assert_frame_equal(int_frame, recons)
+
+ recons2 = read_excel(self.path, "test1", index_col=0)
+ tm.assert_frame_equal(int_frame, recons2)
+
+ # Test with convert_float=False comes back as float.
+ float_frame = frame.astype(float)
+ recons = read_excel(self.path, "test1",
+ convert_float=False, index_col=0)
+ tm.assert_frame_equal(recons, float_frame,
+ check_index_type=False,
+ check_column_type=False)
+
+ @pytest.mark.parametrize("np_type", [
+ np.float16, np.float32, np.float64])
+ def test_float_types(self, merge_cells, engine, ext, np_type):
+ # Test np.float values read come back as float.
+ frame = DataFrame(np.random.random_sample(10), dtype=np_type)
+ frame.to_excel(self.path, "test1")
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, "test1", index_col=0).astype(np_type)
+
+ tm.assert_frame_equal(frame, recons, check_dtype=False)
+
+ @pytest.mark.parametrize("np_type", [np.bool8, np.bool_])
+ def test_bool_types(self, merge_cells, engine, ext, np_type):
+ # Test np.bool values read come back as float.
+ frame = (DataFrame([1, 0, True, False], dtype=np_type))
+ frame.to_excel(self.path, "test1")
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, "test1", index_col=0).astype(np_type)
+
+ tm.assert_frame_equal(frame, recons)
+
+ def test_inf_roundtrip(self, *_):
+ frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)])
+ frame.to_excel(self.path, "test1")
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, "test1", index_col=0)
+
+ tm.assert_frame_equal(frame, recons)
+
+ def test_sheets(self, merge_cells, engine, ext):
+ self.frame['A'][:5] = nan
+
+ self.frame.to_excel(self.path, 'test1')
+ self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+ self.frame.to_excel(self.path, 'test1', header=False)
+ self.frame.to_excel(self.path, 'test1', index=False)
+
+ # Test writing to separate sheets
+ writer = ExcelWriter(self.path)
+ self.frame.to_excel(writer, 'test1')
+ self.tsframe.to_excel(writer, 'test2')
+ writer.save()
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1', index_col=0)
+ tm.assert_frame_equal(self.frame, recons)
+ recons = read_excel(reader, 'test2', index_col=0)
+ tm.assert_frame_equal(self.tsframe, recons)
+ assert 2 == len(reader.sheet_names)
+ assert 'test1' == reader.sheet_names[0]
+ assert 'test2' == reader.sheet_names[1]
+
+ def test_colaliases(self, merge_cells, engine, ext):
+ self.frame['A'][:5] = nan
+
+ self.frame.to_excel(self.path, 'test1')
+ self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+ self.frame.to_excel(self.path, 'test1', header=False)
+ self.frame.to_excel(self.path, 'test1', index=False)
+
+ # column aliases
+ col_aliases = Index(['AA', 'X', 'Y', 'Z'])
+ self.frame2.to_excel(self.path, 'test1', header=col_aliases)
+ reader = ExcelFile(self.path)
+ rs = read_excel(reader, 'test1', index_col=0)
+ xp = self.frame2.copy()
+ xp.columns = col_aliases
+ tm.assert_frame_equal(xp, rs)
+
+ def test_roundtrip_indexlabels(self, merge_cells, engine, ext):
+ self.frame['A'][:5] = nan
+
+ self.frame.to_excel(self.path, 'test1')
+ self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+ self.frame.to_excel(self.path, 'test1', header=False)
+ self.frame.to_excel(self.path, 'test1', index=False)
+
+ # test index_label
+ frame = (DataFrame(np.random.randn(10, 2)) >= 0)
+ frame.to_excel(self.path, 'test1',
+ index_label=['test'],
+ merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1',
+ index_col=0,
+ ).astype(np.int64)
+ frame.index.names = ['test']
+ assert frame.index.names == recons.index.names
+
+ frame = (DataFrame(np.random.randn(10, 2)) >= 0)
+ frame.to_excel(self.path,
+ 'test1',
+ index_label=['test', 'dummy', 'dummy2'],
+ merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1',
+ index_col=0,
+ ).astype(np.int64)
+ frame.index.names = ['test']
+ assert frame.index.names == recons.index.names
+
+ frame = (DataFrame(np.random.randn(10, 2)) >= 0)
+ frame.to_excel(self.path,
+ 'test1',
+ index_label='test',
+ merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1',
+ index_col=0,
+ ).astype(np.int64)
+ frame.index.names = ['test']
+ tm.assert_frame_equal(frame, recons.astype(bool))
+
+ self.frame.to_excel(self.path,
+ 'test1',
+ columns=['A', 'B', 'C', 'D'],
+ index=False, merge_cells=merge_cells)
+ # take 'A' and 'B' as indexes (same row as cols 'C', 'D')
+ df = self.frame.copy()
+ df = df.set_index(['A', 'B'])
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1', index_col=[0, 1])
+ tm.assert_frame_equal(df, recons, check_less_precise=True)
+
+ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext):
+ df = DataFrame(np.random.randn(10, 4))
+ df.index.name = 'foo'
+
+ df.to_excel(self.path, merge_cells=merge_cells)
+
+ xf = ExcelFile(self.path)
+ result = read_excel(xf, xf.sheet_names[0],
+ index_col=0)
+
+ tm.assert_frame_equal(result, df)
+ assert result.index.name == 'foo'
+
+ def test_excel_roundtrip_datetime(self, merge_cells, *_):
+ # datetime.date, not sure what to test here exactly
+ tsf = self.tsframe.copy()
+
+ tsf.index = [x.date() for x in self.tsframe.index]
+ tsf.to_excel(self.path, "test1", merge_cells=merge_cells)
+
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, "test1", index_col=0)
+
+ tm.assert_frame_equal(self.tsframe, recons)
+
+ def test_excel_date_datetime_format(self, merge_cells, engine, ext):
+ # see gh-4133
+ #
+ # Excel output format strings
+ df = DataFrame([[date(2014, 1, 31),
+ date(1999, 9, 24)],
+ [datetime(1998, 5, 26, 23, 33, 4),
+ datetime(2014, 2, 28, 13, 5, 13)]],
+ index=["DATE", "DATETIME"], columns=["X", "Y"])
+ df_expected = DataFrame([[datetime(2014, 1, 31),
+ datetime(1999, 9, 24)],
+ [datetime(1998, 5, 26, 23, 33, 4),
+ datetime(2014, 2, 28, 13, 5, 13)]],
+ index=["DATE", "DATETIME"], columns=["X", "Y"])
+
+ with ensure_clean(ext) as filename2:
+ writer1 = ExcelWriter(self.path)
+ writer2 = ExcelWriter(filename2,
+ date_format="DD.MM.YYYY",
+ datetime_format="DD.MM.YYYY HH-MM-SS")
+
+ df.to_excel(writer1, "test1")
+ df.to_excel(writer2, "test1")
+
+ writer1.close()
+ writer2.close()
+
+ reader1 = ExcelFile(self.path)
+ reader2 = ExcelFile(filename2)
+
+ rs1 = read_excel(reader1, "test1", index_col=0)
+ rs2 = read_excel(reader2, "test1", index_col=0)
+
+ tm.assert_frame_equal(rs1, rs2)
+
+ # Since the reader returns a datetime object for dates,
+ # we need to use df_expected to check the result.
+ tm.assert_frame_equal(rs2, df_expected)
+
+ def test_to_excel_interval_no_labels(self, *_):
+ # see gh-19242
+ #
+ # Test writing Interval without labels.
+ frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
+ dtype=np.int64)
+ expected = frame.copy()
+
+ frame["new"] = pd.cut(frame[0], 10)
+ expected["new"] = pd.cut(expected[0], 10).astype(str)
+
+ frame.to_excel(self.path, "test1")
+ reader = ExcelFile(self.path)
+
+ recons = read_excel(reader, "test1", index_col=0)
+ tm.assert_frame_equal(expected, recons)
+
+ def test_to_excel_interval_labels(self, *_):
+ # see gh-19242
+ #
+ # Test writing Interval with labels.
+ frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
+ dtype=np.int64)
+ expected = frame.copy()
+ intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
+ "F", "G", "H", "I", "J"])
+ frame["new"] = intervals
+ expected["new"] = pd.Series(list(intervals))
+
+ frame.to_excel(self.path, "test1")
+ reader = ExcelFile(self.path)
+
+ recons = read_excel(reader, "test1", index_col=0)
+ tm.assert_frame_equal(expected, recons)
+
+ def test_to_excel_timedelta(self, *_):
+ # see gh-19242, gh-9155
+ #
+ # Test writing timedelta to xls.
+ frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
+ columns=["A"], dtype=np.int64)
+ expected = frame.copy()
+
+ frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x))
+ expected["new"] = expected["A"].apply(
+ lambda x: timedelta(seconds=x).total_seconds() / float(86400))
+
+ frame.to_excel(self.path, "test1")
+ reader = ExcelFile(self.path)
+
+ recons = read_excel(reader, "test1", index_col=0)
+ tm.assert_frame_equal(expected, recons)
+
+ def test_to_excel_periodindex(self, merge_cells, engine, ext):
+ frame = self.tsframe
+ xp = frame.resample('M', kind='period').mean()
+
+ xp.to_excel(self.path, 'sht1')
+
+ reader = ExcelFile(self.path)
+ rs = read_excel(reader, 'sht1', index_col=0)
+ tm.assert_frame_equal(xp, rs.to_period('M'))
+
+ def test_to_excel_multiindex(self, merge_cells, engine, ext):
+ frame = self.frame
+ arrays = np.arange(len(frame.index) * 2).reshape(2, -1)
+ new_index = MultiIndex.from_arrays(arrays,
+ names=['first', 'second'])
+ frame.index = new_index
+
+ frame.to_excel(self.path, 'test1', header=False)
+ frame.to_excel(self.path, 'test1', columns=['A', 'B'])
+
+ # round trip
+ frame.to_excel(self.path, 'test1', merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ df = read_excel(reader, 'test1', index_col=[0, 1])
+ tm.assert_frame_equal(frame, df)
+
+ # GH13511
+ def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext):
+ frame = pd.DataFrame({'A': [None, 2, 3],
+ 'B': [10, 20, 30],
+ 'C': np.random.sample(3)})
+ frame = frame.set_index(['A', 'B'])
+
+ frame.to_excel(self.path, merge_cells=merge_cells)
+ df = read_excel(self.path, index_col=[0, 1])
+ tm.assert_frame_equal(frame, df)
+
+ # Test for Issue 11328. If column indices are integers, make
+ # sure they are handled correctly for either setting of
+ # merge_cells
+ def test_to_excel_multiindex_cols(self, merge_cells, engine, ext):
+ frame = self.frame
+ arrays = np.arange(len(frame.index) * 2).reshape(2, -1)
+ new_index = MultiIndex.from_arrays(arrays,
+ names=['first', 'second'])
+ frame.index = new_index
+
+ new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2),
+ (50, 1), (50, 2)])
+ frame.columns = new_cols_index
+ header = [0, 1]
+ if not merge_cells:
+ header = 0
+
+ # round trip
+ frame.to_excel(self.path, 'test1', merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ df = read_excel(reader, 'test1', header=header,
+ index_col=[0, 1])
+ if not merge_cells:
+ fm = frame.columns.format(sparsify=False,
+ adjoin=False, names=False)
+ frame.columns = [".".join(map(str, q)) for q in zip(*fm)]
+ tm.assert_frame_equal(frame, df)
+
+ def test_to_excel_multiindex_dates(self, merge_cells, engine, ext):
+ # try multiindex with dates
+ tsframe = self.tsframe.copy()
+ new_index = [tsframe.index, np.arange(len(tsframe.index))]
+ tsframe.index = MultiIndex.from_arrays(new_index)
+
+ tsframe.index.names = ['time', 'foo']
+ tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells)
+ reader = ExcelFile(self.path)
+ recons = read_excel(reader, 'test1',
+ index_col=[0, 1])
+
+ tm.assert_frame_equal(tsframe, recons)
+ assert recons.index.names == ('time', 'foo')
+
+ def test_to_excel_multiindex_no_write_index(self, merge_cells, engine,
+ ext):
+ # Test writing and re-reading a MI witout the index. GH 5616.
+
+ # Initial non-MI frame.
+ frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]})
+
+ # Add a MI.
+ frame2 = frame1.copy()
+ multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)])
+ frame2.index = multi_index
+
+ # Write out to Excel without the index.
+ frame2.to_excel(self.path, 'test1', index=False)
+
+ # Read it back in.
+ reader = ExcelFile(self.path)
+ frame3 = read_excel(reader, 'test1')
+
+ # Test that it is the same as the initial frame.
+ tm.assert_frame_equal(frame1, frame3)
+
+ def test_to_excel_float_format(self, *_):
+ df = DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=["A", "B"], columns=["X", "Y", "Z"])
+ df.to_excel(self.path, "test1", float_format="%.2f")
+
+ reader = ExcelFile(self.path)
+ result = read_excel(reader, "test1", index_col=0)
+
+ expected = DataFrame([[0.12, 0.23, 0.57],
+ [12.32, 123123.20, 321321.20]],
+ index=["A", "B"], columns=["X", "Y", "Z"])
+ tm.assert_frame_equal(result, expected)
+
+ def test_to_excel_output_encoding(self, merge_cells, engine, ext):
+ # Avoid mixed inferred_type.
+ df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"],
+ [u"\u0195", u"\u0196", u"\u0197"]],
+ index=[u"A\u0192", u"B"],
+ columns=[u"X\u0193", u"Y", u"Z"])
+
+ with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename:
+ df.to_excel(filename, sheet_name="TestSheet", encoding="utf8")
+ result = read_excel(filename, "TestSheet",
+ encoding="utf8", index_col=0)
+ tm.assert_frame_equal(result, df)
+
+ def test_to_excel_unicode_filename(self, merge_cells, engine, ext):
+ with ensure_clean(u("\u0192u.") + ext) as filename:
+ try:
+ f = open(filename, "wb")
+ except UnicodeEncodeError:
+ pytest.skip("No unicode file names on this system")
+ else:
+ f.close()
+
+ df = DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=["A", "B"], columns=["X", "Y", "Z"])
+ df.to_excel(filename, "test1", float_format="%.2f")
+
+ reader = ExcelFile(filename)
+ result = read_excel(reader, "test1", index_col=0)
+
+ expected = DataFrame([[0.12, 0.23, 0.57],
+ [12.32, 123123.20, 321321.20]],
+ index=["A", "B"], columns=["X", "Y", "Z"])
+ tm.assert_frame_equal(result, expected)
+
+ # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext):
+
+ # import StringIO
+ # s = StringIO(
+ # """Date,ticker,type,value
+ # 2001-01-01,x,close,12.2
+ # 2001-01-01,x,open ,12.1
+ # 2001-01-01,y,close,12.2
+ # 2001-01-01,y,open ,12.1
+ # 2001-02-01,x,close,12.2
+ # 2001-02-01,x,open ,12.1
+ # 2001-02-01,y,close,12.2
+ # 2001-02-01,y,open ,12.1
+ # 2001-03-01,x,close,12.2
+ # 2001-03-01,x,open ,12.1
+ # 2001-03-01,y,close,12.2
+ # 2001-03-01,y,open ,12.1""")
+ # df = read_csv(s, parse_dates=["Date"])
+ # pdf = df.pivot_table(values="value", rows=["ticker"],
+ # cols=["Date", "type"])
+
+ # try:
+ # import xlwt
+ # import xlrd
+ # except ImportError:
+ # pytest.skip
+
+ # filename = '__tmp_to_excel_header_styling_xls__.xls'
+ # pdf.to_excel(filename, 'test1')
+
+ # wbk = xlrd.open_workbook(filename,
+ # formatting_info=True)
+ # assert ["test1"] == wbk.sheet_names()
+ # ws = wbk.sheet_by_name('test1')
+ # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells
+ # for i in range(0, 2):
+ # for j in range(0, 7):
+ # xfx = ws.cell_xf_index(0, 0)
+ # cell_xf = wbk.xf_list[xfx]
+ # font = wbk.font_list
+ # assert 1 == font[cell_xf.font_index].bold
+ # assert 1 == cell_xf.border.top_line_style
+ # assert 1 == cell_xf.border.right_line_style
+ # assert 1 == cell_xf.border.bottom_line_style
+ # assert 1 == cell_xf.border.left_line_style
+ # assert 2 == cell_xf.alignment.hor_align
+ # os.remove(filename)
+ # def test_to_excel_header_styling_xlsx(self, merge_cells, engine, ext):
+ # import StringIO
+ # s = StringIO(
+ # """Date,ticker,type,value
+ # 2001-01-01,x,close,12.2
+ # 2001-01-01,x,open ,12.1
+ # 2001-01-01,y,close,12.2
+ # 2001-01-01,y,open ,12.1
+ # 2001-02-01,x,close,12.2
+ # 2001-02-01,x,open ,12.1
+ # 2001-02-01,y,close,12.2
+ # 2001-02-01,y,open ,12.1
+ # 2001-03-01,x,close,12.2
+ # 2001-03-01,x,open ,12.1
+ # 2001-03-01,y,close,12.2
+ # 2001-03-01,y,open ,12.1""")
+ # df = read_csv(s, parse_dates=["Date"])
+ # pdf = df.pivot_table(values="value", rows=["ticker"],
+ # cols=["Date", "type"])
+ # try:
+ # import openpyxl
+ # from openpyxl.cell import get_column_letter
+ # except ImportError:
+ # pytest.skip
+ # if openpyxl.__version__ < '1.6.1':
+ # pytest.skip
+ # # test xlsx_styling
+ # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx'
+ # pdf.to_excel(filename, 'test1')
+ # wbk = openpyxl.load_workbook(filename)
+ # assert ["test1"] == wbk.get_sheet_names()
+ # ws = wbk.get_sheet_by_name('test1')
+ # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))]
+ # xlsaddrs += ["A%s" % i for i in range(1, 6)]
+ # xlsaddrs += ["B1", "D1", "F1"]
+ # for xlsaddr in xlsaddrs:
+ # cell = ws.cell(xlsaddr)
+ # assert cell.style.font.bold
+ # assert (openpyxl.style.Border.BORDER_THIN ==
+ # cell.style.borders.top.border_style)
+ # assert (openpyxl.style.Border.BORDER_THIN ==
+ # cell.style.borders.right.border_style)
+ # assert (openpyxl.style.Border.BORDER_THIN ==
+ # cell.style.borders.bottom.border_style)
+ # assert (openpyxl.style.Border.BORDER_THIN ==
+ # cell.style.borders.left.border_style)
+ # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER ==
+ # cell.style.alignment.horizontal)
+ # mergedcells_addrs = ["C1", "E1", "G1"]
+ # for maddr in mergedcells_addrs:
+ # assert ws.cell(maddr).merged
+ # os.remove(filename)
+
+ @pytest.mark.parametrize("use_headers", [True, False])
+ @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3])
+ @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3])
+ def test_excel_010_hemstring(self, merge_cells, engine, ext,
+ c_idx_nlevels, r_idx_nlevels, use_headers):
+
+ def roundtrip(data, header=True, parser_hdr=0, index=True):
+ data.to_excel(self.path, header=header,
+ merge_cells=merge_cells, index=index)
+
+ xf = ExcelFile(self.path)
+ return read_excel(xf, xf.sheet_names[0], header=parser_hdr)
+
+ # Basic test.
+ parser_header = 0 if use_headers else None
+ res = roundtrip(DataFrame([0]), use_headers, parser_header)
+
+ assert res.shape == (1, 2)
+ assert res.iloc[0, 0] is not np.nan
+
+ # More complex tests with multi-index.
+ nrows = 5
+ ncols = 3
+
+ from pandas.util.testing import makeCustomDataframe as mkdf
+ # ensure limited functionality in 0.10
+ # override of gh-2370 until sorted out in 0.11
+
+ df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels,
+ c_idx_nlevels=c_idx_nlevels)
+
+ # This if will be removed once multi-column Excel writing
+ # is implemented. For now fixing gh-9794.
+ if c_idx_nlevels > 1:
+ with pytest.raises(NotImplementedError):
+ roundtrip(df, use_headers, index=False)
+ else:
+ res = roundtrip(df, use_headers)
+
+ if use_headers:
+ assert res.shape == (nrows, ncols + r_idx_nlevels)
+ else:
+ # First row taken as columns.
+ assert res.shape == (nrows - 1, ncols + r_idx_nlevels)
+
+ # No NaNs.
+ for r in range(len(res.index)):
+ for c in range(len(res.columns)):
+ assert res.iloc[r, c] is not np.nan
+
+ def test_duplicated_columns(self, *_):
+ # see gh-5235
+ df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]],
+ columns=["A", "B", "B"])
+ df.to_excel(self.path, "test1")
+ expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]],
+ columns=["A", "B", "B.1"])
+
+ # By default, we mangle.
+ result = read_excel(self.path, "test1", index_col=0)
+ tm.assert_frame_equal(result, expected)
+
+ # Explicitly, we pass in the parameter.
+ result = read_excel(self.path, "test1", index_col=0,
+ mangle_dupe_cols=True)
+ tm.assert_frame_equal(result, expected)
+
+ # see gh-11007, gh-10970
+ df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
+ columns=["A", "B", "A", "B"])
+ df.to_excel(self.path, "test1")
+
+ result = read_excel(self.path, "test1", index_col=0)
+ expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
+ columns=["A", "B", "A.1", "B.1"])
+ tm.assert_frame_equal(result, expected)
+
+ # see gh-10982
+ df.to_excel(self.path, "test1", index=False, header=False)
+ result = read_excel(self.path, "test1", header=None)
+
+ expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
+ tm.assert_frame_equal(result, expected)
+
+ msg = "Setting mangle_dupe_cols=False is not supported yet"
+ with pytest.raises(ValueError, match=msg):
+ read_excel(self.path, "test1", header=None, mangle_dupe_cols=False)
+
+ def test_swapped_columns(self, merge_cells, engine, ext):
+ # Test for issue #5427.
+ write_frame = DataFrame({'A': [1, 1, 1],
+ 'B': [2, 2, 2]})
+ write_frame.to_excel(self.path, 'test1', columns=['B', 'A'])
+
+ read_frame = read_excel(self.path, 'test1', header=0)
+
+ tm.assert_series_equal(write_frame['A'], read_frame['A'])
+ tm.assert_series_equal(write_frame['B'], read_frame['B'])
+
+ def test_invalid_columns(self, *_):
+ # see gh-10982
+ write_frame = DataFrame({"A": [1, 1, 1],
+ "B": [2, 2, 2]})
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ write_frame.to_excel(self.path, "test1", columns=["B", "C"])
+
+ expected = write_frame.reindex(columns=["B", "C"])
+ read_frame = read_excel(self.path, "test1", index_col=0)
+ tm.assert_frame_equal(expected, read_frame)
+
+ with pytest.raises(KeyError):
+ write_frame.to_excel(self.path, "test1", columns=["C", "D"])
+
+ def test_comment_arg(self, *_):
+ # see gh-18735
+ #
+ # Test the comment argument functionality to read_excel.
+
+ # Create file to read in.
+ df = DataFrame({"A": ["one", "#one", "one"],
+ "B": ["two", "two", "#two"]})
+ df.to_excel(self.path, "test_c")
+
+ # Read file without comment arg.
+ result1 = read_excel(self.path, "test_c", index_col=0)
+
+ result1.iloc[1, 0] = None
+ result1.iloc[1, 1] = None
+ result1.iloc[2, 1] = None
+
+ result2 = read_excel(self.path, "test_c", comment="#", index_col=0)
+ tm.assert_frame_equal(result1, result2)
+
+ def test_comment_default(self, merge_cells, engine, ext):
+ # Re issue #18735
+ # Test the comment argument default to read_excel
+
+ # Create file to read in
+ df = DataFrame({'A': ['one', '#one', 'one'],
+ 'B': ['two', 'two', '#two']})
+ df.to_excel(self.path, 'test_c')
+
+ # Read file with default and explicit comment=None
+ result1 = read_excel(self.path, 'test_c')
+ result2 = read_excel(self.path, 'test_c', comment=None)
+ tm.assert_frame_equal(result1, result2)
+
+ def test_comment_used(self, *_):
+ # see gh-18735
+ #
+ # Test the comment argument is working as expected when used.
+
+ # Create file to read in.
+ df = DataFrame({"A": ["one", "#one", "one"],
+ "B": ["two", "two", "#two"]})
+ df.to_excel(self.path, "test_c")
+
+ # Test read_frame_comment against manually produced expected output.
+ expected = DataFrame({"A": ["one", None, "one"],
+ "B": ["two", None, None]})
+ result = read_excel(self.path, "test_c", comment="#", index_col=0)
+ tm.assert_frame_equal(result, expected)
+
+ def test_comment_empty_line(self, merge_cells, engine, ext):
+ # Re issue #18735
+ # Test that read_excel ignores commented lines at the end of file
+
+ df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
+ df.to_excel(self.path, index=False)
+
+ # Test that all-comment lines at EoF are ignored
+ expected = DataFrame({'a': [1], 'b': [2]})
+ result = read_excel(self.path, comment='#')
+ tm.assert_frame_equal(result, expected)
+
+ def test_datetimes(self, merge_cells, engine, ext):
+
+ # Test writing and reading datetimes. For issue #9139. (xref #9185)
+ datetimes = [datetime(2013, 1, 13, 1, 2, 3),
+ datetime(2013, 1, 13, 2, 45, 56),
+ datetime(2013, 1, 13, 4, 29, 49),
+ datetime(2013, 1, 13, 6, 13, 42),
+ datetime(2013, 1, 13, 7, 57, 35),
+ datetime(2013, 1, 13, 9, 41, 28),
+ datetime(2013, 1, 13, 11, 25, 21),
+ datetime(2013, 1, 13, 13, 9, 14),
+ datetime(2013, 1, 13, 14, 53, 7),
+ datetime(2013, 1, 13, 16, 37, 0),
+ datetime(2013, 1, 13, 18, 20, 52)]
+
+ write_frame = DataFrame({'A': datetimes})
+ write_frame.to_excel(self.path, 'Sheet1')
+ read_frame = read_excel(self.path, 'Sheet1', header=0)
+
+ tm.assert_series_equal(write_frame['A'], read_frame['A'])
+
+ def test_bytes_io(self, merge_cells, engine, ext):
+ # see gh-7074
+ bio = BytesIO()
+ df = DataFrame(np.random.randn(10, 2))
+
+ # Pass engine explicitly, as there is no file path to infer from.
+ writer = ExcelWriter(bio, engine=engine)
+ df.to_excel(writer)
+ writer.save()
+
+ bio.seek(0)
+ reread_df = read_excel(bio, index_col=0)
+ tm.assert_frame_equal(df, reread_df)
+
+ def test_write_lists_dict(self, *_):
+ # see gh-8188.
+ df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}],
+ "numeric": [1, 2, 3.0],
+ "str": ["apple", "banana", "cherry"]})
+ df.to_excel(self.path, "Sheet1")
+ read = read_excel(self.path, "Sheet1", header=0, index_col=0)
+
+ expected = df.copy()
+ expected.mixed = expected.mixed.apply(str)
+ expected.numeric = expected.numeric.astype("int64")
+
+ tm.assert_frame_equal(read, expected)
+
+ def test_true_and_false_value_options(self, *_):
+ # see gh-13347
+ df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"])
+ expected = df.replace({"foo": True, "bar": False})
+
+ df.to_excel(self.path)
+ read_frame = read_excel(self.path, true_values=["foo"],
+ false_values=["bar"], index_col=0)
+ tm.assert_frame_equal(read_frame, expected)
+
+ def test_freeze_panes(self, *_):
+ # see gh-15160
+ expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"])
+ expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1))
+
+ result = read_excel(self.path, index_col=0)
+ tm.assert_frame_equal(result, expected)
+
+ def test_path_path_lib(self, merge_cells, engine, ext):
+ df = tm.makeDataFrame()
+ writer = partial(df.to_excel, engine=engine)
+
+ reader = partial(pd.read_excel, index_col=0)
+ result = tm.round_trip_pathlib(writer, reader,
+ path="foo.{ext}".format(ext=ext))
+ tm.assert_frame_equal(result, df)
+
+ def test_path_local_path(self, merge_cells, engine, ext):
+ df = tm.makeDataFrame()
+ writer = partial(df.to_excel, engine=engine)
+
+ reader = partial(pd.read_excel, index_col=0)
+ result = tm.round_trip_pathlib(writer, reader,
+ path="foo.{ext}".format(ext=ext))
+ tm.assert_frame_equal(result, df)
+
+
[email protected]_if_no('openpyxl')
[email protected]("merge_cells,ext,engine", [
+ (None, '.xlsx', 'openpyxl')])
+class TestOpenpyxlTests(_WriterBase):
+
+ def test_to_excel_styleconverter(self, merge_cells, ext, engine):
+ from openpyxl import styles
+
+ hstyle = {
+ "font": {
+ "color": '00FF0000',
+ "bold": True,
+ },
+ "borders": {
+ "top": "thin",
+ "right": "thin",
+ "bottom": "thin",
+ "left": "thin",
+ },
+ "alignment": {
+ "horizontal": "center",
+ "vertical": "top",
+ },
+ "fill": {
+ "patternType": 'solid',
+ 'fgColor': {
+ 'rgb': '006666FF',
+ 'tint': 0.3,
+ },
+ },
+ "number_format": {
+ "format_code": "0.00"
+ },
+ "protection": {
+ "locked": True,
+ "hidden": False,
+ },
+ }
+
+ font_color = styles.Color('00FF0000')
+ font = styles.Font(bold=True, color=font_color)
+ side = styles.Side(style=styles.borders.BORDER_THIN)
+ border = styles.Border(top=side, right=side, bottom=side, left=side)
+ alignment = styles.Alignment(horizontal='center', vertical='top')
+ fill_color = styles.Color(rgb='006666FF', tint=0.3)
+ fill = styles.PatternFill(patternType='solid', fgColor=fill_color)
+
+ number_format = '0.00'
+
+ protection = styles.Protection(locked=True, hidden=False)
+
+ kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
+ assert kw['font'] == font
+ assert kw['border'] == border
+ assert kw['alignment'] == alignment
+ assert kw['fill'] == fill
+ assert kw['number_format'] == number_format
+ assert kw['protection'] == protection
+
+ def test_write_cells_merge_styled(self, merge_cells, ext, engine):
+ from pandas.io.formats.excel import ExcelCell
+
+ sheet_name = 'merge_styled'
+
+ sty_b1 = {'font': {'color': '00FF0000'}}
+ sty_a2 = {'font': {'color': '0000FF00'}}
+
+ initial_cells = [
+ ExcelCell(col=1, row=0, val=42, style=sty_b1),
+ ExcelCell(col=0, row=1, val=99, style=sty_a2),
+ ]
+
+ sty_merged = {'font': {'color': '000000FF', 'bold': True}}
+ sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
+ openpyxl_sty_merged = sty_kwargs['font']
+ merge_cells = [
+ ExcelCell(col=0, row=0, val='pandas',
+ mergestart=1, mergeend=1, style=sty_merged),
+ ]
+
+ with ensure_clean(ext) as path:
+ writer = _OpenpyxlWriter(path)
+ writer.write_cells(initial_cells, sheet_name=sheet_name)
+ writer.write_cells(merge_cells, sheet_name=sheet_name)
+
+ wks = writer.sheets[sheet_name]
+ xcell_b1 = wks['B1']
+ xcell_a2 = wks['A2']
+ assert xcell_b1.font == openpyxl_sty_merged
+ assert xcell_a2.font == openpyxl_sty_merged
+
+ @pytest.mark.parametrize("mode,expected", [
+ ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])])
+ def test_write_append_mode(self, merge_cells, ext, engine, mode, expected):
+ import openpyxl
+ df = DataFrame([1], columns=['baz'])
+
+ with ensure_clean(ext) as f:
+ wb = openpyxl.Workbook()
+ wb.worksheets[0].title = 'foo'
+ wb.worksheets[0]['A1'].value = 'foo'
+ wb.create_sheet('bar')
+ wb.worksheets[1]['A1'].value = 'bar'
+ wb.save(f)
+
+ writer = ExcelWriter(f, engine=engine, mode=mode)
+ df.to_excel(writer, sheet_name='baz', index=False)
+ writer.save()
+
+ wb2 = openpyxl.load_workbook(f)
+ result = [sheet.title for sheet in wb2.worksheets]
+ assert result == expected
+
+ for index, cell_value in enumerate(expected):
+ assert wb2.worksheets[index]['A1'].value == cell_value
+
+
[email protected]_if_no('xlwt')
[email protected]("merge_cells,ext,engine", [
+ (None, '.xls', 'xlwt')])
+class TestXlwtTests(_WriterBase):
+
+ def test_excel_raise_error_on_multiindex_columns_and_no_index(
+ self, merge_cells, ext, engine):
+ # MultiIndex as columns is not yet implemented 9794
+ cols = MultiIndex.from_tuples([('site', ''),
+ ('2014', 'height'),
+ ('2014', 'weight')])
+ df = DataFrame(np.random.randn(10, 3), columns=cols)
+ with pytest.raises(NotImplementedError):
+ with ensure_clean(ext) as path:
+ df.to_excel(path, index=False)
+
+ def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext,
+ engine):
+ cols = MultiIndex.from_tuples([('site', ''),
+ ('2014', 'height'),
+ ('2014', 'weight')])
+ df = pd.DataFrame(np.random.randn(10, 3), columns=cols)
+ with ensure_clean(ext) as path:
+ df.to_excel(path, index=True)
+
+ def test_excel_multiindex_index(self, merge_cells, ext, engine):
+ # MultiIndex as index works so assert no error #9794
+ cols = MultiIndex.from_tuples([('site', ''),
+ ('2014', 'height'),
+ ('2014', 'weight')])
+ df = DataFrame(np.random.randn(3, 10), index=cols)
+ with ensure_clean(ext) as path:
+ df.to_excel(path, index=False)
+
+ def test_to_excel_styleconverter(self, merge_cells, ext, engine):
+ import xlwt
+
+ hstyle = {"font": {"bold": True},
+ "borders": {"top": "thin",
+ "right": "thin",
+ "bottom": "thin",
+ "left": "thin"},
+ "alignment": {"horizontal": "center", "vertical": "top"}}
+
+ xls_style = _XlwtWriter._convert_to_style(hstyle)
+ assert xls_style.font.bold
+ assert xlwt.Borders.THIN == xls_style.borders.top
+ assert xlwt.Borders.THIN == xls_style.borders.right
+ assert xlwt.Borders.THIN == xls_style.borders.bottom
+ assert xlwt.Borders.THIN == xls_style.borders.left
+ assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz
+ assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert
+
+ def test_write_append_mode_raises(self, merge_cells, ext, engine):
+ msg = "Append mode is not supported with xlwt!"
+
+ with ensure_clean(ext) as f:
+ with pytest.raises(ValueError, match=msg):
+ ExcelWriter(f, engine=engine, mode='a')
+
+
[email protected]_if_no('xlsxwriter')
[email protected]("merge_cells,ext,engine", [
+ (None, '.xlsx', 'xlsxwriter')])
+class TestXlsxWriterTests(_WriterBase):
+
+ @td.skip_if_no('openpyxl')
+ def test_column_format(self, merge_cells, ext, engine):
+ # Test that column formats are applied to cells. Test for issue #9167.
+ # Applicable to xlsxwriter only.
+ with warnings.catch_warnings():
+ # Ignore the openpyxl lxml warning.
+ warnings.simplefilter("ignore")
+ import openpyxl
+
+ with ensure_clean(ext) as path:
+ frame = DataFrame({'A': [123456, 123456],
+ 'B': [123456, 123456]})
+
+ writer = ExcelWriter(path)
+ frame.to_excel(writer)
+
+ # Add a number format to col B and ensure it is applied to cells.
+ num_format = '#,##0'
+ write_workbook = writer.book
+ write_worksheet = write_workbook.worksheets()[0]
+ col_format = write_workbook.add_format({'num_format': num_format})
+ write_worksheet.set_column('B:B', None, col_format)
+ writer.save()
+
+ read_workbook = openpyxl.load_workbook(path)
+ try:
+ read_worksheet = read_workbook['Sheet1']
+ except TypeError:
+ # compat
+ read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1')
+
+ # Get the number format from the cell.
+ try:
+ cell = read_worksheet['B2']
+ except TypeError:
+ # compat
+ cell = read_worksheet.cell('B2')
+
+ try:
+ read_num_format = cell.number_format
+ except Exception:
+ read_num_format = cell.style.number_format._format_code
+
+ assert read_num_format == num_format
+
+ def test_write_append_mode_raises(self, merge_cells, ext, engine):
+ msg = "Append mode is not supported with xlsxwriter!"
+
+ with ensure_clean(ext) as f:
+ with pytest.raises(ValueError, match=msg):
+ ExcelWriter(f, engine=engine, mode='a')
+
+
+class TestExcelWriterEngineTests(object):
+
+ @pytest.mark.parametrize('klass,ext', [
+ pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif(
+ not td.safe_import('xlsxwriter'), reason='No xlsxwriter')),
+ pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif(
+ not td.safe_import('openpyxl'), reason='No openpyxl')),
+ pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif(
+ not td.safe_import('xlwt'), reason='No xlwt'))
+ ])
+ def test_ExcelWriter_dispatch(self, klass, ext):
+ with ensure_clean(ext) as path:
+ writer = ExcelWriter(path)
+ if ext == '.xlsx' and td.safe_import('xlsxwriter'):
+ # xlsxwriter has preference over openpyxl if both installed
+ assert isinstance(writer, _XlsxWriter)
+ else:
+ assert isinstance(writer, klass)
+
+ def test_ExcelWriter_dispatch_raises(self):
+ with pytest.raises(ValueError, match='No engine'):
+ ExcelWriter('nothing')
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_register_writer(self):
+ # some awkward mocking to test out dispatch and such actually works
+ called_save = []
+ called_write_cells = []
+
+ class DummyClass(ExcelWriter):
+ called_save = False
+ called_write_cells = False
+ supported_extensions = ['test', 'xlsx', 'xls']
+ engine = 'dummy'
+
+ def save(self):
+ called_save.append(True)
+
+ def write_cells(self, *args, **kwargs):
+ called_write_cells.append(True)
+
+ def check_called(func):
+ func()
+ assert len(called_save) >= 1
+ assert len(called_write_cells) >= 1
+ del called_save[:]
+ del called_write_cells[:]
+
+ with pd.option_context('io.excel.xlsx.writer', 'dummy'):
+ register_writer(DummyClass)
+ writer = ExcelWriter('something.test')
+ assert isinstance(writer, DummyClass)
+ df = tm.makeCustomDataframe(1, 1)
+
+ with catch_warnings(record=True):
+ panel = tm.makePanel()
+ func = lambda: df.to_excel('something.test')
+ check_called(func)
+ check_called(lambda: panel.to_excel('something.test'))
+ check_called(lambda: df.to_excel('something.xlsx'))
+ check_called(
+ lambda: df.to_excel(
+ 'something.xls', engine='dummy'))
+
+
[email protected]('engine', [
+ pytest.param('xlwt',
+ marks=pytest.mark.xfail(reason='xlwt does not support '
+ 'openpyxl-compatible '
+ 'style dicts')),
+ 'xlsxwriter',
+ 'openpyxl',
+])
+def test_styler_to_excel(engine):
+ def style(df):
+ # XXX: RGB colors not supported in xlwt
+ return DataFrame([['font-weight: bold', '', ''],
+ ['', 'color: blue', ''],
+ ['', '', 'text-decoration: underline'],
+ ['border-style: solid', '', ''],
+ ['', 'font-style: italic', ''],
+ ['', '', 'text-align: right'],
+ ['background-color: red', '', ''],
+ ['number-format: 0%', '', ''],
+ ['', '', ''],
+ ['', '', ''],
+ ['', '', '']],
+ index=df.index, columns=df.columns)
+
+ def assert_equal_style(cell1, cell2, engine):
+ if engine in ['xlsxwriter', 'openpyxl']:
+ pytest.xfail(reason=("GH25351: failing on some attribute "
+ "comparisons in {}".format(engine)))
+ # XXX: should find a better way to check equality
+ assert cell1.alignment.__dict__ == cell2.alignment.__dict__
+ assert cell1.border.__dict__ == cell2.border.__dict__
+ assert cell1.fill.__dict__ == cell2.fill.__dict__
+ assert cell1.font.__dict__ == cell2.font.__dict__
+ assert cell1.number_format == cell2.number_format
+ assert cell1.protection.__dict__ == cell2.protection.__dict__
+
+ def custom_converter(css):
+ # use bold iff there is custom style attached to the cell
+ if css.strip(' \n;'):
+ return {'font': {'bold': True}}
+ return {}
+
+ pytest.importorskip('jinja2')
+ pytest.importorskip(engine)
+
+ # Prepare spreadsheets
+
+ df = DataFrame(np.random.randn(11, 3))
+ with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path:
+ writer = ExcelWriter(path, engine=engine)
+ df.to_excel(writer, sheet_name='frame')
+ df.style.to_excel(writer, sheet_name='unstyled')
+ styled = df.style.apply(style, axis=None)
+ styled.to_excel(writer, sheet_name='styled')
+ ExcelFormatter(styled, style_converter=custom_converter).write(
+ writer, sheet_name='custom')
+ writer.save()
+
+ if engine not in ('openpyxl', 'xlsxwriter'):
+ # For other engines, we only smoke test
+ return
+ openpyxl = pytest.importorskip('openpyxl')
+ wb = openpyxl.load_workbook(path)
+
+ # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled
+ n_cells = 0
+ for col1, col2 in zip(wb['frame'].columns,
+ wb['unstyled'].columns):
+ assert len(col1) == len(col2)
+ for cell1, cell2 in zip(col1, col2):
+ assert cell1.value == cell2.value
+ assert_equal_style(cell1, cell2, engine)
+ n_cells += 1
+
+ # ensure iteration actually happened:
+ assert n_cells == (11 + 1) * (3 + 1)
+
+ # (2) check styling with default converter
+
+ # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF
+ alpha = '00' if engine == 'openpyxl' else 'FF'
+
+ n_cells = 0
+ for col1, col2 in zip(wb['frame'].columns,
+ wb['styled'].columns):
+ assert len(col1) == len(col2)
+ for cell1, cell2 in zip(col1, col2):
+ ref = '%s%d' % (cell2.column, cell2.row)
+ # XXX: this isn't as strong a test as ideal; we should
+ # confirm that differences are exclusive
+ if ref == 'B2':
+ assert not cell1.font.bold
+ assert cell2.font.bold
+ elif ref == 'C3':
+ assert cell1.font.color.rgb != cell2.font.color.rgb
+ assert cell2.font.color.rgb == alpha + '0000FF'
+ elif ref == 'D4':
+ # This fails with engine=xlsxwriter due to
+ # https://bitbucket.org/openpyxl/openpyxl/issues/800
+ if engine == 'xlsxwriter' \
+ and (LooseVersion(openpyxl.__version__) <
+ LooseVersion('2.4.6')):
+ pass
+ else:
+ assert cell1.font.underline != cell2.font.underline
+ assert cell2.font.underline == 'single'
+ elif ref == 'B5':
+ assert not cell1.border.left.style
+ assert (cell2.border.top.style ==
+ cell2.border.right.style ==
+ cell2.border.bottom.style ==
+ cell2.border.left.style ==
+ 'medium')
+ elif ref == 'C6':
+ assert not cell1.font.italic
+ assert cell2.font.italic
+ elif ref == 'D7':
+ assert (cell1.alignment.horizontal !=
+ cell2.alignment.horizontal)
+ assert cell2.alignment.horizontal == 'right'
+ elif ref == 'B8':
+ assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb
+ assert cell1.fill.patternType != cell2.fill.patternType
+ assert cell2.fill.fgColor.rgb == alpha + 'FF0000'
+ assert cell2.fill.patternType == 'solid'
+ elif ref == 'B9':
+ assert cell1.number_format == 'General'
+ assert cell2.number_format == '0%'
+ else:
+ assert_equal_style(cell1, cell2, engine)
+
+ assert cell1.value == cell2.value
+ n_cells += 1
+
+ assert n_cells == (11 + 1) * (3 + 1)
+
+ # (3) check styling with custom converter
+ n_cells = 0
+ for col1, col2 in zip(wb['frame'].columns,
+ wb['custom'].columns):
+ assert len(col1) == len(col2)
+ for cell1, cell2 in zip(col1, col2):
+ ref = '%s%d' % (cell2.column, cell2.row)
+ if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'):
+ assert not cell1.font.bold
+ assert cell2.font.bold
+ else:
+ assert_equal_style(cell1, cell2, engine)
+
+ assert cell1.value == cell2.value
+ n_cells += 1
+
+ assert n_cells == (11 + 1) * (3 + 1)
+
+
[email protected]_if_no('openpyxl')
[email protected](not PY36, reason='requires fspath')
+class TestFSPath(object):
+
+ def test_excelfile_fspath(self):
+ with tm.ensure_clean('foo.xlsx') as path:
+ df = DataFrame({"A": [1, 2]})
+ df.to_excel(path)
+ xl = ExcelFile(path)
+ result = os.fspath(xl)
+ assert result == path
+
+ def test_excelwriter_fspath(self):
+ with tm.ensure_clean('foo.xlsx') as path:
+ writer = ExcelWriter(path)
+ assert os.fspath(writer) == str(path)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_feather.py b/contrib/python/pandas/py2/pandas/tests/io/test_feather.py
new file mode 100644
index 00000000000..d170e4c43fe
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_feather.py
@@ -0,0 +1,158 @@
+""" test feather-format compat """
+from distutils.version import LooseVersion
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, ensure_clean
+
+from pandas.io.feather_format import read_feather, to_feather # noqa:E402
+
+pyarrow = pytest.importorskip('pyarrow')
+
+
+pyarrow_version = LooseVersion(pyarrow.__version__)
+
+
+class TestFeather(object):
+
+ def check_error_on_write(self, df, exc):
+ # check that we are raising the exception
+ # on writing
+
+ with pytest.raises(exc):
+ with ensure_clean() as path:
+ to_feather(df, path)
+
+ def check_round_trip(self, df, expected=None, **kwargs):
+
+ if expected is None:
+ expected = df
+
+ with ensure_clean() as path:
+ to_feather(df, path)
+
+ result = read_feather(path, **kwargs)
+ assert_frame_equal(result, expected)
+
+ def test_error(self):
+
+ for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
+ np.array([1, 2, 3])]:
+ self.check_error_on_write(obj, ValueError)
+
+ def test_basic(self):
+
+ df = pd.DataFrame({'string': list('abc'),
+ 'int': list(range(1, 4)),
+ 'uint': np.arange(3, 6).astype('u1'),
+ 'float': np.arange(4.0, 7.0, dtype='float64'),
+ 'float_with_null': [1., np.nan, 3],
+ 'bool': [True, False, True],
+ 'bool_with_null': [True, np.nan, False],
+ 'cat': pd.Categorical(list('abc')),
+ 'dt': pd.date_range('20130101', periods=3),
+ 'dttz': pd.date_range('20130101', periods=3,
+ tz='US/Eastern'),
+ 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT,
+ pd.Timestamp('20130103')],
+ 'dtns': pd.date_range('20130101', periods=3,
+ freq='ns')})
+
+ assert df.dttz.dtype.tz.zone == 'US/Eastern'
+ self.check_round_trip(df)
+
+ def test_duplicate_columns(self):
+
+ # https://github.com/wesm/feather/issues/53
+ # not currently able to handle duplicate columns
+ df = pd.DataFrame(np.arange(12).reshape(4, 3),
+ columns=list('aaa')).copy()
+ self.check_error_on_write(df, ValueError)
+
+ def test_stringify_columns(self):
+
+ df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
+ self.check_error_on_write(df, ValueError)
+
+ def test_read_columns(self):
+ # GH 24025
+ df = pd.DataFrame({'col1': list('abc'),
+ 'col2': list(range(1, 4)),
+ 'col3': list('xyz'),
+ 'col4': list(range(4, 7))})
+ columns = ['col1', 'col3']
+ self.check_round_trip(df, expected=df[columns],
+ columns=columns)
+
+ def test_unsupported_other(self):
+
+ # period
+ df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+ # Some versions raise ValueError, others raise ArrowInvalid.
+ self.check_error_on_write(df, Exception)
+
+ def test_rw_nthreads(self):
+ df = pd.DataFrame({'A': np.arange(100000)})
+ expected_warning = (
+ "the 'nthreads' keyword is deprecated, "
+ "use 'use_threads' instead"
+ )
+ # TODO: make the warning work with check_stacklevel=True
+ with tm.assert_produces_warning(
+ FutureWarning, check_stacklevel=False) as w:
+ self.check_round_trip(df, nthreads=2)
+ # we have an extra FutureWarning because of #GH23752
+ assert any(expected_warning in str(x) for x in w)
+
+ # TODO: make the warning work with check_stacklevel=True
+ with tm.assert_produces_warning(
+ FutureWarning, check_stacklevel=False) as w:
+ self.check_round_trip(df, nthreads=1)
+ # we have an extra FutureWarnings because of #GH23752
+ assert any(expected_warning in str(x) for x in w)
+
+ def test_rw_use_threads(self):
+ df = pd.DataFrame({'A': np.arange(100000)})
+ self.check_round_trip(df, use_threads=True)
+ self.check_round_trip(df, use_threads=False)
+
+ def test_write_with_index(self):
+
+ df = pd.DataFrame({'A': [1, 2, 3]})
+ self.check_round_trip(df)
+
+ # non-default index
+ for index in [[2, 3, 4],
+ pd.date_range('20130101', periods=3),
+ list('abc'),
+ [1, 3, 4],
+ pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
+ ('b', 1)]),
+ ]:
+
+ df.index = index
+ self.check_error_on_write(df, ValueError)
+
+ # index with meta-data
+ df.index = [0, 1, 2]
+ df.index.name = 'foo'
+ self.check_error_on_write(df, ValueError)
+
+ # column multi-index
+ df.index = [0, 1, 2]
+ df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
+ self.check_error_on_write(df, ValueError)
+
+ def test_path_pathlib(self):
+ df = tm.makeDataFrame().reset_index()
+ result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
+ tm.assert_frame_equal(df, result)
+
+ def test_path_localpath(self):
+ df = tm.makeDataFrame().reset_index()
+ result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
+ tm.assert_frame_equal(df, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_gbq.py b/contrib/python/pandas/py2/pandas/tests/io/test_gbq.py
new file mode 100644
index 00000000000..d3569af8d77
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_gbq.py
@@ -0,0 +1,153 @@
+from datetime import datetime
+import os
+import platform
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import DataFrame, compat
+import pandas.util.testing as tm
+
+api_exceptions = pytest.importorskip("google.api_core.exceptions")
+bigquery = pytest.importorskip("google.cloud.bigquery")
+service_account = pytest.importorskip("google.oauth2.service_account")
+pandas_gbq = pytest.importorskip("pandas_gbq")
+
+PROJECT_ID = None
+PRIVATE_KEY_JSON_PATH = None
+PRIVATE_KEY_JSON_CONTENTS = None
+
+if compat.PY3:
+ DATASET_ID = 'pydata_pandas_bq_testing_py3'
+else:
+ DATASET_ID = 'pydata_pandas_bq_testing_py2'
+
+TABLE_ID = 'new_test'
+DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)
+
+VERSION = platform.python_version()
+
+
+def _skip_if_no_project_id():
+ if not _get_project_id():
+ pytest.skip(
+ "Cannot run integration tests without a project id")
+
+
+def _skip_if_no_private_key_path():
+ if not _get_private_key_path():
+ pytest.skip("Cannot run integration tests without a "
+ "private key json file path")
+
+
+def _in_travis_environment():
+ return 'TRAVIS_BUILD_DIR' in os.environ and \
+ 'GBQ_PROJECT_ID' in os.environ
+
+
+def _get_project_id():
+ if _in_travis_environment():
+ return os.environ.get('GBQ_PROJECT_ID')
+ return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID')
+
+
+def _get_private_key_path():
+ if _in_travis_environment():
+ return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci',
+ 'travis_gbq.json'])
+
+ private_key_path = PRIVATE_KEY_JSON_PATH
+ if not private_key_path:
+ private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS')
+ return private_key_path
+
+
+def _get_credentials():
+ private_key_path = _get_private_key_path()
+ if private_key_path:
+ return service_account.Credentials.from_service_account_file(
+ private_key_path)
+
+
+def _get_client():
+ project_id = _get_project_id()
+ credentials = _get_credentials()
+ return bigquery.Client(project=project_id, credentials=credentials)
+
+
+def make_mixed_dataframe_v2(test_size):
+ # create df to test for all BQ datatypes except RECORD
+ bools = np.random.randint(2, size=(1, test_size)).astype(bool)
+ flts = np.random.randn(1, test_size)
+ ints = np.random.randint(1, 10, size=(1, test_size))
+ strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
+ times = [datetime.now(pytz.timezone('US/Arizona'))
+ for t in range(test_size)]
+ return DataFrame({'bools': bools[0],
+ 'flts': flts[0],
+ 'ints': ints[0],
+ 'strs': strs[0],
+ 'times': times[0]},
+ index=range(test_size))
+
+
+def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
+ # Default dialect is changing to standard SQL. See:
+ # https://github.com/pydata/pandas-gbq/issues/195
+
+ def mock_read_gbq(*args, **kwargs):
+ return DataFrame([[1.0]])
+
+ monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
+ with tm.assert_produces_warning(FutureWarning):
+ pd.read_gbq("SELECT 1")
+
+
+class TestToGBQIntegrationWithServiceAccountKeyPath(object):
+
+ @classmethod
+ def setup_class(cls):
+ # - GLOBAL CLASS FIXTURES -
+ # put here any instruction you want to execute only *ONCE* *BEFORE*
+ # executing *ALL* tests described below.
+
+ _skip_if_no_project_id()
+ _skip_if_no_private_key_path()
+
+ cls.client = _get_client()
+ cls.dataset = cls.client.dataset(DATASET_ID + "1")
+ try:
+ # Clean-up previous test runs.
+ cls.client.delete_dataset(cls.dataset, delete_contents=True)
+ except api_exceptions.NotFound:
+ pass # It's OK if the dataset doesn't already exist.
+
+ cls.client.create_dataset(bigquery.Dataset(cls.dataset))
+
+ @classmethod
+ def teardown_class(cls):
+ # - GLOBAL CLASS FIXTURES -
+ # put here any instruction you want to execute only *ONCE* *AFTER*
+ # executing all tests.
+ cls.client.delete_dataset(cls.dataset, delete_contents=True)
+
+ def test_roundtrip(self):
+ destination_table = DESTINATION_TABLE + "1"
+
+ test_size = 20001
+ df = make_mixed_dataframe_v2(test_size)
+
+ df.to_gbq(destination_table, _get_project_id(), chunksize=None,
+ credentials=_get_credentials())
+
+ result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
+ .format(destination_table),
+ project_id=_get_project_id(),
+ credentials=_get_credentials(),
+ dialect="standard")
+ assert result['num_rows'][0] == test_size
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_gcs.py b/contrib/python/pandas/py2/pandas/tests/io/test_gcs.py
new file mode 100644
index 00000000000..ec0631e748d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_gcs.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+
+from pandas import DataFrame, date_range, read_csv
+from pandas.util import _test_decorators as td
+from pandas.util.testing import assert_frame_equal
+
+from pandas.io.common import is_gcs_url
+
+
+def test_is_gcs_url():
+ assert is_gcs_url("gcs://pandas/somethingelse.com")
+ assert is_gcs_url("gs://pandas/somethingelse.com")
+ assert not is_gcs_url("s3://pandas/somethingelse.com")
+
+
[email protected]_if_no('gcsfs')
+def test_read_csv_gcs(monkeypatch):
+ df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
+ 'dt': date_range('2018-06-18', periods=2)})
+
+ class MockGCSFileSystem():
+ def open(*args):
+ return StringIO(df1.to_csv(index=False))
+
+ monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
+ df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
+
+ assert_frame_equal(df1, df2)
+
+
[email protected]_if_no('gcsfs')
+def test_to_csv_gcs(monkeypatch):
+ df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
+ 'dt': date_range('2018-06-18', periods=2)})
+ s = StringIO()
+
+ class MockGCSFileSystem():
+ def open(*args):
+ return s
+
+ monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
+ df1.to_csv('gs://test/test.csv', index=True)
+ df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)
+
+ assert_frame_equal(df1, df2)
+
+
[email protected]_if_no('gcsfs')
+def test_gcs_get_filepath_or_buffer(monkeypatch):
+ df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
+ 'dt': date_range('2018-06-18', periods=2)})
+
+ def mock_get_filepath_or_buffer(*args, **kwargs):
+ return (StringIO(df1.to_csv(index=False)),
+ None, None, False)
+
+ monkeypatch.setattr('pandas.io.gcs.get_filepath_or_buffer',
+ mock_get_filepath_or_buffer)
+ df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
+
+ assert_frame_equal(df1, df2)
+
+
[email protected](td.safe_import('gcsfs'),
+ reason='Only check when gcsfs not installed')
+def test_gcs_not_present_exception():
+ with pytest.raises(ImportError) as e:
+ read_csv('gs://test/test.csv')
+ assert 'gcsfs library is required' in str(e.value)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_html.py b/contrib/python/pandas/py2/pandas/tests/io/test_html.py
new file mode 100644
index 00000000000..b2b0c21c812
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_html.py
@@ -0,0 +1,1161 @@
+from __future__ import print_function
+
+from functools import partial
+import os
+import re
+import threading
+
+import numpy as np
+from numpy.random import rand
+import pytest
+
+from pandas.compat import (
+ PY3, BytesIO, StringIO, is_platform_windows, map, reload, zip)
+from pandas.errors import ParserError
+import pandas.util._test_decorators as td
+
+from pandas import (
+ DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv)
+import pandas.util.testing as tm
+from pandas.util.testing import makeCustomDataframe as mkdf, network
+
+from pandas.io.common import URLError, file_path_to_url
+import pandas.io.html
+from pandas.io.html import read_html
+
+HERE = os.path.dirname(__file__)
+
+
+ 'chinese_utf-16.html',
+ 'chinese_utf-32.html',
+ 'chinese_utf-8.html',
+ 'letz_latin1.html',
+])
+def html_encoding_file(request, datapath):
+ """Parametrized fixture for HTML encoding test filenames."""
+ return datapath('io', 'data', 'html_encoding', request.param)
+
+
+def assert_framelist_equal(list1, list2, *args, **kwargs):
+ assert len(list1) == len(list2), ('lists are not of equal size '
+ 'len(list1) == {0}, '
+ 'len(list2) == {1}'.format(len(list1),
+ len(list2)))
+ msg = 'not all list elements are DataFrames'
+ both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and
+ isinstance(y, DataFrame), list1, list2))
+ assert both_frames, msg
+ for frame_i, frame_j in zip(list1, list2):
+ tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
+ assert not frame_i.empty, 'frames are both empty'
+
+
[email protected]_if_no('bs4')
+def test_bs4_version_fails(monkeypatch, datapath):
+ import bs4
+ monkeypatch.setattr(bs4, '__version__', '4.2')
+ with pytest.raises(ValueError, match="minimum version"):
+ read_html(datapath("io", "data", "spam.html"), flavor='bs4')
+
+
+def test_invalid_flavor():
+ url = "google.com"
+ flavor = "invalid flavor"
+ msg = r"\{" + flavor + r"\} is not a valid set of flavors"
+
+ with pytest.raises(ValueError, match=msg):
+ read_html(url, "google", flavor=flavor)
+
+
[email protected]_if_no('bs4')
[email protected]_if_no('lxml')
+def test_same_ordering(datapath):
+ filename = datapath('io', 'data', 'valid_markup.html')
+ dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
+ dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
+ assert_framelist_equal(dfs_lxml, dfs_bs4)
+
+
[email protected]("flavor", [
+ pytest.param('bs4', marks=pytest.mark.skipif(
+ not td.safe_import('lxml'), reason='No bs4')),
+ pytest.param('lxml', marks=pytest.mark.skipif(
+ not td.safe_import('lxml'), reason='No lxml'))], scope="class")
+class TestReadHtml(object):
+
+ @pytest.fixture(autouse=True)
+ def set_files(self, datapath):
+ self.spam_data = datapath('io', 'data', 'spam.html')
+ self.spam_data_kwargs = {}
+ if PY3:
+ self.spam_data_kwargs['encoding'] = 'UTF-8'
+ self.banklist_data = datapath("io", "data", "banklist.html")
+
+ @pytest.fixture(autouse=True, scope="function")
+ def set_defaults(self, flavor, request):
+ self.read_html = partial(read_html, flavor=flavor)
+ yield
+
+ def test_to_html_compat(self):
+ df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
+ r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
+ out = df.to_html()
+ res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
+ tm.assert_frame_equal(res, df)
+
+ @network
+ def test_banklist_url(self):
+ url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
+ df1 = self.read_html(url, 'First Federal Bank of Florida',
+ attrs={"id": 'table'})
+ df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})
+
+ assert_framelist_equal(df1, df2)
+
+ @network
+ def test_spam_url(self):
+ url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&'
+ 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
+ df1 = self.read_html(url, '.*Water.*')
+ df2 = self.read_html(url, 'Unit')
+
+ assert_framelist_equal(df1, df2)
+
+ @pytest.mark.slow
+ def test_banklist(self):
+ df1 = self.read_html(self.banklist_data, '.*Florida.*',
+ attrs={'id': 'table'})
+ df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
+ attrs={'id': 'table'})
+
+ assert_framelist_equal(df1, df2)
+
+ def test_spam(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*')
+ df2 = self.read_html(self.spam_data, 'Unit')
+ assert_framelist_equal(df1, df2)
+
+ assert df1[0].iloc[0, 0] == 'Proximates'
+ assert df1[0].columns[0] == 'Nutrient'
+
+ def test_spam_no_match(self):
+ dfs = self.read_html(self.spam_data)
+ for df in dfs:
+ assert isinstance(df, DataFrame)
+
+ def test_banklist_no_match(self):
+ dfs = self.read_html(self.banklist_data, attrs={'id': 'table'})
+ for df in dfs:
+ assert isinstance(df, DataFrame)
+
+ def test_spam_header(self):
+ df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
+ assert df.columns[0] == 'Proximates'
+ assert not df.empty
+
+ def test_skiprows_int(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_xrange(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
+ tm.assert_frame_equal(df1, df2)
+
+ def test_skiprows_list(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2])
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1])
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_set(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2})
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1})
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_slice(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_slice_short(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_slice_long(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_ndarray(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*',
+ skiprows=np.arange(2))
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))
+
+ assert_framelist_equal(df1, df2)
+
+ def test_skiprows_invalid(self):
+ with pytest.raises(TypeError, match=('is not a valid type '
+ 'for skipping rows')):
+ self.read_html(self.spam_data, '.*Water.*', skiprows='asdf')
+
+ def test_index(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
+ df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
+ assert_framelist_equal(df1, df2)
+
+ def test_header_and_index_no_types(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
+ index_col=0)
+ df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
+ assert_framelist_equal(df1, df2)
+
+ def test_header_and_index_with_types(self):
+ df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
+ index_col=0)
+ df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
+ assert_framelist_equal(df1, df2)
+
+ def test_infer_types(self):
+
+ # 10892 infer_types removed
+ df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
+ df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
+ assert_framelist_equal(df1, df2)
+
+ def test_string_io(self):
+ with open(self.spam_data, **self.spam_data_kwargs) as f:
+ data1 = StringIO(f.read())
+
+ with open(self.spam_data, **self.spam_data_kwargs) as f:
+ data2 = StringIO(f.read())
+
+ df1 = self.read_html(data1, '.*Water.*')
+ df2 = self.read_html(data2, 'Unit')
+ assert_framelist_equal(df1, df2)
+
+ def test_string(self):
+ with open(self.spam_data, **self.spam_data_kwargs) as f:
+ data = f.read()
+
+ df1 = self.read_html(data, '.*Water.*')
+ df2 = self.read_html(data, 'Unit')
+
+ assert_framelist_equal(df1, df2)
+
+ def test_file_like(self):
+ with open(self.spam_data, **self.spam_data_kwargs) as f:
+ df1 = self.read_html(f, '.*Water.*')
+
+ with open(self.spam_data, **self.spam_data_kwargs) as f:
+ df2 = self.read_html(f, 'Unit')
+
+ assert_framelist_equal(df1, df2)
+
+ @network
+ def test_bad_url_protocol(self):
+ with pytest.raises(URLError):
+ self.read_html('git://github.com', match='.*Water.*')
+
+ @network
+ def test_invalid_url(self):
+ try:
+ with pytest.raises(URLError):
+ self.read_html('http://www.a23950sdfa908sd.com',
+ match='.*Water.*')
+ except ValueError as e:
+ assert 'No tables found' in str(e)
+
+ @pytest.mark.slow
+ def test_file_url(self):
+ url = self.banklist_data
+ dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
+ 'First',
+ attrs={'id': 'table'})
+ assert isinstance(dfs, list)
+ for df in dfs:
+ assert isinstance(df, DataFrame)
+
+ @pytest.mark.slow
+ def test_invalid_table_attrs(self):
+ url = self.banklist_data
+ with pytest.raises(ValueError, match='No tables found'):
+ self.read_html(url, 'First Federal Bank of Florida',
+ attrs={'id': 'tasdfable'})
+
+ def _bank_data(self, *args, **kwargs):
+ return self.read_html(self.banklist_data, 'Metcalf',
+ attrs={'id': 'table'}, *args, **kwargs)
+
+ @pytest.mark.slow
+ def test_multiindex_header(self):
+ df = self._bank_data(header=[0, 1])[0]
+ assert isinstance(df.columns, MultiIndex)
+
+ @pytest.mark.slow
+ def test_multiindex_index(self):
+ df = self._bank_data(index_col=[0, 1])[0]
+ assert isinstance(df.index, MultiIndex)
+
+ @pytest.mark.slow
+ def test_multiindex_header_index(self):
+ df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
+ assert isinstance(df.columns, MultiIndex)
+ assert isinstance(df.index, MultiIndex)
+
+ @pytest.mark.slow
+ def test_multiindex_header_skiprows_tuples(self):
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ df = self._bank_data(header=[0, 1], skiprows=1,
+ tupleize_cols=True)[0]
+ assert isinstance(df.columns, Index)
+
+ @pytest.mark.slow
+ def test_multiindex_header_skiprows(self):
+ df = self._bank_data(header=[0, 1], skiprows=1)[0]
+ assert isinstance(df.columns, MultiIndex)
+
+ @pytest.mark.slow
+ def test_multiindex_header_index_skiprows(self):
+ df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
+ assert isinstance(df.index, MultiIndex)
+ assert isinstance(df.columns, MultiIndex)
+
+ @pytest.mark.slow
+ def test_regex_idempotency(self):
+ url = self.banklist_data
+ dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
+ match=re.compile(re.compile('Florida')),
+ attrs={'id': 'table'})
+ assert isinstance(dfs, list)
+ for df in dfs:
+ assert isinstance(df, DataFrame)
+
+ def test_negative_skiprows(self):
+ msg = r'\(you passed a negative value\)'
+ with pytest.raises(ValueError, match=msg):
+ self.read_html(self.spam_data, 'Water', skiprows=-1)
+
+ @network
+ def test_multiple_matches(self):
+ url = 'https://docs.python.org/2/'
+ dfs = self.read_html(url, match='Python')
+ assert len(dfs) > 1
+
+ @network
+ def test_python_docs_table(self):
+ url = 'https://docs.python.org/2/'
+ dfs = self.read_html(url, match='Python')
+ zz = [df.iloc[0, 0][0:4] for df in dfs]
+ assert sorted(zz) == sorted(['Repo', 'What'])
+
+ @pytest.mark.slow
+ def test_thousands_macau_stats(self, datapath):
+ all_non_nan_table_index = -2
+ macau_data = datapath("io", "data", "macau.html")
+ dfs = self.read_html(macau_data, index_col=0,
+ attrs={'class': 'style1'})
+ df = dfs[all_non_nan_table_index]
+
+ assert not any(s.isna().any() for _, s in df.iteritems())
+
+ @pytest.mark.slow
+ def test_thousands_macau_index_col(self, datapath):
+ all_non_nan_table_index = -2
+ macau_data = datapath('io', 'data', 'macau.html')
+ dfs = self.read_html(macau_data, index_col=0, header=0)
+ df = dfs[all_non_nan_table_index]
+
+ assert not any(s.isna().any() for _, s in df.iteritems())
+
+ def test_empty_tables(self):
+ """
+ Make sure that read_html ignores empty tables.
+ """
+ result = self.read_html('''
+ <table>
+ <thead>
+ <tr>
+ <th>A</th>
+ <th>B</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>1</td>
+ <td>2</td>
+ </tr>
+ </tbody>
+ </table>
+ <table>
+ <tbody>
+ </tbody>
+ </table>
+ ''')
+
+ assert len(result) == 1
+
+ def test_multiple_tbody(self):
+ # GH-20690
+ # Read all tbody tags within a single table.
+ result = self.read_html('''<table>
+ <thead>
+ <tr>
+ <th>A</th>
+ <th>B</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>1</td>
+ <td>2</td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <td>3</td>
+ <td>4</td>
+ </tr>
+ </tbody>
+ </table>''')[0]
+
+ expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_header_and_one_column(self):
+ """
+ Don't fail with bs4 when there is a header and only one column
+ as described in issue #9178
+ """
+ result = self.read_html('''<table>
+ <thead>
+ <tr>
+ <th>Header</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>first</td>
+ </tr>
+ </tbody>
+ </table>''')[0]
+
+ expected = DataFrame(data={'Header': 'first'}, index=[0])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_thead_without_tr(self):
+ """
+ Ensure parser adds <tr> within <thead> on malformed HTML.
+ """
+ result = self.read_html('''<table>
+ <thead>
+ <tr>
+ <th>Country</th>
+ <th>Municipality</th>
+ <th>Year</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Ukraine</td>
+ <th>Odessa</th>
+ <td>1944</td>
+ </tr>
+ </tbody>
+ </table>''')[0]
+
+ expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
+ columns=['Country', 'Municipality', 'Year'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_tfoot_read(self):
+ """
+ Make sure that read_html reads tfoot, containing td or th.
+ Ignores empty tfoot
+ """
+ data_template = '''<table>
+ <thead>
+ <tr>
+ <th>A</th>
+ <th>B</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>bodyA</td>
+ <td>bodyB</td>
+ </tr>
+ </tbody>
+ <tfoot>
+ {footer}
+ </tfoot>
+ </table>'''
+
+ expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])
+
+ expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
+ columns=['A', 'B'])
+
+ data1 = data_template.format(footer="")
+ data2 = data_template.format(
+ footer="<tr><td>footA</td><th>footB</th></tr>")
+
+ result1 = self.read_html(data1)[0]
+ result2 = self.read_html(data2)[0]
+
+ tm.assert_frame_equal(result1, expected1)
+ tm.assert_frame_equal(result2, expected2)
+
+ def test_parse_header_of_non_string_column(self):
+ # GH5048: if header is specified explicitly, an int column should be
+ # parsed as int while its header is parsed as str
+ result = self.read_html('''
+ <table>
+ <tr>
+ <td>S</td>
+ <td>I</td>
+ </tr>
+ <tr>
+ <td>text</td>
+ <td>1944</td>
+ </tr>
+ </table>
+ ''', header=0)[0]
+
+ expected = DataFrame([['text', 1944]], columns=('S', 'I'))
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_nyse_wsj_commas_table(self, datapath):
+ data = datapath('io', 'data', 'nyse_wsj.html')
+ df = self.read_html(data, index_col=0, header=0,
+ attrs={'class': 'mdcTable'})[0]
+
+ expected = Index(['Issue(Roll over for charts and headlines)',
+ 'Volume', 'Price', 'Chg', '% Chg'])
+ nrows = 100
+ assert df.shape[0] == nrows
+ tm.assert_index_equal(df.columns, expected)
+
+ @pytest.mark.slow
+ def test_banklist_header(self, datapath):
+ from pandas.io.html import _remove_whitespace
+
+ def try_remove_ws(x):
+ try:
+ return _remove_whitespace(x)
+ except AttributeError:
+ return x
+
+ df = self.read_html(self.banklist_data, 'Metcalf',
+ attrs={'id': 'table'})[0]
+ ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'),
+ converters={'Updated Date': Timestamp,
+ 'Closing Date': Timestamp})
+ assert df.shape == ground_truth.shape
+ old = ['First Vietnamese American BankIn Vietnamese',
+ 'Westernbank Puerto RicoEn Espanol',
+ 'R-G Premier Bank of Puerto RicoEn Espanol',
+ 'EurobankEn Espanol', 'Sanderson State BankEn Espanol',
+ 'Washington Mutual Bank(Including its subsidiary Washington '
+ 'Mutual Bank FSB)',
+ 'Silver State BankEn Espanol',
+ 'AmTrade International BankEn Espanol',
+ 'Hamilton Bank, NAEn Espanol',
+ 'The Citizens Savings BankPioneer Community Bank, Inc.']
+ new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico',
+ 'R-G Premier Bank of Puerto Rico', 'Eurobank',
+ 'Sanderson State Bank', 'Washington Mutual Bank',
+ 'Silver State Bank', 'AmTrade International Bank',
+ 'Hamilton Bank, NA', 'The Citizens Savings Bank']
+ dfnew = df.applymap(try_remove_ws).replace(old, new)
+ gtnew = ground_truth.applymap(try_remove_ws)
+ converted = dfnew._convert(datetime=True, numeric=True)
+ date_cols = ['Closing Date', 'Updated Date']
+ converted[date_cols] = converted[date_cols]._convert(datetime=True,
+ coerce=True)
+ tm.assert_frame_equal(converted, gtnew)
+
+ @pytest.mark.slow
+ def test_gold_canyon(self):
+ gc = 'Gold Canyon'
+ with open(self.banklist_data, 'r') as f:
+ raw_text = f.read()
+
+ assert gc in raw_text
+ df = self.read_html(self.banklist_data, 'Gold Canyon',
+ attrs={'id': 'table'})[0]
+ assert gc in df.to_string()
+
+ def test_different_number_of_cols(self):
+ expected = self.read_html("""<table>
+ <thead>
+ <tr style="text-align: right;">
+ <th></th>
+ <th>C_l0_g0</th>
+ <th>C_l0_g1</th>
+ <th>C_l0_g2</th>
+ <th>C_l0_g3</th>
+ <th>C_l0_g4</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <th>R_l0_g0</th>
+ <td> 0.763</td>
+ <td> 0.233</td>
+ <td> nan</td>
+ <td> nan</td>
+ <td> nan</td>
+ </tr>
+ <tr>
+ <th>R_l0_g1</th>
+ <td> 0.244</td>
+ <td> 0.285</td>
+ <td> 0.392</td>
+ <td> 0.137</td>
+ <td> 0.222</td>
+ </tr>
+ </tbody>
+ </table>""", index_col=0)[0]
+
+ result = self.read_html("""<table>
+ <thead>
+ <tr style="text-align: right;">
+ <th></th>
+ <th>C_l0_g0</th>
+ <th>C_l0_g1</th>
+ <th>C_l0_g2</th>
+ <th>C_l0_g3</th>
+ <th>C_l0_g4</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <th>R_l0_g0</th>
+ <td> 0.763</td>
+ <td> 0.233</td>
+ </tr>
+ <tr>
+ <th>R_l0_g1</th>
+ <td> 0.244</td>
+ <td> 0.285</td>
+ <td> 0.392</td>
+ <td> 0.137</td>
+ <td> 0.222</td>
+ </tr>
+ </tbody>
+ </table>""", index_col=0)[0]
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_colspan_rowspan_1(self):
+ # GH17054
+ result = self.read_html("""
+ <table>
+ <tr>
+ <th>A</th>
+ <th colspan="1">B</th>
+ <th rowspan="1">C</th>
+ </tr>
+ <tr>
+ <td>a</td>
+ <td>b</td>
+ <td>c</td>
+ </tr>
+ </table>
+ """)[0]
+
+ expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_colspan_rowspan_copy_values(self):
+ # GH17054
+
+ # In ASCII, with lowercase letters being copies:
+ #
+ # X x Y Z W
+ # A B b z C
+
+ result = self.read_html("""
+ <table>
+ <tr>
+ <td colspan="2">X</td>
+ <td>Y</td>
+ <td rowspan="2">Z</td>
+ <td>W</td>
+ </tr>
+ <tr>
+ <td>A</td>
+ <td colspan="2">B</td>
+ <td>C</td>
+ </tr>
+ </table>
+ """, header=0)[0]
+
+ expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
+ columns=['X', 'X.1', 'Y', 'Z', 'W'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_colspan_rowspan_both_not_1(self):
+ # GH17054
+
+ # In ASCII, with lowercase letters being copies:
+ #
+ # A B b b C
+ # a b b b D
+
+ result = self.read_html("""
+ <table>
+ <tr>
+ <td rowspan="2">A</td>
+ <td rowspan="2" colspan="3">B</td>
+ <td>C</td>
+ </tr>
+ <tr>
+ <td>D</td>
+ </tr>
+ </table>
+ """, header=0)[0]
+
+ expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
+ columns=['A', 'B', 'B.1', 'B.2', 'C'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_rowspan_at_end_of_row(self):
+ # GH17054
+
+ # In ASCII, with lowercase letters being copies:
+ #
+ # A B
+ # C b
+
+ result = self.read_html("""
+ <table>
+ <tr>
+ <td>A</td>
+ <td rowspan="2">B</td>
+ </tr>
+ <tr>
+ <td>C</td>
+ </tr>
+ </table>
+ """, header=0)[0]
+
+ expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_rowspan_only_rows(self):
+ # GH17054
+
+ result = self.read_html("""
+ <table>
+ <tr>
+ <td rowspan="3">A</td>
+ <td rowspan="3">B</td>
+ </tr>
+ </table>
+ """, header=0)[0]
+
+ expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
+ columns=['A', 'B'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_header_inferred_from_rows_with_only_th(self):
+ # GH17054
+ result = self.read_html("""
+ <table>
+ <tr>
+ <th>A</th>
+ <th>B</th>
+ </tr>
+ <tr>
+ <th>a</th>
+ <th>b</th>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>2</td>
+ </tr>
+ </table>
+ """)[0]
+
+ columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+ codes=[[0, 1], [0, 1]])
+ expected = DataFrame(data=[[1, 2]], columns=columns)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_parse_dates_list(self):
+ df = DataFrame({'date': date_range('1/1/2001', periods=10)})
+ expected = df.to_html()
+ res = self.read_html(expected, parse_dates=[1], index_col=0)
+ tm.assert_frame_equal(df, res[0])
+ res = self.read_html(expected, parse_dates=['date'], index_col=0)
+ tm.assert_frame_equal(df, res[0])
+
+ def test_parse_dates_combine(self):
+ raw_dates = Series(date_range('1/1/2001', periods=10))
+ df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
+ 'time': raw_dates.map(lambda x: str(x.time()))})
+ res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
+ index_col=1)
+ newdf = DataFrame({'datetime': raw_dates})
+ tm.assert_frame_equal(newdf, res[0])
+
+ def test_computer_sales_page(self, datapath):
+ data = datapath('io', 'data', 'computer_sales_page.html')
+ msg = (r"Passed header=\[0,1\] are too many "
+ r"rows for this multi_index of columns")
+ with pytest.raises(ParserError, match=msg):
+ self.read_html(data, header=[0, 1])
+
+ data = datapath('io', 'data', 'computer_sales_page.html')
+ assert self.read_html(data, header=[1, 2])
+
+ def test_wikipedia_states_table(self, datapath):
+ data = datapath('io', 'data', 'wikipedia_states.html')
+ assert os.path.isfile(data), '%r is not a file' % data
+ assert os.path.getsize(data), '%r is an empty file' % data
+ result = self.read_html(data, 'Arizona', header=1)[0]
+ assert result['sq mi'].dtype == np.dtype('float64')
+
+ def test_parser_error_on_empty_header_row(self):
+ msg = (r"Passed header=\[0,1\] are too many "
+ r"rows for this multi_index of columns")
+ with pytest.raises(ParserError, match=msg):
+ self.read_html("""
+ <table>
+ <thead>
+ <tr><th></th><th></tr>
+ <tr><th>A</th><th>B</th></tr>
+ </thead>
+ <tbody>
+ <tr><td>a</td><td>b</td></tr>
+ </tbody>
+ </table>
+ """, header=[0, 1])
+
+ def test_decimal_rows(self):
+ # GH 12907
+ result = self.read_html('''<html>
+ <body>
+ <table>
+ <thead>
+ <tr>
+ <th>Header</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>1100#101</td>
+ </tr>
+ </tbody>
+ </table>
+ </body>
+ </html>''', decimal='#')[0]
+
+ expected = DataFrame(data={'Header': 1100.101}, index=[0])
+
+ assert result['Header'].dtype == np.dtype('float64')
+ tm.assert_frame_equal(result, expected)
+
+ def test_bool_header_arg(self):
+ # GH 6114
+ for arg in [True, False]:
+ with pytest.raises(TypeError):
+ self.read_html(self.spam_data, header=arg)
+
+ def test_converters(self):
+ # GH 13461
+ result = self.read_html(
+ """<table>
+ <thead>
+ <tr>
+ <th>a</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td> 0.763</td>
+ </tr>
+ <tr>
+ <td> 0.244</td>
+ </tr>
+ </tbody>
+ </table>""",
+ converters={'a': str}
+ )[0]
+
+ expected = DataFrame({'a': ['0.763', '0.244']})
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_na_values(self):
+ # GH 13461
+ result = self.read_html(
+ """<table>
+ <thead>
+ <tr>
+ <th>a</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td> 0.763</td>
+ </tr>
+ <tr>
+ <td> 0.244</td>
+ </tr>
+ </tbody>
+ </table>""",
+ na_values=[0.244])[0]
+
+ expected = DataFrame({'a': [0.763, np.nan]})
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_keep_default_na(self):
+ html_data = """<table>
+ <thead>
+ <tr>
+ <th>a</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td> N/A</td>
+ </tr>
+ <tr>
+ <td> NA</td>
+ </tr>
+ </tbody>
+ </table>"""
+
+ expected_df = DataFrame({'a': ['N/A', 'NA']})
+ html_df = self.read_html(html_data, keep_default_na=False)[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ expected_df = DataFrame({'a': [np.nan, np.nan]})
+ html_df = self.read_html(html_data, keep_default_na=True)[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ def test_preserve_empty_rows(self):
+ result = self.read_html("""
+ <table>
+ <tr>
+ <th>A</th>
+ <th>B</th>
+ </tr>
+ <tr>
+ <td>a</td>
+ <td>b</td>
+ </tr>
+ <tr>
+ <td></td>
+ <td></td>
+ </tr>
+ </table>
+ """)[0]
+
+ expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
+ columns=['A', 'B'])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_ignore_empty_rows_when_inferring_header(self):
+ result = self.read_html("""
+ <table>
+ <thead>
+ <tr><th></th><th></tr>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><th>a</th><th>b</th></tr>
+ </thead>
+ <tbody>
+ <tr><td>1</td><td>2</td></tr>
+ </tbody>
+ </table>
+ """)[0]
+
+ columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+ codes=[[0, 1], [0, 1]])
+ expected = DataFrame(data=[[1, 2]], columns=columns)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_multiple_header_rows(self):
+ # Issue #13434
+ expected_df = DataFrame(data=[("Hillary", 68, "D"),
+ ("Bernie", 74, "D"),
+ ("Donald", 69, "R")])
+ expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
+ ["Name", "Unnamed: 1_level_1",
+ "Unnamed: 2_level_1"]]
+ html = expected_df.to_html(index=False)
+ html_df = self.read_html(html, )[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ def test_works_on_valid_markup(self, datapath):
+ filename = datapath('io', 'data', 'valid_markup.html')
+ dfs = self.read_html(filename, index_col=0)
+ assert isinstance(dfs, list)
+ assert isinstance(dfs[0], DataFrame)
+
+ @pytest.mark.slow
+ def test_fallback_success(self, datapath):
+ banklist_data = datapath('io', 'data', 'banklist.html')
+ self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib'])
+
+ def test_to_html_timestamp(self):
+ rng = date_range('2000-01-01', periods=10)
+ df = DataFrame(np.random.randn(10, 4), index=rng)
+
+ result = df.to_html()
+ assert '2000-01-01' in result
+
+ @pytest.mark.parametrize("displayed_only,exp0,exp1", [
+ (True, DataFrame(["foo"]), None),
+ (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
+ def test_displayed_only(self, displayed_only, exp0, exp1):
+ # GH 20027
+ data = StringIO("""<html>
+ <body>
+ <table>
+ <tr>
+ <td>
+ foo
+ <span style="display:none;text-align:center">bar</span>
+ <span style="display:none">baz</span>
+ <span style="display: none">qux</span>
+ </td>
+ </tr>
+ </table>
+ <table style="display: none">
+ <tr>
+ <td>foo</td>
+ </tr>
+ </table>
+ </body>
+ </html>""")
+
+ dfs = self.read_html(data, displayed_only=displayed_only)
+ tm.assert_frame_equal(dfs[0], exp0)
+
+ if exp1 is not None:
+ tm.assert_frame_equal(dfs[1], exp1)
+ else:
+ assert len(dfs) == 1 # Should not parse hidden table
+
+ def test_encode(self, html_encoding_file):
+ _, encoding = os.path.splitext(
+ os.path.basename(html_encoding_file)
+ )[0].split('_')
+
+ try:
+ with open(html_encoding_file, 'rb') as fobj:
+ from_string = self.read_html(fobj.read(), encoding=encoding,
+ index_col=0).pop()
+
+ with open(html_encoding_file, 'rb') as fobj:
+ from_file_like = self.read_html(BytesIO(fobj.read()),
+ encoding=encoding,
+ index_col=0).pop()
+
+ from_filename = self.read_html(html_encoding_file,
+ encoding=encoding,
+ index_col=0).pop()
+ tm.assert_frame_equal(from_string, from_file_like)
+ tm.assert_frame_equal(from_string, from_filename)
+ except Exception:
+ # seems utf-16/32 fail on windows
+ if is_platform_windows():
+ if '16' in encoding or '32' in encoding:
+ pytest.skip()
+ raise
+
+ def test_parse_failure_unseekable(self):
+ # Issue #17975
+
+ if self.read_html.keywords.get('flavor') == 'lxml':
+ pytest.skip("Not applicable for lxml")
+
+ class UnseekableStringIO(StringIO):
+ def seekable(self):
+ return False
+
+ bad = UnseekableStringIO('''
+ <table><tr><td>spam<foobr />eggs</td></tr></table>''')
+
+ assert self.read_html(bad)
+
+ with pytest.raises(ValueError,
+ match='passed a non-rewindable file object'):
+ self.read_html(bad)
+
+ def test_parse_failure_rewinds(self):
+ # Issue #17975
+
+ class MockFile(object):
+ def __init__(self, data):
+ self.data = data
+ self.at_end = False
+
+ def read(self, size=None):
+ data = '' if self.at_end else self.data
+ self.at_end = True
+ return data
+
+ def seek(self, offset):
+ self.at_end = False
+
+ def seekable(self):
+ return True
+
+ good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
+ bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
+
+ assert self.read_html(good)
+ assert self.read_html(bad)
+
+ @pytest.mark.slow
+ def test_importcheck_thread_safety(self, datapath):
+ # see gh-16928
+
+ class ErrorThread(threading.Thread):
+ def run(self):
+ try:
+ super(ErrorThread, self).run()
+ except Exception as e:
+ self.err = e
+ else:
+ self.err = None
+
+ # force import check by reinitalising global vars in html.py
+ reload(pandas.io.html)
+
+ filename = datapath('io', 'data', 'valid_markup.html')
+ helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
+ helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))
+
+ helper_thread1.start()
+ helper_thread2.start()
+
+ while helper_thread1.is_alive() or helper_thread2.is_alive():
+ pass
+ assert None is helper_thread1.err is helper_thread2.err
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_packers.py b/contrib/python/pandas/py2/pandas/tests/io/test_packers.py
new file mode 100644
index 00000000000..9eb6d327be0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_packers.py
@@ -0,0 +1,954 @@
+import datetime
+from distutils.version import LooseVersion
+import glob
+import os
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+from pandas.compat import PY3, u
+from pandas.errors import PerformanceWarning
+
+import pandas
+from pandas import (
+ Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Panel, Period,
+ Series, Timestamp, bdate_range, compat, date_range, period_range)
+from pandas.tests.test_panel import assert_panel_equal
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_categorical_equal, assert_frame_equal, assert_index_equal,
+ assert_series_equal, ensure_clean)
+
+from pandas.io.packers import read_msgpack, to_msgpack
+
+nan = np.nan
+
+try:
+ import blosc # NOQA
+except ImportError:
+ _BLOSC_INSTALLED = False
+else:
+ _BLOSC_INSTALLED = True
+
+try:
+ import zlib # NOQA
+except ImportError:
+ _ZLIB_INSTALLED = False
+else:
+ _ZLIB_INSTALLED = True
+
+
[email protected](scope='module')
+def current_packers_data():
+ # our current version packers data
+ from pandas.tests.io.generate_legacy_storage_files import (
+ create_msgpack_data)
+ return create_msgpack_data()
+
+
[email protected](scope='module')
+def all_packers_data():
+ # our all of our current version packers data
+ from pandas.tests.io.generate_legacy_storage_files import (
+ create_data)
+ return create_data()
+
+
+def check_arbitrary(a, b):
+
+ if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
+ assert(len(a) == len(b))
+ for a_, b_ in zip(a, b):
+ check_arbitrary(a_, b_)
+ elif isinstance(a, Panel):
+ assert_panel_equal(a, b)
+ elif isinstance(a, DataFrame):
+ assert_frame_equal(a, b)
+ elif isinstance(a, Series):
+ assert_series_equal(a, b)
+ elif isinstance(a, Index):
+ assert_index_equal(a, b)
+ elif isinstance(a, Categorical):
+ # Temp,
+ # Categorical.categories is changed from str to bytes in PY3
+ # maybe the same as GH 13591
+ if PY3 and b.categories.inferred_type == 'string':
+ pass
+ else:
+ tm.assert_categorical_equal(a, b)
+ elif a is NaT:
+ assert b is NaT
+ elif isinstance(a, Timestamp):
+ assert a == b
+ assert a.freq == b.freq
+ else:
+ assert(a == b)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestPackers(object):
+
+ def setup_method(self, method):
+ self.path = '__%s__.msg' % tm.rands(10)
+
+ def teardown_method(self, method):
+ pass
+
+ def encode_decode(self, x, compress=None, **kwargs):
+ with ensure_clean(self.path) as p:
+ to_msgpack(p, x, compress=compress, **kwargs)
+ return read_msgpack(p, **kwargs)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestAPI(TestPackers):
+
+ def test_string_io(self):
+
+ df = DataFrame(np.random.randn(10, 2))
+ s = df.to_msgpack(None)
+ result = read_msgpack(s)
+ tm.assert_frame_equal(result, df)
+
+ s = df.to_msgpack()
+ result = read_msgpack(s)
+ tm.assert_frame_equal(result, df)
+
+ s = df.to_msgpack()
+ result = read_msgpack(compat.BytesIO(s))
+ tm.assert_frame_equal(result, df)
+
+ s = to_msgpack(None, df)
+ result = read_msgpack(s)
+ tm.assert_frame_equal(result, df)
+
+ with ensure_clean(self.path) as p:
+
+ s = df.to_msgpack()
+ with open(p, 'wb') as fh:
+ fh.write(s)
+ result = read_msgpack(p)
+ tm.assert_frame_equal(result, df)
+
+ def test_path_pathlib(self):
+ df = tm.makeDataFrame()
+ result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
+ tm.assert_frame_equal(df, result)
+
+ def test_path_localpath(self):
+ df = tm.makeDataFrame()
+ result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
+ tm.assert_frame_equal(df, result)
+
+ def test_iterator_with_string_io(self):
+
+ dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
+ s = to_msgpack(None, *dfs)
+ for i, result in enumerate(read_msgpack(s, iterator=True)):
+ tm.assert_frame_equal(result, dfs[i])
+
+ def test_invalid_arg(self):
+ # GH10369
+ class A(object):
+
+ def __init__(self):
+ self.read = 0
+
+ msg = (r"Invalid file path or buffer object type: <(class|type)"
+ r" '{}'>")
+ with pytest.raises(ValueError, match=msg.format('NoneType')):
+ read_msgpack(path_or_buf=None)
+ with pytest.raises(ValueError, match=msg.format('dict')):
+ read_msgpack(path_or_buf={})
+ with pytest.raises(ValueError, match=msg.format(r'.*\.A')):
+ read_msgpack(path_or_buf=A())
+
+
+class TestNumpy(TestPackers):
+
+ def test_numpy_scalar_float(self):
+ x = np.float32(np.random.rand())
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_numpy_scalar_complex(self):
+ x = np.complex64(np.random.rand() + 1j * np.random.rand())
+ x_rec = self.encode_decode(x)
+ assert np.allclose(x, x_rec)
+
+ def test_scalar_float(self):
+ x = np.random.rand()
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_scalar_bool(self):
+ x = np.bool_(1)
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ x = np.bool_(0)
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_scalar_complex(self):
+ x = np.random.rand() + 1j * np.random.rand()
+ x_rec = self.encode_decode(x)
+ assert np.allclose(x, x_rec)
+
+ def test_list_numpy_float(self):
+ x = [np.float32(np.random.rand()) for i in range(5)]
+ x_rec = self.encode_decode(x)
+ # current msgpack cannot distinguish list/tuple
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+ x_rec = self.encode_decode(tuple(x))
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+ def test_list_numpy_float_complex(self):
+ if not hasattr(np, 'complex128'):
+ pytest.skip('numpy can not handle complex128')
+
+ x = [np.float32(np.random.rand()) for i in range(5)] + \
+ [np.complex128(np.random.rand() + 1j * np.random.rand())
+ for i in range(5)]
+ x_rec = self.encode_decode(x)
+ assert np.allclose(x, x_rec)
+
+ def test_list_float(self):
+ x = [np.random.rand() for i in range(5)]
+ x_rec = self.encode_decode(x)
+ # current msgpack cannot distinguish list/tuple
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+ x_rec = self.encode_decode(tuple(x))
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+ def test_list_float_complex(self):
+ x = [np.random.rand() for i in range(5)] + \
+ [(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
+ x_rec = self.encode_decode(x)
+ assert np.allclose(x, x_rec)
+
+ def test_dict_float(self):
+ x = {'foo': 1.0, 'bar': 2.0}
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_dict_complex(self):
+ x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
+ x_rec = self.encode_decode(x)
+ tm.assert_dict_equal(x, x_rec)
+
+ for key in x:
+ tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
+
+ def test_dict_numpy_float(self):
+ x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_dict_numpy_complex(self):
+ x = {'foo': np.complex128(1.0 + 1.0j),
+ 'bar': np.complex128(2.0 + 2.0j)}
+ x_rec = self.encode_decode(x)
+ tm.assert_dict_equal(x, x_rec)
+
+ for key in x:
+ tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
+
+ def test_numpy_array_float(self):
+
+ # run multiple times
+ for n in range(10):
+ x = np.random.rand(10)
+ for dtype in ['float32', 'float64']:
+ x = x.astype(dtype)
+ x_rec = self.encode_decode(x)
+ tm.assert_almost_equal(x, x_rec)
+
+ def test_numpy_array_complex(self):
+ x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
+ x_rec = self.encode_decode(x)
+ assert (all(map(lambda x, y: x == y, x, x_rec)) and
+ x.dtype == x_rec.dtype)
+
+ def test_list_mixed(self):
+ x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
+ x_rec = self.encode_decode(x)
+ # current msgpack cannot distinguish list/tuple
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+ x_rec = self.encode_decode(tuple(x))
+ tm.assert_almost_equal(tuple(x), x_rec)
+
+
+class TestBasic(TestPackers):
+
+ def test_timestamp(self):
+
+ for i in [Timestamp(
+ '20130101'), Timestamp('20130101', tz='US/Eastern'),
+ Timestamp('201301010501')]:
+ i_rec = self.encode_decode(i)
+ assert i == i_rec
+
+ def test_nat(self):
+ nat_rec = self.encode_decode(NaT)
+ assert NaT is nat_rec
+
+ def test_datetimes(self):
+
+ for i in [datetime.datetime(2013, 1, 1),
+ datetime.datetime(2013, 1, 1, 5, 1),
+ datetime.date(2013, 1, 1),
+ np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
+ i_rec = self.encode_decode(i)
+ assert i == i_rec
+
+ def test_timedeltas(self):
+
+ for i in [datetime.timedelta(days=1),
+ datetime.timedelta(days=1, seconds=10),
+ np.timedelta64(1000000)]:
+ i_rec = self.encode_decode(i)
+ assert i == i_rec
+
+ def test_periods(self):
+ # 13463
+ for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
+ i_rec = self.encode_decode(i)
+ assert i == i_rec
+
+ def test_intervals(self):
+ # 19967
+ for i in [Interval(0, 1), Interval(0, 1, 'left'),
+ Interval(10, 25., 'right')]:
+ i_rec = self.encode_decode(i)
+ assert i == i_rec
+
+
+class TestIndex(TestPackers):
+
+ def setup_method(self, method):
+ super(TestIndex, self).setup_method(method)
+
+ self.d = {
+ 'string': tm.makeStringIndex(100),
+ 'date': tm.makeDateIndex(100),
+ 'int': tm.makeIntIndex(100),
+ 'rng': tm.makeRangeIndex(100),
+ 'float': tm.makeFloatIndex(100),
+ 'empty': Index([]),
+ 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
+ 'period': Index(period_range('2012-1-1', freq='M', periods=3)),
+ 'date2': Index(date_range('2013-01-1', periods=10)),
+ 'bdate': Index(bdate_range('2013-01-02', periods=10)),
+ 'cat': tm.makeCategoricalIndex(100),
+ 'interval': tm.makeIntervalIndex(100),
+ 'timedelta': tm.makeTimedeltaIndex(100, 'H')
+ }
+
+ self.mi = {
+ 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
+ ('foo', 'two'),
+ ('qux', 'one'), ('qux', 'two')],
+ names=['first', 'second']),
+ }
+
+ def test_basic_index(self):
+
+ for s, i in self.d.items():
+ i_rec = self.encode_decode(i)
+ tm.assert_index_equal(i, i_rec)
+
+ # datetime with no freq (GH5506)
+ i = Index([Timestamp('20130101'), Timestamp('20130103')])
+ i_rec = self.encode_decode(i)
+ tm.assert_index_equal(i, i_rec)
+
+ # datetime with timezone
+ i = Index([Timestamp('20130101 9:00:00'), Timestamp(
+ '20130103 11:00:00')]).tz_localize('US/Eastern')
+ i_rec = self.encode_decode(i)
+ tm.assert_index_equal(i, i_rec)
+
+ def test_multi_index(self):
+
+ for s, i in self.mi.items():
+ i_rec = self.encode_decode(i)
+ tm.assert_index_equal(i, i_rec)
+
+ def test_unicode(self):
+ i = tm.makeUnicodeIndex(100)
+
+ i_rec = self.encode_decode(i)
+ tm.assert_index_equal(i, i_rec)
+
+ def categorical_index(self):
+ # GH15487
+ df = DataFrame(np.random.randn(10, 2))
+ df = df.astype({0: 'category'}).set_index(0)
+ result = self.encode_decode(df)
+ tm.assert_frame_equal(result, df)
+
+
+class TestSeries(TestPackers):
+
+ def setup_method(self, method):
+ super(TestSeries, self).setup_method(method)
+
+ self.d = {}
+
+ s = tm.makeStringSeries()
+ s.name = 'string'
+ self.d['string'] = s
+
+ s = tm.makeObjectSeries()
+ s.name = 'object'
+ self.d['object'] = s
+
+ s = Series(iNaT, dtype='M8[ns]', index=range(5))
+ self.d['date'] = s
+
+ data = {
+ 'A': [0., 1., 2., 3., np.nan],
+ 'B': [0, 1, 0, 1, 0],
+ 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+ 'D': date_range('1/1/2009', periods=5),
+ 'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
+ 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
+ [Timestamp('20130603', tz='CET')] * 3,
+ 'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
+ 'H': Categorical([1, 2, 3, 4, 5]),
+ 'I': Categorical([1, 2, 3, 4, 5], ordered=True),
+ 'J': (np.bool_(1), 2, 3, 4, 5),
+ }
+
+ self.d['float'] = Series(data['A'])
+ self.d['int'] = Series(data['B'])
+ self.d['mixed'] = Series(data['E'])
+ self.d['dt_tz_mixed'] = Series(data['F'])
+ self.d['dt_tz'] = Series(data['G'])
+ self.d['cat_ordered'] = Series(data['H'])
+ self.d['cat_unordered'] = Series(data['I'])
+ self.d['numpy_bool_mixed'] = Series(data['J'])
+
+ def test_basic(self):
+
+ # run multiple times here
+ for n in range(10):
+ for s, i in self.d.items():
+ i_rec = self.encode_decode(i)
+ assert_series_equal(i, i_rec)
+
+
+class TestCategorical(TestPackers):
+
+ def setup_method(self, method):
+ super(TestCategorical, self).setup_method(method)
+
+ self.d = {}
+
+ self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
+ self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
+ ordered=True)
+
+ self.d['plain_int'] = Categorical([5, 6, 7, 8])
+ self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
+
+ def test_basic(self):
+
+ # run multiple times here
+ for n in range(10):
+ for s, i in self.d.items():
+ i_rec = self.encode_decode(i)
+ assert_categorical_equal(i, i_rec)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestNDFrame(TestPackers):
+
+ def setup_method(self, method):
+ super(TestNDFrame, self).setup_method(method)
+
+ data = {
+ 'A': [0., 1., 2., 3., np.nan],
+ 'B': [0, 1, 0, 1, 0],
+ 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+ 'D': date_range('1/1/2009', periods=5),
+ 'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
+ 'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
+ 'G': [Timestamp('20130603', tz='CET')] * 5,
+ 'H': Categorical(['a', 'b', 'c', 'd', 'e']),
+ 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
+ }
+
+ self.frame = {
+ 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
+ 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
+ 'mixed': DataFrame(data)}
+
+ self.panel = {
+ 'float': Panel(dict(ItemA=self.frame['float'],
+ ItemB=self.frame['float'] + 1))}
+
+ def test_basic_frame(self):
+
+ for s, i in self.frame.items():
+ i_rec = self.encode_decode(i)
+ assert_frame_equal(i, i_rec)
+
+ def test_basic_panel(self):
+
+ with catch_warnings(record=True):
+ for s, i in self.panel.items():
+ i_rec = self.encode_decode(i)
+ assert_panel_equal(i, i_rec)
+
+ def test_multi(self):
+
+ i_rec = self.encode_decode(self.frame)
+ for k in self.frame.keys():
+ assert_frame_equal(self.frame[k], i_rec[k])
+
+ packed_items = tuple([self.frame['float'], self.frame['float'].A,
+ self.frame['float'].B, None])
+ l_rec = self.encode_decode(packed_items)
+ check_arbitrary(packed_items, l_rec)
+
+ # this is an oddity in that packed lists will be returned as tuples
+ packed_items = [self.frame['float'], self.frame['float'].A,
+ self.frame['float'].B, None]
+ l_rec = self.encode_decode(packed_items)
+ assert isinstance(l_rec, tuple)
+ check_arbitrary(packed_items, l_rec)
+
+ def test_iterator(self):
+
+ packed_items = [self.frame['float'], self.frame['float'].A,
+ self.frame['float'].B, None]
+
+ with ensure_clean(self.path) as path:
+ to_msgpack(path, *packed_items)
+ for i, packed in enumerate(read_msgpack(path, iterator=True)):
+ check_arbitrary(packed, packed_items[i])
+
+ def tests_datetimeindex_freq_issue(self):
+
+ # GH 5947
+ # inferring freq on the datetimeindex
+ df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
+ result = self.encode_decode(df)
+ assert_frame_equal(result, df)
+
+ df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
+ result = self.encode_decode(df)
+ assert_frame_equal(result, df)
+
+ def test_dataframe_duplicate_column_names(self):
+
+ # GH 9618
+ expected_1 = DataFrame(columns=['a', 'a'])
+ expected_2 = DataFrame(columns=[1] * 100)
+ expected_2.loc[0] = np.random.randn(100)
+ expected_3 = DataFrame(columns=[1, 1])
+ expected_3.loc[0] = ['abc', np.nan]
+
+ result_1 = self.encode_decode(expected_1)
+ result_2 = self.encode_decode(expected_2)
+ result_3 = self.encode_decode(expected_3)
+
+ assert_frame_equal(result_1, expected_1)
+ assert_frame_equal(result_2, expected_2)
+ assert_frame_equal(result_3, expected_3)
+
+
+class TestSparse(TestPackers):
+
+ def _check_roundtrip(self, obj, comparator, **kwargs):
+
+ # currently these are not implemetned
+ # i_rec = self.encode_decode(obj)
+ # comparator(obj, i_rec, **kwargs)
+ msg = r"msgpack sparse (series|frame) is not implemented"
+ with pytest.raises(NotImplementedError, match=msg):
+ self.encode_decode(obj)
+
+ def test_sparse_series(self):
+
+ s = tm.makeStringSeries()
+ s[3:5] = np.nan
+ ss = s.to_sparse()
+ self._check_roundtrip(ss, tm.assert_series_equal,
+ check_series_type=True)
+
+ ss2 = s.to_sparse(kind='integer')
+ self._check_roundtrip(ss2, tm.assert_series_equal,
+ check_series_type=True)
+
+ ss3 = s.to_sparse(fill_value=0)
+ self._check_roundtrip(ss3, tm.assert_series_equal,
+ check_series_type=True)
+
+ def test_sparse_frame(self):
+
+ s = tm.makeDataFrame()
+ s.loc[3:5, 1:3] = np.nan
+ s.loc[8:10, -2] = np.nan
+ ss = s.to_sparse()
+
+ self._check_roundtrip(ss, tm.assert_frame_equal,
+ check_frame_type=True)
+
+ ss2 = s.to_sparse(kind='integer')
+ self._check_roundtrip(ss2, tm.assert_frame_equal,
+ check_frame_type=True)
+
+ ss3 = s.to_sparse(fill_value=0)
+ self._check_roundtrip(ss3, tm.assert_frame_equal,
+ check_frame_type=True)
+
+
+class TestCompression(TestPackers):
+ """See https://github.com/pandas-dev/pandas/pull/9783
+ """
+
+ def setup_method(self, method):
+ try:
+ from sqlalchemy import create_engine
+ self._create_sql_engine = create_engine
+ except ImportError:
+ self._SQLALCHEMY_INSTALLED = False
+ else:
+ self._SQLALCHEMY_INSTALLED = True
+
+ super(TestCompression, self).setup_method(method)
+ data = {
+ 'A': np.arange(1000, dtype=np.float64),
+ 'B': np.arange(1000, dtype=np.int32),
+ 'C': list(100 * 'abcdefghij'),
+ 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
+ 'E': [datetime.timedelta(days=x) for x in range(1000)],
+ }
+ self.frame = {
+ 'float': DataFrame({k: data[k] for k in ['A', 'A']}),
+ 'int': DataFrame({k: data[k] for k in ['B', 'B']}),
+ 'mixed': DataFrame(data),
+ }
+
+ def test_plain(self):
+ i_rec = self.encode_decode(self.frame)
+ for k in self.frame.keys():
+ assert_frame_equal(self.frame[k], i_rec[k])
+
+ def _test_compression(self, compress):
+ i_rec = self.encode_decode(self.frame, compress=compress)
+ for k in self.frame.keys():
+ value = i_rec[k]
+ expected = self.frame[k]
+ assert_frame_equal(value, expected)
+ # make sure that we can write to the new frames
+ for block in value._data.blocks:
+ assert block.values.flags.writeable
+
+ def test_compression_zlib(self):
+ if not _ZLIB_INSTALLED:
+ pytest.skip('no zlib')
+ self._test_compression('zlib')
+
+ def test_compression_blosc(self):
+ if not _BLOSC_INSTALLED:
+ pytest.skip('no blosc')
+ self._test_compression('blosc')
+
+ def _test_compression_warns_when_decompress_caches(
+ self, monkeypatch, compress):
+ not_garbage = []
+ control = [] # copied data
+
+ compress_module = globals()[compress]
+ real_decompress = compress_module.decompress
+
+ def decompress(ob):
+ """mock decompress function that delegates to the real
+ decompress but caches the result and a copy of the result.
+ """
+ res = real_decompress(ob)
+ not_garbage.append(res) # hold a reference to this bytes object
+ control.append(bytearray(res)) # copy the data here to check later
+ return res
+
+ # types mapped to values to add in place.
+ rhs = {
+ np.dtype('float64'): 1.0,
+ np.dtype('int32'): 1,
+ np.dtype('object'): 'a',
+ np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
+ np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
+ }
+
+ with monkeypatch.context() as m, \
+ tm.assert_produces_warning(PerformanceWarning) as ws:
+ m.setattr(compress_module, 'decompress', decompress)
+ i_rec = self.encode_decode(self.frame, compress=compress)
+ for k in self.frame.keys():
+
+ value = i_rec[k]
+ expected = self.frame[k]
+ assert_frame_equal(value, expected)
+ # make sure that we can write to the new frames even though
+ # we needed to copy the data
+ for block in value._data.blocks:
+ assert block.values.flags.writeable
+ # mutate the data in some way
+ block.values[0] += rhs[block.dtype]
+
+ for w in ws:
+ # check the messages from our warnings
+ assert str(w.message) == ('copying data after decompressing; '
+ 'this may mean that decompress is '
+ 'caching its result')
+
+ for buf, control_buf in zip(not_garbage, control):
+ # make sure none of our mutations above affected the
+ # original buffers
+ assert buf == control_buf
+
+ def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch):
+ if not _ZLIB_INSTALLED:
+ pytest.skip('no zlib')
+ self._test_compression_warns_when_decompress_caches(
+ monkeypatch, 'zlib')
+
+ def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch):
+ if not _BLOSC_INSTALLED:
+ pytest.skip('no blosc')
+ self._test_compression_warns_when_decompress_caches(
+ monkeypatch, 'blosc')
+
+ def _test_small_strings_no_warn(self, compress):
+ empty = np.array([], dtype='uint8')
+ with tm.assert_produces_warning(None):
+ empty_unpacked = self.encode_decode(empty, compress=compress)
+
+ tm.assert_numpy_array_equal(empty_unpacked, empty)
+ assert empty_unpacked.flags.writeable
+
+ char = np.array([ord(b'a')], dtype='uint8')
+ with tm.assert_produces_warning(None):
+ char_unpacked = self.encode_decode(char, compress=compress)
+
+ tm.assert_numpy_array_equal(char_unpacked, char)
+ assert char_unpacked.flags.writeable
+ # if this test fails I am sorry because the interpreter is now in a
+ # bad state where b'a' points to 98 == ord(b'b').
+ char_unpacked[0] = ord(b'b')
+
+ # we compare the ord of bytes b'a' with unicode u'a' because the should
+ # always be the same (unless we were able to mutate the shared
+ # character singleton in which case ord(b'a') == ord(b'b').
+ assert ord(b'a') == ord(u'a')
+ tm.assert_numpy_array_equal(
+ char_unpacked,
+ np.array([ord(b'b')], dtype='uint8'),
+ )
+
+ def test_small_strings_no_warn_zlib(self):
+ if not _ZLIB_INSTALLED:
+ pytest.skip('no zlib')
+ self._test_small_strings_no_warn('zlib')
+
+ def test_small_strings_no_warn_blosc(self):
+ if not _BLOSC_INSTALLED:
+ pytest.skip('no blosc')
+ self._test_small_strings_no_warn('blosc')
+
+ def test_readonly_axis_blosc(self):
+ # GH11880
+ if not _BLOSC_INSTALLED:
+ pytest.skip('no blosc')
+ df1 = DataFrame({'A': list('abcd')})
+ df2 = DataFrame(df1, index=[1., 2., 3., 4.])
+ assert 1 in self.encode_decode(df1['A'], compress='blosc')
+ assert 1. in self.encode_decode(df2['A'], compress='blosc')
+
+ def test_readonly_axis_zlib(self):
+ # GH11880
+ df1 = DataFrame({'A': list('abcd')})
+ df2 = DataFrame(df1, index=[1., 2., 3., 4.])
+ assert 1 in self.encode_decode(df1['A'], compress='zlib')
+ assert 1. in self.encode_decode(df2['A'], compress='zlib')
+
+ def test_readonly_axis_blosc_to_sql(self):
+ # GH11880
+ if not _BLOSC_INSTALLED:
+ pytest.skip('no blosc')
+ if not self._SQLALCHEMY_INSTALLED:
+ pytest.skip('no sqlalchemy')
+ expected = DataFrame({'A': list('abcd')})
+ df = self.encode_decode(expected, compress='blosc')
+ eng = self._create_sql_engine("sqlite:///:memory:")
+ df.to_sql('test', eng, if_exists='append')
+ result = pandas.read_sql_table('test', eng, index_col='index')
+ result.index.names = [None]
+ assert_frame_equal(expected, result)
+
+ def test_readonly_axis_zlib_to_sql(self):
+ # GH11880
+ if not _ZLIB_INSTALLED:
+ pytest.skip('no zlib')
+ if not self._SQLALCHEMY_INSTALLED:
+ pytest.skip('no sqlalchemy')
+ expected = DataFrame({'A': list('abcd')})
+ df = self.encode_decode(expected, compress='zlib')
+ eng = self._create_sql_engine("sqlite:///:memory:")
+ df.to_sql('test', eng, if_exists='append')
+ result = pandas.read_sql_table('test', eng, index_col='index')
+ result.index.names = [None]
+ assert_frame_equal(expected, result)
+
+
+class TestEncoding(TestPackers):
+
+ def setup_method(self, method):
+ super(TestEncoding, self).setup_method(method)
+ data = {
+ 'A': [compat.u('\u2019')] * 1000,
+ 'B': np.arange(1000, dtype=np.int32),
+ 'C': list(100 * 'abcdefghij'),
+ 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
+ 'E': [datetime.timedelta(days=x) for x in range(1000)],
+ 'G': [400] * 1000
+ }
+ self.frame = {
+ 'float': DataFrame({k: data[k] for k in ['A', 'A']}),
+ 'int': DataFrame({k: data[k] for k in ['B', 'B']}),
+ 'mixed': DataFrame(data),
+ }
+ self.utf_encodings = ['utf8', 'utf16', 'utf32']
+
+ def test_utf(self):
+ # GH10581
+ for encoding in self.utf_encodings:
+ for frame in compat.itervalues(self.frame):
+ result = self.encode_decode(frame, encoding=encoding)
+ assert_frame_equal(result, frame)
+
+ def test_default_encoding(self):
+ for frame in compat.itervalues(self.frame):
+ result = frame.to_msgpack()
+ expected = frame.to_msgpack(encoding='utf8')
+ assert result == expected
+ result = self.encode_decode(frame)
+ assert_frame_equal(result, frame)
+
+
+files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
+ "legacy_msgpack", "*", "*.msgpack"))
+
+
[email protected](params=files)
+def legacy_packer(request, datapath):
+ return datapath(request.param)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestMsgpack(object):
+ """
+ How to add msgpack tests:
+
+ 1. Install pandas version intended to output the msgpack.
+TestPackers
+ 2. Execute "generate_legacy_storage_files.py" to create the msgpack.
+ $ python generate_legacy_storage_files.py <output_dir> msgpack
+
+ 3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
+ """
+
+ minimum_structure = {'series': ['float', 'int', 'mixed',
+ 'ts', 'mi', 'dup'],
+ 'frame': ['float', 'int', 'mixed', 'mi'],
+ 'panel': ['float'],
+ 'index': ['int', 'date', 'period'],
+ 'mi': ['reg2']}
+
+ def check_min_structure(self, data, version):
+ for typ, v in self.minimum_structure.items():
+ assert typ in data, '"{0}" not found in unpacked data'.format(typ)
+ for kind in v:
+ msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
+ assert kind in data[typ], msg
+
+ def compare(self, current_data, all_data, vf, version):
+ # GH12277 encoding default used to be latin-1, now utf-8
+ if LooseVersion(version) < LooseVersion('0.18.0'):
+ data = read_msgpack(vf, encoding='latin-1')
+ else:
+ data = read_msgpack(vf)
+ self.check_min_structure(data, version)
+ for typ, dv in data.items():
+ assert typ in all_data, ('unpacked data contains '
+ 'extra key "{0}"'
+ .format(typ))
+ for dt, result in dv.items():
+ assert dt in current_data[typ], ('data["{0}"] contains extra '
+ 'key "{1}"'.format(typ, dt))
+ try:
+ expected = current_data[typ][dt]
+ except KeyError:
+ continue
+
+ # use a specific comparator
+ # if available
+ comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
+ comparator = getattr(self, comp_method, None)
+ if comparator is not None:
+ comparator(result, expected, typ, version)
+ else:
+ check_arbitrary(result, expected)
+
+ return data
+
+ def compare_series_dt_tz(self, result, expected, typ, version):
+ # 8260
+ # dtype is object < 0.17.0
+ if LooseVersion(version) < LooseVersion('0.17.0'):
+ expected = expected.astype(object)
+ tm.assert_series_equal(result, expected)
+ else:
+ tm.assert_series_equal(result, expected)
+
+ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
+ # 8260
+ # dtype is object < 0.17.0
+ if LooseVersion(version) < LooseVersion('0.17.0'):
+ expected = expected.astype(object)
+ tm.assert_frame_equal(result, expected)
+ else:
+ tm.assert_frame_equal(result, expected)
+
+ def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
+ legacy_packer, datapath):
+
+ version = os.path.basename(os.path.dirname(legacy_packer))
+
+ # GH12142 0.17 files packed in P2 can't be read in P3
+ if (compat.PY3 and version.startswith('0.17.') and
+ legacy_packer.split('.')[-4][-1] == '2'):
+ msg = "Files packed in Py2 can't be read in Py3 ({})"
+ pytest.skip(msg.format(version))
+ try:
+ with catch_warnings(record=True):
+ self.compare(current_packers_data, all_packers_data,
+ legacy_packer, version)
+ except ImportError:
+ # blosc not installed
+ pass
+
+ def test_msgpack_period_freq(self):
+ # https://github.com/pandas-dev/pandas/issues/24135
+ s = Series(np.random.rand(5), index=date_range('20130101', periods=5))
+ r = read_msgpack(s.to_msgpack())
+ repr(r)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_parquet.py b/contrib/python/pandas/py2/pandas/tests/io/test_parquet.py
new file mode 100644
index 00000000000..01a47a67ad1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_parquet.py
@@ -0,0 +1,541 @@
+""" test parquet compat """
+import datetime
+from distutils.version import LooseVersion
+import os
+from warnings import catch_warnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas.util import testing as tm
+
+from pandas.io.parquet import (
+ FastParquetImpl, PyArrowImpl, get_engine, read_parquet, to_parquet)
+
+try:
+ import pyarrow # noqa
+ _HAVE_PYARROW = True
+except ImportError:
+ _HAVE_PYARROW = False
+
+try:
+ import fastparquet # noqa
+ _HAVE_FASTPARQUET = True
+except ImportError:
+ _HAVE_FASTPARQUET = False
+
+
+# setup engines & skips
+ pytest.param('fastparquet',
+ marks=pytest.mark.skipif(not _HAVE_FASTPARQUET,
+ reason='fastparquet is '
+ 'not installed')),
+ pytest.param('pyarrow',
+ marks=pytest.mark.skipif(not _HAVE_PYARROW,
+ reason='pyarrow is '
+ 'not installed'))])
+def engine(request):
+ return request.param
+
+
+def pa():
+ if not _HAVE_PYARROW:
+ pytest.skip("pyarrow is not installed")
+ return 'pyarrow'
+
+
+def fp():
+ if not _HAVE_FASTPARQUET:
+ pytest.skip("fastparquet is not installed")
+ return 'fastparquet'
+
+
+def df_compat():
+ return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
+
+
+def df_cross_compat():
+ df = pd.DataFrame({'a': list('abc'),
+ 'b': list(range(1, 4)),
+ # 'c': np.arange(3, 6).astype('u1'),
+ 'd': np.arange(4.0, 7.0, dtype='float64'),
+ 'e': [True, False, True],
+ 'f': pd.date_range('20130101', periods=3),
+ # 'g': pd.date_range('20130101', periods=3,
+ # tz='US/Eastern'),
+ # 'h': pd.date_range('20130101', periods=3, freq='ns')
+ })
+ return df
+
+
+def df_full():
+ return pd.DataFrame(
+ {'string': list('abc'),
+ 'string_with_nan': ['a', np.nan, 'c'],
+ 'string_with_none': ['a', None, 'c'],
+ 'bytes': [b'foo', b'bar', b'baz'],
+ 'unicode': [u'foo', u'bar', u'baz'],
+ 'int': list(range(1, 4)),
+ 'uint': np.arange(3, 6).astype('u1'),
+ 'float': np.arange(4.0, 7.0, dtype='float64'),
+ 'float_with_nan': [2., np.nan, 3.],
+ 'bool': [True, False, True],
+ 'datetime': pd.date_range('20130101', periods=3),
+ 'datetime_with_nat': [pd.Timestamp('20130101'),
+ pd.NaT,
+ pd.Timestamp('20130103')]})
+
+
+def check_round_trip(df, engine=None, path=None,
+ write_kwargs=None, read_kwargs=None,
+ expected=None, check_names=True,
+ repeat=2):
+ """Verify parquet serializer and deserializer produce the same results.
+
+ Performs a pandas to disk and disk to pandas round trip,
+ then compares the 2 resulting DataFrames to verify equality.
+
+ Parameters
+ ----------
+ df: Dataframe
+ engine: str, optional
+ 'pyarrow' or 'fastparquet'
+ path: str, optional
+ write_kwargs: dict of str:str, optional
+ read_kwargs: dict of str:str, optional
+ expected: DataFrame, optional
+ Expected deserialization result, otherwise will be equal to `df`
+ check_names: list of str, optional
+ Closed set of column names to be compared
+ repeat: int, optional
+ How many times to repeat the test
+ """
+
+ write_kwargs = write_kwargs or {'compression': None}
+ read_kwargs = read_kwargs or {}
+
+ if expected is None:
+ expected = df
+
+ if engine:
+ write_kwargs['engine'] = engine
+ read_kwargs['engine'] = engine
+
+ def compare(repeat):
+ for _ in range(repeat):
+ df.to_parquet(path, **write_kwargs)
+ with catch_warnings(record=True):
+ actual = read_parquet(path, **read_kwargs)
+ tm.assert_frame_equal(expected, actual,
+ check_names=check_names)
+
+ if path is None:
+ with tm.ensure_clean() as path:
+ compare(repeat)
+ else:
+ compare(repeat)
+
+
+def test_invalid_engine(df_compat):
+ with pytest.raises(ValueError):
+ check_round_trip(df_compat, 'foo', 'bar')
+
+
+def test_options_py(df_compat, pa):
+ # use the set option
+
+ with pd.option_context('io.parquet.engine', 'pyarrow'):
+ check_round_trip(df_compat)
+
+
+def test_options_fp(df_compat, fp):
+ # use the set option
+
+ with pd.option_context('io.parquet.engine', 'fastparquet'):
+ check_round_trip(df_compat)
+
+
+def test_options_auto(df_compat, fp, pa):
+ # use the set option
+
+ with pd.option_context('io.parquet.engine', 'auto'):
+ check_round_trip(df_compat)
+
+
+def test_options_get_engine(fp, pa):
+ assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+ assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+ with pd.option_context('io.parquet.engine', 'pyarrow'):
+ assert isinstance(get_engine('auto'), PyArrowImpl)
+ assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+ assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+ with pd.option_context('io.parquet.engine', 'fastparquet'):
+ assert isinstance(get_engine('auto'), FastParquetImpl)
+ assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+ assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+ with pd.option_context('io.parquet.engine', 'auto'):
+ assert isinstance(get_engine('auto'), PyArrowImpl)
+ assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+ assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+
+def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
+ # cross-compat with differing reading/writing engines
+
+ df = df_cross_compat
+ with tm.ensure_clean() as path:
+ df.to_parquet(path, engine=pa, compression=None)
+
+ result = read_parquet(path, engine=fp)
+ tm.assert_frame_equal(result, df)
+
+ result = read_parquet(path, engine=fp, columns=['a', 'd'])
+ tm.assert_frame_equal(result, df[['a', 'd']])
+
+
+def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
+ # cross-compat with differing reading/writing engines
+
+ df = df_cross_compat
+ with tm.ensure_clean() as path:
+ df.to_parquet(path, engine=fp, compression=None)
+
+ with catch_warnings(record=True):
+ result = read_parquet(path, engine=pa)
+ tm.assert_frame_equal(result, df)
+
+ result = read_parquet(path, engine=pa, columns=['a', 'd'])
+ tm.assert_frame_equal(result, df[['a', 'd']])
+
+
+class Base(object):
+
+ def check_error_on_write(self, df, engine, exc):
+ # check that we are raising the exception on writing
+ with tm.ensure_clean() as path:
+ with pytest.raises(exc):
+ to_parquet(df, path, engine, compression=None)
+
+
+class TestBasic(Base):
+
+ def test_error(self, engine):
+ for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
+ np.array([1, 2, 3])]:
+ self.check_error_on_write(obj, engine, ValueError)
+
+ def test_columns_dtypes(self, engine):
+ df = pd.DataFrame({'string': list('abc'),
+ 'int': list(range(1, 4))})
+
+ # unicode
+ df.columns = [u'foo', u'bar']
+ check_round_trip(df, engine)
+
+ def test_columns_dtypes_invalid(self, engine):
+ df = pd.DataFrame({'string': list('abc'),
+ 'int': list(range(1, 4))})
+
+ # numeric
+ df.columns = [0, 1]
+ self.check_error_on_write(df, engine, ValueError)
+
+ if PY3:
+ # bytes on PY3, on PY2 these are str
+ df.columns = [b'foo', b'bar']
+ self.check_error_on_write(df, engine, ValueError)
+
+ # python object
+ df.columns = [datetime.datetime(2011, 1, 1, 0, 0),
+ datetime.datetime(2011, 1, 1, 1, 1)]
+ self.check_error_on_write(df, engine, ValueError)
+
+ @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli'])
+ def test_compression(self, engine, compression):
+
+ if compression == 'snappy':
+ pytest.importorskip('snappy')
+
+ elif compression == 'brotli':
+ pytest.importorskip('brotli')
+
+ df = pd.DataFrame({'A': [1, 2, 3]})
+ check_round_trip(df, engine, write_kwargs={'compression': compression})
+
+ def test_read_columns(self, engine):
+ # GH18154
+ df = pd.DataFrame({'string': list('abc'),
+ 'int': list(range(1, 4))})
+
+ expected = pd.DataFrame({'string': list('abc')})
+ check_round_trip(df, engine, expected=expected,
+ read_kwargs={'columns': ['string']})
+
+ def test_write_index(self, engine):
+ check_names = engine != 'fastparquet'
+
+ df = pd.DataFrame({'A': [1, 2, 3]})
+ check_round_trip(df, engine)
+
+ indexes = [
+ [2, 3, 4],
+ pd.date_range('20130101', periods=3),
+ list('abc'),
+ [1, 3, 4],
+ ]
+ # non-default index
+ for index in indexes:
+ df.index = index
+ check_round_trip(df, engine, check_names=check_names)
+
+ # index with meta-data
+ df.index = [0, 1, 2]
+ df.index.name = 'foo'
+ check_round_trip(df, engine)
+
+ def test_write_multiindex(self, pa):
+ # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
+ engine = pa
+
+ df = pd.DataFrame({'A': [1, 2, 3]})
+ index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
+ df.index = index
+ check_round_trip(df, engine)
+
+ def test_write_column_multiindex(self, engine):
+ # column multi-index
+ mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
+ df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
+ self.check_error_on_write(df, engine, ValueError)
+
+ def test_multiindex_with_columns(self, pa):
+ engine = pa
+ dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
+ df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
+ columns=list('ABC'))
+ index1 = pd.MultiIndex.from_product(
+ [['Level1', 'Level2'], dates],
+ names=['level', 'date'])
+ index2 = index1.copy(names=None)
+ for index in [index1, index2]:
+ df.index = index
+
+ check_round_trip(df, engine)
+ check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
+ expected=df[['A', 'B']])
+
+ def test_write_ignoring_index(self, engine):
+ # ENH 20768
+ # Ensure index=False omits the index from the written Parquet file.
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']})
+
+ write_kwargs = {
+ 'compression': None,
+ 'index': False,
+ }
+
+ # Because we're dropping the index, we expect the loaded dataframe to
+ # have the default integer index.
+ expected = df.reset_index(drop=True)
+
+ check_round_trip(df, engine, write_kwargs=write_kwargs,
+ expected=expected)
+
+ # Ignore custom index
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']},
+ index=['zyx', 'wvu', 'tsr'])
+
+ check_round_trip(df, engine, write_kwargs=write_kwargs,
+ expected=expected)
+
+ # Ignore multi-indexes as well.
+ arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ df = pd.DataFrame({'one': [i for i in range(8)],
+ 'two': [-i for i in range(8)]}, index=arrays)
+
+ expected = df.reset_index(drop=True)
+ check_round_trip(df, engine, write_kwargs=write_kwargs,
+ expected=expected)
+
+
+class TestParquetPyArrow(Base):
+
+ def test_basic(self, pa, df_full):
+
+ df = df_full
+
+ # additional supported types for pyarrow
+ df['datetime_tz'] = pd.date_range('20130101', periods=3,
+ tz='Europe/Brussels')
+ df['bool_with_none'] = [True, None, True]
+
+ check_round_trip(df, pa)
+
+ # TODO: This doesn't fail on all systems; track down which
+ @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)",
+ strict=False)
+ def test_basic_subset_columns(self, pa, df_full):
+ # GH18628
+
+ df = df_full
+ # additional supported types for pyarrow
+ df['datetime_tz'] = pd.date_range('20130101', periods=3,
+ tz='Europe/Brussels')
+
+ check_round_trip(df, pa, expected=df[['string', 'int']],
+ read_kwargs={'columns': ['string', 'int']})
+
+ def test_duplicate_columns(self, pa):
+ # not currently able to handle duplicate columns
+ df = pd.DataFrame(np.arange(12).reshape(4, 3),
+ columns=list('aaa')).copy()
+ self.check_error_on_write(df, pa, ValueError)
+
+ def test_unsupported(self, pa):
+ # period
+ df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+ # pyarrow 0.11 raises ArrowTypeError
+ # older pyarrows raise ArrowInvalid
+ self.check_error_on_write(df, pa, Exception)
+
+ # timedelta
+ df = pd.DataFrame({'a': pd.timedelta_range('1 day',
+ periods=3)})
+ self.check_error_on_write(df, pa, NotImplementedError)
+
+ # mixed python objects
+ df = pd.DataFrame({'a': ['a', 1, 2.0]})
+ # pyarrow 0.11 raises ArrowTypeError
+ # older pyarrows raise ArrowInvalid
+ self.check_error_on_write(df, pa, Exception)
+
+ def test_categorical(self, pa):
+
+ # supported in >= 0.7.0
+ df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
+
+ # de-serialized as object
+ expected = df.assign(a=df.a.astype(object))
+ check_round_trip(df, pa, expected=expected)
+
+ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
+ # GH #19134
+ check_round_trip(df_compat, pa,
+ path='s3://pandas-test/pyarrow.parquet')
+
+ def test_partition_cols_supported(self, pa, df_full):
+ # GH #23283
+ partition_cols = ['bool', 'int']
+ df = df_full
+ with tm.ensure_clean_dir() as path:
+ df.to_parquet(path, partition_cols=partition_cols,
+ compression=None)
+ import pyarrow.parquet as pq
+ dataset = pq.ParquetDataset(path, validate_schema=False)
+ assert len(dataset.partitions.partition_names) == 2
+ assert dataset.partitions.partition_names == set(partition_cols)
+
+
+class TestParquetFastParquet(Base):
+
+ @td.skip_if_no('fastparquet', min_version="0.2.1")
+ def test_basic(self, fp, df_full):
+ df = df_full
+
+ # additional supported types for fastparquet
+ if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
+ df['datetime_tz'] = pd.date_range('20130101', periods=3,
+ tz='US/Eastern')
+ df['timedelta'] = pd.timedelta_range('1 day', periods=3)
+ check_round_trip(df, fp)
+
+ @pytest.mark.skip(reason="not supported")
+ def test_duplicate_columns(self, fp):
+
+ # not currently able to handle duplicate columns
+ df = pd.DataFrame(np.arange(12).reshape(4, 3),
+ columns=list('aaa')).copy()
+ self.check_error_on_write(df, fp, ValueError)
+
+ def test_bool_with_none(self, fp):
+ df = pd.DataFrame({'a': [True, None, False]})
+ expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16')
+ check_round_trip(df, fp, expected=expected)
+
+ def test_unsupported(self, fp):
+
+ # period
+ df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+ self.check_error_on_write(df, fp, ValueError)
+
+ # mixed
+ df = pd.DataFrame({'a': ['a', 1, 2.0]})
+ self.check_error_on_write(df, fp, ValueError)
+
+ def test_categorical(self, fp):
+ if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
+ pytest.skip("CategoricalDtype not supported for older fp")
+ df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
+ check_round_trip(df, fp)
+
+ def test_filter_row_groups(self, fp):
+ d = {'a': list(range(0, 3))}
+ df = pd.DataFrame(d)
+ with tm.ensure_clean() as path:
+ df.to_parquet(path, fp, compression=None,
+ row_group_offsets=1)
+ result = read_parquet(path, fp, filters=[('a', '==', 0)])
+ assert len(result) == 1
+
+ def test_s3_roundtrip(self, df_compat, s3_resource, fp):
+ # GH #19134
+ check_round_trip(df_compat, fp,
+ path='s3://pandas-test/fastparquet.parquet')
+
+ def test_partition_cols_supported(self, fp, df_full):
+ # GH #23283
+ partition_cols = ['bool', 'int']
+ df = df_full
+ with tm.ensure_clean_dir() as path:
+ df.to_parquet(path, engine="fastparquet",
+ partition_cols=partition_cols, compression=None)
+ assert os.path.exists(path)
+ import fastparquet
+ actual_partition_cols = fastparquet.ParquetFile(path, False).cats
+ assert len(actual_partition_cols) == 2
+
+ def test_partition_on_supported(self, fp, df_full):
+ # GH #23283
+ partition_cols = ['bool', 'int']
+ df = df_full
+ with tm.ensure_clean_dir() as path:
+ df.to_parquet(path, engine="fastparquet", compression=None,
+ partition_on=partition_cols)
+ assert os.path.exists(path)
+ import fastparquet
+ actual_partition_cols = fastparquet.ParquetFile(path, False).cats
+ assert len(actual_partition_cols) == 2
+
+ def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
+ # GH #23283
+ partition_cols = ['bool', 'int']
+ df = df_full
+ with pytest.raises(ValueError):
+ with tm.ensure_clean_dir() as path:
+ df.to_parquet(path, engine="fastparquet", compression=None,
+ partition_on=partition_cols,
+ partition_cols=partition_cols)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_pickle.py b/contrib/python/pandas/py2/pandas/tests/io/test_pickle.py
new file mode 100644
index 00000000000..7f3fe1aa401
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_pickle.py
@@ -0,0 +1,481 @@
+# pylint: disable=E1101,E1103,W0232
+
+"""
+manage legacy pickle tests
+
+How to add pickle tests:
+
+1. Install pandas version intended to output the pickle.
+
+2. Execute "generate_legacy_storage_files.py" to create the pickle.
+$ python generate_legacy_storage_files.py <output_dir> pickle
+
+3. Move the created pickle to "data/legacy_pickle/<version>" directory.
+"""
+from distutils.version import LooseVersion
+import glob
+import os
+import shutil
+from warnings import catch_warnings, simplefilter
+
+import pytest
+
+from pandas.compat import PY3, is_platform_little_endian
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import Index
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day, MonthEnd
+
+
[email protected](scope='module')
+def current_pickle_data():
+ # our current version pickle data
+ from pandas.tests.io.generate_legacy_storage_files import (
+ create_pickle_data)
+ return create_pickle_data()
+
+
+# ---------------------
+# comparison functions
+# ---------------------
+def compare_element(result, expected, typ, version=None):
+ if isinstance(expected, Index):
+ tm.assert_index_equal(expected, result)
+ return
+
+ if typ.startswith('sp_'):
+ comparator = getattr(tm, "assert_%s_equal" % typ)
+ comparator(result, expected, exact_indices=False)
+ elif typ == 'timestamp':
+ if expected is pd.NaT:
+ assert result is pd.NaT
+ else:
+ assert result == expected
+ assert result.freq == expected.freq
+ else:
+ comparator = getattr(tm, "assert_%s_equal" %
+ typ, tm.assert_almost_equal)
+ comparator(result, expected)
+
+
+def compare(data, vf, version):
+
+ # py3 compat when reading py2 pickle
+ try:
+ data = pd.read_pickle(vf)
+ except (ValueError) as e:
+ if 'unsupported pickle protocol:' in str(e):
+ # trying to read a py3 pickle in py2
+ return
+ else:
+ raise
+
+ m = globals()
+ for typ, dv in data.items():
+ for dt, result in dv.items():
+ try:
+ expected = data[typ][dt]
+ except (KeyError):
+ if version in ('0.10.1', '0.11.0') and dt == 'reg':
+ break
+ else:
+ raise
+
+ # use a specific comparator
+ # if available
+ comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
+
+ comparator = m.get(comparator, m['compare_element'])
+ comparator(result, expected, typ, version)
+ return data
+
+
+def compare_sp_series_ts(res, exp, typ, version):
+ # SparseTimeSeries integrated into SparseSeries in 0.12.0
+ # and deprecated in 0.17.0
+ if version and LooseVersion(version) <= LooseVersion("0.12.0"):
+ tm.assert_sp_series_equal(res, exp, check_series_type=False)
+ else:
+ tm.assert_sp_series_equal(res, exp)
+
+
+def compare_series_ts(result, expected, typ, version):
+ # GH 7748
+ tm.assert_series_equal(result, expected)
+ assert result.index.freq == expected.index.freq
+ assert not result.index.freq.normalize
+ tm.assert_series_equal(result > 0, expected > 0)
+
+ # GH 9291
+ freq = result.index.freq
+ assert freq + Day(1) == Day(2)
+
+ res = freq + pd.Timedelta(hours=1)
+ assert isinstance(res, pd.Timedelta)
+ assert res == pd.Timedelta(days=1, hours=1)
+
+ res = freq + pd.Timedelta(nanoseconds=1)
+ assert isinstance(res, pd.Timedelta)
+ assert res == pd.Timedelta(days=1, nanoseconds=1)
+
+
+def compare_series_dt_tz(result, expected, typ, version):
+ # 8260
+ # dtype is object < 0.17.0
+ if LooseVersion(version) < LooseVersion('0.17.0'):
+ expected = expected.astype(object)
+ tm.assert_series_equal(result, expected)
+ else:
+ tm.assert_series_equal(result, expected)
+
+
+def compare_series_cat(result, expected, typ, version):
+ # Categorical dtype is added in 0.15.0
+ # ordered is changed in 0.16.0
+ if LooseVersion(version) < LooseVersion('0.15.0'):
+ tm.assert_series_equal(result, expected, check_dtype=False,
+ check_categorical=False)
+ elif LooseVersion(version) < LooseVersion('0.16.0'):
+ tm.assert_series_equal(result, expected, check_categorical=False)
+ else:
+ tm.assert_series_equal(result, expected)
+
+
+def compare_frame_dt_mixed_tzs(result, expected, typ, version):
+ # 8260
+ # dtype is object < 0.17.0
+ if LooseVersion(version) < LooseVersion('0.17.0'):
+ expected = expected.astype(object)
+ tm.assert_frame_equal(result, expected)
+ else:
+ tm.assert_frame_equal(result, expected)
+
+
+def compare_frame_cat_onecol(result, expected, typ, version):
+ # Categorical dtype is added in 0.15.0
+ # ordered is changed in 0.16.0
+ if LooseVersion(version) < LooseVersion('0.15.0'):
+ tm.assert_frame_equal(result, expected, check_dtype=False,
+ check_categorical=False)
+ elif LooseVersion(version) < LooseVersion('0.16.0'):
+ tm.assert_frame_equal(result, expected, check_categorical=False)
+ else:
+ tm.assert_frame_equal(result, expected)
+
+
+def compare_frame_cat_and_float(result, expected, typ, version):
+ compare_frame_cat_onecol(result, expected, typ, version)
+
+
+def compare_index_period(result, expected, typ, version):
+ tm.assert_index_equal(result, expected)
+ assert isinstance(result.freq, MonthEnd)
+ assert result.freq == MonthEnd()
+ assert result.freqstr == 'M'
+ tm.assert_index_equal(result.shift(2), expected.shift(2))
+
+
+def compare_sp_frame_float(result, expected, typ, version):
+ if LooseVersion(version) <= LooseVersion('0.18.1'):
+ tm.assert_sp_frame_equal(result, expected, exact_indices=False,
+ check_dtype=False)
+ else:
+ tm.assert_sp_frame_equal(result, expected)
+
+
+files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
+ "legacy_pickle", "*", "*.pickle"))
+
+
[email protected](params=files)
+def legacy_pickle(request, datapath):
+ return datapath(request.param)
+
+
+# ---------------------
+# tests
+# ---------------------
+def test_pickles(current_pickle_data, legacy_pickle):
+ if not is_platform_little_endian():
+ pytest.skip("known failure on non-little endian")
+
+ version = os.path.basename(os.path.dirname(legacy_pickle))
+ with catch_warnings(record=True):
+ simplefilter("ignore")
+ compare(current_pickle_data, legacy_pickle, version)
+
+
+def test_round_trip_current(current_pickle_data):
+
+ try:
+ import cPickle as c_pickle
+
+ def c_pickler(obj, path):
+ with open(path, 'wb') as fh:
+ c_pickle.dump(obj, fh, protocol=-1)
+
+ def c_unpickler(path):
+ with open(path, 'rb') as fh:
+ fh.seek(0)
+ return c_pickle.load(fh)
+ except ImportError:
+ c_pickler = None
+ c_unpickler = None
+
+ import pickle as python_pickle
+
+ def python_pickler(obj, path):
+ with open(path, 'wb') as fh:
+ python_pickle.dump(obj, fh, protocol=-1)
+
+ def python_unpickler(path):
+ with open(path, 'rb') as fh:
+ fh.seek(0)
+ return python_pickle.load(fh)
+
+ data = current_pickle_data
+ for typ, dv in data.items():
+ for dt, expected in dv.items():
+
+ for writer in [pd.to_pickle, c_pickler, python_pickler]:
+ if writer is None:
+ continue
+
+ with tm.ensure_clean() as path:
+
+ # test writing with each pickler
+ writer(expected, path)
+
+ # test reading with each unpickler
+ result = pd.read_pickle(path)
+ compare_element(result, expected, typ)
+
+ if c_unpickler is not None:
+ result = c_unpickler(path)
+ compare_element(result, expected, typ)
+
+ result = python_unpickler(path)
+ compare_element(result, expected, typ)
+
+
+def test_pickle_v0_14_1(datapath):
+
+ cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
+ categories=['a', 'b', 'c', 'd'])
+ pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle')
+ # This code was executed once on v0.14.1 to generate the pickle:
+ #
+ # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
+ # name='foobar')
+ # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
+ #
+ tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
+
+
+def test_pickle_v0_15_2(datapath):
+ # ordered -> _ordered
+ # GH 9347
+
+ cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
+ categories=['a', 'b', 'c', 'd'])
+ pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle')
+ # This code was executed once on v0.15.2 to generate the pickle:
+ #
+ # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
+ # name='foobar')
+ # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
+ #
+ tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
+
+
+def test_pickle_path_pathlib():
+ df = tm.makeDataFrame()
+ result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
+ tm.assert_frame_equal(df, result)
+
+
+def test_pickle_path_localpath():
+ df = tm.makeDataFrame()
+ result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
+ tm.assert_frame_equal(df, result)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+def get_random_path():
+ return u'__%s__.pickle' % tm.rands(10)
+
+
+class TestCompression(object):
+
+ _compression_to_extension = {
+ None: ".none",
+ 'gzip': '.gz',
+ 'bz2': '.bz2',
+ 'zip': '.zip',
+ 'xz': '.xz',
+ }
+
+ def compress_file(self, src_path, dest_path, compression):
+ if compression is None:
+ shutil.copyfile(src_path, dest_path)
+ return
+
+ if compression == 'gzip':
+ import gzip
+ f = gzip.open(dest_path, "w")
+ elif compression == 'bz2':
+ import bz2
+ f = bz2.BZ2File(dest_path, "w")
+ elif compression == 'zip':
+ import zipfile
+ with zipfile.ZipFile(dest_path, "w",
+ compression=zipfile.ZIP_DEFLATED) as f:
+ f.write(src_path, os.path.basename(src_path))
+ elif compression == 'xz':
+ lzma = pd.compat.import_lzma()
+ f = lzma.LZMAFile(dest_path, "w")
+ else:
+ msg = 'Unrecognized compression type: {}'.format(compression)
+ raise ValueError(msg)
+
+ if compression != "zip":
+ with open(src_path, "rb") as fh, f:
+ f.write(fh.read())
+
+ def test_write_explicit(self, compression, get_random_path):
+ base = get_random_path
+ path1 = base + ".compressed"
+ path2 = base + ".raw"
+
+ with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+ df = tm.makeDataFrame()
+
+ # write to compressed file
+ df.to_pickle(p1, compression=compression)
+
+ # decompress
+ with tm.decompress_file(p1, compression=compression) as f:
+ with open(p2, "wb") as fh:
+ fh.write(f.read())
+
+ # read decompressed file
+ df2 = pd.read_pickle(p2, compression=None)
+
+ tm.assert_frame_equal(df, df2)
+
+ @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
+ def test_write_explicit_bad(self, compression, get_random_path):
+ with pytest.raises(ValueError, match="Unrecognized compression type"):
+ with tm.ensure_clean(get_random_path) as path:
+ df = tm.makeDataFrame()
+ df.to_pickle(path, compression=compression)
+
+ @pytest.mark.parametrize('ext', [
+ '', '.gz', '.bz2', '.no_compress',
+ pytest.param('.xz', marks=td.skip_if_no_lzma)
+ ])
+ def test_write_infer(self, ext, get_random_path):
+ base = get_random_path
+ path1 = base + ext
+ path2 = base + ".raw"
+ compression = None
+ for c in self._compression_to_extension:
+ if self._compression_to_extension[c] == ext:
+ compression = c
+ break
+
+ with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+ df = tm.makeDataFrame()
+
+ # write to compressed file by inferred compression method
+ df.to_pickle(p1)
+
+ # decompress
+ with tm.decompress_file(p1, compression=compression) as f:
+ with open(p2, "wb") as fh:
+ fh.write(f.read())
+
+ # read decompressed file
+ df2 = pd.read_pickle(p2, compression=None)
+
+ tm.assert_frame_equal(df, df2)
+
+ def test_read_explicit(self, compression, get_random_path):
+ base = get_random_path
+ path1 = base + ".raw"
+ path2 = base + ".compressed"
+
+ with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+ df = tm.makeDataFrame()
+
+ # write to uncompressed file
+ df.to_pickle(p1, compression=None)
+
+ # compress
+ self.compress_file(p1, p2, compression=compression)
+
+ # read compressed file
+ df2 = pd.read_pickle(p2, compression=compression)
+
+ tm.assert_frame_equal(df, df2)
+
+ @pytest.mark.parametrize('ext', [
+ '', '.gz', '.bz2', '.zip', '.no_compress',
+ pytest.param('.xz', marks=td.skip_if_no_lzma)
+ ])
+ def test_read_infer(self, ext, get_random_path):
+ base = get_random_path
+ path1 = base + ".raw"
+ path2 = base + ext
+ compression = None
+ for c in self._compression_to_extension:
+ if self._compression_to_extension[c] == ext:
+ compression = c
+ break
+
+ with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+ df = tm.makeDataFrame()
+
+ # write to uncompressed file
+ df.to_pickle(p1, compression=None)
+
+ # compress
+ self.compress_file(p1, p2, compression=compression)
+
+ # read compressed file by inferred compression method
+ df2 = pd.read_pickle(p2)
+
+ tm.assert_frame_equal(df, df2)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+class TestProtocol(object):
+
+ @pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
+ def test_read(self, protocol, get_random_path):
+ with tm.ensure_clean(get_random_path) as path:
+ df = tm.makeDataFrame()
+ df.to_pickle(path, protocol=protocol)
+ df2 = pd.read_pickle(path)
+ tm.assert_frame_equal(df, df2)
+
+ @pytest.mark.parametrize('protocol', [3, 4])
+ @pytest.mark.skipif(PY3, reason="Testing invalid parameters for Python 2")
+ def test_read_bad_versions(self, protocol, get_random_path):
+ # For Python 2, HIGHEST_PROTOCOL should be 2.
+ msg = ("pickle protocol {protocol} asked for; the highest available "
+ "protocol is 2").format(protocol=protocol)
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean(get_random_path) as path:
+ df = tm.makeDataFrame()
+ df.to_pickle(path, protocol=protocol)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_pytables.py b/contrib/python/pandas/py2/pandas/tests/io/test_pytables.py
new file mode 100644
index 00000000000..73e632d538f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_pytables.py
@@ -0,0 +1,5691 @@
+from contextlib import contextmanager
+import datetime
+from datetime import timedelta
+from distutils.version import LooseVersion
+import os
+import tempfile
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+ PY35, PY36, BytesIO, is_platform_little_endian, is_platform_windows,
+ lrange, range, text_type, u)
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_categorical_dtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex,
+ Panel, RangeIndex, Series, Timestamp, bdate_range, compat, concat,
+ date_range, isna, timedelta_range)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_frame_equal, assert_panel_equal, assert_series_equal, set_timezone)
+
+from pandas.io import pytables as pytables # noqa:E402
+from pandas.io.formats.printing import pprint_thing
+from pandas.io.pytables import (
+ ClosedFileError, HDFStore, PossibleDataLossError, Term, read_hdf)
+from pandas.io.pytables import TableIterator # noqa:E402
+
+tables = pytest.importorskip('tables')
+
+
+# TODO:
+# remove when gh-24839 is fixed; this affects numpy 1.16
+# and pytables 3.4.4
+xfail_non_writeable = pytest.mark.xfail(
+ LooseVersion(np.__version__) >= LooseVersion('1.16'),
+ reason=('gh-25511, gh-24839. pytables needs a '
+ 'release beyong 3.4.4 to support numpy 1.16x'))
+
+
+_default_compressor = ('blosc' if LooseVersion(tables.__version__) >=
+ LooseVersion('2.2') else 'zlib')
+
+
+ignore_natural_naming_warning = pytest.mark.filterwarnings(
+ "ignore:object name:tables.exceptions.NaturalNameWarning"
+)
+
+# contextmanager to ensure the file cleanup
+
+
+def safe_remove(path):
+ if path is not None:
+ try:
+ os.remove(path)
+ except OSError:
+ pass
+
+
+def safe_close(store):
+ try:
+ if store is not None:
+ store.close()
+ except IOError:
+ pass
+
+
+def create_tempfile(path):
+ """ create an unopened named temporary file """
+ return os.path.join(tempfile.gettempdir(), path)
+
+
+@contextmanager
+def ensure_clean_store(path, mode='a', complevel=None, complib=None,
+ fletcher32=False):
+
+ try:
+
+ # put in the temporary path if we don't have one already
+ if not len(os.path.dirname(path)):
+ path = create_tempfile(path)
+
+ store = HDFStore(path, mode=mode, complevel=complevel,
+ complib=complib, fletcher32=False)
+ yield store
+ finally:
+ safe_close(store)
+ if mode == 'w' or mode == 'a':
+ safe_remove(path)
+
+
+@contextmanager
+def ensure_clean_path(path):
+ """
+ return essentially a named temporary file that is not opened
+ and deleted on existing; if path is a list, then create and
+ return list of filenames
+ """
+ try:
+ if isinstance(path, list):
+ filenames = [create_tempfile(p) for p in path]
+ yield filenames
+ else:
+ filenames = [create_tempfile(path)]
+ yield filenames[0]
+ finally:
+ for f in filenames:
+ safe_remove(f)
+
+
+# set these parameters so we don't have file sharing
+tables.parameters.MAX_NUMEXPR_THREADS = 1
+tables.parameters.MAX_BLOSC_THREADS = 1
+tables.parameters.MAX_THREADS = 1
+
+
+def _maybe_remove(store, key):
+ """For tests using tables, try removing the table to be sure there is
+ no content from previous tests using the same table name."""
+ try:
+ store.remove(key)
+ except (ValueError, KeyError):
+ pass
+
+
+class Base(object):
+
+ @classmethod
+ def setup_class(cls):
+
+ # Pytables 3.0.0 deprecates lots of things
+ tm.reset_testing_mode()
+
+ @classmethod
+ def teardown_class(cls):
+
+ # Pytables 3.0.0 deprecates lots of things
+ tm.set_testing_mode()
+
+ def setup_method(self, method):
+ self.path = 'tmp.__%s__.h5' % tm.rands(10)
+
+ def teardown_method(self, method):
+ pass
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestHDFStore(Base):
+
+ def test_format_kwarg_in_constructor(self):
+ # GH 13291
+ with ensure_clean_path(self.path) as path:
+ pytest.raises(ValueError, HDFStore, path, format='table')
+
+ def test_context(self):
+ path = create_tempfile(self.path)
+ try:
+ with HDFStore(path) as tbl:
+ raise ValueError('blah')
+ except ValueError:
+ pass
+ finally:
+ safe_remove(path)
+
+ try:
+ with HDFStore(path) as tbl:
+ tbl['a'] = tm.makeDataFrame()
+
+ with HDFStore(path) as tbl:
+ assert len(tbl) == 1
+ assert type(tbl['a']) == DataFrame
+ finally:
+ safe_remove(path)
+
+ def test_conv_read_write(self):
+ path = create_tempfile(self.path)
+ try:
+ def roundtrip(key, obj, **kwargs):
+ obj.to_hdf(path, key, **kwargs)
+ return read_hdf(path, key)
+
+ o = tm.makeTimeSeries()
+ assert_series_equal(o, roundtrip('series', o))
+
+ o = tm.makeStringSeries()
+ assert_series_equal(o, roundtrip('string_series', o))
+
+ o = tm.makeDataFrame()
+ assert_frame_equal(o, roundtrip('frame', o))
+
+ with catch_warnings(record=True):
+
+ o = tm.makePanel()
+ assert_panel_equal(o, roundtrip('panel', o))
+
+ # table
+ df = DataFrame(dict(A=lrange(5), B=lrange(5)))
+ df.to_hdf(path, 'table', append=True)
+ result = read_hdf(path, 'table', where=['index>2'])
+ assert_frame_equal(df[df.index > 2], result)
+
+ finally:
+ safe_remove(path)
+
+ def test_long_strings(self):
+
+ # GH6166
+ df = DataFrame({'a': tm.rands_array(100, size=10)},
+ index=tm.rands_array(100, size=10))
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df, data_columns=['a'])
+
+ result = store.select('df')
+ assert_frame_equal(df, result)
+
+ def test_api(self):
+
+ # GH4584
+ # API issue when to_hdf doesn't acdept append AND format args
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+ df.iloc[:10].to_hdf(path, 'df', append=True, format='table')
+ df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ # append to False
+ df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
+ df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+ df.iloc[:10].to_hdf(path, 'df', append=True)
+ df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ # append to False
+ df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
+ df.iloc[10:].to_hdf(path, 'df', append=True)
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+ df.to_hdf(path, 'df', append=False, format='fixed')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ df.to_hdf(path, 'df', append=False, format='f')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ df.to_hdf(path, 'df', append=False)
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ df.to_hdf(path, 'df')
+ assert_frame_equal(read_hdf(path, 'df'), df)
+
+ with ensure_clean_store(self.path) as store:
+
+ path = store._path
+ df = tm.makeDataFrame()
+
+ _maybe_remove(store, 'df')
+ store.append('df', df.iloc[:10], append=True, format='table')
+ store.append('df', df.iloc[10:], append=True, format='table')
+ assert_frame_equal(store.select('df'), df)
+
+ # append to False
+ _maybe_remove(store, 'df')
+ store.append('df', df.iloc[:10], append=False, format='table')
+ store.append('df', df.iloc[10:], append=True, format='table')
+ assert_frame_equal(store.select('df'), df)
+
+ # formats
+ _maybe_remove(store, 'df')
+ store.append('df', df.iloc[:10], append=False, format='table')
+ store.append('df', df.iloc[10:], append=True, format='table')
+ assert_frame_equal(store.select('df'), df)
+
+ _maybe_remove(store, 'df')
+ store.append('df', df.iloc[:10], append=False, format='table')
+ store.append('df', df.iloc[10:], append=True, format=None)
+ assert_frame_equal(store.select('df'), df)
+
+ with ensure_clean_path(self.path) as path:
+
+ # invalid
+ df = tm.makeDataFrame()
+ pytest.raises(ValueError, df.to_hdf, path,
+ 'df', append=True, format='f')
+ pytest.raises(ValueError, df.to_hdf, path,
+ 'df', append=True, format='fixed')
+
+ pytest.raises(TypeError, df.to_hdf, path,
+ 'df', append=True, format='foo')
+ pytest.raises(TypeError, df.to_hdf, path,
+ 'df', append=False, format='bar')
+
+ # File path doesn't exist
+ path = ""
+ pytest.raises(compat.FileNotFoundError,
+ read_hdf, path, 'df')
+
+ def test_api_default_format(self):
+
+ # default_format option
+ with ensure_clean_store(self.path) as store:
+ df = tm.makeDataFrame()
+
+ pd.set_option('io.hdf.default_format', 'fixed')
+ _maybe_remove(store, 'df')
+ store.put('df', df)
+ assert not store.get_storer('df').is_table
+ pytest.raises(ValueError, store.append, 'df2', df)
+
+ pd.set_option('io.hdf.default_format', 'table')
+ _maybe_remove(store, 'df')
+ store.put('df', df)
+ assert store.get_storer('df').is_table
+ _maybe_remove(store, 'df2')
+ store.append('df2', df)
+ assert store.get_storer('df').is_table
+
+ pd.set_option('io.hdf.default_format', None)
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+
+ pd.set_option('io.hdf.default_format', 'fixed')
+ df.to_hdf(path, 'df')
+ with HDFStore(path) as store:
+ assert not store.get_storer('df').is_table
+ pytest.raises(ValueError, df.to_hdf, path, 'df2', append=True)
+
+ pd.set_option('io.hdf.default_format', 'table')
+ df.to_hdf(path, 'df3')
+ with HDFStore(path) as store:
+ assert store.get_storer('df3').is_table
+ df.to_hdf(path, 'df4', append=True)
+ with HDFStore(path) as store:
+ assert store.get_storer('df4').is_table
+
+ pd.set_option('io.hdf.default_format', None)
+
+ def test_keys(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeSeries()
+ store['b'] = tm.makeStringSeries()
+ store['c'] = tm.makeDataFrame()
+ with catch_warnings(record=True):
+ store['d'] = tm.makePanel()
+ store['foo/bar'] = tm.makePanel()
+ assert len(store) == 5
+ expected = {'/a', '/b', '/c', '/d', '/foo/bar'}
+ assert set(store.keys()) == expected
+ assert set(store) == expected
+
+ def test_keys_ignore_hdf_softlink(self):
+
+ # GH 20523
+ # Puts a softlink into HDF file and rereads
+
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame(dict(A=lrange(5), B=lrange(5)))
+ store.put("df", df)
+
+ assert store.keys() == ["/df"]
+
+ store._handle.create_soft_link(store._handle.root, "symlink", "df")
+
+ # Should ignore the softlink
+ assert store.keys() == ["/df"]
+
+ def test_iter_empty(self):
+
+ with ensure_clean_store(self.path) as store:
+ # GH 12221
+ assert list(store) == []
+
+ def test_repr(self):
+
+ with ensure_clean_store(self.path) as store:
+ repr(store)
+ store.info()
+ store['a'] = tm.makeTimeSeries()
+ store['b'] = tm.makeStringSeries()
+ store['c'] = tm.makeDataFrame()
+
+ with catch_warnings(record=True):
+ store['d'] = tm.makePanel()
+ store['foo/bar'] = tm.makePanel()
+ store.append('e', tm.makePanel())
+
+ df = tm.makeDataFrame()
+ df['obj1'] = 'foo'
+ df['obj2'] = 'bar'
+ df['bool1'] = df['A'] > 0
+ df['bool2'] = df['B'] > 0
+ df['bool3'] = True
+ df['int1'] = 1
+ df['int2'] = 2
+ df['timestamp1'] = Timestamp('20010102')
+ df['timestamp2'] = Timestamp('20010103')
+ df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
+ df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
+ df.loc[3:6, ['obj1']] = np.nan
+ df = df._consolidate()._convert(datetime=True)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", pd.errors.PerformanceWarning)
+ store['df'] = df
+
+ # make a random group in hdf space
+ store._handle.create_group(store._handle.root, 'bah')
+
+ assert store.filename in repr(store)
+ assert store.filename in str(store)
+ store.info()
+
+ # storers
+ with ensure_clean_store(self.path) as store:
+
+ df = tm.makeDataFrame()
+ store.append('df', df)
+
+ s = store.get_storer('df')
+ repr(s)
+ str(s)
+
+ @ignore_natural_naming_warning
+ def test_contains(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeSeries()
+ store['b'] = tm.makeDataFrame()
+ store['foo/bar'] = tm.makeDataFrame()
+ assert 'a' in store
+ assert 'b' in store
+ assert 'c' not in store
+ assert 'foo/bar' in store
+ assert '/foo/bar' in store
+ assert '/foo/b' not in store
+ assert 'bar' not in store
+
+ # gh-2694: tables.NaturalNameWarning
+ with catch_warnings(record=True):
+ store['node())'] = tm.makeDataFrame()
+ assert 'node())' in store
+
+ def test_versioning(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeSeries()
+ store['b'] = tm.makeDataFrame()
+ df = tm.makeTimeDataFrame()
+ _maybe_remove(store, 'df1')
+ store.append('df1', df[:10])
+ store.append('df1', df[10:])
+ assert store.root.a._v_attrs.pandas_version == '0.15.2'
+ assert store.root.b._v_attrs.pandas_version == '0.15.2'
+ assert store.root.df1._v_attrs.pandas_version == '0.15.2'
+
+ # write a file and wipe its versioning
+ _maybe_remove(store, 'df2')
+ store.append('df2', df)
+
+ # this is an error because its table_type is appendable, but no
+ # version info
+ store.get_node('df2')._v_attrs.pandas_version = None
+ pytest.raises(Exception, store.select, 'df2')
+
+ def test_mode(self):
+
+ df = tm.makeTimeDataFrame()
+
+ def check(mode):
+
+ with ensure_clean_path(self.path) as path:
+
+ # constructor
+ if mode in ['r', 'r+']:
+ pytest.raises(IOError, HDFStore, path, mode=mode)
+
+ else:
+ store = HDFStore(path, mode=mode)
+ assert store._handle.mode == mode
+ store.close()
+
+ with ensure_clean_path(self.path) as path:
+
+ # context
+ if mode in ['r', 'r+']:
+ def f():
+ with HDFStore(path, mode=mode) as store: # noqa
+ pass
+ pytest.raises(IOError, f)
+ else:
+ with HDFStore(path, mode=mode) as store:
+ assert store._handle.mode == mode
+
+ with ensure_clean_path(self.path) as path:
+
+ # conv write
+ if mode in ['r', 'r+']:
+ pytest.raises(IOError, df.to_hdf,
+ path, 'df', mode=mode)
+ df.to_hdf(path, 'df', mode='w')
+ else:
+ df.to_hdf(path, 'df', mode=mode)
+
+ # conv read
+ if mode in ['w']:
+ pytest.raises(ValueError, read_hdf,
+ path, 'df', mode=mode)
+ else:
+ result = read_hdf(path, 'df', mode=mode)
+ assert_frame_equal(result, df)
+
+ def check_default_mode():
+
+ # read_hdf uses default mode
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', mode='w')
+ result = read_hdf(path, 'df')
+ assert_frame_equal(result, df)
+
+ check('r')
+ check('r+')
+ check('a')
+ check('w')
+ check_default_mode()
+
+ def test_reopen_handle(self):
+
+ with ensure_clean_path(self.path) as path:
+
+ store = HDFStore(path, mode='a')
+ store['a'] = tm.makeTimeSeries()
+
+ # invalid mode change
+ pytest.raises(PossibleDataLossError, store.open, 'w')
+ store.close()
+ assert not store.is_open
+
+ # truncation ok here
+ store.open('w')
+ assert store.is_open
+ assert len(store) == 0
+ store.close()
+ assert not store.is_open
+
+ store = HDFStore(path, mode='a')
+ store['a'] = tm.makeTimeSeries()
+
+ # reopen as read
+ store.open('r')
+ assert store.is_open
+ assert len(store) == 1
+ assert store._mode == 'r'
+ store.close()
+ assert not store.is_open
+
+ # reopen as append
+ store.open('a')
+ assert store.is_open
+ assert len(store) == 1
+ assert store._mode == 'a'
+ store.close()
+ assert not store.is_open
+
+ # reopen as append (again)
+ store.open('a')
+ assert store.is_open
+ assert len(store) == 1
+ assert store._mode == 'a'
+ store.close()
+ assert not store.is_open
+
+ def test_open_args(self):
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+
+ # create an in memory store
+ store = HDFStore(path, mode='a', driver='H5FD_CORE',
+ driver_core_backing_store=0)
+ store['df'] = df
+ store.append('df2', df)
+
+ tm.assert_frame_equal(store['df'], df)
+ tm.assert_frame_equal(store['df2'], df)
+
+ store.close()
+
+ # the file should not have actually been written
+ assert not os.path.exists(path)
+
+ def test_flush(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeSeries()
+ store.flush()
+ store.flush(fsync=True)
+
+ def test_get(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeSeries()
+ left = store.get('a')
+ right = store['a']
+ tm.assert_series_equal(left, right)
+
+ left = store.get('/a')
+ right = store['/a']
+ tm.assert_series_equal(left, right)
+
+ pytest.raises(KeyError, store.get, 'b')
+
+ @pytest.mark.parametrize('where, expected', [
+ ('/', {
+ '': ({'first_group', 'second_group'}, set()),
+ '/first_group': (set(), {'df1', 'df2'}),
+ '/second_group': ({'third_group'}, {'df3', 's1'}),
+ '/second_group/third_group': (set(), {'df4'}),
+ }),
+ ('/second_group', {
+ '/second_group': ({'third_group'}, {'df3', 's1'}),
+ '/second_group/third_group': (set(), {'df4'}),
+ })
+ ])
+ def test_walk(self, where, expected):
+ # GH10143
+ objs = {
+ 'df1': pd.DataFrame([1, 2, 3]),
+ 'df2': pd.DataFrame([4, 5, 6]),
+ 'df3': pd.DataFrame([6, 7, 8]),
+ 'df4': pd.DataFrame([9, 10, 11]),
+ 's1': pd.Series([10, 9, 8]),
+ # Next 3 items aren't pandas objects and should be ignored
+ 'a1': np.array([[1, 2, 3], [4, 5, 6]]),
+ 'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'),
+ 'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i')
+ }
+
+ with ensure_clean_store('walk_groups.hdf', mode='w') as store:
+ store.put('/first_group/df1', objs['df1'])
+ store.put('/first_group/df2', objs['df2'])
+ store.put('/second_group/df3', objs['df3'])
+ store.put('/second_group/s1', objs['s1'])
+ store.put('/second_group/third_group/df4', objs['df4'])
+ # Create non-pandas objects
+ store._handle.create_array('/first_group', 'a1', objs['a1'])
+ store._handle.create_table('/first_group', 'tb1', obj=objs['tb1'])
+ store._handle.create_table('/second_group', 'tb2', obj=objs['tb2'])
+
+ assert len(list(store.walk(where=where))) == len(expected)
+ for path, groups, leaves in store.walk(where=where):
+ assert path in expected
+ expected_groups, expected_frames = expected[path]
+ assert expected_groups == set(groups)
+ assert expected_frames == set(leaves)
+ for leaf in leaves:
+ frame_path = '/'.join([path, leaf])
+ obj = store.get(frame_path)
+ if 'df' in leaf:
+ tm.assert_frame_equal(obj, objs[leaf])
+ else:
+ tm.assert_series_equal(obj, objs[leaf])
+
+ def test_getattr(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ s = tm.makeTimeSeries()
+ store['a'] = s
+
+ # test attribute access
+ result = store.a
+ tm.assert_series_equal(result, s)
+ result = getattr(store, 'a')
+ tm.assert_series_equal(result, s)
+
+ df = tm.makeTimeDataFrame()
+ store['df'] = df
+ result = store.df
+ tm.assert_frame_equal(result, df)
+
+ # errors
+ pytest.raises(AttributeError, getattr, store, 'd')
+
+ for x in ['mode', 'path', 'handle', 'complib']:
+ pytest.raises(AttributeError, getattr, store, x)
+
+ # not stores
+ for x in ['mode', 'path', 'handle', 'complib']:
+ getattr(store, "_%s" % x)
+
+ def test_put(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ ts = tm.makeTimeSeries()
+ df = tm.makeTimeDataFrame()
+ store['a'] = ts
+ store['b'] = df[:10]
+ store['foo/bar/bah'] = df[:10]
+ store['foo'] = df[:10]
+ store['/foo'] = df[:10]
+ store.put('c', df[:10], format='table')
+
+ # not OK, not a table
+ pytest.raises(
+ ValueError, store.put, 'b', df[10:], append=True)
+
+ # node does not currently exist, test _is_table_type returns False
+ # in this case
+ # _maybe_remove(store, 'f')
+ # pytest.raises(ValueError, store.put, 'f', df[10:],
+ # append=True)
+
+ # can't put to a table (use append instead)
+ pytest.raises(ValueError, store.put, 'c', df[10:], append=True)
+
+ # overwrite table
+ store.put('c', df[:10], format='table', append=False)
+ tm.assert_frame_equal(df[:10], store['c'])
+
+ def test_put_string_index(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ index = Index(
+ ["I am a very long string index: %s" % i for i in range(20)])
+ s = Series(np.arange(20), index=index)
+ df = DataFrame({'A': s, 'B': s})
+
+ store['a'] = s
+ tm.assert_series_equal(store['a'], s)
+
+ store['b'] = df
+ tm.assert_frame_equal(store['b'], df)
+
+ # mixed length
+ index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] +
+ ["I am a very long string index: %s" % i
+ for i in range(20)])
+ s = Series(np.arange(21), index=index)
+ df = DataFrame({'A': s, 'B': s})
+ store['a'] = s
+ tm.assert_series_equal(store['a'], s)
+
+ store['b'] = df
+ tm.assert_frame_equal(store['b'], df)
+
+ def test_put_compression(self):
+
+ with ensure_clean_store(self.path) as store:
+ df = tm.makeTimeDataFrame()
+
+ store.put('c', df, format='table', complib='zlib')
+ tm.assert_frame_equal(store['c'], df)
+
+ # can't compress if format='fixed'
+ pytest.raises(ValueError, store.put, 'b', df,
+ format='fixed', complib='zlib')
+
+ @td.skip_if_windows_python_3
+ def test_put_compression_blosc(self):
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+
+ # can't compress if format='fixed'
+ pytest.raises(ValueError, store.put, 'b', df,
+ format='fixed', complib='blosc')
+
+ store.put('c', df, format='table', complib='blosc')
+ tm.assert_frame_equal(store['c'], df)
+
+ def test_complibs_default_settings(self):
+ # GH15943
+ df = tm.makeDataFrame()
+
+ # Set complevel and check if complib is automatically set to
+ # default value
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df', complevel=9)
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 9
+ assert node.filters.complib == 'zlib'
+
+ # Set complib and check to see if compression is disabled
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df', complib='zlib')
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+
+ # Check if not setting complib or complevel results in no compression
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df')
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+
+ # Check if file-defaults can be overridden on a per table basis
+ with ensure_clean_path(self.path) as tmpfile:
+ store = pd.HDFStore(tmpfile)
+ store.append('dfc', df, complevel=9, complib='blosc')
+ store.append('df', df)
+ store.close()
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+ for node in h5file.walk_nodes(where='/dfc', classname='Leaf'):
+ assert node.filters.complevel == 9
+ assert node.filters.complib == 'blosc'
+
+ def test_complibs(self):
+ # GH14478
+ df = tm.makeDataFrame()
+
+ # Building list of all complibs and complevels tuples
+ all_complibs = tables.filters.all_complibs
+ # Remove lzo if its not available on this platform
+ if not tables.which_lib_version('lzo'):
+ all_complibs.remove('lzo')
+ # Remove bzip2 if its not available on this platform
+ if not tables.which_lib_version("bzip2"):
+ all_complibs.remove("bzip2")
+
+ all_levels = range(0, 10)
+ all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels]
+
+ for (lib, lvl) in all_tests:
+ with ensure_clean_path(self.path) as tmpfile:
+ gname = 'foo'
+
+ # Write and read file to see if data is consistent
+ df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl)
+ result = pd.read_hdf(tmpfile, gname)
+ tm.assert_frame_equal(result, df)
+
+ # Open file and check metadata
+ # for correct amount of compression
+ h5table = tables.open_file(tmpfile, mode='r')
+ for node in h5table.walk_nodes(where='/' + gname,
+ classname='Leaf'):
+ assert node.filters.complevel == lvl
+ if lvl == 0:
+ assert node.filters.complib is None
+ else:
+ assert node.filters.complib == lib
+ h5table.close()
+
+ def test_put_integer(self):
+ # non-date, non-string index
+ df = DataFrame(np.random.randn(50, 100))
+ self._check_roundtrip(df, tm.assert_frame_equal)
+
+ @xfail_non_writeable
+ def test_put_mixed_type(self):
+ df = tm.makeTimeDataFrame()
+ df['obj1'] = 'foo'
+ df['obj2'] = 'bar'
+ df['bool1'] = df['A'] > 0
+ df['bool2'] = df['B'] > 0
+ df['bool3'] = True
+ df['int1'] = 1
+ df['int2'] = 2
+ df['timestamp1'] = Timestamp('20010102')
+ df['timestamp2'] = Timestamp('20010103')
+ df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
+ df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
+ df.loc[3:6, ['obj1']] = np.nan
+ df = df._consolidate()._convert(datetime=True)
+
+ with ensure_clean_store(self.path) as store:
+ _maybe_remove(store, 'df')
+
+ # PerformanceWarning
+ with catch_warnings(record=True):
+ simplefilter("ignore", pd.errors.PerformanceWarning)
+ store.put('df', df)
+
+ expected = store.get('df')
+ tm.assert_frame_equal(expected, df)
+
+ @pytest.mark.filterwarnings(
+ "ignore:object name:tables.exceptions.NaturalNameWarning"
+ )
+ def test_append(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # this is allowed by almost always don't want to do it
+ # tables.NaturalNameWarning):
+ with catch_warnings(record=True):
+
+ df = tm.makeTimeDataFrame()
+ _maybe_remove(store, 'df1')
+ store.append('df1', df[:10])
+ store.append('df1', df[10:])
+ tm.assert_frame_equal(store['df1'], df)
+
+ _maybe_remove(store, 'df2')
+ store.put('df2', df[:10], format='table')
+ store.append('df2', df[10:])
+ tm.assert_frame_equal(store['df2'], df)
+
+ _maybe_remove(store, 'df3')
+ store.append('/df3', df[:10])
+ store.append('/df3', df[10:])
+ tm.assert_frame_equal(store['df3'], df)
+
+ # this is allowed by almost always don't want to do it
+ # tables.NaturalNameWarning
+ _maybe_remove(store, '/df3 foo')
+ store.append('/df3 foo', df[:10])
+ store.append('/df3 foo', df[10:])
+ tm.assert_frame_equal(store['df3 foo'], df)
+
+ # panel
+ wp = tm.makePanel()
+ _maybe_remove(store, 'wp1')
+ store.append('wp1', wp.iloc[:, :10, :])
+ store.append('wp1', wp.iloc[:, 10:, :])
+ assert_panel_equal(store['wp1'], wp)
+
+ # test using differt order of items on the non-index axes
+ _maybe_remove(store, 'wp1')
+ wp_append1 = wp.iloc[:, :10, :]
+ store.append('wp1', wp_append1)
+ wp_append2 = wp.iloc[:, 10:, :].reindex(items=wp.items[::-1])
+ store.append('wp1', wp_append2)
+ assert_panel_equal(store['wp1'], wp)
+
+ # dtype issues - mizxed type in a single object column
+ df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
+ df['mixed_column'] = 'testing'
+ df.loc[2, 'mixed_column'] = np.nan
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+ tm.assert_frame_equal(store['df'], df)
+
+ # uints - test storage of uints
+ uint_data = DataFrame({
+ 'u08': Series(np.random.randint(0, high=255, size=5),
+ dtype=np.uint8),
+ 'u16': Series(np.random.randint(0, high=65535, size=5),
+ dtype=np.uint16),
+ 'u32': Series(np.random.randint(0, high=2**30, size=5),
+ dtype=np.uint32),
+ 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62],
+ dtype=np.uint64)}, index=np.arange(5))
+ _maybe_remove(store, 'uints')
+ store.append('uints', uint_data)
+ tm.assert_frame_equal(store['uints'], uint_data)
+
+ # uints - test storage of uints in indexable columns
+ _maybe_remove(store, 'uints')
+ # 64-bit indices not yet supported
+ store.append('uints', uint_data, data_columns=[
+ 'u08', 'u16', 'u32'])
+ tm.assert_frame_equal(store['uints'], uint_data)
+
+ def test_append_series(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # basic
+ ss = tm.makeStringSeries()
+ ts = tm.makeTimeSeries()
+ ns = Series(np.arange(100))
+
+ store.append('ss', ss)
+ result = store['ss']
+ tm.assert_series_equal(result, ss)
+ assert result.name is None
+
+ store.append('ts', ts)
+ result = store['ts']
+ tm.assert_series_equal(result, ts)
+ assert result.name is None
+
+ ns.name = 'foo'
+ store.append('ns', ns)
+ result = store['ns']
+ tm.assert_series_equal(result, ns)
+ assert result.name == ns.name
+
+ # select on the values
+ expected = ns[ns > 60]
+ result = store.select('ns', 'foo>60')
+ tm.assert_series_equal(result, expected)
+
+ # select on the index and values
+ expected = ns[(ns > 70) & (ns.index < 90)]
+ result = store.select('ns', 'foo>70 and index<90')
+ tm.assert_series_equal(result, expected)
+
+ # multi-index
+ mi = DataFrame(np.random.randn(5, 1), columns=['A'])
+ mi['B'] = np.arange(len(mi))
+ mi['C'] = 'foo'
+ mi.loc[3:5, 'C'] = 'bar'
+ mi.set_index(['C', 'B'], inplace=True)
+ s = mi.stack()
+ s.index = s.index.droplevel(2)
+ store.append('mi', s)
+ tm.assert_series_equal(store['mi'], s)
+
+ def test_store_index_types(self):
+ # GH5386
+ # test storing various index types
+
+ with ensure_clean_store(self.path) as store:
+
+ def check(format, index):
+ df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
+ df.index = index(len(df))
+
+ _maybe_remove(store, 'df')
+ store.put('df', df, format=format)
+ assert_frame_equal(df, store['df'])
+
+ for index in [tm.makeFloatIndex, tm.makeStringIndex,
+ tm.makeIntIndex, tm.makeDateIndex]:
+
+ check('table', index)
+ check('fixed', index)
+
+ # period index currently broken for table
+ # seee GH7796 FIXME
+ check('fixed', tm.makePeriodIndex)
+ # check('table',tm.makePeriodIndex)
+
+ # unicode
+ index = tm.makeUnicodeIndex
+ if compat.PY3:
+ check('table', index)
+ check('fixed', index)
+ else:
+
+ # only support for fixed types (and they have a perf warning)
+ pytest.raises(TypeError, check, 'table', index)
+
+ # PerformanceWarning
+ with catch_warnings(record=True):
+ simplefilter("ignore", pd.errors.PerformanceWarning)
+ check('fixed', index)
+
+ @pytest.mark.skipif(not is_platform_little_endian(),
+ reason="reason platform is not little endian")
+ def test_encoding(self):
+
+ with ensure_clean_store(self.path) as store:
+ df = DataFrame(dict(A='foo', B='bar'), index=range(5))
+ df.loc[2, 'A'] = np.nan
+ df.loc[3, 'B'] = np.nan
+ _maybe_remove(store, 'df')
+ store.append('df', df, encoding='ascii')
+ tm.assert_frame_equal(store['df'], df)
+
+ expected = df.reindex(columns=['A'])
+ result = store.select('df', Term('columns=A', encoding='ascii'))
+ tm.assert_frame_equal(result, expected)
+
+ def test_latin_encoding(self):
+
+ if compat.PY2:
+ pytest.skip("[unicode] is not implemented as a table column")
+
+ values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+ [b'E\xc9, 17', b'a', b'b', b'c'],
+ [b'EE, 17', b'', b'a', b'b', b'c'],
+ [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+ [b'', b'a', b'b', b'c'],
+ [b'\xf8\xfc', b'a', b'b', b'c'],
+ [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+ [np.nan, b'', b'b', b'c'],
+ [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+ def _try_decode(x, encoding='latin-1'):
+ try:
+ return x.decode(encoding)
+ except AttributeError:
+ return x
+ # not sure how to remove latin-1 from code in python 2 and 3
+ values = [[_try_decode(x) for x in y] for y in values]
+
+ examples = []
+ for dtype in ['category', object]:
+ for val in values:
+ examples.append(pd.Series(val, dtype=dtype))
+
+ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
+ with ensure_clean_path(self.path) as store:
+ s.to_hdf(store, key, format='table', encoding=encoding,
+ nan_rep=nan_rep)
+ retr = read_hdf(store, key)
+ s_nan = s.replace(nan_rep, np.nan)
+ if is_categorical_dtype(s_nan):
+ assert is_categorical_dtype(retr)
+ assert_series_equal(s_nan, retr, check_dtype=False,
+ check_categorical=False)
+ else:
+ assert_series_equal(s_nan, retr)
+
+ for s in examples:
+ roundtrip(s)
+
+ # fails:
+ # for x in examples:
+ # roundtrip(s, nan_rep=b'\xf8\xfc')
+
+ def test_append_some_nans(self):
+
+ with ensure_clean_store(self.path) as store:
+ df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'),
+ 'A1': np.random.randn(20),
+ 'A2': np.random.randn(20),
+ 'B': 'foo', 'C': 'bar',
+ 'D': Timestamp("20010101"),
+ 'E': datetime.datetime(2001, 1, 2, 0, 0)},
+ index=np.arange(20))
+ # some nans
+ _maybe_remove(store, 'df1')
+ df.loc[0:15, ['A1', 'B', 'D', 'E']] = np.nan
+ store.append('df1', df[:10])
+ store.append('df1', df[10:])
+ tm.assert_frame_equal(store['df1'], df)
+
+ # first column
+ df1 = df.copy()
+ df1.loc[:, 'A1'] = np.nan
+ _maybe_remove(store, 'df1')
+ store.append('df1', df1[:10])
+ store.append('df1', df1[10:])
+ tm.assert_frame_equal(store['df1'], df1)
+
+ # 2nd column
+ df2 = df.copy()
+ df2.loc[:, 'A2'] = np.nan
+ _maybe_remove(store, 'df2')
+ store.append('df2', df2[:10])
+ store.append('df2', df2[10:])
+ tm.assert_frame_equal(store['df2'], df2)
+
+ # datetimes
+ df3 = df.copy()
+ df3.loc[:, 'E'] = np.nan
+ _maybe_remove(store, 'df3')
+ store.append('df3', df3[:10])
+ store.append('df3', df3[10:])
+ tm.assert_frame_equal(store['df3'], df3)
+
+ def test_append_all_nans(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame({'A1': np.random.randn(20),
+ 'A2': np.random.randn(20)},
+ index=np.arange(20))
+ df.loc[0:15, :] = np.nan
+
+ # nan some entire rows (dropna=True)
+ _maybe_remove(store, 'df')
+ store.append('df', df[:10], dropna=True)
+ store.append('df', df[10:], dropna=True)
+ tm.assert_frame_equal(store['df'], df[-4:])
+
+ # nan some entire rows (dropna=False)
+ _maybe_remove(store, 'df2')
+ store.append('df2', df[:10], dropna=False)
+ store.append('df2', df[10:], dropna=False)
+ tm.assert_frame_equal(store['df2'], df)
+
+ # tests the option io.hdf.dropna_table
+ pd.set_option('io.hdf.dropna_table', False)
+ _maybe_remove(store, 'df3')
+ store.append('df3', df[:10])
+ store.append('df3', df[10:])
+ tm.assert_frame_equal(store['df3'], df)
+
+ pd.set_option('io.hdf.dropna_table', True)
+ _maybe_remove(store, 'df4')
+ store.append('df4', df[:10])
+ store.append('df4', df[10:])
+ tm.assert_frame_equal(store['df4'], df[-4:])
+
+ # nan some entire rows (string are still written!)
+ df = DataFrame({'A1': np.random.randn(20),
+ 'A2': np.random.randn(20),
+ 'B': 'foo', 'C': 'bar'},
+ index=np.arange(20))
+
+ df.loc[0:15, :] = np.nan
+
+ _maybe_remove(store, 'df')
+ store.append('df', df[:10], dropna=True)
+ store.append('df', df[10:], dropna=True)
+ tm.assert_frame_equal(store['df'], df)
+
+ _maybe_remove(store, 'df2')
+ store.append('df2', df[:10], dropna=False)
+ store.append('df2', df[10:], dropna=False)
+ tm.assert_frame_equal(store['df2'], df)
+
+ # nan some entire rows (but since we have dates they are still
+ # written!)
+ df = DataFrame({'A1': np.random.randn(20),
+ 'A2': np.random.randn(20),
+ 'B': 'foo', 'C': 'bar',
+ 'D': Timestamp("20010101"),
+ 'E': datetime.datetime(2001, 1, 2, 0, 0)},
+ index=np.arange(20))
+
+ df.loc[0:15, :] = np.nan
+
+ _maybe_remove(store, 'df')
+ store.append('df', df[:10], dropna=True)
+ store.append('df', df[10:], dropna=True)
+ tm.assert_frame_equal(store['df'], df)
+
+ _maybe_remove(store, 'df2')
+ store.append('df2', df[:10], dropna=False)
+ store.append('df2', df[10:], dropna=False)
+ tm.assert_frame_equal(store['df2'], df)
+
+ # Test to make sure defaults are to not drop.
+ # Corresponding to Issue 9382
+ df_with_missing = DataFrame(
+ {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]})
+
+ with ensure_clean_path(self.path) as path:
+ df_with_missing.to_hdf(path, 'df_with_missing', format='table')
+ reloaded = read_hdf(path, 'df_with_missing')
+ tm.assert_frame_equal(df_with_missing, reloaded)
+
+ matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]],
+ [[np.nan, np.nan, np.nan], [np.nan, 5, 6]],
+ [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]]
+
+ with catch_warnings(record=True):
+ panel_with_missing = Panel(matrix,
+ items=['Item1', 'Item2', 'Item3'],
+ major_axis=[1, 2],
+ minor_axis=['A', 'B', 'C'])
+
+ with ensure_clean_path(self.path) as path:
+ panel_with_missing.to_hdf(
+ path, 'panel_with_missing', format='table')
+ reloaded_panel = read_hdf(path, 'panel_with_missing')
+ tm.assert_panel_equal(panel_with_missing, reloaded_panel)
+
+ def test_append_frame_column_oriented(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # column oriented
+ df = tm.makeTimeDataFrame()
+ _maybe_remove(store, 'df1')
+ store.append('df1', df.iloc[:, :2], axes=['columns'])
+ store.append('df1', df.iloc[:, 2:])
+ tm.assert_frame_equal(store['df1'], df)
+
+ result = store.select('df1', 'columns=A')
+ expected = df.reindex(columns=['A'])
+ tm.assert_frame_equal(expected, result)
+
+ # selection on the non-indexable
+ result = store.select(
+ 'df1', ('columns=A', 'index=df.index[0:4]'))
+ expected = df.reindex(columns=['A'], index=df.index[0:4])
+ tm.assert_frame_equal(expected, result)
+
+ # this isn't supported
+ with pytest.raises(TypeError):
+ store.select('df1',
+ 'columns=A and index>df.index[4]')
+
+ def test_append_with_different_block_ordering(self):
+
+ # GH 4096; using same frames, but different block orderings
+ with ensure_clean_store(self.path) as store:
+
+ for i in range(10):
+
+ df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
+ df['index'] = range(10)
+ df['index'] += i * 10
+ df['int64'] = Series([1] * len(df), dtype='int64')
+ df['int16'] = Series([1] * len(df), dtype='int16')
+
+ if i % 2 == 0:
+ del df['int64']
+ df['int64'] = Series([1] * len(df), dtype='int64')
+ if i % 3 == 0:
+ a = df.pop('A')
+ df['A'] = a
+
+ df.set_index('index', inplace=True)
+
+ store.append('df', df)
+
+ # test a different ordering but with more fields (like invalid
+ # combinate)
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame(np.random.randn(10, 2),
+ columns=list('AB'), dtype='float64')
+ df['int64'] = Series([1] * len(df), dtype='int64')
+ df['int16'] = Series([1] * len(df), dtype='int16')
+ store.append('df', df)
+
+ # store additional fields in different blocks
+ df['int16_2'] = Series([1] * len(df), dtype='int16')
+ pytest.raises(ValueError, store.append, 'df', df)
+
+ # store multile additional fields in different blocks
+ df['float_3'] = Series([1.] * len(df), dtype='float64')
+ pytest.raises(ValueError, store.append, 'df', df)
+
+ def test_append_with_strings(self):
+
+ with ensure_clean_store(self.path) as store:
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ wp = tm.makePanel()
+ wp2 = wp.rename(
+ minor_axis={x: "%s_extra" % x for x in wp.minor_axis})
+
+ def check_col(key, name, size):
+ assert getattr(store.get_storer(key)
+ .table.description, name).itemsize == size
+
+ store.append('s1', wp, min_itemsize=20)
+ store.append('s1', wp2)
+ expected = concat([wp, wp2], axis=2)
+ expected = expected.reindex(
+ minor_axis=sorted(expected.minor_axis))
+ assert_panel_equal(store['s1'], expected)
+ check_col('s1', 'minor_axis', 20)
+
+ # test dict format
+ store.append('s2', wp, min_itemsize={'minor_axis': 20})
+ store.append('s2', wp2)
+ expected = concat([wp, wp2], axis=2)
+ expected = expected.reindex(
+ minor_axis=sorted(expected.minor_axis))
+ assert_panel_equal(store['s2'], expected)
+ check_col('s2', 'minor_axis', 20)
+
+ # apply the wrong field (similar to #1)
+ store.append('s3', wp, min_itemsize={'major_axis': 20})
+ pytest.raises(ValueError, store.append, 's3', wp2)
+
+ # test truncation of bigger strings
+ store.append('s4', wp)
+ pytest.raises(ValueError, store.append, 's4', wp2)
+
+ # avoid truncation on elements
+ df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
+ store.append('df_big', df)
+ tm.assert_frame_equal(store.select('df_big'), df)
+ check_col('df_big', 'values_block_1', 15)
+
+ # appending smaller string ok
+ df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']])
+ store.append('df_big', df2)
+ expected = concat([df, df2])
+ tm.assert_frame_equal(store.select('df_big'), expected)
+ check_col('df_big', 'values_block_1', 15)
+
+ # avoid truncation on elements
+ df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
+ store.append('df_big2', df, min_itemsize={'values': 50})
+ tm.assert_frame_equal(store.select('df_big2'), df)
+ check_col('df_big2', 'values_block_1', 50)
+
+ # bigger string on next append
+ store.append('df_new', df)
+ df_new = DataFrame(
+ [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']])
+ pytest.raises(ValueError, store.append, 'df_new', df_new)
+
+ # min_itemsize on Series index (GH 11412)
+ df = tm.makeMixedDataFrame().set_index('C')
+ store.append('ss', df['B'], min_itemsize={'index': 4})
+ tm.assert_series_equal(store.select('ss'), df['B'])
+
+ # same as above, with data_columns=True
+ store.append('ss2', df['B'], data_columns=True,
+ min_itemsize={'index': 4})
+ tm.assert_series_equal(store.select('ss2'), df['B'])
+
+ # min_itemsize in index without appending (GH 10381)
+ store.put('ss3', df, format='table',
+ min_itemsize={'index': 6})
+ # just make sure there is a longer string:
+ df2 = df.copy().reset_index().assign(C='longer').set_index('C')
+ store.append('ss3', df2)
+ tm.assert_frame_equal(store.select('ss3'),
+ pd.concat([df, df2]))
+
+ # same as above, with a Series
+ store.put('ss4', df['B'], format='table',
+ min_itemsize={'index': 6})
+ store.append('ss4', df2['B'])
+ tm.assert_series_equal(store.select('ss4'),
+ pd.concat([df['B'], df2['B']]))
+
+ # with nans
+ _maybe_remove(store, 'df')
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df.loc[1:4, 'string'] = np.nan
+ df['string2'] = 'bar'
+ df.loc[4:8, 'string2'] = np.nan
+ df['string3'] = 'bah'
+ df.loc[1:, 'string3'] = np.nan
+ store.append('df', df)
+ result = store.select('df')
+ tm.assert_frame_equal(result, df)
+
+ with ensure_clean_store(self.path) as store:
+
+ def check_col(key, name, size):
+ assert getattr(store.get_storer(key)
+ .table.description, name).itemsize, size
+
+ df = DataFrame(dict(A='foo', B='bar'), index=range(10))
+
+ # a min_itemsize that creates a data_column
+ _maybe_remove(store, 'df')
+ store.append('df', df, min_itemsize={'A': 200})
+ check_col('df', 'A', 200)
+ assert store.get_storer('df').data_columns == ['A']
+
+ # a min_itemsize that creates a data_column2
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=['B'], min_itemsize={'A': 200})
+ check_col('df', 'A', 200)
+ assert store.get_storer('df').data_columns == ['B', 'A']
+
+ # a min_itemsize that creates a data_column2
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=[
+ 'B'], min_itemsize={'values': 200})
+ check_col('df', 'B', 200)
+ check_col('df', 'values_block_0', 200)
+ assert store.get_storer('df').data_columns == ['B']
+
+ # infer the .typ on subsequent appends
+ _maybe_remove(store, 'df')
+ store.append('df', df[:5], min_itemsize=200)
+ store.append('df', df[5:], min_itemsize=200)
+ tm.assert_frame_equal(store['df'], df)
+
+ # invalid min_itemsize keys
+ df = DataFrame(['foo', 'foo', 'foo', 'barh',
+ 'barh', 'barh'], columns=['A'])
+ _maybe_remove(store, 'df')
+ pytest.raises(ValueError, store.append, 'df',
+ df, min_itemsize={'foo': 20, 'foobar': 20})
+
+ def test_append_with_empty_string(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # with all empty strings (GH 12242)
+ df = DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', '']})
+ store.append('df', df[:-1], min_itemsize={'x': 1})
+ store.append('df', df[-1:], min_itemsize={'x': 1})
+ tm.assert_frame_equal(store.select('df'), df)
+
+ def test_to_hdf_with_min_itemsize(self):
+
+ with ensure_clean_path(self.path) as path:
+
+ # min_itemsize in index with to_hdf (GH 10381)
+ df = tm.makeMixedDataFrame().set_index('C')
+ df.to_hdf(path, 'ss3', format='table', min_itemsize={'index': 6})
+ # just make sure there is a longer string:
+ df2 = df.copy().reset_index().assign(C='longer').set_index('C')
+ df2.to_hdf(path, 'ss3', append=True, format='table')
+ tm.assert_frame_equal(pd.read_hdf(path, 'ss3'),
+ pd.concat([df, df2]))
+
+ # same as above, with a Series
+ df['B'].to_hdf(path, 'ss4', format='table',
+ min_itemsize={'index': 6})
+ df2['B'].to_hdf(path, 'ss4', append=True, format='table')
+ tm.assert_series_equal(pd.read_hdf(path, 'ss4'),
+ pd.concat([df['B'], df2['B']]))
+
+ @pytest.mark.parametrize(
+ "format",
+ [pytest.param('fixed', marks=xfail_non_writeable),
+ 'table'])
+ def test_to_hdf_errors(self, format):
+
+ data = ['\ud800foo']
+ ser = pd.Series(data, index=pd.Index(data))
+ with ensure_clean_path(self.path) as path:
+ # GH 20835
+ ser.to_hdf(path, 'table', format=format, errors='surrogatepass')
+
+ result = pd.read_hdf(path, 'table', errors='surrogatepass')
+ tm.assert_series_equal(result, ser)
+
+ def test_append_with_data_columns(self):
+
+ with ensure_clean_store(self.path) as store:
+ df = tm.makeTimeDataFrame()
+ df.iloc[0, df.columns.get_loc('B')] = 1.
+ _maybe_remove(store, 'df')
+ store.append('df', df[:2], data_columns=['B'])
+ store.append('df', df[2:])
+ tm.assert_frame_equal(store['df'], df)
+
+ # check that we have indices created
+ assert(store._handle.root.df.table.cols.index.is_indexed is True)
+ assert(store._handle.root.df.table.cols.B.is_indexed is True)
+
+ # data column searching
+ result = store.select('df', 'B>0')
+ expected = df[df.B > 0]
+ tm.assert_frame_equal(result, expected)
+
+ # data column searching (with an indexable and a data_columns)
+ result = store.select(
+ 'df', 'B>0 and index>df.index[3]')
+ df_new = df.reindex(index=df.index[4:])
+ expected = df_new[df_new.B > 0]
+ tm.assert_frame_equal(result, expected)
+
+ # data column selection with a string data_column
+ df_new = df.copy()
+ df_new['string'] = 'foo'
+ df_new.loc[1:4, 'string'] = np.nan
+ df_new.loc[5:6, 'string'] = 'bar'
+ _maybe_remove(store, 'df')
+ store.append('df', df_new, data_columns=['string'])
+ result = store.select('df', "string='foo'")
+ expected = df_new[df_new.string == 'foo']
+ tm.assert_frame_equal(result, expected)
+
+ # using min_itemsize and a data column
+ def check_col(key, name, size):
+ assert getattr(store.get_storer(key)
+ .table.description, name).itemsize == size
+
+ with ensure_clean_store(self.path) as store:
+ _maybe_remove(store, 'df')
+ store.append('df', df_new, data_columns=['string'],
+ min_itemsize={'string': 30})
+ check_col('df', 'string', 30)
+ _maybe_remove(store, 'df')
+ store.append(
+ 'df', df_new, data_columns=['string'], min_itemsize=30)
+ check_col('df', 'string', 30)
+ _maybe_remove(store, 'df')
+ store.append('df', df_new, data_columns=['string'],
+ min_itemsize={'values': 30})
+ check_col('df', 'string', 30)
+
+ with ensure_clean_store(self.path) as store:
+ df_new['string2'] = 'foobarbah'
+ df_new['string_block1'] = 'foobarbah1'
+ df_new['string_block2'] = 'foobarbah2'
+ _maybe_remove(store, 'df')
+ store.append('df', df_new, data_columns=['string', 'string2'],
+ min_itemsize={'string': 30, 'string2': 40,
+ 'values': 50})
+ check_col('df', 'string', 30)
+ check_col('df', 'string2', 40)
+ check_col('df', 'values_block_1', 50)
+
+ with ensure_clean_store(self.path) as store:
+ # multiple data columns
+ df_new = df.copy()
+ df_new.iloc[0, df_new.columns.get_loc('A')] = 1.
+ df_new.iloc[0, df_new.columns.get_loc('B')] = -1.
+ df_new['string'] = 'foo'
+
+ sl = df_new.columns.get_loc('string')
+ df_new.iloc[1:4, sl] = np.nan
+ df_new.iloc[5:6, sl] = 'bar'
+
+ df_new['string2'] = 'foo'
+ sl = df_new.columns.get_loc('string2')
+ df_new.iloc[2:5, sl] = np.nan
+ df_new.iloc[7:8, sl] = 'bar'
+ _maybe_remove(store, 'df')
+ store.append(
+ 'df', df_new, data_columns=['A', 'B', 'string', 'string2'])
+ result = store.select('df',
+ "string='foo' and string2='foo'"
+ " and A>0 and B<0")
+ expected = df_new[(df_new.string == 'foo') & (
+ df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)]
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ # yield an empty frame
+ result = store.select('df', "string='foo' and string2='cool'")
+ expected = df_new[(df_new.string == 'foo') & (
+ df_new.string2 == 'cool')]
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ with ensure_clean_store(self.path) as store:
+ # doc example
+ df_dc = df.copy()
+ df_dc['string'] = 'foo'
+ df_dc.loc[4:6, 'string'] = np.nan
+ df_dc.loc[7:9, 'string'] = 'bar'
+ df_dc['string2'] = 'cool'
+ df_dc['datetime'] = Timestamp('20010102')
+ df_dc = df_dc._convert(datetime=True)
+ df_dc.loc[3:5, ['A', 'B', 'datetime']] = np.nan
+
+ _maybe_remove(store, 'df_dc')
+ store.append('df_dc', df_dc,
+ data_columns=['B', 'C', 'string',
+ 'string2', 'datetime'])
+ result = store.select('df_dc', 'B>0')
+
+ expected = df_dc[df_dc.B > 0]
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ result = store.select(
+ 'df_dc', ['B > 0', 'C > 0', 'string == foo'])
+ expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (
+ df_dc.string == 'foo')]
+ tm.assert_frame_equal(result, expected, check_index_type=False)
+
+ with ensure_clean_store(self.path) as store:
+ # doc example part 2
+ np.random.seed(1234)
+ index = date_range('1/1/2000', periods=8)
+ df_dc = DataFrame(np.random.randn(8, 3), index=index,
+ columns=['A', 'B', 'C'])
+ df_dc['string'] = 'foo'
+ df_dc.loc[4:6, 'string'] = np.nan
+ df_dc.loc[7:9, 'string'] = 'bar'
+ df_dc.loc[:, ['B', 'C']] = df_dc.loc[:, ['B', 'C']].abs()
+ df_dc['string2'] = 'cool'
+
+ # on-disk operations
+ store.append('df_dc', df_dc, data_columns=[
+ 'B', 'C', 'string', 'string2'])
+
+ result = store.select('df_dc', 'B>0')
+ expected = df_dc[df_dc.B > 0]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select(
+ 'df_dc', ['B > 0', 'C > 0', 'string == "foo"'])
+ expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) &
+ (df_dc.string == 'foo')]
+ tm.assert_frame_equal(result, expected)
+
+ with ensure_clean_store(self.path) as store:
+ with catch_warnings(record=True):
+ # panel
+ # GH5717 not handling data_columns
+ np.random.seed(1234)
+ p = tm.makePanel()
+
+ store.append('p1', p)
+ tm.assert_panel_equal(store.select('p1'), p)
+
+ store.append('p2', p, data_columns=True)
+ tm.assert_panel_equal(store.select('p2'), p)
+
+ result = store.select('p2', where='ItemA>0')
+ expected = p.to_frame()
+ expected = expected[expected['ItemA'] > 0]
+ tm.assert_frame_equal(result.to_frame(), expected)
+
+ result = store.select(
+ 'p2', where='ItemA>0 & minor_axis=["A","B"]')
+ expected = p.to_frame()
+ expected = expected[expected['ItemA'] > 0]
+ expected = expected[expected.reset_index(
+ level=['major']).index.isin(['A', 'B'])]
+ tm.assert_frame_equal(result.to_frame(), expected)
+
+ def test_create_table_index(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ def col(t, column):
+ return getattr(store.get_storer(t).table.cols, column)
+
+ # index=False
+ wp = tm.makePanel()
+ store.append('p5', wp, index=False)
+ store.create_table_index('p5', columns=['major_axis'])
+ assert(col('p5', 'major_axis').is_indexed is True)
+ assert(col('p5', 'minor_axis').is_indexed is False)
+
+ # index=True
+ store.append('p5i', wp, index=True)
+ assert(col('p5i', 'major_axis').is_indexed is True)
+ assert(col('p5i', 'minor_axis').is_indexed is True)
+
+ # default optlevels
+ store.get_storer('p5').create_index()
+ assert(col('p5', 'major_axis').index.optlevel == 6)
+ assert(col('p5', 'minor_axis').index.kind == 'medium')
+
+ # let's change the indexing scheme
+ store.create_table_index('p5')
+ assert(col('p5', 'major_axis').index.optlevel == 6)
+ assert(col('p5', 'minor_axis').index.kind == 'medium')
+ store.create_table_index('p5', optlevel=9)
+ assert(col('p5', 'major_axis').index.optlevel == 9)
+ assert(col('p5', 'minor_axis').index.kind == 'medium')
+ store.create_table_index('p5', kind='full')
+ assert(col('p5', 'major_axis').index.optlevel == 9)
+ assert(col('p5', 'minor_axis').index.kind == 'full')
+ store.create_table_index('p5', optlevel=1, kind='light')
+ assert(col('p5', 'major_axis').index.optlevel == 1)
+ assert(col('p5', 'minor_axis').index.kind == 'light')
+
+ # data columns
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df['string2'] = 'bar'
+ store.append('f', df, data_columns=['string', 'string2'])
+ assert(col('f', 'index').is_indexed is True)
+ assert(col('f', 'string').is_indexed is True)
+ assert(col('f', 'string2').is_indexed is True)
+
+ # specify index=columns
+ store.append(
+ 'f2', df, index=['string'],
+ data_columns=['string', 'string2'])
+ assert(col('f2', 'index').is_indexed is False)
+ assert(col('f2', 'string').is_indexed is True)
+ assert(col('f2', 'string2').is_indexed is False)
+
+ # try to index a non-table
+ _maybe_remove(store, 'f2')
+ store.put('f2', df)
+ pytest.raises(TypeError, store.create_table_index, 'f2')
+
+ def test_append_diff_item_order(self):
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel()
+ wp1 = wp.iloc[:, :10, :]
+ wp2 = wp.iloc[wp.items.get_indexer(['ItemC', 'ItemB', 'ItemA']),
+ 10:, :]
+
+ with ensure_clean_store(self.path) as store:
+ store.put('panel', wp1, format='table')
+ pytest.raises(ValueError, store.put, 'panel', wp2,
+ append=True)
+
+ def test_append_hierarchical(self):
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['foo', 'bar'])
+ df = DataFrame(np.random.randn(10, 3), index=index,
+ columns=['A', 'B', 'C'])
+
+ with ensure_clean_store(self.path) as store:
+ store.append('mi', df)
+ result = store.select('mi')
+ tm.assert_frame_equal(result, df)
+
+ # GH 3748
+ result = store.select('mi', columns=['A', 'B'])
+ expected = df.reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ with ensure_clean_path('test.hdf') as path:
+ df.to_hdf(path, 'df', format='table')
+ result = read_hdf(path, 'df', columns=['A', 'B'])
+ expected = df.reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_column_multiindex(self):
+ # GH 4710
+ # recreate multi-indexes properly
+
+ index = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'),
+ ('B', 'a'), ('B', 'b')],
+ names=['first', 'second'])
+ df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
+ expected = df.copy()
+ if isinstance(expected.index, RangeIndex):
+ expected.index = Int64Index(expected.index)
+
+ with ensure_clean_store(self.path) as store:
+
+ store.put('df', df)
+ tm.assert_frame_equal(store['df'], expected,
+ check_index_type=True,
+ check_column_type=True)
+
+ store.put('df1', df, format='table')
+ tm.assert_frame_equal(store['df1'], expected,
+ check_index_type=True,
+ check_column_type=True)
+
+ pytest.raises(ValueError, store.put, 'df2', df,
+ format='table', data_columns=['A'])
+ pytest.raises(ValueError, store.put, 'df3', df,
+ format='table', data_columns=True)
+
+ # appending multi-column on existing table (see GH 6167)
+ with ensure_clean_store(self.path) as store:
+ store.append('df2', df)
+ store.append('df2', df)
+
+ tm.assert_frame_equal(store['df2'], concat((df, df)))
+
+ # non_index_axes name
+ df = DataFrame(np.arange(12).reshape(3, 4),
+ columns=Index(list('ABCD'), name='foo'))
+ expected = df.copy()
+ if isinstance(expected.index, RangeIndex):
+ expected.index = Int64Index(expected.index)
+
+ with ensure_clean_store(self.path) as store:
+
+ store.put('df1', df, format='table')
+ tm.assert_frame_equal(store['df1'], expected,
+ check_index_type=True,
+ check_column_type=True)
+
+ def test_store_multiindex(self):
+
+ # validate multi-index names
+ # GH 5527
+ with ensure_clean_store(self.path) as store:
+
+ def make_index(names=None):
+ return MultiIndex.from_tuples([(datetime.datetime(2013, 12, d),
+ s, t)
+ for d in range(1, 3)
+ for s in range(2)
+ for t in range(3)],
+ names=names)
+
+ # no names
+ _maybe_remove(store, 'df')
+ df = DataFrame(np.zeros((12, 2)), columns=[
+ 'a', 'b'], index=make_index())
+ store.append('df', df)
+ tm.assert_frame_equal(store.select('df'), df)
+
+ # partial names
+ _maybe_remove(store, 'df')
+ df = DataFrame(np.zeros((12, 2)), columns=[
+ 'a', 'b'], index=make_index(['date', None, None]))
+ store.append('df', df)
+ tm.assert_frame_equal(store.select('df'), df)
+
+ # series
+ _maybe_remove(store, 's')
+ s = Series(np.zeros(12), index=make_index(['date', None, None]))
+ store.append('s', s)
+ xp = Series(np.zeros(12), index=make_index(
+ ['date', 'level_1', 'level_2']))
+ tm.assert_series_equal(store.select('s'), xp)
+
+ # dup with column
+ _maybe_remove(store, 'df')
+ df = DataFrame(np.zeros((12, 2)), columns=[
+ 'a', 'b'], index=make_index(['date', 'a', 't']))
+ pytest.raises(ValueError, store.append, 'df', df)
+
+ # dup within level
+ _maybe_remove(store, 'df')
+ df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
+ index=make_index(['date', 'date', 'date']))
+ pytest.raises(ValueError, store.append, 'df', df)
+
+ # fully names
+ _maybe_remove(store, 'df')
+ df = DataFrame(np.zeros((12, 2)), columns=[
+ 'a', 'b'], index=make_index(['date', 's', 't']))
+ store.append('df', df)
+ tm.assert_frame_equal(store.select('df'), df)
+
+ def test_select_columns_in_where(self):
+
+ # GH 6169
+ # recreate multi-indexes when columns is passed
+ # in the `where` argument
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['foo_name', 'bar_name'])
+
+ # With a DataFrame
+ df = DataFrame(np.random.randn(10, 3), index=index,
+ columns=['A', 'B', 'C'])
+
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df, format='table')
+ expected = df[['A']]
+
+ tm.assert_frame_equal(store.select('df', columns=['A']), expected)
+
+ tm.assert_frame_equal(store.select(
+ 'df', where="columns=['A']"), expected)
+
+ # With a Series
+ s = Series(np.random.randn(10), index=index,
+ name='A')
+ with ensure_clean_store(self.path) as store:
+ store.put('s', s, format='table')
+ tm.assert_series_equal(store.select('s', where="columns=['A']"), s)
+
+ def test_mi_data_columns(self):
+ # GH 14435
+ idx = pd.MultiIndex.from_arrays([date_range('2000-01-01', periods=5),
+ range(5)], names=['date', 'id'])
+ df = pd.DataFrame({'a': [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx)
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df, data_columns=True)
+
+ actual = store.select('df', where='id == 1')
+ expected = df.iloc[[1], :]
+ tm.assert_frame_equal(actual, expected)
+
+ def test_pass_spec_to_storer(self):
+
+ df = tm.makeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df)
+ pytest.raises(TypeError, store.select, 'df', columns=['A'])
+ pytest.raises(TypeError, store.select,
+ 'df', where=[('columns=A')])
+
+ @xfail_non_writeable
+ def test_append_misc(self):
+
+ with ensure_clean_store(self.path) as store:
+ df = tm.makeDataFrame()
+ store.append('df', df, chunksize=1)
+ result = store.select('df')
+ tm.assert_frame_equal(result, df)
+
+ store.append('df1', df, expectedrows=10)
+ result = store.select('df1')
+ tm.assert_frame_equal(result, df)
+
+ # more chunksize in append tests
+ def check(obj, comparator):
+ for c in [10, 200, 1000]:
+ with ensure_clean_store(self.path, mode='w') as store:
+ store.append('obj', obj, chunksize=c)
+ result = store.select('obj')
+ comparator(result, obj)
+
+ df = tm.makeDataFrame()
+ df['string'] = 'foo'
+ df['float322'] = 1.
+ df['float322'] = df['float322'].astype('float32')
+ df['bool'] = df['float322'] > 0
+ df['time1'] = Timestamp('20130101')
+ df['time2'] = Timestamp('20130102')
+ check(df, tm.assert_frame_equal)
+
+ with catch_warnings(record=True):
+ p = tm.makePanel()
+ check(p, assert_panel_equal)
+
+ # empty frame, GH4273
+ with ensure_clean_store(self.path) as store:
+
+ # 0 len
+ df_empty = DataFrame(columns=list('ABC'))
+ store.append('df', df_empty)
+ pytest.raises(KeyError, store.select, 'df')
+
+ # repeated append of 0/non-zero frames
+ df = DataFrame(np.random.rand(10, 3), columns=list('ABC'))
+ store.append('df', df)
+ assert_frame_equal(store.select('df'), df)
+ store.append('df', df_empty)
+ assert_frame_equal(store.select('df'), df)
+
+ # store
+ df = DataFrame(columns=list('ABC'))
+ store.put('df2', df)
+ assert_frame_equal(store.select('df2'), df)
+
+ with catch_warnings(record=True):
+
+ # 0 len
+ p_empty = Panel(items=list('ABC'))
+ store.append('p', p_empty)
+ pytest.raises(KeyError, store.select, 'p')
+
+ # repeated append of 0/non-zero frames
+ p = Panel(np.random.randn(3, 4, 5), items=list('ABC'))
+ store.append('p', p)
+ assert_panel_equal(store.select('p'), p)
+ store.append('p', p_empty)
+ assert_panel_equal(store.select('p'), p)
+
+ # store
+ store.put('p2', p_empty)
+ assert_panel_equal(store.select('p2'), p_empty)
+
+ def test_append_raise(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # test append with invalid input to get good error messages
+
+ # list in column
+ df = tm.makeDataFrame()
+ df['invalid'] = [['a']] * len(df)
+ assert df.dtypes['invalid'] == np.object_
+ pytest.raises(TypeError, store.append, 'df', df)
+
+ # multiple invalid columns
+ df['invalid2'] = [['a']] * len(df)
+ df['invalid3'] = [['a']] * len(df)
+ pytest.raises(TypeError, store.append, 'df', df)
+
+ # datetime with embedded nans as object
+ df = tm.makeDataFrame()
+ s = Series(datetime.datetime(2001, 1, 2), index=df.index)
+ s = s.astype(object)
+ s[0:5] = np.nan
+ df['invalid'] = s
+ assert df.dtypes['invalid'] == np.object_
+ pytest.raises(TypeError, store.append, 'df', df)
+
+ # directly ndarray
+ pytest.raises(TypeError, store.append, 'df', np.arange(10))
+
+ # series directly
+ pytest.raises(TypeError, store.append,
+ 'df', Series(np.arange(10)))
+
+ # appending an incompatible table
+ df = tm.makeDataFrame()
+ store.append('df', df)
+
+ df['foo'] = 'foo'
+ pytest.raises(ValueError, store.append, 'df', df)
+
+ def test_table_index_incompatible_dtypes(self):
+ df1 = DataFrame({'a': [1, 2, 3]})
+ df2 = DataFrame({'a': [4, 5, 6]},
+ index=date_range('1/1/2000', periods=3))
+
+ with ensure_clean_store(self.path) as store:
+ store.put('frame', df1, format='table')
+ pytest.raises(TypeError, store.put, 'frame', df2,
+ format='table', append=True)
+
+ def test_table_values_dtypes_roundtrip(self):
+
+ with ensure_clean_store(self.path) as store:
+ df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
+ store.append('df_f8', df1)
+ assert_series_equal(df1.dtypes, store['df_f8'].dtypes)
+
+ df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
+ store.append('df_i8', df2)
+ assert_series_equal(df2.dtypes, store['df_i8'].dtypes)
+
+ # incompatible dtype
+ pytest.raises(ValueError, store.append, 'df_i8', df1)
+
+ # check creation/storage/retrieval of float32 (a bit hacky to
+ # actually create them thought)
+ df1 = DataFrame(
+ np.array([[1], [2], [3]], dtype='f4'), columns=['A'])
+ store.append('df_f4', df1)
+ assert_series_equal(df1.dtypes, store['df_f4'].dtypes)
+ assert df1.dtypes[0] == 'float32'
+
+ # check with mixed dtypes
+ df1 = DataFrame({c: Series(np.random.randint(5), dtype=c)
+ for c in ['float32', 'float64', 'int32',
+ 'int64', 'int16', 'int8']})
+ df1['string'] = 'foo'
+ df1['float322'] = 1.
+ df1['float322'] = df1['float322'].astype('float32')
+ df1['bool'] = df1['float32'] > 0
+ df1['time1'] = Timestamp('20130101')
+ df1['time2'] = Timestamp('20130102')
+
+ store.append('df_mixed_dtypes1', df1)
+ result = store.select('df_mixed_dtypes1').get_dtype_counts()
+ expected = Series({'float32': 2, 'float64': 1, 'int32': 1,
+ 'bool': 1, 'int16': 1, 'int8': 1,
+ 'int64': 1, 'object': 1, 'datetime64[ns]': 2})
+ result = result.sort_index()
+ expected = expected.sort_index()
+ tm.assert_series_equal(result, expected)
+
+ def test_table_mixed_dtypes(self):
+
+ # frame
+ df = tm.makeDataFrame()
+ df['obj1'] = 'foo'
+ df['obj2'] = 'bar'
+ df['bool1'] = df['A'] > 0
+ df['bool2'] = df['B'] > 0
+ df['bool3'] = True
+ df['int1'] = 1
+ df['int2'] = 2
+ df['timestamp1'] = Timestamp('20010102')
+ df['timestamp2'] = Timestamp('20010103')
+ df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
+ df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
+ df.loc[3:6, ['obj1']] = np.nan
+ df = df._consolidate()._convert(datetime=True)
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df1_mixed', df)
+ tm.assert_frame_equal(store.select('df1_mixed'), df)
+
+ with catch_warnings(record=True):
+
+ # panel
+ wp = tm.makePanel()
+ wp['obj1'] = 'foo'
+ wp['obj2'] = 'bar'
+ wp['bool1'] = wp['ItemA'] > 0
+ wp['bool2'] = wp['ItemB'] > 0
+ wp['int1'] = 1
+ wp['int2'] = 2
+ wp = wp._consolidate()
+
+ with catch_warnings(record=True):
+
+ with ensure_clean_store(self.path) as store:
+ store.append('p1_mixed', wp)
+ assert_panel_equal(store.select('p1_mixed'), wp)
+
+ def test_unimplemented_dtypes_table_columns(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ dtypes = [('date', datetime.date(2001, 1, 2))]
+
+ # py3 ok for unicode
+ if not compat.PY3:
+ dtypes.append(('unicode', u('\\u03c3')))
+
+ # currently not supported dtypes ####
+ for n, f in dtypes:
+ df = tm.makeDataFrame()
+ df[n] = f
+ pytest.raises(
+ TypeError, store.append, 'df1_%s' % n, df)
+
+ # frame
+ df = tm.makeDataFrame()
+ df['obj1'] = 'foo'
+ df['obj2'] = 'bar'
+ df['datetime1'] = datetime.date(2001, 1, 2)
+ df = df._consolidate()._convert(datetime=True)
+
+ with ensure_clean_store(self.path) as store:
+ # this fails because we have a date in the object block......
+ pytest.raises(TypeError, store.append, 'df_unimplemented', df)
+
+ @xfail_non_writeable
+ @pytest.mark.skipif(
+ LooseVersion(np.__version__) == LooseVersion('1.15.0'),
+ reason=("Skipping pytables test when numpy version is "
+ "exactly equal to 1.15.0: gh-22098"))
+ def test_calendar_roundtrip_issue(self):
+
+ # 8591
+ # doc example from tseries holiday section
+ weekmask_egypt = 'Sun Mon Tue Wed Thu'
+ holidays = ['2012-05-01',
+ datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')]
+ bday_egypt = pd.offsets.CustomBusinessDay(
+ holidays=holidays, weekmask=weekmask_egypt)
+ dt = datetime.datetime(2013, 4, 30)
+ dts = date_range(dt, periods=5, freq=bday_egypt)
+
+ s = (Series(dts.weekday, dts).map(
+ Series('Mon Tue Wed Thu Fri Sat Sun'.split())))
+
+ with ensure_clean_store(self.path) as store:
+
+ store.put('fixed', s)
+ result = store.select('fixed')
+ assert_series_equal(result, s)
+
+ store.append('table', s)
+ result = store.select('table')
+ assert_series_equal(result, s)
+
+ def test_roundtrip_tz_aware_index(self):
+ # GH 17618
+ time = pd.Timestamp('2000-01-01 01:00:00', tz='US/Eastern')
+ df = pd.DataFrame(data=[0], index=[time])
+
+ with ensure_clean_store(self.path) as store:
+ store.put('frame', df, format='fixed')
+ recons = store['frame']
+ tm.assert_frame_equal(recons, df)
+ assert recons.index[0].value == 946706400000000000
+
+ def test_append_with_timedelta(self):
+ # GH 3577
+ # append timedelta
+
+ df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp(
+ '20130101') + timedelta(days=i, seconds=10) for i in range(10)]))
+ df['C'] = df['A'] - df['B']
+ df.loc[3:5, 'C'] = np.nan
+
+ with ensure_clean_store(self.path) as store:
+
+ # table
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=True)
+ result = store.select('df')
+ assert_frame_equal(result, df)
+
+ result = store.select('df', where="C<100000")
+ assert_frame_equal(result, df)
+
+ result = store.select('df', where="C<pd.Timedelta('-3D')")
+ assert_frame_equal(result, df.iloc[3:])
+
+ result = store.select('df', "C<'-3D'")
+ assert_frame_equal(result, df.iloc[3:])
+
+ # a bit hacky here as we don't really deal with the NaT properly
+
+ result = store.select('df', "C<'-500000s'")
+ result = result.dropna(subset=['C'])
+ assert_frame_equal(result, df.iloc[6:])
+
+ result = store.select('df', "C<'-3.5D'")
+ result = result.iloc[1:]
+ assert_frame_equal(result, df.iloc[4:])
+
+ # fixed
+ _maybe_remove(store, 'df2')
+ store.put('df2', df)
+ result = store.select('df2')
+ assert_frame_equal(result, df)
+
+ def test_remove(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ ts = tm.makeTimeSeries()
+ df = tm.makeDataFrame()
+ store['a'] = ts
+ store['b'] = df
+ _maybe_remove(store, 'a')
+ assert len(store) == 1
+ tm.assert_frame_equal(df, store['b'])
+
+ _maybe_remove(store, 'b')
+ assert len(store) == 0
+
+ # nonexistence
+ pytest.raises(KeyError, store.remove, 'a_nonexistent_store')
+
+ # pathing
+ store['a'] = ts
+ store['b/foo'] = df
+ _maybe_remove(store, 'foo')
+ _maybe_remove(store, 'b/foo')
+ assert len(store) == 1
+
+ store['a'] = ts
+ store['b/foo'] = df
+ _maybe_remove(store, 'b')
+ assert len(store) == 1
+
+ # __delitem__
+ store['a'] = ts
+ store['b'] = df
+ del store['a']
+ del store['b']
+ assert len(store) == 0
+
+ def test_remove_where(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+
+ # non-existance
+ crit1 = 'index>foo'
+ pytest.raises(KeyError, store.remove, 'a', [crit1])
+
+ # try to remove non-table (with crit)
+ # non-table ok (where = None)
+ wp = tm.makePanel(30)
+ store.put('wp', wp, format='table')
+ store.remove('wp', ["minor_axis=['A', 'D']"])
+ rs = store.select('wp')
+ expected = wp.reindex(minor_axis=['B', 'C'])
+ assert_panel_equal(rs, expected)
+
+ # empty where
+ _maybe_remove(store, 'wp')
+ store.put('wp', wp, format='table')
+
+ # deleted number (entire table)
+ n = store.remove('wp', [])
+ assert n == 120
+
+ # non - empty where
+ _maybe_remove(store, 'wp')
+ store.put('wp', wp, format='table')
+ pytest.raises(ValueError, store.remove,
+ 'wp', ['foo'])
+
+ def test_remove_startstop(self):
+ # GH #4835 and #6177
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel(30)
+
+ # start
+ _maybe_remove(store, 'wp1')
+ store.put('wp1', wp, format='t')
+ n = store.remove('wp1', start=32)
+ assert n == 120 - 32
+ result = store.select('wp1')
+ expected = wp.reindex(major_axis=wp.major_axis[:32 // 4])
+ assert_panel_equal(result, expected)
+
+ _maybe_remove(store, 'wp2')
+ store.put('wp2', wp, format='t')
+ n = store.remove('wp2', start=-32)
+ assert n == 32
+ result = store.select('wp2')
+ expected = wp.reindex(major_axis=wp.major_axis[:-32 // 4])
+ assert_panel_equal(result, expected)
+
+ # stop
+ _maybe_remove(store, 'wp3')
+ store.put('wp3', wp, format='t')
+ n = store.remove('wp3', stop=32)
+ assert n == 32
+ result = store.select('wp3')
+ expected = wp.reindex(major_axis=wp.major_axis[32 // 4:])
+ assert_panel_equal(result, expected)
+
+ _maybe_remove(store, 'wp4')
+ store.put('wp4', wp, format='t')
+ n = store.remove('wp4', stop=-32)
+ assert n == 120 - 32
+ result = store.select('wp4')
+ expected = wp.reindex(major_axis=wp.major_axis[-32 // 4:])
+ assert_panel_equal(result, expected)
+
+ # start n stop
+ _maybe_remove(store, 'wp5')
+ store.put('wp5', wp, format='t')
+ n = store.remove('wp5', start=16, stop=-16)
+ assert n == 120 - 32
+ result = store.select('wp5')
+ expected = wp.reindex(
+ major_axis=(wp.major_axis[:16 // 4]
+ .union(wp.major_axis[-16 // 4:])))
+ assert_panel_equal(result, expected)
+
+ _maybe_remove(store, 'wp6')
+ store.put('wp6', wp, format='t')
+ n = store.remove('wp6', start=16, stop=16)
+ assert n == 0
+ result = store.select('wp6')
+ expected = wp.reindex(major_axis=wp.major_axis)
+ assert_panel_equal(result, expected)
+
+ # with where
+ _maybe_remove(store, 'wp7')
+
+ # TODO: unused?
+ date = wp.major_axis.take(np.arange(0, 30, 3)) # noqa
+
+ crit = 'major_axis=date'
+ store.put('wp7', wp, format='t')
+ n = store.remove('wp7', where=[crit], stop=80)
+ assert n == 28
+ result = store.select('wp7')
+ expected = wp.reindex(major_axis=wp.major_axis.difference(
+ wp.major_axis[np.arange(0, 20, 3)]))
+ assert_panel_equal(result, expected)
+
+ def test_remove_crit(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel(30)
+
+ # group row removal
+ _maybe_remove(store, 'wp3')
+ date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10])
+ crit4 = 'major_axis=date4'
+ store.put('wp3', wp, format='t')
+ n = store.remove('wp3', where=[crit4])
+ assert n == 36
+
+ result = store.select('wp3')
+ expected = wp.reindex(
+ major_axis=wp.major_axis.difference(date4))
+ assert_panel_equal(result, expected)
+
+ # upper half
+ _maybe_remove(store, 'wp')
+ store.put('wp', wp, format='table')
+ date = wp.major_axis[len(wp.major_axis) // 2]
+
+ crit1 = 'major_axis>date'
+ crit2 = "minor_axis=['A', 'D']"
+ n = store.remove('wp', where=[crit1])
+ assert n == 56
+
+ n = store.remove('wp', where=[crit2])
+ assert n == 32
+
+ result = store['wp']
+ expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
+ assert_panel_equal(result, expected)
+
+ # individual row elements
+ _maybe_remove(store, 'wp2')
+ store.put('wp2', wp, format='table')
+
+ date1 = wp.major_axis[1:3]
+ crit1 = 'major_axis=date1'
+ store.remove('wp2', where=[crit1])
+ result = store.select('wp2')
+ expected = wp.reindex(
+ major_axis=wp.major_axis.difference(date1))
+ assert_panel_equal(result, expected)
+
+ date2 = wp.major_axis[5]
+ crit2 = 'major_axis=date2'
+ store.remove('wp2', where=[crit2])
+ result = store['wp2']
+ expected = wp.reindex(
+ major_axis=(wp.major_axis
+ .difference(date1)
+ .difference(Index([date2]))
+ ))
+ assert_panel_equal(result, expected)
+
+ date3 = [wp.major_axis[7], wp.major_axis[9]]
+ crit3 = 'major_axis=date3'
+ store.remove('wp2', where=[crit3])
+ result = store['wp2']
+ expected = wp.reindex(major_axis=wp.major_axis
+ .difference(date1)
+ .difference(Index([date2]))
+ .difference(Index(date3)))
+ assert_panel_equal(result, expected)
+
+ # corners
+ _maybe_remove(store, 'wp4')
+ store.put('wp4', wp, format='table')
+ n = store.remove(
+ 'wp4', where="major_axis>wp.major_axis[-1]")
+ result = store.select('wp4')
+ assert_panel_equal(result, wp)
+
+ def test_invalid_terms(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df.loc[0:4, 'string'] = 'bar'
+ wp = tm.makePanel()
+
+ store.put('df', df, format='table')
+ store.put('wp', wp, format='table')
+
+ # some invalid terms
+ pytest.raises(ValueError, store.select,
+ 'wp', "minor=['A', 'B']")
+ pytest.raises(ValueError, store.select,
+ 'wp', ["index=['20121114']"])
+ pytest.raises(ValueError, store.select, 'wp', [
+ "index=['20121114', '20121114']"])
+ pytest.raises(TypeError, Term)
+
+ # more invalid
+ pytest.raises(
+ ValueError, store.select, 'df', 'df.index[3]')
+ pytest.raises(SyntaxError, store.select, 'df', 'index>')
+ pytest.raises(
+ ValueError, store.select, 'wp',
+ "major_axis<'20000108' & minor_axis['A', 'B']")
+
+ # from the docs
+ with ensure_clean_path(self.path) as path:
+ dfq = DataFrame(np.random.randn(10, 4), columns=list(
+ 'ABCD'), index=date_range('20130101', periods=10))
+ dfq.to_hdf(path, 'dfq', format='table', data_columns=True)
+
+ # check ok
+ read_hdf(path, 'dfq',
+ where="index>Timestamp('20130104') & columns=['A', 'B']")
+ read_hdf(path, 'dfq', where="A>0 or C>0")
+
+ # catch the invalid reference
+ with ensure_clean_path(self.path) as path:
+ dfq = DataFrame(np.random.randn(10, 4), columns=list(
+ 'ABCD'), index=date_range('20130101', periods=10))
+ dfq.to_hdf(path, 'dfq', format='table')
+
+ pytest.raises(ValueError, read_hdf, path,
+ 'dfq', where="A>0 or C>0")
+
+ def test_terms(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+
+ wp = tm.makePanel()
+ wpneg = Panel.fromDict({-1: tm.makeDataFrame(),
+ 0: tm.makeDataFrame(),
+ 1: tm.makeDataFrame()})
+
+ store.put('wp', wp, format='table')
+ store.put('wpneg', wpneg, format='table')
+
+ # panel
+ result = store.select(
+ 'wp',
+ "major_axis<'20000108' and minor_axis=['A', 'B']")
+ expected = wp.truncate(
+ after='20000108').reindex(minor=['A', 'B'])
+ assert_panel_equal(result, expected)
+
+ # with deprecation
+ result = store.select(
+ 'wp', where=("major_axis<'20000108' "
+ "and minor_axis=['A', 'B']"))
+ expected = wp.truncate(
+ after='20000108').reindex(minor=['A', 'B'])
+ tm.assert_panel_equal(result, expected)
+
+ with catch_warnings(record=True):
+
+ # valid terms
+ terms = [('major_axis=20121114'),
+ ('major_axis>20121114'),
+ (("major_axis=['20121114', '20121114']"),),
+ ('major_axis=datetime.datetime(2012, 11, 14)'),
+ 'major_axis> 20121114',
+ 'major_axis >20121114',
+ 'major_axis > 20121114',
+ (("minor_axis=['A', 'B']"),),
+ (("minor_axis=['A', 'B']"),),
+ ((("minor_axis==['A', 'B']"),),),
+ (("items=['ItemA', 'ItemB']"),),
+ ('items=ItemA'),
+ ]
+
+ for t in terms:
+ store.select('wp', t)
+
+ with pytest.raises(TypeError,
+ match='Only named functions are supported'):
+ store.select(
+ 'wp',
+ 'major_axis == (lambda x: x)("20130101")')
+
+ with catch_warnings(record=True):
+ # check USub node parsing
+ res = store.select('wpneg', 'items == -1')
+ expected = Panel({-1: wpneg[-1]})
+ tm.assert_panel_equal(res, expected)
+
+ msg = 'Unary addition not supported'
+ with pytest.raises(NotImplementedError, match=msg):
+ store.select('wpneg', 'items == +1')
+
+ def test_term_compat(self):
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+ store.append('wp', wp)
+
+ result = store.select(
+ 'wp', where=("major_axis>20000102 "
+ "and minor_axis=['A', 'B']"))
+ expected = wp.loc[:, wp.major_axis >
+ Timestamp('20000102'), ['A', 'B']]
+ assert_panel_equal(result, expected)
+
+ store.remove('wp', 'major_axis>20000103')
+ result = store.select('wp')
+ expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :]
+ assert_panel_equal(result, expected)
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ wp = Panel(np.random.randn(2, 5, 4),
+ items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B', 'C', 'D'])
+ store.append('wp', wp)
+
+ # stringified datetimes
+ result = store.select(
+ 'wp', 'major_axis>datetime.datetime(2000, 1, 2)')
+ expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
+ assert_panel_equal(result, expected)
+
+ result = store.select(
+ 'wp', 'major_axis>datetime.datetime(2000, 1, 2)')
+ expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
+ assert_panel_equal(result, expected)
+
+ result = store.select(
+ 'wp',
+ "major_axis=[datetime.datetime(2000, 1, 2, 0, 0), "
+ "datetime.datetime(2000, 1, 3, 0, 0)]")
+ expected = wp.loc[:, [Timestamp('20000102'),
+ Timestamp('20000103')]]
+ assert_panel_equal(result, expected)
+
+ result = store.select(
+ 'wp', "minor_axis=['A', 'B']")
+ expected = wp.loc[:, :, ['A', 'B']]
+ assert_panel_equal(result, expected)
+
+ def test_same_name_scoping(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ import pandas as pd
+ df = DataFrame(np.random.randn(20, 2),
+ index=pd.date_range('20130101', periods=20))
+ store.put('df', df, format='table')
+ expected = df[df.index > pd.Timestamp('20130105')]
+
+ import datetime # noqa
+ result = store.select('df', 'index>datetime.datetime(2013,1,5)')
+ assert_frame_equal(result, expected)
+
+ from datetime import datetime # noqa
+
+ # technically an error, but allow it
+ result = store.select('df', 'index>datetime.datetime(2013,1,5)')
+ assert_frame_equal(result, expected)
+
+ result = store.select('df', 'index>datetime(2013,1,5)')
+ assert_frame_equal(result, expected)
+
+ def test_series(self):
+
+ s = tm.makeStringSeries()
+ self._check_roundtrip(s, tm.assert_series_equal)
+
+ ts = tm.makeTimeSeries()
+ self._check_roundtrip(ts, tm.assert_series_equal)
+
+ ts2 = Series(ts.index, Index(ts.index, dtype=object))
+ self._check_roundtrip(ts2, tm.assert_series_equal)
+
+ ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object),
+ dtype=object))
+ self._check_roundtrip(ts3, tm.assert_series_equal,
+ check_index_type=False)
+
+ def test_sparse_series(self):
+
+ s = tm.makeStringSeries()
+ s.iloc[3:5] = np.nan
+ ss = s.to_sparse()
+ self._check_roundtrip(ss, tm.assert_series_equal,
+ check_series_type=True)
+
+ ss2 = s.to_sparse(kind='integer')
+ self._check_roundtrip(ss2, tm.assert_series_equal,
+ check_series_type=True)
+
+ ss3 = s.to_sparse(fill_value=0)
+ self._check_roundtrip(ss3, tm.assert_series_equal,
+ check_series_type=True)
+
+ def test_sparse_frame(self):
+
+ s = tm.makeDataFrame()
+ s.iloc[3:5, 1:3] = np.nan
+ s.iloc[8:10, -2] = np.nan
+ ss = s.to_sparse()
+
+ self._check_double_roundtrip(ss, tm.assert_frame_equal,
+ check_frame_type=True)
+
+ ss2 = s.to_sparse(kind='integer')
+ self._check_double_roundtrip(ss2, tm.assert_frame_equal,
+ check_frame_type=True)
+
+ ss3 = s.to_sparse(fill_value=0)
+ self._check_double_roundtrip(ss3, tm.assert_frame_equal,
+ check_frame_type=True)
+
+ def test_float_index(self):
+
+ # GH #454
+ index = np.random.randn(10)
+ s = Series(np.random.randn(10), index=index)
+ self._check_roundtrip(s, tm.assert_series_equal)
+
+ @xfail_non_writeable
+ def test_tuple_index(self):
+
+ # GH #492
+ col = np.arange(10)
+ idx = [(0., 1.), (2., 3.), (4., 5.)]
+ data = np.random.randn(30).reshape((3, 10))
+ DF = DataFrame(data, index=idx, columns=col)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", pd.errors.PerformanceWarning)
+ self._check_roundtrip(DF, tm.assert_frame_equal)
+
+ @xfail_non_writeable
+ @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
+ def test_index_types(self):
+
+ with catch_warnings(record=True):
+ values = np.random.randn(2)
+
+ func = lambda l, r: tm.assert_series_equal(l, r,
+ check_dtype=True,
+ check_index_type=True,
+ check_series_type=True)
+
+ with catch_warnings(record=True):
+ ser = Series(values, [0, 'y'])
+ self._check_roundtrip(ser, func)
+
+ with catch_warnings(record=True):
+ ser = Series(values, [datetime.datetime.today(), 0])
+ self._check_roundtrip(ser, func)
+
+ with catch_warnings(record=True):
+ ser = Series(values, ['y', 0])
+ self._check_roundtrip(ser, func)
+
+ with catch_warnings(record=True):
+ ser = Series(values, [datetime.date.today(), 'a'])
+ self._check_roundtrip(ser, func)
+
+ with catch_warnings(record=True):
+
+ ser = Series(values, [0, 'y'])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [datetime.datetime.today(), 0])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, ['y', 0])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [datetime.date.today(), 'a'])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [1.23, 'b'])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [1, 1.53])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [1, 5])
+ self._check_roundtrip(ser, func)
+
+ ser = Series(values, [datetime.datetime(
+ 2012, 1, 1), datetime.datetime(2012, 1, 2)])
+ self._check_roundtrip(ser, func)
+
+ def test_timeseries_preepoch(self):
+
+ dr = bdate_range('1/1/1940', '1/1/1960')
+ ts = Series(np.random.randn(len(dr)), index=dr)
+ try:
+ self._check_roundtrip(ts, tm.assert_series_equal)
+ except OverflowError:
+ pytest.skip('known failer on some windows platforms')
+
+ @xfail_non_writeable
+ @pytest.mark.parametrize("compression", [
+ False, pytest.param(True, marks=td.skip_if_windows_python_3)
+ ])
+ def test_frame(self, compression):
+
+ df = tm.makeDataFrame()
+
+ # put in some random NAs
+ df.values[0, 0] = np.nan
+ df.values[5, 3] = np.nan
+
+ self._check_roundtrip_table(df, tm.assert_frame_equal,
+ compression=compression)
+ self._check_roundtrip(df, tm.assert_frame_equal,
+ compression=compression)
+
+ tdf = tm.makeTimeDataFrame()
+ self._check_roundtrip(tdf, tm.assert_frame_equal,
+ compression=compression)
+
+ with ensure_clean_store(self.path) as store:
+ # not consolidated
+ df['foo'] = np.random.randn(len(df))
+ store['df'] = df
+ recons = store['df']
+ assert recons._data.is_consolidated()
+
+ # empty
+ self._check_roundtrip(df[:0], tm.assert_frame_equal)
+
+ @xfail_non_writeable
+ def test_empty_series_frame(self):
+ s0 = Series()
+ s1 = Series(name='myseries')
+ df0 = DataFrame()
+ df1 = DataFrame(index=['a', 'b', 'c'])
+ df2 = DataFrame(columns=['d', 'e', 'f'])
+
+ self._check_roundtrip(s0, tm.assert_series_equal)
+ self._check_roundtrip(s1, tm.assert_series_equal)
+ self._check_roundtrip(df0, tm.assert_frame_equal)
+ self._check_roundtrip(df1, tm.assert_frame_equal)
+ self._check_roundtrip(df2, tm.assert_frame_equal)
+
+ @xfail_non_writeable
+ @pytest.mark.parametrize(
+ 'dtype', [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]'])
+ def test_empty_series(self, dtype):
+ s = Series(dtype=dtype)
+ self._check_roundtrip(s, tm.assert_series_equal)
+
+ def test_can_serialize_dates(self):
+
+ rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')]
+ frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ self._check_roundtrip(frame, tm.assert_frame_equal)
+
+ def test_store_hierarchical(self):
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['foo', 'bar'])
+ frame = DataFrame(np.random.randn(10, 3), index=index,
+ columns=['A', 'B', 'C'])
+
+ self._check_roundtrip(frame, tm.assert_frame_equal)
+ self._check_roundtrip(frame.T, tm.assert_frame_equal)
+ self._check_roundtrip(frame['A'], tm.assert_series_equal)
+
+ # check that the names are stored
+ with ensure_clean_store(self.path) as store:
+ store['frame'] = frame
+ recons = store['frame']
+ tm.assert_frame_equal(recons, frame)
+
+ def test_store_index_name(self):
+ df = tm.makeDataFrame()
+ df.index.name = 'foo'
+
+ with ensure_clean_store(self.path) as store:
+ store['frame'] = df
+ recons = store['frame']
+ tm.assert_frame_equal(recons, df)
+
+ def test_store_index_name_with_tz(self):
+ # GH 13884
+ df = pd.DataFrame({'A': [1, 2]})
+ df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788])
+ df.index = df.index.tz_localize('UTC')
+ df.index.name = 'foo'
+
+ with ensure_clean_store(self.path) as store:
+ store.put('frame', df, format='table')
+ recons = store['frame']
+ tm.assert_frame_equal(recons, df)
+
+ @pytest.mark.parametrize('table_format', ['table', 'fixed'])
+ def test_store_index_name_numpy_str(self, table_format):
+ # GH #13492
+ idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1),
+ datetime.date(2000, 1, 2)]),
+ name=u('cols\u05d2'))
+ idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1),
+ datetime.date(2010, 1, 2)]),
+ name=u('rows\u05d0'))
+ df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)
+
+ # This used to fail, returning numpy strings instead of python strings.
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format=table_format)
+ df2 = read_hdf(path, 'df')
+
+ assert_frame_equal(df, df2, check_names=True)
+
+ assert type(df2.index.name) == text_type
+ assert type(df2.columns.name) == text_type
+
+ def test_store_series_name(self):
+ df = tm.makeDataFrame()
+ series = df['A']
+
+ with ensure_clean_store(self.path) as store:
+ store['series'] = series
+ recons = store['series']
+ tm.assert_series_equal(recons, series)
+
+ @xfail_non_writeable
+ @pytest.mark.parametrize("compression", [
+ False, pytest.param(True, marks=td.skip_if_windows_python_3)
+ ])
+ def test_store_mixed(self, compression):
+
+ def _make_one():
+ df = tm.makeDataFrame()
+ df['obj1'] = 'foo'
+ df['obj2'] = 'bar'
+ df['bool1'] = df['A'] > 0
+ df['bool2'] = df['B'] > 0
+ df['int1'] = 1
+ df['int2'] = 2
+ return df._consolidate()
+
+ df1 = _make_one()
+ df2 = _make_one()
+
+ self._check_roundtrip(df1, tm.assert_frame_equal)
+ self._check_roundtrip(df2, tm.assert_frame_equal)
+
+ with ensure_clean_store(self.path) as store:
+ store['obj'] = df1
+ tm.assert_frame_equal(store['obj'], df1)
+ store['obj'] = df2
+ tm.assert_frame_equal(store['obj'], df2)
+
+ # check that can store Series of all of these types
+ self._check_roundtrip(df1['obj1'], tm.assert_series_equal,
+ compression=compression)
+ self._check_roundtrip(df1['bool1'], tm.assert_series_equal,
+ compression=compression)
+ self._check_roundtrip(df1['int1'], tm.assert_series_equal,
+ compression=compression)
+
+ def test_wide(self):
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel()
+ self._check_roundtrip(wp, assert_panel_equal)
+
+ @pytest.mark.filterwarnings(
+ "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning"
+ )
+ def test_select_with_dups(self):
+
+ # single dtypes
+ df = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B'])
+ df.index = date_range('20130101 9:30', periods=10, freq='T')
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df)
+
+ result = store.select('df')
+ expected = df
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ result = store.select('df', columns=df.columns)
+ expected = df
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ result = store.select('df', columns=['A'])
+ expected = df.loc[:, ['A']]
+ assert_frame_equal(result, expected)
+
+ # dups across dtypes
+ df = concat([DataFrame(np.random.randn(10, 4),
+ columns=['A', 'A', 'B', 'B']),
+ DataFrame(np.random.randint(0, 10, size=20)
+ .reshape(10, 2),
+ columns=['A', 'C'])],
+ axis=1)
+ df.index = date_range('20130101 9:30', periods=10, freq='T')
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df)
+
+ result = store.select('df')
+ expected = df
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ result = store.select('df', columns=df.columns)
+ expected = df
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ expected = df.loc[:, ['A']]
+ result = store.select('df', columns=['A'])
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ expected = df.loc[:, ['B', 'A']]
+ result = store.select('df', columns=['B', 'A'])
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ # duplicates on both index and columns
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df)
+ store.append('df', df)
+
+ expected = df.loc[:, ['B', 'A']]
+ expected = concat([expected, expected])
+ result = store.select('df', columns=['B', 'A'])
+ assert_frame_equal(result, expected, by_blocks=True)
+
+ @pytest.mark.filterwarnings(
+ "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning"
+ )
+ def test_wide_table_dups(self):
+ with ensure_clean_store(self.path) as store:
+ with catch_warnings(record=True):
+
+ wp = tm.makePanel()
+ store.put('panel', wp, format='table')
+ store.put('panel', wp, format='table', append=True)
+
+ recons = store['panel']
+
+ assert_panel_equal(recons, wp)
+
+ def test_long(self):
+ def _check(left, right):
+ assert_panel_equal(left.to_panel(), right.to_panel())
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel()
+ self._check_roundtrip(wp.to_frame(), _check)
+
+ def test_overwrite_node(self):
+
+ with ensure_clean_store(self.path) as store:
+ store['a'] = tm.makeTimeDataFrame()
+ ts = tm.makeTimeSeries()
+ store['a'] = ts
+
+ tm.assert_series_equal(store['a'], ts)
+
+ def test_sparse_with_compression(self):
+
+ # GH 2931
+
+ # make sparse dataframe
+ arr = np.random.binomial(n=1, p=.01, size=(1000, 10))
+ df = DataFrame(arr).to_sparse(fill_value=0)
+
+ # case 1: store uncompressed
+ self._check_double_roundtrip(df, tm.assert_frame_equal,
+ compression=False,
+ check_frame_type=True)
+
+ # case 2: store compressed (works)
+ self._check_double_roundtrip(df, tm.assert_frame_equal,
+ compression='zlib',
+ check_frame_type=True)
+
+ # set one series to be completely sparse
+ df[0] = np.zeros(1000)
+
+ # case 3: store df with completely sparse series uncompressed
+ self._check_double_roundtrip(df, tm.assert_frame_equal,
+ compression=False,
+ check_frame_type=True)
+
+ # case 4: try storing df with completely sparse series compressed
+ # (fails)
+ self._check_double_roundtrip(df, tm.assert_frame_equal,
+ compression='zlib',
+ check_frame_type=True)
+
+ def test_select(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+ wp = tm.makePanel()
+
+ # put/select ok
+ _maybe_remove(store, 'wp')
+ store.put('wp', wp, format='table')
+ store.select('wp')
+
+ # non-table ok (where = None)
+ _maybe_remove(store, 'wp')
+ store.put('wp2', wp)
+ store.select('wp2')
+
+ # selection on the non-indexable with a large number of columns
+ wp = Panel(np.random.randn(100, 100, 100),
+ items=['Item%03d' % i for i in range(100)],
+ major_axis=date_range('1/1/2000', periods=100),
+ minor_axis=['E%03d' % i for i in range(100)])
+
+ _maybe_remove(store, 'wp')
+ store.append('wp', wp)
+ items = ['Item%03d' % i for i in range(80)]
+ result = store.select('wp', 'items=items')
+ expected = wp.reindex(items=items)
+ assert_panel_equal(expected, result)
+
+ # selectin non-table with a where
+ # pytest.raises(ValueError, store.select,
+ # 'wp2', ('column', ['A', 'D']))
+
+ # select with columns=
+ df = tm.makeTimeDataFrame()
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+ result = store.select('df', columns=['A', 'B'])
+ expected = df.reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(expected, result)
+
+ # equivalentsly
+ result = store.select('df', [("columns=['A', 'B']")])
+ expected = df.reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(expected, result)
+
+ # with a data column
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=['A'])
+ result = store.select('df', ['A > 0'], columns=['A', 'B'])
+ expected = df[df.A > 0].reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(expected, result)
+
+ # all a data columns
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=True)
+ result = store.select('df', ['A > 0'], columns=['A', 'B'])
+ expected = df[df.A > 0].reindex(columns=['A', 'B'])
+ tm.assert_frame_equal(expected, result)
+
+ # with a data column, but different columns
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=['A'])
+ result = store.select('df', ['A > 0'], columns=['C', 'D'])
+ expected = df[df.A > 0].reindex(columns=['C', 'D'])
+ tm.assert_frame_equal(expected, result)
+
+ def test_select_dtypes(self):
+
+ with ensure_clean_store(self.path) as store:
+ # with a Timestamp data column (GH #2637)
+ df = DataFrame(dict(
+ ts=bdate_range('2012-01-01', periods=300),
+ A=np.random.randn(300)))
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=['ts', 'A'])
+
+ result = store.select('df', "ts>=Timestamp('2012-02-01')")
+ expected = df[df.ts >= Timestamp('2012-02-01')]
+ tm.assert_frame_equal(expected, result)
+
+ # bool columns (GH #2849)
+ df = DataFrame(np.random.randn(5, 2), columns=['A', 'B'])
+ df['object'] = 'foo'
+ df.loc[4:5, 'object'] = 'bar'
+ df['boolv'] = df['A'] > 0
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=True)
+
+ expected = (df[df.boolv == True] # noqa
+ .reindex(columns=['A', 'boolv']))
+ for v in [True, 'true', 1]:
+ result = store.select('df', 'boolv == %s' % str(v),
+ columns=['A', 'boolv'])
+ tm.assert_frame_equal(expected, result)
+
+ expected = (df[df.boolv == False] # noqa
+ .reindex(columns=['A', 'boolv']))
+ for v in [False, 'false', 0]:
+ result = store.select(
+ 'df', 'boolv == %s' % str(v), columns=['A', 'boolv'])
+ tm.assert_frame_equal(expected, result)
+
+ # integer index
+ df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
+ _maybe_remove(store, 'df_int')
+ store.append('df_int', df)
+ result = store.select(
+ 'df_int', "index<10 and columns=['A']")
+ expected = df.reindex(index=list(df.index)[0:10], columns=['A'])
+ tm.assert_frame_equal(expected, result)
+
+ # float index
+ df = DataFrame(dict(A=np.random.rand(
+ 20), B=np.random.rand(20), index=np.arange(20, dtype='f8')))
+ _maybe_remove(store, 'df_float')
+ store.append('df_float', df)
+ result = store.select(
+ 'df_float', "index<10.0 and columns=['A']")
+ expected = df.reindex(index=list(df.index)[0:10], columns=['A'])
+ tm.assert_frame_equal(expected, result)
+
+ with ensure_clean_store(self.path) as store:
+
+ # floats w/o NaN
+ df = DataFrame(
+ dict(cols=range(11), values=range(11)), dtype='float64')
+ df['cols'] = (df['cols'] + 10).apply(str)
+
+ store.append('df1', df, data_columns=True)
+ result = store.select(
+ 'df1', where='values>2.0')
+ expected = df[df['values'] > 2.0]
+ tm.assert_frame_equal(expected, result)
+
+ # floats with NaN
+ df.iloc[0] = np.nan
+ expected = df[df['values'] > 2.0]
+
+ store.append('df2', df, data_columns=True, index=False)
+ result = store.select(
+ 'df2', where='values>2.0')
+ tm.assert_frame_equal(expected, result)
+
+ # https://github.com/PyTables/PyTables/issues/282
+ # bug in selection when 0th row has a np.nan and an index
+ # store.append('df3',df,data_columns=True)
+ # result = store.select(
+ # 'df3', where='values>2.0')
+ # tm.assert_frame_equal(expected, result)
+
+ # not in first position float with NaN ok too
+ df = DataFrame(
+ dict(cols=range(11), values=range(11)), dtype='float64')
+ df['cols'] = (df['cols'] + 10).apply(str)
+
+ df.iloc[1] = np.nan
+ expected = df[df['values'] > 2.0]
+
+ store.append('df4', df, data_columns=True)
+ result = store.select(
+ 'df4', where='values>2.0')
+ tm.assert_frame_equal(expected, result)
+
+ # test selection with comparison against numpy scalar
+ # GH 11283
+ with ensure_clean_store(self.path) as store:
+ df = tm.makeDataFrame()
+
+ expected = df[df['A'] > 0]
+
+ store.append('df', df, data_columns=True)
+ np_zero = np.float64(0) # noqa
+ result = store.select('df', where=["A>np_zero"])
+ tm.assert_frame_equal(expected, result)
+
+ def test_select_with_many_inputs(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300),
+ A=np.random.randn(300),
+ B=range(300),
+ users=['a'] * 50 + ['b'] * 50 + ['c'] * 100 +
+ ['a%03d' % i for i in range(100)]))
+ _maybe_remove(store, 'df')
+ store.append('df', df, data_columns=['ts', 'A', 'B', 'users'])
+
+ # regular select
+ result = store.select('df', "ts>=Timestamp('2012-02-01')")
+ expected = df[df.ts >= Timestamp('2012-02-01')]
+ tm.assert_frame_equal(expected, result)
+
+ # small selector
+ result = store.select(
+ 'df',
+ "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
+ expected = df[(df.ts >= Timestamp('2012-02-01')) &
+ df.users.isin(['a', 'b', 'c'])]
+ tm.assert_frame_equal(expected, result)
+
+ # big selector along the columns
+ selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)]
+ result = store.select(
+ 'df',
+ "ts>=Timestamp('2012-02-01') and users=selector")
+ expected = df[(df.ts >= Timestamp('2012-02-01')) &
+ df.users.isin(selector)]
+ tm.assert_frame_equal(expected, result)
+
+ selector = range(100, 200)
+ result = store.select('df', 'B=selector')
+ expected = df[df.B.isin(selector)]
+ tm.assert_frame_equal(expected, result)
+ assert len(result) == 100
+
+ # big selector along the index
+ selector = Index(df.ts[0:100].values)
+ result = store.select('df', 'ts=selector')
+ expected = df[df.ts.isin(selector.values)]
+ tm.assert_frame_equal(expected, result)
+ assert len(result) == 100
+
+ def test_select_iterator(self):
+
+ # single table
+ with ensure_clean_store(self.path) as store:
+
+ df = tm.makeTimeDataFrame(500)
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+
+ expected = store.select('df')
+
+ results = [s for s in store.select('df', iterator=True)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ results = [s for s in store.select('df', chunksize=100)]
+ assert len(results) == 5
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ results = [s for s in store.select('df', chunksize=150)]
+ result = concat(results)
+ tm.assert_frame_equal(result, expected)
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeTimeDataFrame(500)
+ df.to_hdf(path, 'df_non_table')
+ pytest.raises(TypeError, read_hdf, path,
+ 'df_non_table', chunksize=100)
+ pytest.raises(TypeError, read_hdf, path,
+ 'df_non_table', iterator=True)
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeTimeDataFrame(500)
+ df.to_hdf(path, 'df', format='table')
+
+ results = [s for s in read_hdf(path, 'df', chunksize=100)]
+ result = concat(results)
+
+ assert len(results) == 5
+ tm.assert_frame_equal(result, df)
+ tm.assert_frame_equal(result, read_hdf(path, 'df'))
+
+ # multiple
+
+ with ensure_clean_store(self.path) as store:
+
+ df1 = tm.makeTimeDataFrame(500)
+ store.append('df1', df1, data_columns=True)
+ df2 = tm.makeTimeDataFrame(500).rename(
+ columns=lambda x: "%s_2" % x)
+ df2['foo'] = 'bar'
+ store.append('df2', df2)
+
+ df = concat([df1, df2], axis=1)
+
+ # full selection
+ expected = store.select_as_multiple(
+ ['df1', 'df2'], selector='df1')
+ results = [s for s in store.select_as_multiple(
+ ['df1', 'df2'], selector='df1', chunksize=150)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ def test_select_iterator_complete_8014(self):
+
+ # GH 8014
+ # using iterator and where clause
+ chunksize = 1e4
+
+ # no iterator
+ with ensure_clean_store(self.path) as store:
+
+ expected = tm.makeTimeDataFrame(100064, 'S')
+ _maybe_remove(store, 'df')
+ store.append('df', expected)
+
+ beg_dt = expected.index[0]
+ end_dt = expected.index[-1]
+
+ # select w/o iteration and no where clause works
+ result = store.select('df')
+ tm.assert_frame_equal(expected, result)
+
+ # select w/o iterator and where clause, single term, begin
+ # of range, works
+ where = "index >= '%s'" % beg_dt
+ result = store.select('df', where=where)
+ tm.assert_frame_equal(expected, result)
+
+ # select w/o iterator and where clause, single term, end
+ # of range, works
+ where = "index <= '%s'" % end_dt
+ result = store.select('df', where=where)
+ tm.assert_frame_equal(expected, result)
+
+ # select w/o iterator and where clause, inclusive range,
+ # works
+ where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
+ result = store.select('df', where=where)
+ tm.assert_frame_equal(expected, result)
+
+ # with iterator, full range
+ with ensure_clean_store(self.path) as store:
+
+ expected = tm.makeTimeDataFrame(100064, 'S')
+ _maybe_remove(store, 'df')
+ store.append('df', expected)
+
+ beg_dt = expected.index[0]
+ end_dt = expected.index[-1]
+
+ # select w/iterator and no where clause works
+ results = [s for s in store.select('df', chunksize=chunksize)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ # select w/iterator and where clause, single term, begin of range
+ where = "index >= '%s'" % beg_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ # select w/iterator and where clause, single term, end of range
+ where = "index <= '%s'" % end_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ # select w/iterator and where clause, inclusive range
+ where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ tm.assert_frame_equal(expected, result)
+
+ def test_select_iterator_non_complete_8014(self):
+
+ # GH 8014
+ # using iterator and where clause
+ chunksize = 1e4
+
+ # with iterator, non complete range
+ with ensure_clean_store(self.path) as store:
+
+ expected = tm.makeTimeDataFrame(100064, 'S')
+ _maybe_remove(store, 'df')
+ store.append('df', expected)
+
+ beg_dt = expected.index[1]
+ end_dt = expected.index[-2]
+
+ # select w/iterator and where clause, single term, begin of range
+ where = "index >= '%s'" % beg_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ rexpected = expected[expected.index >= beg_dt]
+ tm.assert_frame_equal(rexpected, result)
+
+ # select w/iterator and where clause, single term, end of range
+ where = "index <= '%s'" % end_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ rexpected = expected[expected.index <= end_dt]
+ tm.assert_frame_equal(rexpected, result)
+
+ # select w/iterator and where clause, inclusive range
+ where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ rexpected = expected[(expected.index >= beg_dt) &
+ (expected.index <= end_dt)]
+ tm.assert_frame_equal(rexpected, result)
+
+ # with iterator, empty where
+ with ensure_clean_store(self.path) as store:
+
+ expected = tm.makeTimeDataFrame(100064, 'S')
+ _maybe_remove(store, 'df')
+ store.append('df', expected)
+
+ end_dt = expected.index[-1]
+
+ # select w/iterator and where clause, single term, begin of range
+ where = "index > '%s'" % end_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ assert 0 == len(results)
+
+ def test_select_iterator_many_empty_frames(self):
+
+ # GH 8014
+ # using iterator and where clause can return many empty
+ # frames.
+ chunksize = int(1e4)
+
+ # with iterator, range limited to the first chunk
+ with ensure_clean_store(self.path) as store:
+
+ expected = tm.makeTimeDataFrame(100000, 'S')
+ _maybe_remove(store, 'df')
+ store.append('df', expected)
+
+ beg_dt = expected.index[0]
+ end_dt = expected.index[chunksize - 1]
+
+ # select w/iterator and where clause, single term, begin of range
+ where = "index >= '%s'" % beg_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+ result = concat(results)
+ rexpected = expected[expected.index >= beg_dt]
+ tm.assert_frame_equal(rexpected, result)
+
+ # select w/iterator and where clause, single term, end of range
+ where = "index <= '%s'" % end_dt
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+
+ assert len(results) == 1
+ result = concat(results)
+ rexpected = expected[expected.index <= end_dt]
+ tm.assert_frame_equal(rexpected, result)
+
+ # select w/iterator and where clause, inclusive range
+ where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+
+ # should be 1, is 10
+ assert len(results) == 1
+ result = concat(results)
+ rexpected = expected[(expected.index >= beg_dt) &
+ (expected.index <= end_dt)]
+ tm.assert_frame_equal(rexpected, result)
+
+ # select w/iterator and where clause which selects
+ # *nothing*.
+ #
+ # To be consistent with Python idiom I suggest this should
+ # return [] e.g. `for e in []: print True` never prints
+ # True.
+
+ where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt)
+ results = [s for s in store.select(
+ 'df', where=where, chunksize=chunksize)]
+
+ # should be []
+ assert len(results) == 0
+
+ @pytest.mark.filterwarnings(
+ "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning"
+ )
+ def test_retain_index_attributes(self):
+
+ # GH 3499, losing frequency info on index recreation
+ df = DataFrame(dict(
+ A=Series(lrange(3),
+ index=date_range('2000-1-1', periods=3, freq='H'))))
+
+ with ensure_clean_store(self.path) as store:
+ _maybe_remove(store, 'data')
+ store.put('data', df, format='table')
+
+ result = store.get('data')
+ tm.assert_frame_equal(df, result)
+
+ for attr in ['freq', 'tz', 'name']:
+ for idx in ['index', 'columns']:
+ assert (getattr(getattr(df, idx), attr, None) ==
+ getattr(getattr(result, idx), attr, None))
+
+ # try to append a table with a different frequency
+ with catch_warnings(record=True):
+ df2 = DataFrame(dict(
+ A=Series(lrange(3),
+ index=date_range('2002-1-1',
+ periods=3, freq='D'))))
+ store.append('data', df2)
+
+ assert store.get_storer('data').info['index']['freq'] is None
+
+ # this is ok
+ _maybe_remove(store, 'df2')
+ df2 = DataFrame(dict(
+ A=Series(lrange(3),
+ index=[Timestamp('20010101'), Timestamp('20010102'),
+ Timestamp('20020101')])))
+ store.append('df2', df2)
+ df3 = DataFrame(dict(
+ A=Series(lrange(3),
+ index=date_range('2002-1-1', periods=3,
+ freq='D'))))
+ store.append('df2', df3)
+
+ @pytest.mark.filterwarnings(
+ "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning"
+ )
+ def test_retain_index_attributes2(self):
+ with ensure_clean_path(self.path) as path:
+
+ with catch_warnings(record=True):
+
+ df = DataFrame(dict(
+ A=Series(lrange(3),
+ index=date_range('2000-1-1',
+ periods=3, freq='H'))))
+ df.to_hdf(path, 'data', mode='w', append=True)
+ df2 = DataFrame(dict(
+ A=Series(lrange(3),
+ index=date_range('2002-1-1', periods=3,
+ freq='D'))))
+ df2.to_hdf(path, 'data', append=True)
+
+ idx = date_range('2000-1-1', periods=3, freq='H')
+ idx.name = 'foo'
+ df = DataFrame(dict(A=Series(lrange(3), index=idx)))
+ df.to_hdf(path, 'data', mode='w', append=True)
+
+ assert read_hdf(path, 'data').index.name == 'foo'
+
+ with catch_warnings(record=True):
+
+ idx2 = date_range('2001-1-1', periods=3, freq='H')
+ idx2.name = 'bar'
+ df2 = DataFrame(dict(A=Series(lrange(3), index=idx2)))
+ df2.to_hdf(path, 'data', append=True)
+
+ assert read_hdf(path, 'data').index.name is None
+
+ def test_panel_select(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ with catch_warnings(record=True):
+
+ wp = tm.makePanel()
+
+ store.put('wp', wp, format='table')
+ date = wp.major_axis[len(wp.major_axis) // 2]
+
+ crit1 = ('major_axis>=date')
+ crit2 = ("minor_axis=['A', 'D']")
+
+ result = store.select('wp', [crit1, crit2])
+ expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
+ assert_panel_equal(result, expected)
+
+ result = store.select(
+ 'wp', ['major_axis>="20000124"',
+ ("minor_axis=['A', 'B']")])
+ expected = wp.truncate(
+ before='20000124').reindex(minor=['A', 'B'])
+ assert_panel_equal(result, expected)
+
+ def test_frame_select(self):
+
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+ store.put('frame', df, format='table')
+ date = df.index[len(df) // 2]
+
+ crit1 = Term('index>=date')
+ assert crit1.env.scope['date'] == date
+
+ crit2 = ("columns=['A', 'D']")
+ crit3 = ('columns=A')
+
+ result = store.select('frame', [crit1, crit2])
+ expected = df.loc[date:, ['A', 'D']]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select('frame', [crit3])
+ expected = df.loc[:, ['A']]
+ tm.assert_frame_equal(result, expected)
+
+ # invalid terms
+ df = tm.makeTimeDataFrame()
+ store.append('df_time', df)
+ pytest.raises(
+ ValueError, store.select, 'df_time', "index>0")
+
+ # can't select if not written as table
+ # store['frame'] = df
+ # pytest.raises(ValueError, store.select,
+ # 'frame', [crit1, crit2])
+
+ def test_frame_select_complex(self):
+ # select via complex criteria
+
+ df = tm.makeTimeDataFrame()
+ df['string'] = 'foo'
+ df.loc[df.index[0:4], 'string'] = 'bar'
+
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df, format='table', data_columns=['string'])
+
+ # empty
+ result = store.select('df', 'index>df.index[3] & string="bar"')
+ expected = df.loc[(df.index > df.index[3]) & (df.string == 'bar')]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select('df', 'index>df.index[3] & string="foo"')
+ expected = df.loc[(df.index > df.index[3]) & (df.string == 'foo')]
+ tm.assert_frame_equal(result, expected)
+
+ # or
+ result = store.select('df', 'index>df.index[3] | string="bar"')
+ expected = df.loc[(df.index > df.index[3]) | (df.string == 'bar')]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select('df', '(index>df.index[3] & '
+ 'index<=df.index[6]) | string="bar"')
+ expected = df.loc[((df.index > df.index[3]) & (
+ df.index <= df.index[6])) | (df.string == 'bar')]
+ tm.assert_frame_equal(result, expected)
+
+ # invert
+ result = store.select('df', 'string!="bar"')
+ expected = df.loc[df.string != 'bar']
+ tm.assert_frame_equal(result, expected)
+
+ # invert not implemented in numexpr :(
+ pytest.raises(NotImplementedError,
+ store.select, 'df', '~(string="bar")')
+
+ # invert ok for filters
+ result = store.select('df', "~(columns=['A','B'])")
+ expected = df.loc[:, df.columns.difference(['A', 'B'])]
+ tm.assert_frame_equal(result, expected)
+
+ # in
+ result = store.select(
+ 'df', "index>df.index[3] & columns in ['A','B']")
+ expected = df.loc[df.index > df.index[3]].reindex(columns=[
+ 'A', 'B'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_select_complex2(self):
+
+ with ensure_clean_path(['parms.hdf', 'hist.hdf']) as paths:
+
+ pp, hh = paths
+
+ # use non-trivial selection criteria
+ parms = DataFrame({'A': [1, 1, 2, 2, 3]})
+ parms.to_hdf(pp, 'df', mode='w',
+ format='table', data_columns=['A'])
+
+ selection = read_hdf(pp, 'df', where='A=[2,3]')
+ hist = DataFrame(np.random.randn(25, 1),
+ columns=['data'],
+ index=MultiIndex.from_tuples(
+ [(i, j) for i in range(5)
+ for j in range(5)],
+ names=['l1', 'l2']))
+
+ hist.to_hdf(hh, 'df', mode='w', format='table')
+
+ expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]')
+
+ # sccope with list like
+ l = selection.index.tolist() # noqa
+ store = HDFStore(hh)
+ result = store.select('df', where='l1=l')
+ assert_frame_equal(result, expected)
+ store.close()
+
+ result = read_hdf(hh, 'df', where='l1=l')
+ assert_frame_equal(result, expected)
+
+ # index
+ index = selection.index # noqa
+ result = read_hdf(hh, 'df', where='l1=index')
+ assert_frame_equal(result, expected)
+
+ result = read_hdf(hh, 'df', where='l1=selection.index')
+ assert_frame_equal(result, expected)
+
+ result = read_hdf(hh, 'df', where='l1=selection.index.tolist()')
+ assert_frame_equal(result, expected)
+
+ result = read_hdf(hh, 'df', where='l1=list(selection.index)')
+ assert_frame_equal(result, expected)
+
+ # sccope with index
+ store = HDFStore(hh)
+
+ result = store.select('df', where='l1=index')
+ assert_frame_equal(result, expected)
+
+ result = store.select('df', where='l1=selection.index')
+ assert_frame_equal(result, expected)
+
+ result = store.select('df', where='l1=selection.index.tolist()')
+ assert_frame_equal(result, expected)
+
+ result = store.select('df', where='l1=list(selection.index)')
+ assert_frame_equal(result, expected)
+
+ store.close()
+
+ def test_invalid_filtering(self):
+
+ # can't use more than one filter (atm)
+
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df, format='table')
+
+ # not implemented
+ pytest.raises(NotImplementedError, store.select,
+ 'df', "columns=['A'] | columns=['B']")
+
+ # in theory we could deal with this
+ pytest.raises(NotImplementedError, store.select,
+ 'df', "columns=['A','B'] & columns=['C']")
+
+ def test_string_select(self):
+ # GH 2973
+ with ensure_clean_store(self.path) as store:
+
+ df = tm.makeTimeDataFrame()
+
+ # test string ==/!=
+ df['x'] = 'none'
+ df.loc[2:7, 'x'] = ''
+
+ store.append('df', df, data_columns=['x'])
+
+ result = store.select('df', 'x=none')
+ expected = df[df.x == 'none']
+ assert_frame_equal(result, expected)
+
+ try:
+ result = store.select('df', 'x!=none')
+ expected = df[df.x != 'none']
+ assert_frame_equal(result, expected)
+ except Exception as detail:
+ pprint_thing("[{0}]".format(detail))
+ pprint_thing(store)
+ pprint_thing(expected)
+
+ df2 = df.copy()
+ df2.loc[df2.x == '', 'x'] = np.nan
+
+ store.append('df2', df2, data_columns=['x'])
+ result = store.select('df2', 'x!=none')
+ expected = df2[isna(df2.x)]
+ assert_frame_equal(result, expected)
+
+ # int ==/!=
+ df['int'] = 1
+ df.loc[2:7, 'int'] = 2
+
+ store.append('df3', df, data_columns=['int'])
+
+ result = store.select('df3', 'int=2')
+ expected = df[df.int == 2]
+ assert_frame_equal(result, expected)
+
+ result = store.select('df3', 'int!=2')
+ expected = df[df.int != 2]
+ assert_frame_equal(result, expected)
+
+ def test_read_column(self):
+
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+ _maybe_remove(store, 'df')
+
+ # GH 17912
+ # HDFStore.select_column should raise a KeyError
+ # exception if the key is not a valid store
+ with pytest.raises(KeyError,
+ match='No object named df in the file'):
+ store.select_column('df', 'index')
+
+ store.append('df', df)
+ # error
+ pytest.raises(KeyError, store.select_column, 'df', 'foo')
+
+ def f():
+ store.select_column('df', 'index', where=['index>5'])
+ pytest.raises(Exception, f)
+
+ # valid
+ result = store.select_column('df', 'index')
+ tm.assert_almost_equal(result.values, Series(df.index).values)
+ assert isinstance(result, Series)
+
+ # not a data indexable column
+ pytest.raises(
+ ValueError, store.select_column, 'df', 'values_block_0')
+
+ # a data column
+ df2 = df.copy()
+ df2['string'] = 'foo'
+ store.append('df2', df2, data_columns=['string'])
+ result = store.select_column('df2', 'string')
+ tm.assert_almost_equal(result.values, df2['string'].values)
+
+ # a data column with NaNs, result excludes the NaNs
+ df3 = df.copy()
+ df3['string'] = 'foo'
+ df3.loc[4:6, 'string'] = np.nan
+ store.append('df3', df3, data_columns=['string'])
+ result = store.select_column('df3', 'string')
+ tm.assert_almost_equal(result.values, df3['string'].values)
+
+ # start/stop
+ result = store.select_column('df3', 'string', start=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[2:])
+
+ result = store.select_column('df3', 'string', start=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[-2:])
+
+ result = store.select_column('df3', 'string', stop=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[:2])
+
+ result = store.select_column('df3', 'string', stop=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[:-2])
+
+ result = store.select_column('df3', 'string', start=2, stop=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[2:-2])
+
+ result = store.select_column('df3', 'string', start=-2, stop=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[-2:2])
+
+ # GH 10392 - make sure column name is preserved
+ df4 = DataFrame({'A': np.random.randn(10), 'B': 'foo'})
+ store.append('df4', df4, data_columns=True)
+ expected = df4['B']
+ result = store.select_column('df4', 'B')
+ tm.assert_series_equal(result, expected)
+
+ def test_coordinates(self):
+ df = tm.makeTimeDataFrame()
+
+ with ensure_clean_store(self.path) as store:
+
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+
+ # all
+ c = store.select_as_coordinates('df')
+ assert((c.values == np.arange(len(df.index))).all())
+
+ # get coordinates back & test vs frame
+ _maybe_remove(store, 'df')
+
+ df = DataFrame(dict(A=lrange(5), B=lrange(5)))
+ store.append('df', df)
+ c = store.select_as_coordinates('df', ['index<3'])
+ assert((c.values == np.arange(3)).all())
+ result = store.select('df', where=c)
+ expected = df.loc[0:2, :]
+ tm.assert_frame_equal(result, expected)
+
+ c = store.select_as_coordinates('df', ['index>=3', 'index<=4'])
+ assert((c.values == np.arange(2) + 3).all())
+ result = store.select('df', where=c)
+ expected = df.loc[3:4, :]
+ tm.assert_frame_equal(result, expected)
+ assert isinstance(c, Index)
+
+ # multiple tables
+ _maybe_remove(store, 'df1')
+ _maybe_remove(store, 'df2')
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+ store.append('df1', df1, data_columns=['A', 'B'])
+ store.append('df2', df2)
+
+ c = store.select_as_coordinates('df1', ['A>0', 'B>0'])
+ df1_result = store.select('df1', c)
+ df2_result = store.select('df2', c)
+ result = concat([df1_result, df2_result], axis=1)
+
+ expected = concat([df1, df2], axis=1)
+ expected = expected[(expected.A > 0) & (expected.B > 0)]
+ tm.assert_frame_equal(result, expected)
+
+ # pass array/mask as the coordinates
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame(np.random.randn(1000, 2),
+ index=date_range('20000101', periods=1000))
+ store.append('df', df)
+ c = store.select_column('df', 'index')
+ where = c[DatetimeIndex(c).month == 5].index
+ expected = df.iloc[where]
+
+ # locations
+ result = store.select('df', where=where)
+ tm.assert_frame_equal(result, expected)
+
+ # boolean
+ result = store.select('df', where=where)
+ tm.assert_frame_equal(result, expected)
+
+ # invalid
+ pytest.raises(ValueError, store.select, 'df',
+ where=np.arange(len(df), dtype='float64'))
+ pytest.raises(ValueError, store.select, 'df',
+ where=np.arange(len(df) + 1))
+ pytest.raises(ValueError, store.select, 'df',
+ where=np.arange(len(df)), start=5)
+ pytest.raises(ValueError, store.select, 'df',
+ where=np.arange(len(df)), start=5, stop=10)
+
+ # selection with filter
+ selection = date_range('20000101', periods=500)
+ result = store.select('df', where='index in selection')
+ expected = df[df.index.isin(selection)]
+ tm.assert_frame_equal(result, expected)
+
+ # list
+ df = DataFrame(np.random.randn(10, 2))
+ store.append('df2', df)
+ result = store.select('df2', where=[0, 3, 5])
+ expected = df.iloc[[0, 3, 5]]
+ tm.assert_frame_equal(result, expected)
+
+ # boolean
+ where = [True] * 10
+ where[-2] = False
+ result = store.select('df2', where=where)
+ expected = df.loc[where]
+ tm.assert_frame_equal(result, expected)
+
+ # start/stop
+ result = store.select('df2', start=5, stop=10)
+ expected = df[5:10]
+ tm.assert_frame_equal(result, expected)
+
+ def test_append_to_multiple(self):
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+ df2['foo'] = 'bar'
+ df = concat([df1, df2], axis=1)
+
+ with ensure_clean_store(self.path) as store:
+
+ # exceptions
+ pytest.raises(ValueError, store.append_to_multiple,
+ {'df1': ['A', 'B'], 'df2': None}, df,
+ selector='df3')
+ pytest.raises(ValueError, store.append_to_multiple,
+ {'df1': None, 'df2': None}, df, selector='df3')
+ pytest.raises(
+ ValueError, store.append_to_multiple, 'df1', df, 'df1')
+
+ # regular operation
+ store.append_to_multiple(
+ {'df1': ['A', 'B'], 'df2': None}, df, selector='df1')
+ result = store.select_as_multiple(
+ ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1')
+ expected = df[(df.A > 0) & (df.B > 0)]
+ tm.assert_frame_equal(result, expected)
+
+ def test_append_to_multiple_dropna(self):
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+ df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan
+ df = concat([df1, df2], axis=1)
+
+ with ensure_clean_store(self.path) as store:
+
+ # dropna=True should guarantee rows are synchronized
+ store.append_to_multiple(
+ {'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
+ dropna=True)
+ result = store.select_as_multiple(['df1', 'df2'])
+ expected = df.dropna()
+ tm.assert_frame_equal(result, expected)
+ tm.assert_index_equal(store.select('df1').index,
+ store.select('df2').index)
+
+ @pytest.mark.xfail(run=False,
+ reason="append_to_multiple_dropna_false "
+ "is not raising as failed")
+ def test_append_to_multiple_dropna_false(self):
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+ df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan
+ df = concat([df1, df2], axis=1)
+
+ with ensure_clean_store(self.path) as store:
+
+ # dropna=False shouldn't synchronize row indexes
+ store.append_to_multiple(
+ {'df1a': ['A', 'B'], 'df2a': None}, df, selector='df1a',
+ dropna=False)
+
+ with pytest.raises(ValueError):
+ store.select_as_multiple(['df1a', 'df2a'])
+
+ assert not store.select('df1a').index.equals(
+ store.select('df2a').index)
+
+ def test_select_as_multiple(self):
+
+ df1 = tm.makeTimeDataFrame()
+ df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+ df2['foo'] = 'bar'
+
+ with ensure_clean_store(self.path) as store:
+
+ # no tables stored
+ pytest.raises(Exception, store.select_as_multiple,
+ None, where=['A>0', 'B>0'], selector='df1')
+
+ store.append('df1', df1, data_columns=['A', 'B'])
+ store.append('df2', df2)
+
+ # exceptions
+ pytest.raises(Exception, store.select_as_multiple,
+ None, where=['A>0', 'B>0'], selector='df1')
+ pytest.raises(Exception, store.select_as_multiple,
+ [None], where=['A>0', 'B>0'], selector='df1')
+ pytest.raises(KeyError, store.select_as_multiple,
+ ['df1', 'df3'], where=['A>0', 'B>0'],
+ selector='df1')
+ pytest.raises(KeyError, store.select_as_multiple,
+ ['df3'], where=['A>0', 'B>0'], selector='df1')
+ pytest.raises(KeyError, store.select_as_multiple,
+ ['df1', 'df2'], where=['A>0', 'B>0'],
+ selector='df4')
+
+ # default select
+ result = store.select('df1', ['A>0', 'B>0'])
+ expected = store.select_as_multiple(
+ ['df1'], where=['A>0', 'B>0'], selector='df1')
+ tm.assert_frame_equal(result, expected)
+ expected = store.select_as_multiple(
+ 'df1', where=['A>0', 'B>0'], selector='df1')
+ tm.assert_frame_equal(result, expected)
+
+ # multiple
+ result = store.select_as_multiple(
+ ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1')
+ expected = concat([df1, df2], axis=1)
+ expected = expected[(expected.A > 0) & (expected.B > 0)]
+ tm.assert_frame_equal(result, expected)
+
+ # multiple (diff selector)
+ result = store.select_as_multiple(
+ ['df1', 'df2'], where='index>df2.index[4]', selector='df2')
+ expected = concat([df1, df2], axis=1)
+ expected = expected[5:]
+ tm.assert_frame_equal(result, expected)
+
+ # test excpection for diff rows
+ store.append('df3', tm.makeTimeDataFrame(nper=50))
+ pytest.raises(ValueError, store.select_as_multiple,
+ ['df1', 'df3'], where=['A>0', 'B>0'],
+ selector='df1')
+
+ @pytest.mark.skipif(
+ LooseVersion(tables.__version__) < LooseVersion('3.1.0'),
+ reason=("tables version does not support fix for nan selection "
+ "bug: GH 4858"))
+ def test_nan_selection_bug_4858(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame(dict(cols=range(6), values=range(6)),
+ dtype='float64')
+ df['cols'] = (df['cols'] + 10).apply(str)
+ df.iloc[0] = np.nan
+
+ expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[
+ 3., 4., 5.]), index=[3, 4, 5])
+
+ # write w/o the index on that particular column
+ store.append('df', df, data_columns=True, index=['cols'])
+ result = store.select('df', where='values>2.0')
+ assert_frame_equal(result, expected)
+
+ def test_start_stop_table(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # table
+ df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
+ store.append('df', df)
+
+ result = store.select(
+ 'df', "columns=['A']", start=0, stop=5)
+ expected = df.loc[0:4, ['A']]
+ tm.assert_frame_equal(result, expected)
+
+ # out of range
+ result = store.select(
+ 'df', "columns=['A']", start=30, stop=40)
+ assert len(result) == 0
+ expected = df.loc[30:40, ['A']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_start_stop_multiple(self):
+
+ # GH 16209
+ with ensure_clean_store(self.path) as store:
+
+ df = DataFrame({"foo": [1, 2], "bar": [1, 2]})
+
+ store.append_to_multiple({'selector': ['foo'], 'data': None}, df,
+ selector='selector')
+ result = store.select_as_multiple(['selector', 'data'],
+ selector='selector', start=0,
+ stop=1)
+ expected = df.loc[[0], ['foo', 'bar']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_start_stop_fixed(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # fixed, GH 8287
+ df = DataFrame(dict(A=np.random.rand(20),
+ B=np.random.rand(20)),
+ index=pd.date_range('20130101', periods=20))
+ store.put('df', df)
+
+ result = store.select(
+ 'df', start=0, stop=5)
+ expected = df.iloc[0:5, :]
+ tm.assert_frame_equal(result, expected)
+
+ result = store.select(
+ 'df', start=5, stop=10)
+ expected = df.iloc[5:10, :]
+ tm.assert_frame_equal(result, expected)
+
+ # out of range
+ result = store.select(
+ 'df', start=30, stop=40)
+ expected = df.iloc[30:40, :]
+ tm.assert_frame_equal(result, expected)
+
+ # series
+ s = df.A
+ store.put('s', s)
+ result = store.select('s', start=0, stop=5)
+ expected = s.iloc[0:5]
+ tm.assert_series_equal(result, expected)
+
+ result = store.select('s', start=5, stop=10)
+ expected = s.iloc[5:10]
+ tm.assert_series_equal(result, expected)
+
+ # sparse; not implemented
+ df = tm.makeDataFrame()
+ df.iloc[3:5, 1:3] = np.nan
+ df.iloc[8:10, -2] = np.nan
+ dfs = df.to_sparse()
+ store.put('dfs', dfs)
+ with pytest.raises(NotImplementedError):
+ store.select('dfs', start=0, stop=5)
+
+ def test_select_filter_corner(self):
+
+ df = DataFrame(np.random.randn(50, 100))
+ df.index = ['%.3d' % c for c in df.index]
+ df.columns = ['%.3d' % c for c in df.columns]
+
+ with ensure_clean_store(self.path) as store:
+ store.put('frame', df, format='table')
+
+ crit = 'columns=df.columns[:75]'
+ result = store.select('frame', [crit])
+ tm.assert_frame_equal(result, df.loc[:, df.columns[:75]])
+
+ crit = 'columns=df.columns[:75:2]'
+ result = store.select('frame', [crit])
+ tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]])
+
+ def test_path_pathlib(self):
+ df = tm.makeDataFrame()
+
+ result = tm.round_trip_pathlib(
+ lambda p: df.to_hdf(p, 'df'),
+ lambda p: pd.read_hdf(p, 'df'))
+ tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize('start, stop', [(0, 2), (1, 2), (None, None)])
+ def test_contiguous_mixed_data_table(self, start, stop):
+ # GH 17021
+ # ValueError when reading a contiguous mixed-data table ft. VLArray
+ df = DataFrame({'a': Series([20111010, 20111011, 20111012]),
+ 'b': Series(['ab', 'cd', 'ab'])})
+
+ with ensure_clean_store(self.path) as store:
+ store.append('test_dataset', df)
+
+ result = store.select('test_dataset', start=start, stop=stop)
+ assert_frame_equal(df[start:stop], result)
+
+ def test_path_pathlib_hdfstore(self):
+ df = tm.makeDataFrame()
+
+ def writer(path):
+ with pd.HDFStore(path) as store:
+ df.to_hdf(store, 'df')
+
+ def reader(path):
+ with pd.HDFStore(path) as store:
+ return pd.read_hdf(store, 'df')
+
+ result = tm.round_trip_pathlib(writer, reader)
+ tm.assert_frame_equal(df, result)
+
+ def test_pickle_path_localpath(self):
+ df = tm.makeDataFrame()
+ result = tm.round_trip_pathlib(
+ lambda p: df.to_hdf(p, 'df'),
+ lambda p: pd.read_hdf(p, 'df'))
+ tm.assert_frame_equal(df, result)
+
+ def test_path_localpath_hdfstore(self):
+ df = tm.makeDataFrame()
+
+ def writer(path):
+ with pd.HDFStore(path) as store:
+ df.to_hdf(store, 'df')
+
+ def reader(path):
+ with pd.HDFStore(path) as store:
+ return pd.read_hdf(store, 'df')
+
+ result = tm.round_trip_localpath(writer, reader)
+ tm.assert_frame_equal(df, result)
+
+ def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
+
+ options = {}
+ if compression:
+ options['complib'] = _default_compressor
+
+ with ensure_clean_store(self.path, 'w', **options) as store:
+ store['obj'] = obj
+ retrieved = store['obj']
+ comparator(retrieved, obj, **kwargs)
+
+ def _check_double_roundtrip(self, obj, comparator, compression=False,
+ **kwargs):
+ options = {}
+ if compression:
+ options['complib'] = compression or _default_compressor
+
+ with ensure_clean_store(self.path, 'w', **options) as store:
+ store['obj'] = obj
+ retrieved = store['obj']
+ comparator(retrieved, obj, **kwargs)
+ store['obj'] = retrieved
+ again = store['obj']
+ comparator(again, obj, **kwargs)
+
+ def _check_roundtrip_table(self, obj, comparator, compression=False):
+ options = {}
+ if compression:
+ options['complib'] = _default_compressor
+
+ with ensure_clean_store(self.path, 'w', **options) as store:
+ store.put('obj', obj, format='table')
+ retrieved = store['obj']
+
+ comparator(retrieved, obj)
+
+ def test_multiple_open_close(self):
+ # gh-4409: open & close multiple times
+
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+ df.to_hdf(path, 'df', mode='w', format='table')
+
+ # single
+ store = HDFStore(path)
+ assert 'CLOSED' not in store.info()
+ assert store.is_open
+
+ store.close()
+ assert 'CLOSED' in store.info()
+ assert not store.is_open
+
+ with ensure_clean_path(self.path) as path:
+
+ if pytables._table_file_open_policy_is_strict:
+
+ # multiples
+ store1 = HDFStore(path)
+
+ def f():
+ HDFStore(path)
+ pytest.raises(ValueError, f)
+ store1.close()
+
+ else:
+
+ # multiples
+ store1 = HDFStore(path)
+ store2 = HDFStore(path)
+
+ assert 'CLOSED' not in store1.info()
+ assert 'CLOSED' not in store2.info()
+ assert store1.is_open
+ assert store2.is_open
+
+ store1.close()
+ assert 'CLOSED' in store1.info()
+ assert not store1.is_open
+ assert 'CLOSED' not in store2.info()
+ assert store2.is_open
+
+ store2.close()
+ assert 'CLOSED' in store1.info()
+ assert 'CLOSED' in store2.info()
+ assert not store1.is_open
+ assert not store2.is_open
+
+ # nested close
+ store = HDFStore(path, mode='w')
+ store.append('df', df)
+
+ store2 = HDFStore(path)
+ store2.append('df2', df)
+ store2.close()
+ assert 'CLOSED' in store2.info()
+ assert not store2.is_open
+
+ store.close()
+ assert 'CLOSED' in store.info()
+ assert not store.is_open
+
+ # double closing
+ store = HDFStore(path, mode='w')
+ store.append('df', df)
+
+ store2 = HDFStore(path)
+ store.close()
+ assert 'CLOSED' in store.info()
+ assert not store.is_open
+
+ store2.close()
+ assert 'CLOSED' in store2.info()
+ assert not store2.is_open
+
+ # ops on a closed store
+ with ensure_clean_path(self.path) as path:
+
+ df = tm.makeDataFrame()
+ df.to_hdf(path, 'df', mode='w', format='table')
+
+ store = HDFStore(path)
+ store.close()
+
+ pytest.raises(ClosedFileError, store.keys)
+ pytest.raises(ClosedFileError, lambda: 'df' in store)
+ pytest.raises(ClosedFileError, lambda: len(store))
+ pytest.raises(ClosedFileError, lambda: store['df'])
+ pytest.raises(AttributeError, lambda: store.df)
+ pytest.raises(ClosedFileError, store.select, 'df')
+ pytest.raises(ClosedFileError, store.get, 'df')
+ pytest.raises(ClosedFileError, store.append, 'df2', df)
+ pytest.raises(ClosedFileError, store.put, 'df3', df)
+ pytest.raises(ClosedFileError, store.get_storer, 'df2')
+ pytest.raises(ClosedFileError, store.remove, 'df2')
+
+ with pytest.raises(ClosedFileError, match='file is not open'):
+ store.select('df')
+
+ def test_pytables_native_read(self, datapath):
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf/pytables_native.h5'),
+ mode='r') as store:
+ d2 = store['detector/readout']
+ assert isinstance(d2, DataFrame)
+
+ @pytest.mark.skipif(PY35 and is_platform_windows(),
+ reason="native2 read fails oddly on windows / 3.5")
+ def test_pytables_native2_read(self, datapath):
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'),
+ mode='r') as store:
+ str(store)
+ d1 = store['detector']
+ assert isinstance(d1, DataFrame)
+
+ @xfail_non_writeable
+ def test_legacy_table_fixed_format_read_py2(self, datapath):
+ # GH 24510
+ # legacy table with fixed format written in Python 2
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf',
+ 'legacy_table_fixed_py2.h5'),
+ mode='r') as store:
+ result = store.select('df')
+ expected = pd.DataFrame([[1, 2, 3, 'D']],
+ columns=['A', 'B', 'C', 'D'],
+ index=pd.Index(['ABC'],
+ name='INDEX_NAME'))
+ assert_frame_equal(expected, result)
+
+ def test_legacy_table_read_py2(self, datapath):
+ # issue: 24925
+ # legacy table written in Python 2
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf',
+ 'legacy_table_py2.h5'),
+ mode='r') as store:
+ result = store.select('table')
+
+ expected = pd.DataFrame({
+ "a": ["a", "b"],
+ "b": [2, 3]
+ })
+ assert_frame_equal(expected, result)
+
+ def test_legacy_table_read(self, datapath):
+ # legacy table types
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'),
+ mode='r') as store:
+
+ with catch_warnings():
+ simplefilter("ignore", pd.io.pytables.IncompatibilityWarning)
+ store.select('df1')
+ store.select('df2')
+ store.select('wp1')
+
+ # force the frame
+ store.select('df2', typ='legacy_frame')
+
+ # old version warning
+ pytest.raises(
+ Exception, store.select, 'wp1', 'minor_axis=B')
+
+ df2 = store.select('df2')
+ result = store.select('df2', 'index>df2.index[2]')
+ expected = df2[df2.index > df2.index[2]]
+ assert_frame_equal(expected, result)
+
+ def test_copy(self):
+
+ with catch_warnings(record=True):
+
+ def do_copy(f, new_f=None, keys=None,
+ propindexes=True, **kwargs):
+ try:
+ store = HDFStore(f, 'r')
+
+ if new_f is None:
+ import tempfile
+ fd, new_f = tempfile.mkstemp()
+
+ tstore = store.copy(
+ new_f, keys=keys, propindexes=propindexes, **kwargs)
+
+ # check keys
+ if keys is None:
+ keys = store.keys()
+ assert set(keys) == set(tstore.keys())
+
+ # check indices & nrows
+ for k in tstore.keys():
+ if tstore.get_storer(k).is_table:
+ new_t = tstore.get_storer(k)
+ orig_t = store.get_storer(k)
+
+ assert orig_t.nrows == new_t.nrows
+
+ # check propindixes
+ if propindexes:
+ for a in orig_t.axes:
+ if a.is_indexed:
+ assert new_t[a.name].is_indexed
+
+ finally:
+ safe_close(store)
+ safe_close(tstore)
+ try:
+ os.close(fd)
+ except (OSError, ValueError):
+ pass
+ safe_remove(new_f)
+
+ # new table
+ df = tm.makeDataFrame()
+
+ try:
+ path = create_tempfile(self.path)
+ st = HDFStore(path)
+ st.append('df', df, data_columns=['A'])
+ st.close()
+ do_copy(f=path)
+ do_copy(f=path, propindexes=False)
+ finally:
+ safe_remove(path)
+
+ def test_store_datetime_fractional_secs(self):
+
+ with ensure_clean_store(self.path) as store:
+ dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456)
+ series = Series([0], [dt])
+ store['a'] = series
+ assert store['a'].index[0] == dt
+
+ def test_tseries_indices_series(self):
+
+ with ensure_clean_store(self.path) as store:
+ idx = tm.makeDateIndex(10)
+ ser = Series(np.random.randn(len(idx)), idx)
+ store['a'] = ser
+ result = store['a']
+
+ tm.assert_series_equal(result, ser)
+ assert result.index.freq == ser.index.freq
+ tm.assert_class_equal(result.index, ser.index, obj="series index")
+
+ idx = tm.makePeriodIndex(10)
+ ser = Series(np.random.randn(len(idx)), idx)
+ store['a'] = ser
+ result = store['a']
+
+ tm.assert_series_equal(result, ser)
+ assert result.index.freq == ser.index.freq
+ tm.assert_class_equal(result.index, ser.index, obj="series index")
+
+ def test_tseries_indices_frame(self):
+
+ with ensure_clean_store(self.path) as store:
+ idx = tm.makeDateIndex(10)
+ df = DataFrame(np.random.randn(len(idx), 3), index=idx)
+ store['a'] = df
+ result = store['a']
+
+ assert_frame_equal(result, df)
+ assert result.index.freq == df.index.freq
+ tm.assert_class_equal(result.index, df.index,
+ obj="dataframe index")
+
+ idx = tm.makePeriodIndex(10)
+ df = DataFrame(np.random.randn(len(idx), 3), idx)
+ store['a'] = df
+ result = store['a']
+
+ assert_frame_equal(result, df)
+ assert result.index.freq == df.index.freq
+ tm.assert_class_equal(result.index, df.index,
+ obj="dataframe index")
+
+ def test_unicode_index(self):
+
+ unicode_values = [u('\u03c3'), u('\u03c3\u03c3')]
+
+ # PerformanceWarning
+ with catch_warnings(record=True):
+ simplefilter("ignore", pd.errors.PerformanceWarning)
+ s = Series(np.random.randn(len(unicode_values)), unicode_values)
+ self._check_roundtrip(s, tm.assert_series_equal)
+
+ def test_unicode_longer_encoded(self):
+ # GH 11234
+ char = '\u0394'
+ df = pd.DataFrame({'A': [char]})
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df, format='table', encoding='utf-8')
+ result = store.get('df')
+ tm.assert_frame_equal(result, df)
+
+ df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']})
+ with ensure_clean_store(self.path) as store:
+ store.put('df', df, format='table', encoding='utf-8')
+ result = store.get('df')
+ tm.assert_frame_equal(result, df)
+
+ @xfail_non_writeable
+ def test_store_datetime_mixed(self):
+
+ df = DataFrame(
+ {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']})
+ ts = tm.makeTimeSeries()
+ df['d'] = ts.index[:3]
+ self._check_roundtrip(df, tm.assert_frame_equal)
+
+ # def test_cant_write_multiindex_table(self):
+ # # for now, #1848
+ # df = DataFrame(np.random.randn(10, 4),
+ # index=[np.arange(5).repeat(2),
+ # np.tile(np.arange(2), 5)])
+
+ # pytest.raises(Exception, store.put, 'foo', df, format='table')
+
+ def test_append_with_diff_col_name_types_raises_value_error(self):
+ df = DataFrame(np.random.randn(10, 1))
+ df2 = DataFrame({'a': np.random.randn(10)})
+ df3 = DataFrame({(1, 2): np.random.randn(10)})
+ df4 = DataFrame({('1', 2): np.random.randn(10)})
+ df5 = DataFrame({('1', 2, object): np.random.randn(10)})
+
+ with ensure_clean_store(self.path) as store:
+ name = 'df_%s' % tm.rands(10)
+ store.append(name, df)
+
+ for d in (df2, df3, df4, df5):
+ with pytest.raises(ValueError):
+ store.append(name, d)
+
+ def test_query_with_nested_special_character(self):
+ df = DataFrame({'a': ['a', 'a', 'c', 'b',
+ 'test & test', 'c', 'b', 'e'],
+ 'b': [1, 2, 3, 4, 5, 6, 7, 8]})
+ expected = df[df.a == 'test & test']
+ with ensure_clean_store(self.path) as store:
+ store.append('test', df, format='table', data_columns=True)
+ result = store.select('test', 'a = "test & test"')
+ tm.assert_frame_equal(expected, result)
+
+ def test_categorical(self):
+
+ with ensure_clean_store(self.path) as store:
+
+ # Basic
+ _maybe_remove(store, 's')
+ s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
+ 'a', 'b', 'c', 'd'], ordered=False))
+ store.append('s', s, format='table')
+ result = store.select('s')
+ tm.assert_series_equal(s, result)
+
+ _maybe_remove(store, 's_ordered')
+ s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
+ 'a', 'b', 'c', 'd'], ordered=True))
+ store.append('s_ordered', s, format='table')
+ result = store.select('s_ordered')
+ tm.assert_series_equal(s, result)
+
+ _maybe_remove(store, 'df')
+ df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
+ store.append('df', df, format='table')
+ result = store.select('df')
+ tm.assert_frame_equal(result, df)
+
+ # Dtypes
+ _maybe_remove(store, 'si')
+ s = Series([1, 1, 2, 2, 3, 4, 5]).astype('category')
+ store.append('si', s)
+ result = store.select('si')
+ tm.assert_series_equal(result, s)
+
+ _maybe_remove(store, 'si2')
+ s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype('category')
+ store.append('si2', s)
+ result = store.select('si2')
+ tm.assert_series_equal(result, s)
+
+ # Multiple
+ _maybe_remove(store, 'df2')
+ df2 = df.copy()
+ df2['s2'] = Series(list('abcdefg')).astype('category')
+ store.append('df2', df2)
+ result = store.select('df2')
+ tm.assert_frame_equal(result, df2)
+
+ # Make sure the metadata is OK
+ info = store.info()
+ assert '/df2 ' in info
+ # assert '/df2/meta/values_block_0/meta' in info
+ assert '/df2/meta/values_block_1/meta' in info
+
+ # unordered
+ _maybe_remove(store, 's2')
+ s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
+ 'a', 'b', 'c', 'd'], ordered=False))
+ store.append('s2', s, format='table')
+ result = store.select('s2')
+ tm.assert_series_equal(result, s)
+
+ # Query
+ _maybe_remove(store, 'df3')
+ store.append('df3', df, data_columns=['s'])
+ expected = df[df.s.isin(['b', 'c'])]
+ result = store.select('df3', where=['s in ["b","c"]'])
+ tm.assert_frame_equal(result, expected)
+
+ expected = df[df.s.isin(['b', 'c'])]
+ result = store.select('df3', where=['s = ["b","c"]'])
+ tm.assert_frame_equal(result, expected)
+
+ expected = df[df.s.isin(['d'])]
+ result = store.select('df3', where=['s in ["d"]'])
+ tm.assert_frame_equal(result, expected)
+
+ expected = df[df.s.isin(['f'])]
+ result = store.select('df3', where=['s in ["f"]'])
+ tm.assert_frame_equal(result, expected)
+
+ # Appending with same categories is ok
+ store.append('df3', df)
+
+ df = concat([df, df])
+ expected = df[df.s.isin(['b', 'c'])]
+ result = store.select('df3', where=['s in ["b","c"]'])
+ tm.assert_frame_equal(result, expected)
+
+ # Appending must have the same categories
+ df3 = df.copy()
+ df3['s'].cat.remove_unused_categories(inplace=True)
+
+ with pytest.raises(ValueError):
+ store.append('df3', df3)
+
+ # Remove, and make sure meta data is removed (its a recursive
+ # removal so should be).
+ result = store.select('df3/meta/s/meta')
+ assert result is not None
+ store.remove('df3')
+
+ with pytest.raises(KeyError):
+ store.select('df3/meta/s/meta')
+
+ def test_categorical_conversion(self):
+
+ # GH13322
+ # Check that read_hdf with categorical columns doesn't return rows if
+ # where criteria isn't met.
+ obsids = ['ESP_012345_6789', 'ESP_987654_3210']
+ imgids = ['APF00006np', 'APF0001imm']
+ data = [4.3, 9.8]
+
+ # Test without categories
+ df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
+
+ # We are expecting an empty DataFrame matching types of df
+ expected = df.iloc[[], :]
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table', data_columns=True)
+ result = read_hdf(path, 'df', where='obsids=B')
+ tm.assert_frame_equal(result, expected)
+
+ # Test with categories
+ df.obsids = df.obsids.astype('category')
+ df.imgids = df.imgids.astype('category')
+
+ # We are expecting an empty DataFrame matching types of df
+ expected = df.iloc[[], :]
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table', data_columns=True)
+ result = read_hdf(path, 'df', where='obsids=B')
+ tm.assert_frame_equal(result, expected)
+
+ def test_categorical_nan_only_columns(self):
+ # GH18413
+ # Check that read_hdf with categorical columns with NaN-only values can
+ # be read back.
+ df = pd.DataFrame({
+ 'a': ['a', 'b', 'c', np.nan],
+ 'b': [np.nan, np.nan, np.nan, np.nan],
+ 'c': [1, 2, 3, 4],
+ 'd': pd.Series([None] * 4, dtype=object)
+ })
+ df['a'] = df.a.astype('category')
+ df['b'] = df.b.astype('category')
+ df['d'] = df.b.astype('category')
+ expected = df
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table', data_columns=True)
+ result = read_hdf(path, 'df')
+ tm.assert_frame_equal(result, expected)
+
+ def test_duplicate_column_name(self):
+ df = DataFrame(columns=["a", "a"], data=[[0, 0]])
+
+ with ensure_clean_path(self.path) as path:
+ pytest.raises(ValueError, df.to_hdf,
+ path, 'df', format='fixed')
+
+ df.to_hdf(path, 'df', format='table')
+ other = read_hdf(path, 'df')
+
+ tm.assert_frame_equal(df, other)
+ assert df.equals(other)
+ assert other.equals(df)
+
+ def test_round_trip_equals(self):
+ # GH 9330
+ df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table')
+ other = read_hdf(path, 'df')
+ tm.assert_frame_equal(df, other)
+ assert df.equals(other)
+ assert other.equals(df)
+
+ def test_preserve_timedeltaindex_type(self):
+ # GH9635
+ # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve
+ # the type of the index.
+ df = DataFrame(np.random.normal(size=(10, 5)))
+ df.index = timedelta_range(
+ start='0s', periods=10, freq='1s', name='example')
+
+ with ensure_clean_store(self.path) as store:
+
+ store['df'] = df
+ assert_frame_equal(store['df'], df)
+
+ def test_columns_multiindex_modified(self):
+ # BUG: 7212
+ # read_hdf store.select modified the passed columns parameters
+ # when multi-indexed.
+
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ df.index.name = 'letters'
+ df = df.set_index(keys='E', append=True)
+
+ data_columns = df.index.names + df.columns.tolist()
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df',
+ mode='a',
+ append=True,
+ data_columns=data_columns,
+ index=False)
+ cols2load = list('BCD')
+ cols2load_original = list(cols2load)
+ df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa
+ assert cols2load_original == cols2load
+
+ @ignore_natural_naming_warning
+ def test_to_hdf_with_object_column_names(self):
+ # GH9057
+ # Writing HDF5 table format should only work for string-like
+ # column types
+
+ types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex,
+ tm.makeDateIndex, tm.makeTimedeltaIndex,
+ tm.makePeriodIndex]
+ types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex]
+
+ if compat.PY3:
+ types_should_run.append(tm.makeUnicodeIndex)
+ else:
+ # TODO: Add back to types_should_fail
+ # https://github.com/pandas-dev/pandas/issues/20907
+ pass
+
+ for index in types_should_fail:
+ df = DataFrame(np.random.randn(10, 2), columns=index(2))
+ with ensure_clean_path(self.path) as path:
+ with catch_warnings(record=True):
+ msg = "cannot have non-object label DataIndexableCol"
+ with pytest.raises(ValueError, match=msg):
+ df.to_hdf(path, 'df', format='table',
+ data_columns=True)
+
+ for index in types_should_run:
+ df = DataFrame(np.random.randn(10, 2), columns=index(2))
+ with ensure_clean_path(self.path) as path:
+ with catch_warnings(record=True):
+ df.to_hdf(path, 'df', format='table', data_columns=True)
+ result = pd.read_hdf(
+ path, 'df', where="index = [{0}]".format(df.index[0]))
+ assert(len(result))
+
+ def test_read_hdf_open_store(self):
+ # GH10330
+ # No check for non-string path_or-buf, and no test of open store
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ df.index.name = 'letters'
+ df = df.set_index(keys='E', append=True)
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', mode='w')
+ direct = read_hdf(path, 'df')
+ store = HDFStore(path, mode='r')
+ indirect = read_hdf(store, 'df')
+ tm.assert_frame_equal(direct, indirect)
+ assert store.is_open
+ store.close()
+
+ def test_read_hdf_iterator(self):
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ df.index.name = 'letters'
+ df = df.set_index(keys='E', append=True)
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', mode='w', format='t')
+ direct = read_hdf(path, 'df')
+ iterator = read_hdf(path, 'df', iterator=True)
+ assert isinstance(iterator, TableIterator)
+ indirect = next(iterator.__iter__())
+ tm.assert_frame_equal(direct, indirect)
+ iterator.store.close()
+
+ def test_read_hdf_errors(self):
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+
+ with ensure_clean_path(self.path) as path:
+ pytest.raises(IOError, read_hdf, path, 'key')
+ df.to_hdf(path, 'df')
+ store = HDFStore(path, mode='r')
+ store.close()
+ pytest.raises(IOError, read_hdf, store, 'df')
+
+ def test_read_hdf_generic_buffer_errors(self):
+ pytest.raises(NotImplementedError, read_hdf, BytesIO(b''), 'df')
+
+ def test_invalid_complib(self):
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ with ensure_clean_path(self.path) as path:
+ with pytest.raises(ValueError):
+ df.to_hdf(path, 'df', complib='foolib')
+ # GH10443
+
+ def test_read_nokey(self):
+ df = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+
+ # Categorical dtype not supported for "fixed" format. So no need
+ # to test with that dtype in the dataframe here.
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', mode='a')
+ reread = read_hdf(path)
+ assert_frame_equal(df, reread)
+ df.to_hdf(path, 'df2', mode='a')
+ pytest.raises(ValueError, read_hdf, path)
+
+ def test_read_nokey_table(self):
+ # GH13231
+ df = DataFrame({'i': range(5),
+ 'c': Series(list('abacd'), dtype='category')})
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', mode='a', format='table')
+ reread = read_hdf(path)
+ assert_frame_equal(df, reread)
+ df.to_hdf(path, 'df2', mode='a', format='table')
+ pytest.raises(ValueError, read_hdf, path)
+
+ def test_read_nokey_empty(self):
+ with ensure_clean_path(self.path) as path:
+ store = HDFStore(path)
+ store.close()
+ pytest.raises(ValueError, read_hdf, path)
+
+ @td.skip_if_no('pathlib')
+ def test_read_from_pathlib_path(self):
+
+ # GH11773
+ from pathlib import Path
+
+ expected = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ with ensure_clean_path(self.path) as filename:
+ path_obj = Path(filename)
+
+ expected.to_hdf(path_obj, 'df', mode='a')
+ actual = read_hdf(path_obj, 'df')
+
+ tm.assert_frame_equal(expected, actual)
+
+ @td.skip_if_no('py.path')
+ def test_read_from_py_localpath(self):
+
+ # GH11773
+ from py.path import local as LocalPath
+
+ expected = DataFrame(np.random.rand(4, 5),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ with ensure_clean_path(self.path) as filename:
+ path_obj = LocalPath(filename)
+
+ expected.to_hdf(path_obj, 'df', mode='a')
+ actual = read_hdf(path_obj, 'df')
+
+ tm.assert_frame_equal(expected, actual)
+
+ def test_query_long_float_literal(self):
+ # GH 14241
+ df = pd.DataFrame({'A': [1000000000.0009,
+ 1000000000.0011,
+ 1000000000.0015]})
+
+ with ensure_clean_store(self.path) as store:
+ store.append('test', df, format='table', data_columns=True)
+
+ cutoff = 1000000000.0006
+ result = store.select('test', "A < %.4f" % cutoff)
+ assert result.empty
+
+ cutoff = 1000000000.0010
+ result = store.select('test', "A > %.4f" % cutoff)
+ expected = df.loc[[1, 2], :]
+ tm.assert_frame_equal(expected, result)
+
+ exact = 1000000000.0011
+ result = store.select('test', 'A == %.4f' % exact)
+ expected = df.loc[[1], :]
+ tm.assert_frame_equal(expected, result)
+
+ def test_query_compare_column_type(self):
+ # GH 15492
+ df = pd.DataFrame({'date': ['2014-01-01', '2014-01-02'],
+ 'real_date': date_range('2014-01-01', periods=2),
+ 'float': [1.1, 1.2],
+ 'int': [1, 2]},
+ columns=['date', 'real_date', 'float', 'int'])
+
+ with ensure_clean_store(self.path) as store:
+ store.append('test', df, format='table', data_columns=True)
+
+ ts = pd.Timestamp('2014-01-01') # noqa
+ result = store.select('test', where='real_date > ts')
+ expected = df.loc[[1], :]
+ tm.assert_frame_equal(expected, result)
+
+ for op in ['<', '>', '==']:
+ # non strings to string column always fail
+ for v in [2.1, True, pd.Timestamp('2014-01-01'),
+ pd.Timedelta(1, 's')]:
+ query = 'date {op} v'.format(op=op)
+ with pytest.raises(TypeError):
+ store.select('test', where=query)
+
+ # strings to other columns must be convertible to type
+ v = 'a'
+ for col in ['int', 'float', 'real_date']:
+ query = '{col} {op} v'.format(op=op, col=col)
+ with pytest.raises(ValueError):
+ store.select('test', where=query)
+
+ for v, col in zip(['1', '1.1', '2014-01-01'],
+ ['int', 'float', 'real_date']):
+ query = '{col} {op} v'.format(op=op, col=col)
+ result = store.select('test', where=query)
+
+ if op == '==':
+ expected = df.loc[[0], :]
+ elif op == '>':
+ expected = df.loc[[1], :]
+ else:
+ expected = df.loc[[], :]
+ tm.assert_frame_equal(expected, result)
+
+ @pytest.mark.parametrize('format', ['fixed', 'table'])
+ def test_read_hdf_series_mode_r(self, format):
+ # GH 16583
+ # Tests that reading a Series saved to an HDF file
+ # still works if a mode='r' argument is supplied
+ series = tm.makeFloatSeries()
+ with ensure_clean_path(self.path) as path:
+ series.to_hdf(path, key='data', format=format)
+ result = pd.read_hdf(path, key='data', mode='r')
+ tm.assert_series_equal(result, series)
+
+ @pytest.mark.skipif(not PY36, reason="Need python 3.6")
+ def test_fspath(self):
+ with tm.ensure_clean('foo.h5') as path:
+ with pd.HDFStore(path) as store:
+ assert os.fspath(store) == str(path)
+
+ def test_read_py2_hdf_file_in_py3(self, datapath):
+ # GH 16781
+
+ # tests reading a PeriodIndex DataFrame written in Python2 in Python3
+
+ # the file was generated in Python 2.7 like so:
+ #
+ # df = pd.DataFrame([1.,2,3], index=pd.PeriodIndex(
+ # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
+ # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
+
+ expected = pd.DataFrame([1., 2, 3], index=pd.PeriodIndex(
+ ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
+
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf',
+ 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'),
+ mode='r') as store:
+ result = store['p']
+ assert_frame_equal(result, expected)
+
+
+class TestHDFComplexValues(Base):
+ # GH10447
+
+ def test_complex_fixed(self):
+ df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ def test_complex_table(self):
+ df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
+ index=list('abcd'),
+ columns=list('ABCDE'))
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table', mode='w')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ @xfail_non_writeable
+ def test_complex_mixed_fixed(self):
+ complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j,
+ 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64)
+ complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
+ dtype=np.complex128)
+ df = DataFrame({'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'd'],
+ 'C': complex64,
+ 'D': complex128,
+ 'E': [1.0, 2.0, 3.0, 4.0]},
+ index=list('abcd'))
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ def test_complex_mixed_table(self):
+ complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j,
+ 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64)
+ complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
+ dtype=np.complex128)
+ df = DataFrame({'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'd'],
+ 'C': complex64,
+ 'D': complex128,
+ 'E': [1.0, 2.0, 3.0, 4.0]},
+ index=list('abcd'))
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df, data_columns=['A', 'B'])
+ result = store.select('df', where='A>2')
+ assert_frame_equal(df.loc[df.A > 2], result)
+
+ with ensure_clean_path(self.path) as path:
+ df.to_hdf(path, 'df', format='table')
+ reread = read_hdf(path, 'df')
+ assert_frame_equal(df, reread)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_complex_across_dimensions_fixed(self):
+ with catch_warnings(record=True):
+ complex128 = np.array(
+ [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
+ s = Series(complex128, index=list('abcd'))
+ df = DataFrame({'A': s, 'B': s})
+ p = Panel({'One': df, 'Two': df})
+
+ objs = [s, df, p]
+ comps = [tm.assert_series_equal, tm.assert_frame_equal,
+ tm.assert_panel_equal]
+ for obj, comp in zip(objs, comps):
+ with ensure_clean_path(self.path) as path:
+ obj.to_hdf(path, 'obj', format='fixed')
+ reread = read_hdf(path, 'obj')
+ comp(obj, reread)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_complex_across_dimensions(self):
+ complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
+ s = Series(complex128, index=list('abcd'))
+ df = DataFrame({'A': s, 'B': s})
+
+ with catch_warnings(record=True):
+ p = Panel({'One': df, 'Two': df})
+
+ objs = [df, p]
+ comps = [tm.assert_frame_equal, tm.assert_panel_equal]
+ for obj, comp in zip(objs, comps):
+ with ensure_clean_path(self.path) as path:
+ obj.to_hdf(path, 'obj', format='table')
+ reread = read_hdf(path, 'obj')
+ comp(obj, reread)
+
+ def test_complex_indexing_error(self):
+ complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
+ dtype=np.complex128)
+ df = DataFrame({'A': [1, 2, 3, 4],
+ 'B': ['a', 'b', 'c', 'd'],
+ 'C': complex128},
+ index=list('abcd'))
+ with ensure_clean_store(self.path) as store:
+ pytest.raises(TypeError, store.append,
+ 'df', df, data_columns=['C'])
+
+ def test_complex_series_error(self):
+ complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
+ s = Series(complex128, index=list('abcd'))
+
+ with ensure_clean_path(self.path) as path:
+ pytest.raises(TypeError, s.to_hdf, path, 'obj', format='t')
+
+ with ensure_clean_path(self.path) as path:
+ s.to_hdf(path, 'obj', format='t', index=False)
+ reread = read_hdf(path, 'obj')
+ tm.assert_series_equal(s, reread)
+
+ def test_complex_append(self):
+ df = DataFrame({'a': np.random.randn(100).astype(np.complex128),
+ 'b': np.random.randn(100)})
+
+ with ensure_clean_store(self.path) as store:
+ store.append('df', df, data_columns=['b'])
+ store.append('df', df)
+ result = store.select('df')
+ assert_frame_equal(pd.concat([df, df], 0), result)
+
+
+class TestTimezones(Base):
+
+ def _compare_with_tz(self, a, b):
+ tm.assert_frame_equal(a, b)
+
+ # compare the zones on each element
+ for c in a.columns:
+ for i in a.index:
+ a_e = a.loc[i, c]
+ b_e = b.loc[i, c]
+ if not (a_e == b_e and a_e.tz == b_e.tz):
+ raise AssertionError(
+ "invalid tz comparison [%s] [%s]" % (a_e, b_e))
+
+ def test_append_with_timezones_dateutil(self):
+
+ from datetime import timedelta
+
+ # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows
+ # filename issues.
+ from pandas._libs.tslibs.timezones import maybe_get_tz
+ gettz = lambda x: maybe_get_tz('dateutil/' + x)
+
+ # as columns
+ with ensure_clean_store(self.path) as store:
+
+ _maybe_remove(store, 'df_tz')
+ df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', tz=gettz(
+ 'US/Eastern')) + timedelta(hours=1) * i for i in range(5)]))
+
+ store.append('df_tz', df, data_columns=['A'])
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ # select with tz aware
+ expected = df[df.A >= df.A[3]]
+ result = store.select('df_tz', where='A>=df.A[3]')
+ self._compare_with_tz(result, expected)
+
+ # ensure we include dates in DST and STD time here.
+ _maybe_remove(store, 'df_tz')
+ df = DataFrame(dict(A=Timestamp('20130102',
+ tz=gettz('US/Eastern')),
+ B=Timestamp('20130603',
+ tz=gettz('US/Eastern'))),
+ index=range(5))
+ store.append('df_tz', df)
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ df = DataFrame(dict(A=Timestamp('20130102',
+ tz=gettz('US/Eastern')),
+ B=Timestamp('20130102', tz=gettz('EET'))),
+ index=range(5))
+ pytest.raises(ValueError, store.append, 'df_tz', df)
+
+ # this is ok
+ _maybe_remove(store, 'df_tz')
+ store.append('df_tz', df, data_columns=['A', 'B'])
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ # can't append with diff timezone
+ df = DataFrame(dict(A=Timestamp('20130102',
+ tz=gettz('US/Eastern')),
+ B=Timestamp('20130102', tz=gettz('CET'))),
+ index=range(5))
+ pytest.raises(ValueError, store.append, 'df_tz', df)
+
+ # as index
+ with ensure_clean_store(self.path) as store:
+
+ # GH 4098 example
+ df = DataFrame(dict(A=Series(lrange(3), index=date_range(
+ '2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern')))))
+
+ _maybe_remove(store, 'df')
+ store.put('df', df)
+ result = store.select('df')
+ assert_frame_equal(result, df)
+
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+ result = store.select('df')
+ assert_frame_equal(result, df)
+
+ def test_append_with_timezones_pytz(self):
+
+ from datetime import timedelta
+
+ # as columns
+ with ensure_clean_store(self.path) as store:
+
+ _maybe_remove(store, 'df_tz')
+ df = DataFrame(dict(A=[Timestamp('20130102 2:00:00',
+ tz='US/Eastern') +
+ timedelta(hours=1) * i
+ for i in range(5)]))
+ store.append('df_tz', df, data_columns=['A'])
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ # select with tz aware
+ self._compare_with_tz(store.select(
+ 'df_tz', where='A>=df.A[3]'), df[df.A >= df.A[3]])
+
+ _maybe_remove(store, 'df_tz')
+ # ensure we include dates in DST and STD time here.
+ df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
+ B=Timestamp('20130603', tz='US/Eastern')),
+ index=range(5))
+ store.append('df_tz', df)
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
+ B=Timestamp('20130102', tz='EET')),
+ index=range(5))
+ pytest.raises(ValueError, store.append, 'df_tz', df)
+
+ # this is ok
+ _maybe_remove(store, 'df_tz')
+ store.append('df_tz', df, data_columns=['A', 'B'])
+ result = store['df_tz']
+ self._compare_with_tz(result, df)
+ assert_frame_equal(result, df)
+
+ # can't append with diff timezone
+ df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
+ B=Timestamp('20130102', tz='CET')),
+ index=range(5))
+ pytest.raises(ValueError, store.append, 'df_tz', df)
+
+ # as index
+ with ensure_clean_store(self.path) as store:
+
+ # GH 4098 example
+ df = DataFrame(dict(A=Series(lrange(3), index=date_range(
+ '2000-1-1', periods=3, freq='H', tz='US/Eastern'))))
+
+ _maybe_remove(store, 'df')
+ store.put('df', df)
+ result = store.select('df')
+ assert_frame_equal(result, df)
+
+ _maybe_remove(store, 'df')
+ store.append('df', df)
+ result = store.select('df')
+ assert_frame_equal(result, df)
+
+ def test_tseries_select_index_column(self):
+ # GH7777
+ # selecting a UTC datetimeindex column did
+ # not preserve UTC tzinfo set before storing
+
+ # check that no tz still works
+ rng = date_range('1/1/2000', '1/30/2000')
+ frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ with ensure_clean_store(self.path) as store:
+ store.append('frame', frame)
+ result = store.select_column('frame', 'index')
+ assert rng.tz == DatetimeIndex(result.values).tz
+
+ # check utc
+ rng = date_range('1/1/2000', '1/30/2000', tz='UTC')
+ frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ with ensure_clean_store(self.path) as store:
+ store.append('frame', frame)
+ result = store.select_column('frame', 'index')
+ assert rng.tz == result.dt.tz
+
+ # double check non-utc
+ rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
+ frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ with ensure_clean_store(self.path) as store:
+ store.append('frame', frame)
+ result = store.select_column('frame', 'index')
+ assert rng.tz == result.dt.tz
+
+ def test_timezones_fixed(self):
+ with ensure_clean_store(self.path) as store:
+
+ # index
+ rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
+ df = DataFrame(np.random.randn(len(rng), 4), index=rng)
+ store['df'] = df
+ result = store['df']
+ assert_frame_equal(result, df)
+
+ # as data
+ # GH11411
+ _maybe_remove(store, 'df')
+ df = DataFrame({'A': rng,
+ 'B': rng.tz_convert('UTC').tz_localize(None),
+ 'C': rng.tz_convert('CET'),
+ 'D': range(len(rng))}, index=rng)
+ store['df'] = df
+ result = store['df']
+ assert_frame_equal(result, df)
+
+ def test_fixed_offset_tz(self):
+ rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')
+ frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ with ensure_clean_store(self.path) as store:
+ store['frame'] = frame
+ recons = store['frame']
+ tm.assert_index_equal(recons.index, rng)
+ assert rng.tz == recons.index.tz
+
+ @td.skip_if_windows
+ def test_store_timezone(self):
+ # GH2852
+ # issue storing datetime.date with a timezone as it resets when read
+ # back in a new timezone
+
+ # original method
+ with ensure_clean_store(self.path) as store:
+
+ today = datetime.date(2013, 9, 10)
+ df = DataFrame([1, 2, 3], index=[today, today, today])
+ store['obj1'] = df
+ result = store['obj1']
+ assert_frame_equal(result, df)
+
+ # with tz setting
+ with ensure_clean_store(self.path) as store:
+
+ with set_timezone('EST5EDT'):
+ today = datetime.date(2013, 9, 10)
+ df = DataFrame([1, 2, 3], index=[today, today, today])
+ store['obj1'] = df
+
+ with set_timezone('CST6CDT'):
+ result = store['obj1']
+
+ assert_frame_equal(result, df)
+
+ def test_legacy_datetimetz_object(self, datapath):
+ # legacy from < 0.17.0
+ # 8260
+ expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
+ B=Timestamp('20130603', tz='CET')),
+ index=range(5))
+ with ensure_clean_store(
+ datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'),
+ mode='r') as store:
+ result = store['df']
+ assert_frame_equal(result, expected)
+
+ def test_dst_transitions(self):
+ # make sure we are not failing on transaitions
+ with ensure_clean_store(self.path) as store:
+ times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00",
+ tz="Europe/London",
+ freq="H",
+ ambiguous='infer')
+
+ for i in [times, times + pd.Timedelta('10min')]:
+ _maybe_remove(store, 'df')
+ df = DataFrame({'A': range(len(i)), 'B': i}, index=i)
+ store.append('df', df)
+ result = store.select('df')
+ assert_frame_equal(result, df)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_s3.py b/contrib/python/pandas/py2/pandas/tests/io/test_s3.py
new file mode 100644
index 00000000000..32eae8ed328
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_s3.py
@@ -0,0 +1,29 @@
+import pytest
+
+from pandas.compat import BytesIO
+
+from pandas import read_csv
+
+from pandas.io.common import is_s3_url
+
+
+class TestS3URL(object):
+
+ def test_is_s3_url(self):
+ assert is_s3_url("s3://pandas/somethingelse.com")
+ assert not is_s3_url("s4://pandas/somethingelse.com")
+
+
+def test_streaming_s3_objects():
+ # GH17135
+ # botocore gained iteration support in 1.10.47, can now be used in read_*
+ pytest.importorskip('botocore', minversion='1.10.47')
+ from botocore.response import StreamingBody
+
+ data = [
+ b'foo,bar,baz\n1,2,3\n4,5,6\n',
+ b'just,the,header\n',
+ ]
+ for el in data:
+ body = StreamingBody(BytesIO(el), content_length=len(el))
+ read_csv(body)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_sql.py b/contrib/python/pandas/py2/pandas/tests/io/test_sql.py
new file mode 100644
index 00000000000..75a6d8d0090
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_sql.py
@@ -0,0 +1,2708 @@
+"""SQL io tests
+
+The SQL tests are broken down in different classes:
+
+- `PandasSQLTest`: base class with common methods for all test classes
+- Tests for the public API (only tests with sqlite3)
+ - `_TestSQLApi` base class
+ - `TestSQLApi`: test the public API with sqlalchemy engine
+ - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI
+ connection
+- Tests for the different SQL flavors (flavor specific type conversions)
+ - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with
+ common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy
+ Connection object. The different tested flavors (sqlite3, MySQL,
+ PostgreSQL) derive from the base class
+ - Tests for the fallback mode (`TestSQLiteFallback`)
+
+"""
+
+from __future__ import print_function
+
+import csv
+from datetime import date, datetime, time
+import sqlite3
+import warnings
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import PY36, lrange, range, string_types
+
+from pandas.core.dtypes.common import (
+ is_datetime64_dtype, is_datetime64tz_dtype)
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, MultiIndex, Series, Timestamp, concat, date_range, isna,
+ to_datetime, to_timedelta)
+import pandas.util.testing as tm
+
+import pandas.io.sql as sql
+from pandas.io.sql import read_sql_query, read_sql_table
+
+try:
+ import sqlalchemy
+ import sqlalchemy.schema
+ import sqlalchemy.sql.sqltypes as sqltypes
+ from sqlalchemy.ext import declarative
+ from sqlalchemy.orm import session as sa_session
+ SQLALCHEMY_INSTALLED = True
+except ImportError:
+ SQLALCHEMY_INSTALLED = False
+
+SQL_STRINGS = {
+ 'create_iris': {
+ 'sqlite': """CREATE TABLE iris (
+ "SepalLength" REAL,
+ "SepalWidth" REAL,
+ "PetalLength" REAL,
+ "PetalWidth" REAL,
+ "Name" TEXT
+ )""",
+ 'mysql': """CREATE TABLE iris (
+ `SepalLength` DOUBLE,
+ `SepalWidth` DOUBLE,
+ `PetalLength` DOUBLE,
+ `PetalWidth` DOUBLE,
+ `Name` VARCHAR(200)
+ )""",
+ 'postgresql': """CREATE TABLE iris (
+ "SepalLength" DOUBLE PRECISION,
+ "SepalWidth" DOUBLE PRECISION,
+ "PetalLength" DOUBLE PRECISION,
+ "PetalWidth" DOUBLE PRECISION,
+ "Name" VARCHAR(200)
+ )"""
+ },
+ 'insert_iris': {
+ 'sqlite': """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""",
+ 'mysql': """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""",
+ 'postgresql': """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);"""
+ },
+ 'create_test_types': {
+ 'sqlite': """CREATE TABLE types_test_data (
+ "TextCol" TEXT,
+ "DateCol" TEXT,
+ "IntDateCol" INTEGER,
+ "IntDateOnlyCol" INTEGER,
+ "FloatCol" REAL,
+ "IntCol" INTEGER,
+ "BoolCol" INTEGER,
+ "IntColWithNull" INTEGER,
+ "BoolColWithNull" INTEGER
+ )""",
+ 'mysql': """CREATE TABLE types_test_data (
+ `TextCol` TEXT,
+ `DateCol` DATETIME,
+ `IntDateCol` INTEGER,
+ `IntDateOnlyCol` INTEGER,
+ `FloatCol` DOUBLE,
+ `IntCol` INTEGER,
+ `BoolCol` BOOLEAN,
+ `IntColWithNull` INTEGER,
+ `BoolColWithNull` BOOLEAN
+ )""",
+ 'postgresql': """CREATE TABLE types_test_data (
+ "TextCol" TEXT,
+ "DateCol" TIMESTAMP,
+ "DateColWithTz" TIMESTAMP WITH TIME ZONE,
+ "IntDateCol" INTEGER,
+ "IntDateOnlyCol" INTEGER,
+ "FloatCol" DOUBLE PRECISION,
+ "IntCol" INTEGER,
+ "BoolCol" BOOLEAN,
+ "IntColWithNull" INTEGER,
+ "BoolColWithNull" BOOLEAN
+ )"""
+ },
+ 'insert_test_types': {
+ 'sqlite': {
+ 'query': """
+ INSERT INTO types_test_data
+ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ 'fields': (
+ 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol',
+ 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull',
+ 'BoolColWithNull'
+ )
+ },
+ 'mysql': {
+ 'query': """
+ INSERT INTO types_test_data
+ VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s)
+ """,
+ 'fields': (
+ 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol',
+ 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull',
+ 'BoolColWithNull'
+ )
+ },
+ 'postgresql': {
+ 'query': """
+ INSERT INTO types_test_data
+ VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+ """,
+ 'fields': (
+ 'TextCol', 'DateCol', 'DateColWithTz',
+ 'IntDateCol', 'IntDateOnlyCol', 'FloatCol',
+ 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull'
+ )
+ },
+ },
+ 'read_parameters': {
+ 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?",
+ 'mysql': 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s',
+ 'postgresql': 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s'
+ },
+ 'read_named_parameters': {
+ 'sqlite': """
+ SELECT * FROM iris WHERE Name=:name AND SepalLength=:length
+ """,
+ 'mysql': """
+ SELECT * FROM iris WHERE
+ `Name`="%(name)s" AND `SepalLength`=%(length)s
+ """,
+ 'postgresql': """
+ SELECT * FROM iris WHERE
+ "Name"=%(name)s AND "SepalLength"=%(length)s
+ """
+ },
+ 'create_view': {
+ 'sqlite': """
+ CREATE VIEW iris_view AS
+ SELECT * FROM iris
+ """
+ }
+}
+
+
+class MixInBase(object):
+
+ def teardown_method(self, method):
+ # if setup fails, there may not be a connection to close.
+ if hasattr(self, 'conn'):
+ for tbl in self._get_all_tables():
+ self.drop_table(tbl)
+ self._close_conn()
+
+
+class MySQLMixIn(MixInBase):
+
+ def drop_table(self, table_name):
+ cur = self.conn.cursor()
+ cur.execute("DROP TABLE IF EXISTS %s" %
+ sql._get_valid_mysql_name(table_name))
+ self.conn.commit()
+
+ def _get_all_tables(self):
+ cur = self.conn.cursor()
+ cur.execute('SHOW TABLES')
+ return [table[0] for table in cur.fetchall()]
+
+ def _close_conn(self):
+ from pymysql.err import Error
+ try:
+ self.conn.close()
+ except Error:
+ pass
+
+
+class SQLiteMixIn(MixInBase):
+
+ def drop_table(self, table_name):
+ self.conn.execute("DROP TABLE IF EXISTS %s" %
+ sql._get_valid_sqlite_name(table_name))
+ self.conn.commit()
+
+ def _get_all_tables(self):
+ c = self.conn.execute(
+ "SELECT name FROM sqlite_master WHERE type='table'")
+ return [table[0] for table in c.fetchall()]
+
+ def _close_conn(self):
+ self.conn.close()
+
+
+class SQLAlchemyMixIn(MixInBase):
+
+ def drop_table(self, table_name):
+ sql.SQLDatabase(self.conn).drop_table(table_name)
+
+ def _get_all_tables(self):
+ meta = sqlalchemy.schema.MetaData(bind=self.conn)
+ meta.reflect()
+ table_list = meta.tables.keys()
+ return table_list
+
+ def _close_conn(self):
+ pass
+
+
+class PandasSQLTest(object):
+ """
+ Base class with common private methods for SQLAlchemy and fallback cases.
+
+ """
+
+ def _get_exec(self):
+ if hasattr(self.conn, 'execute'):
+ return self.conn
+ else:
+ return self.conn.cursor()
+
+ @pytest.fixture(params=[('io', 'data', 'iris.csv')])
+ def load_iris_data(self, datapath, request):
+ import io
+ iris_csv_file = datapath(*request.param)
+
+ if not hasattr(self, 'conn'):
+ self.setup_connect()
+
+ self.drop_table('iris')
+ self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor])
+
+ with io.open(iris_csv_file, mode='r', newline=None) as iris_csv:
+ r = csv.reader(iris_csv)
+ next(r) # skip header row
+ ins = SQL_STRINGS['insert_iris'][self.flavor]
+
+ for row in r:
+ self._get_exec().execute(ins, row)
+
+ def _load_iris_view(self):
+ self.drop_table('iris_view')
+ self._get_exec().execute(SQL_STRINGS['create_view'][self.flavor])
+
+ def _check_iris_loaded_frame(self, iris_frame):
+ pytype = iris_frame.dtypes[0].type
+ row = iris_frame.iloc[0]
+
+ assert issubclass(pytype, np.floating)
+ tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa'])
+
+ def _load_test1_data(self):
+ columns = ['index', 'A', 'B', 'C', 'D']
+ data = [(
+ '2000-01-03 00:00:00', 0.980268513777, 3.68573087906,
+ -0.364216805298, -1.15973806169),
+ ('2000-01-04 00:00:00', 1.04791624281, -
+ 0.0412318367011, -0.16181208307, 0.212549316967),
+ ('2000-01-05 00:00:00', 0.498580885705,
+ 0.731167677815, -0.537677223318, 1.34627041952),
+ ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543,
+ 0.00364077397681, 0.67525259227)]
+
+ self.test_frame1 = DataFrame(data, columns=columns)
+
+ def _load_test2_data(self):
+ df = DataFrame(dict(A=[4, 1, 3, 6],
+ B=['asd', 'gsq', 'ylt', 'jkl'],
+ C=[1.1, 3.1, 6.9, 5.3],
+ D=[False, True, True, False],
+ E=['1990-11-22', '1991-10-26',
+ '1993-11-26', '1995-12-12']))
+ df['E'] = to_datetime(df['E'])
+
+ self.test_frame2 = df
+
+ def _load_test3_data(self):
+ columns = ['index', 'A', 'B']
+ data = [(
+ '2000-01-03 00:00:00', 2 ** 31 - 1, -1.987670),
+ ('2000-01-04 00:00:00', -29, -0.0412318367011),
+ ('2000-01-05 00:00:00', 20000, 0.731167677815),
+ ('2000-01-06 00:00:00', -290867, 1.56762092543)]
+
+ self.test_frame3 = DataFrame(data, columns=columns)
+
+ def _load_raw_sql(self):
+ self.drop_table('types_test_data')
+ self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor])
+ ins = SQL_STRINGS['insert_test_types'][self.flavor]
+ data = [
+ {
+ 'TextCol': 'first',
+ 'DateCol': '2000-01-03 00:00:00',
+ 'DateColWithTz': '2000-01-01 00:00:00-08:00',
+ 'IntDateCol': 535852800,
+ 'IntDateOnlyCol': 20101010,
+ 'FloatCol': 10.10,
+ 'IntCol': 1,
+ 'BoolCol': False,
+ 'IntColWithNull': 1,
+ 'BoolColWithNull': False,
+ },
+ {
+ 'TextCol': 'first',
+ 'DateCol': '2000-01-04 00:00:00',
+ 'DateColWithTz': '2000-06-01 00:00:00-07:00',
+ 'IntDateCol': 1356998400,
+ 'IntDateOnlyCol': 20101212,
+ 'FloatCol': 10.10,
+ 'IntCol': 1,
+ 'BoolCol': False,
+ 'IntColWithNull': None,
+ 'BoolColWithNull': None,
+ },
+ ]
+
+ for d in data:
+ self._get_exec().execute(
+ ins['query'],
+ [d[field] for field in ins['fields']]
+ )
+
+ def _count_rows(self, table_name):
+ result = self._get_exec().execute(
+ "SELECT count(*) AS count_1 FROM %s" % table_name).fetchone()
+ return result[0]
+
+ def _read_sql_iris(self):
+ iris_frame = self.pandasSQL.read_query("SELECT * FROM iris")
+ self._check_iris_loaded_frame(iris_frame)
+
+ def _read_sql_iris_parameter(self):
+ query = SQL_STRINGS['read_parameters'][self.flavor]
+ params = ['Iris-setosa', 5.1]
+ iris_frame = self.pandasSQL.read_query(query, params=params)
+ self._check_iris_loaded_frame(iris_frame)
+
+ def _read_sql_iris_named_parameter(self):
+ query = SQL_STRINGS['read_named_parameters'][self.flavor]
+ params = {'name': 'Iris-setosa', 'length': 5.1}
+ iris_frame = self.pandasSQL.read_query(query, params=params)
+ self._check_iris_loaded_frame(iris_frame)
+
+ def _to_sql(self, method=None):
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=method)
+ assert self.pandasSQL.has_table('test_frame1')
+
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+ assert num_rows == num_entries
+
+ # Nuke table
+ self.drop_table('test_frame1')
+
+ def _to_sql_empty(self):
+ self.drop_table('test_frame1')
+ self.pandasSQL.to_sql(self.test_frame1.iloc[:0], 'test_frame1')
+
+ def _to_sql_fail(self):
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(
+ self.test_frame1, 'test_frame1', if_exists='fail')
+ assert self.pandasSQL.has_table('test_frame1')
+
+ pytest.raises(ValueError, self.pandasSQL.to_sql,
+ self.test_frame1, 'test_frame1', if_exists='fail')
+
+ self.drop_table('test_frame1')
+
+ def _to_sql_replace(self):
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(
+ self.test_frame1, 'test_frame1', if_exists='fail')
+ # Add to table again
+ self.pandasSQL.to_sql(
+ self.test_frame1, 'test_frame1', if_exists='replace')
+ assert self.pandasSQL.has_table('test_frame1')
+
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+
+ assert num_rows == num_entries
+ self.drop_table('test_frame1')
+
+ def _to_sql_append(self):
+ # Nuke table just in case
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(
+ self.test_frame1, 'test_frame1', if_exists='fail')
+
+ # Add to table again
+ self.pandasSQL.to_sql(
+ self.test_frame1, 'test_frame1', if_exists='append')
+ assert self.pandasSQL.has_table('test_frame1')
+
+ num_entries = 2 * len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+
+ assert num_rows == num_entries
+ self.drop_table('test_frame1')
+
+ def _to_sql_method_callable(self):
+ check = [] # used to double check function below is really being used
+
+ def sample(pd_table, conn, keys, data_iter):
+ check.append(1)
+ data = [dict(zip(keys, row)) for row in data_iter]
+ conn.execute(pd_table.table.insert(), data)
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=sample)
+ assert self.pandasSQL.has_table('test_frame1')
+
+ assert check == [1]
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+ assert num_rows == num_entries
+ # Nuke table
+ self.drop_table('test_frame1')
+
+ def _roundtrip(self):
+ self.drop_table('test_frame_roundtrip')
+ self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip')
+ result = self.pandasSQL.read_query(
+ 'SELECT * FROM test_frame_roundtrip')
+
+ result.set_index('level_0', inplace=True)
+ # result.index.astype(int)
+
+ result.index.name = None
+
+ tm.assert_frame_equal(result, self.test_frame1)
+
+ def _execute_sql(self):
+ # drop_sql = "DROP TABLE IF EXISTS test" # should already be done
+ iris_results = self.pandasSQL.execute("SELECT * FROM iris")
+ row = iris_results.fetchone()
+ tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa'])
+
+ def _to_sql_save_index(self):
+ df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')],
+ columns=['A', 'B', 'C'], index=['A'])
+ self.pandasSQL.to_sql(df, 'test_to_sql_saves_index')
+ ix_cols = self._get_index_columns('test_to_sql_saves_index')
+ assert ix_cols == [['A', ], ]
+
+ def _transaction_test(self):
+ self.pandasSQL.execute("CREATE TABLE test_trans (A INT, B TEXT)")
+
+ ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')"
+
+ # Make sure when transaction is rolled back, no rows get inserted
+ try:
+ with self.pandasSQL.run_transaction() as trans:
+ trans.execute(ins_sql)
+ raise Exception('error')
+ except Exception:
+ # ignore raised exception
+ pass
+ res = self.pandasSQL.read_query('SELECT * FROM test_trans')
+ assert len(res) == 0
+
+ # Make sure when transaction is committed, rows do get inserted
+ with self.pandasSQL.run_transaction() as trans:
+ trans.execute(ins_sql)
+ res2 = self.pandasSQL.read_query('SELECT * FROM test_trans')
+ assert len(res2) == 1
+
+
+# -----------------------------------------------------------------------------
+# -- Testing the public API
+
+class _TestSQLApi(PandasSQLTest):
+
+ """
+ Base class to test the public API.
+
+ From this two classes are derived to run these tests for both the
+ sqlalchemy mode (`TestSQLApi`) and the fallback mode
+ (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific
+ tests for the different sql flavours are included in `_TestSQLAlchemy`.
+
+ Notes:
+ flavor can always be passed even in SQLAlchemy mode,
+ should be correctly ignored.
+
+ we don't use drop_table because that isn't part of the public api
+
+ """
+ flavor = 'sqlite'
+ mode = None
+
+ def setup_connect(self):
+ self.conn = self.connect()
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, load_iris_data):
+ self.load_test_data_and_sql()
+
+ def load_test_data_and_sql(self):
+ self._load_iris_view()
+ self._load_test1_data()
+ self._load_test2_data()
+ self._load_test3_data()
+ self._load_raw_sql()
+
+ def test_read_sql_iris(self):
+ iris_frame = sql.read_sql_query(
+ "SELECT * FROM iris", self.conn)
+ self._check_iris_loaded_frame(iris_frame)
+
+ def test_read_sql_view(self):
+ iris_frame = sql.read_sql_query(
+ "SELECT * FROM iris_view", self.conn)
+ self._check_iris_loaded_frame(iris_frame)
+
+ def test_to_sql(self):
+ sql.to_sql(self.test_frame1, 'test_frame1', self.conn)
+ assert sql.has_table('test_frame1', self.conn)
+
+ def test_to_sql_fail(self):
+ sql.to_sql(self.test_frame1, 'test_frame2',
+ self.conn, if_exists='fail')
+ assert sql.has_table('test_frame2', self.conn)
+
+ pytest.raises(ValueError, sql.to_sql, self.test_frame1,
+ 'test_frame2', self.conn, if_exists='fail')
+
+ def test_to_sql_replace(self):
+ sql.to_sql(self.test_frame1, 'test_frame3',
+ self.conn, if_exists='fail')
+ # Add to table again
+ sql.to_sql(self.test_frame1, 'test_frame3',
+ self.conn, if_exists='replace')
+ assert sql.has_table('test_frame3', self.conn)
+
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame3')
+
+ assert num_rows == num_entries
+
+ def test_to_sql_append(self):
+ sql.to_sql(self.test_frame1, 'test_frame4',
+ self.conn, if_exists='fail')
+
+ # Add to table again
+ sql.to_sql(self.test_frame1, 'test_frame4',
+ self.conn, if_exists='append')
+ assert sql.has_table('test_frame4', self.conn)
+
+ num_entries = 2 * len(self.test_frame1)
+ num_rows = self._count_rows('test_frame4')
+
+ assert num_rows == num_entries
+
+ def test_to_sql_type_mapping(self):
+ sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False)
+ result = sql.read_sql("SELECT * FROM test_frame5", self.conn)
+
+ tm.assert_frame_equal(self.test_frame3, result)
+
+ def test_to_sql_series(self):
+ s = Series(np.arange(5, dtype='int64'), name='series')
+ sql.to_sql(s, "test_series", self.conn, index=False)
+ s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn)
+ tm.assert_frame_equal(s.to_frame(), s2)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_to_sql_panel(self):
+ panel = tm.makePanel()
+ pytest.raises(NotImplementedError, sql.to_sql, panel,
+ 'test_panel', self.conn)
+
+ def test_roundtrip(self):
+ sql.to_sql(self.test_frame1, 'test_frame_roundtrip',
+ con=self.conn)
+ result = sql.read_sql_query(
+ 'SELECT * FROM test_frame_roundtrip',
+ con=self.conn)
+
+ # HACK!
+ result.index = self.test_frame1.index
+ result.set_index('level_0', inplace=True)
+ result.index.astype(int)
+ result.index.name = None
+ tm.assert_frame_equal(result, self.test_frame1)
+
+ def test_roundtrip_chunksize(self):
+ sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn,
+ index=False, chunksize=2)
+ result = sql.read_sql_query(
+ 'SELECT * FROM test_frame_roundtrip',
+ con=self.conn)
+ tm.assert_frame_equal(result, self.test_frame1)
+
+ def test_execute_sql(self):
+ # drop_sql = "DROP TABLE IF EXISTS test" # should already be done
+ iris_results = sql.execute("SELECT * FROM iris", con=self.conn)
+ row = iris_results.fetchone()
+ tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa'])
+
+ def test_date_parsing(self):
+ # Test date parsing in read_sql
+ # No Parsing
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn)
+ assert not issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ parse_dates=['DateCol'])
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+ assert df.DateCol.tolist() == [
+ pd.Timestamp(2000, 1, 3, 0, 0, 0),
+ pd.Timestamp(2000, 1, 4, 0, 0, 0)
+ ]
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'})
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+ assert df.DateCol.tolist() == [
+ pd.Timestamp(2000, 1, 3, 0, 0, 0),
+ pd.Timestamp(2000, 1, 4, 0, 0, 0)
+ ]
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ parse_dates=['IntDateCol'])
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+ assert df.IntDateCol.tolist() == [
+ pd.Timestamp(1986, 12, 25, 0, 0, 0),
+ pd.Timestamp(2013, 1, 1, 0, 0, 0)
+ ]
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ parse_dates={'IntDateCol': 's'})
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+ assert df.IntDateCol.tolist() == [
+ pd.Timestamp(1986, 12, 25, 0, 0, 0),
+ pd.Timestamp(2013, 1, 1, 0, 0, 0)
+ ]
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ parse_dates={'IntDateOnlyCol': '%Y%m%d'})
+ assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64)
+ assert df.IntDateOnlyCol.tolist() == [
+ pd.Timestamp('2010-10-10'),
+ pd.Timestamp('2010-12-12')
+ ]
+
+ def test_date_and_index(self):
+ # Test case where same column appears in parse_date and index_col
+
+ df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn,
+ index_col='DateCol',
+ parse_dates=['DateCol', 'IntDateCol'])
+
+ assert issubclass(df.index.dtype.type, np.datetime64)
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+ def test_timedelta(self):
+
+ # see #6921
+ df = to_timedelta(
+ Series(['00:00:01', '00:00:03'], name='foo')).to_frame()
+ with tm.assert_produces_warning(UserWarning):
+ df.to_sql('test_timedelta', self.conn)
+ result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn)
+ tm.assert_series_equal(result['foo'], df['foo'].astype('int64'))
+
+ def test_complex(self):
+ df = DataFrame({'a': [1 + 1j, 2j]})
+ # Complex data type should raise error
+ pytest.raises(ValueError, df.to_sql, 'test_complex', self.conn)
+
+ def test_to_sql_index_label(self):
+ temp_frame = DataFrame({'col1': range(4)})
+
+ # no index name, defaults to 'index'
+ sql.to_sql(temp_frame, 'test_index_label', self.conn)
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == 'index'
+
+ # specifying index_label
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace', index_label='other_label')
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == "other_label"
+
+ # using the index name
+ temp_frame.index.name = 'index_name'
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace')
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == "index_name"
+
+ # has index name, but specifying index_label
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace', index_label='other_label')
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == "other_label"
+
+ # index name is integer
+ temp_frame.index.name = 0
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace')
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == "0"
+
+ temp_frame.index.name = None
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace', index_label=0)
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == "0"
+
+ def test_to_sql_index_label_multiindex(self):
+ temp_frame = DataFrame({'col1': range(4)},
+ index=MultiIndex.from_product(
+ [('A0', 'A1'), ('B0', 'B1')]))
+
+ # no index name, defaults to 'level_0' and 'level_1'
+ sql.to_sql(temp_frame, 'test_index_label', self.conn)
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[0] == 'level_0'
+ assert frame.columns[1] == 'level_1'
+
+ # specifying index_label
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace', index_label=['A', 'B'])
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[:2].tolist() == ['A', 'B']
+
+ # using the index name
+ temp_frame.index.names = ['A', 'B']
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace')
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[:2].tolist() == ['A', 'B']
+
+ # has index name, but specifying index_label
+ sql.to_sql(temp_frame, 'test_index_label', self.conn,
+ if_exists='replace', index_label=['C', 'D'])
+ frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn)
+ assert frame.columns[:2].tolist() == ['C', 'D']
+
+ # wrong length of index_label
+ pytest.raises(ValueError, sql.to_sql, temp_frame,
+ 'test_index_label', self.conn, if_exists='replace',
+ index_label='C')
+
+ def test_multiindex_roundtrip(self):
+ df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')],
+ columns=['A', 'B', 'C'], index=['A', 'B'])
+
+ df.to_sql('test_multiindex_roundtrip', self.conn)
+ result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip',
+ self.conn, index_col=['A', 'B'])
+ tm.assert_frame_equal(df, result, check_index_type=True)
+
+ def test_integer_col_names(self):
+ df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])
+ sql.to_sql(df, "test_frame_integer_col_names", self.conn,
+ if_exists='replace')
+
+ def test_get_schema(self):
+ create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn)
+ assert 'CREATE' in create_sql
+
+ def test_get_schema_dtypes(self):
+ float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]})
+ dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER'
+ create_sql = sql.get_schema(float_frame, 'test',
+ con=self.conn, dtype={'b': dtype})
+ assert 'CREATE' in create_sql
+ assert 'INTEGER' in create_sql
+
+ def test_get_schema_keys(self):
+ frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]})
+ create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1')
+ constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")'
+ assert constraint_sentence in create_sql
+
+ # multiple columns as key (GH10385)
+ create_sql = sql.get_schema(self.test_frame1, 'test',
+ con=self.conn, keys=['A', 'B'])
+ constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")'
+ assert constraint_sentence in create_sql
+
+ def test_chunksize_read(self):
+ df = DataFrame(np.random.randn(22, 5), columns=list('abcde'))
+ df.to_sql('test_chunksize', self.conn, index=False)
+
+ # reading the query in one time
+ res1 = sql.read_sql_query("select * from test_chunksize", self.conn)
+
+ # reading the query in chunks with read_sql_query
+ res2 = DataFrame()
+ i = 0
+ sizes = [5, 5, 5, 5, 2]
+
+ for chunk in sql.read_sql_query("select * from test_chunksize",
+ self.conn, chunksize=5):
+ res2 = concat([res2, chunk], ignore_index=True)
+ assert len(chunk) == sizes[i]
+ i += 1
+
+ tm.assert_frame_equal(res1, res2)
+
+ # reading the query in chunks with read_sql_query
+ if self.mode == 'sqlalchemy':
+ res3 = DataFrame()
+ i = 0
+ sizes = [5, 5, 5, 5, 2]
+
+ for chunk in sql.read_sql_table("test_chunksize", self.conn,
+ chunksize=5):
+ res3 = concat([res3, chunk], ignore_index=True)
+ assert len(chunk) == sizes[i]
+ i += 1
+
+ tm.assert_frame_equal(res1, res3)
+
+ def test_categorical(self):
+ # GH8624
+ # test that categorical gets written correctly as dense column
+ df = DataFrame(
+ {'person_id': [1, 2, 3],
+ 'person_name': ['John P. Doe', 'Jane Dove', 'John P. Doe']})
+ df2 = df.copy()
+ df2['person_name'] = df2['person_name'].astype('category')
+
+ df2.to_sql('test_categorical', self.conn, index=False)
+ res = sql.read_sql_query('SELECT * FROM test_categorical', self.conn)
+
+ tm.assert_frame_equal(res, df)
+
+ def test_unicode_column_name(self):
+ # GH 11431
+ df = DataFrame([[1, 2], [3, 4]], columns=[u'\xe9', u'b'])
+ df.to_sql('test_unicode', self.conn, index=False)
+
+ def test_escaped_table_name(self):
+ # GH 13206
+ df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]})
+ df.to_sql('d1187b08-4943-4c8d-a7f6', self.conn, index=False)
+
+ res = sql.read_sql_query('SELECT * FROM `d1187b08-4943-4c8d-a7f6`',
+ self.conn)
+
+ tm.assert_frame_equal(res, df)
+
+
+class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi):
+ """
+ Test the public API as it would be used directly
+
+ Tests for `read_sql_table` are included here, as this is specific for the
+ sqlalchemy mode.
+
+ """
+ flavor = 'sqlite'
+ mode = 'sqlalchemy'
+
+ def connect(self):
+ if SQLALCHEMY_INSTALLED:
+ return sqlalchemy.create_engine('sqlite:///:memory:')
+ else:
+ pytest.skip('SQLAlchemy not installed')
+
+ def test_read_table_columns(self):
+ # test columns argument in read_table
+ sql.to_sql(self.test_frame1, 'test_frame', self.conn)
+
+ cols = ['A', 'B']
+ result = sql.read_sql_table('test_frame', self.conn, columns=cols)
+ assert result.columns.tolist() == cols
+
+ def test_read_table_index_col(self):
+ # test columns argument in read_table
+ sql.to_sql(self.test_frame1, 'test_frame', self.conn)
+
+ result = sql.read_sql_table('test_frame', self.conn, index_col="index")
+ assert result.index.names == ["index"]
+
+ result = sql.read_sql_table(
+ 'test_frame', self.conn, index_col=["A", "B"])
+ assert result.index.names == ["A", "B"]
+
+ result = sql.read_sql_table('test_frame', self.conn,
+ index_col=["A", "B"],
+ columns=["C", "D"])
+ assert result.index.names == ["A", "B"]
+ assert result.columns.tolist() == ["C", "D"]
+
+ def test_read_sql_delegate(self):
+ iris_frame1 = sql.read_sql_query(
+ "SELECT * FROM iris", self.conn)
+ iris_frame2 = sql.read_sql(
+ "SELECT * FROM iris", self.conn)
+ tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+ iris_frame1 = sql.read_sql_table('iris', self.conn)
+ iris_frame2 = sql.read_sql('iris', self.conn)
+ tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+ def test_not_reflect_all_tables(self):
+ # create invalid table
+ qry = """CREATE TABLE invalid (x INTEGER, y UNKNOWN);"""
+ self.conn.execute(qry)
+ qry = """CREATE TABLE other_table (x INTEGER, y INTEGER);"""
+ self.conn.execute(qry)
+
+ with warnings.catch_warnings(record=True) as w:
+ # Cause all warnings to always be triggered.
+ warnings.simplefilter("always")
+ # Trigger a warning.
+ sql.read_sql_table('other_table', self.conn)
+ sql.read_sql_query('SELECT * FROM other_table', self.conn)
+ # Verify some things
+ assert len(w) == 0
+
+ def test_warning_case_insensitive_table_name(self):
+ # see gh-7815
+ #
+ # We can't test that this warning is triggered, a the database
+ # configuration would have to be altered. But here we test that
+ # the warning is certainly NOT triggered in a normal case.
+ with warnings.catch_warnings(record=True) as w:
+ # Cause all warnings to always be triggered.
+ warnings.simplefilter("always")
+ # This should not trigger a Warning
+ self.test_frame1.to_sql('CaseSensitive', self.conn)
+ # Verify some things
+ assert len(w) == 0
+
+ def _get_index_columns(self, tbl_name):
+ from sqlalchemy.engine import reflection
+ insp = reflection.Inspector.from_engine(self.conn)
+ ixs = insp.get_indexes('test_index_saved')
+ ixs = [i['column_names'] for i in ixs]
+ return ixs
+
+ def test_sqlalchemy_type_mapping(self):
+
+ # Test Timestamp objects (no datetime64 because of timezone) (GH9085)
+ df = DataFrame({'time': to_datetime(['201412120154', '201412110254'],
+ utc=True)})
+ db = sql.SQLDatabase(self.conn)
+ table = sql.SQLTable("test_type", db, frame=df)
+ # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones
+ assert isinstance(table.table.c['time'].type, sqltypes.TIMESTAMP)
+
+ def test_database_uri_string(self):
+
+ # Test read_sql and .to_sql method with a database URI (GH10654)
+ test_frame1 = self.test_frame1
+ # db_uri = 'sqlite:///:memory:' # raises
+ # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near
+ # "iris": syntax error [SQL: 'iris']
+ with tm.ensure_clean() as name:
+ db_uri = 'sqlite:///' + name
+ table = 'iris'
+ test_frame1.to_sql(table, db_uri, if_exists='replace', index=False)
+ test_frame2 = sql.read_sql(table, db_uri)
+ test_frame3 = sql.read_sql_table(table, db_uri)
+ query = 'SELECT * FROM iris'
+ test_frame4 = sql.read_sql_query(query, db_uri)
+ tm.assert_frame_equal(test_frame1, test_frame2)
+ tm.assert_frame_equal(test_frame1, test_frame3)
+ tm.assert_frame_equal(test_frame1, test_frame4)
+
+ # using driver that will not be installed on Travis to trigger error
+ # in sqlalchemy.create_engine -> test passing of this error to user
+ try:
+ # the rest of this test depends on pg8000's being absent
+ import pg8000 # noqa
+ pytest.skip("pg8000 is installed")
+ except ImportError:
+ pass
+
+ db_uri = "postgresql+pg8000://user:pass@host/dbname"
+ with pytest.raises(ImportError, match="pg8000"):
+ sql.read_sql("select * from table", db_uri)
+
+ def _make_iris_table_metadata(self):
+ sa = sqlalchemy
+ metadata = sa.MetaData()
+ iris = sa.Table('iris', metadata,
+ sa.Column('SepalLength', sa.REAL),
+ sa.Column('SepalWidth', sa.REAL),
+ sa.Column('PetalLength', sa.REAL),
+ sa.Column('PetalWidth', sa.REAL),
+ sa.Column('Name', sa.TEXT)
+ )
+
+ return iris
+
+ def test_query_by_text_obj(self):
+ # WIP : GH10846
+ name_text = sqlalchemy.text('select * from iris where name=:name')
+ iris_df = sql.read_sql(name_text, self.conn, params={
+ 'name': 'Iris-versicolor'})
+ all_names = set(iris_df['Name'])
+ assert all_names == {'Iris-versicolor'}
+
+ def test_query_by_select_obj(self):
+ # WIP : GH10846
+ iris = self._make_iris_table_metadata()
+
+ name_select = sqlalchemy.select([iris]).where(
+ iris.c.Name == sqlalchemy.bindparam('name'))
+ iris_df = sql.read_sql(name_select, self.conn,
+ params={'name': 'Iris-setosa'})
+ all_names = set(iris_df['Name'])
+ assert all_names == {'Iris-setosa'}
+
+
+class _EngineToConnMixin(object):
+ """
+ A mixin that causes setup_connect to create a conn rather than an engine.
+ """
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, load_iris_data):
+ super(_EngineToConnMixin, self).load_test_data_and_sql()
+ engine = self.conn
+ conn = engine.connect()
+ self.__tx = conn.begin()
+ self.pandasSQL = sql.SQLDatabase(conn)
+ self.__engine = engine
+ self.conn = conn
+
+ yield
+
+ self.__tx.rollback()
+ self.conn.close()
+ self.conn = self.__engine
+ self.pandasSQL = sql.SQLDatabase(self.__engine)
+ # XXX:
+ # super(_EngineToConnMixin, self).teardown_method(method)
+
+
+class TestSQLApiConn(_EngineToConnMixin, TestSQLApi):
+ pass
+
+
+class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi):
+ """
+ Test the public sqlite connection fallback API
+
+ """
+ flavor = 'sqlite'
+ mode = 'fallback'
+
+ def connect(self, database=":memory:"):
+ return sqlite3.connect(database)
+
+ def test_sql_open_close(self):
+ # Test if the IO in the database still work if the connection closed
+ # between the writing and reading (as in many real situations).
+
+ with tm.ensure_clean() as name:
+
+ conn = self.connect(name)
+ sql.to_sql(self.test_frame3, "test_frame3_legacy",
+ conn, index=False)
+ conn.close()
+
+ conn = self.connect(name)
+ result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;",
+ conn)
+ conn.close()
+
+ tm.assert_frame_equal(self.test_frame3, result)
+
+ def test_con_string_import_error(self):
+ if not SQLALCHEMY_INSTALLED:
+ conn = 'mysql://root@localhost/pandas_nosetest'
+ pytest.raises(ImportError, sql.read_sql, "SELECT * FROM iris",
+ conn)
+ else:
+ pytest.skip('SQLAlchemy is installed')
+
+ def test_read_sql_delegate(self):
+ iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn)
+ iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn)
+ tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+ pytest.raises(sql.DatabaseError, sql.read_sql, 'iris', self.conn)
+
+ def test_safe_names_warning(self):
+ # GH 6798
+ df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space
+ # warns on create table with spaces in names
+ with tm.assert_produces_warning():
+ sql.to_sql(df, "test_frame3_legacy", self.conn, index=False)
+
+ def test_get_schema2(self):
+ # without providing a connection object (available for backwards comp)
+ create_sql = sql.get_schema(self.test_frame1, 'test')
+ assert 'CREATE' in create_sql
+
+ def _get_sqlite_column_type(self, schema, column):
+
+ for col in schema.split('\n'):
+ if col.split()[0].strip('""') == column:
+ return col.split()[1]
+ raise ValueError('Column %s not found' % (column))
+
+ def test_sqlite_type_mapping(self):
+
+ # Test Timestamp objects (no datetime64 because of timezone) (GH9085)
+ df = DataFrame({'time': to_datetime(['201412120154', '201412110254'],
+ utc=True)})
+ db = sql.SQLiteDatabase(self.conn)
+ table = sql.SQLiteTable("test_type", db, frame=df)
+ schema = table.sql_schema()
+ assert self._get_sqlite_column_type(schema, 'time') == "TIMESTAMP"
+
+
+# -----------------------------------------------------------------------------
+# -- Database flavor specific tests
+
+
+class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest):
+ """
+ Base class for testing the sqlalchemy backend.
+
+ Subclasses for specific database types are created below. Tests that
+ deviate for each flavor are overwritten there.
+
+ """
+ flavor = None
+
+ @pytest.fixture(autouse=True, scope='class')
+ def setup_class(cls):
+ cls.setup_import()
+ cls.setup_driver()
+ conn = cls.connect()
+ conn.connect()
+
+ def load_test_data_and_sql(self):
+ self._load_raw_sql()
+ self._load_test1_data()
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, load_iris_data):
+ self.load_test_data_and_sql()
+
+ @classmethod
+ def setup_import(cls):
+ # Skip this test if SQLAlchemy not available
+ if not SQLALCHEMY_INSTALLED:
+ pytest.skip('SQLAlchemy not installed')
+
+ @classmethod
+ def setup_driver(cls):
+ raise NotImplementedError()
+
+ @classmethod
+ def connect(cls):
+ raise NotImplementedError()
+
+ def setup_connect(self):
+ try:
+ self.conn = self.connect()
+ self.pandasSQL = sql.SQLDatabase(self.conn)
+ # to test if connection can be made:
+ self.conn.connect()
+ except sqlalchemy.exc.OperationalError:
+ pytest.skip(
+ "Can't connect to {0} server".format(self.flavor))
+
+ def test_read_sql(self):
+ self._read_sql_iris()
+
+ def test_read_sql_parameter(self):
+ self._read_sql_iris_parameter()
+
+ def test_read_sql_named_parameter(self):
+ self._read_sql_iris_named_parameter()
+
+ def test_to_sql(self):
+ self._to_sql()
+
+ def test_to_sql_empty(self):
+ self._to_sql_empty()
+
+ def test_to_sql_fail(self):
+ self._to_sql_fail()
+
+ def test_to_sql_replace(self):
+ self._to_sql_replace()
+
+ def test_to_sql_append(self):
+ self._to_sql_append()
+
+ def test_to_sql_method_multi(self):
+ self._to_sql(method='multi')
+
+ def test_to_sql_method_callable(self):
+ self._to_sql_method_callable()
+
+ def test_create_table(self):
+ temp_conn = self.connect()
+ temp_frame = DataFrame(
+ {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]})
+
+ pandasSQL = sql.SQLDatabase(temp_conn)
+ pandasSQL.to_sql(temp_frame, 'temp_frame')
+
+ assert temp_conn.has_table('temp_frame')
+
+ def test_drop_table(self):
+ temp_conn = self.connect()
+
+ temp_frame = DataFrame(
+ {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]})
+
+ pandasSQL = sql.SQLDatabase(temp_conn)
+ pandasSQL.to_sql(temp_frame, 'temp_frame')
+
+ assert temp_conn.has_table('temp_frame')
+
+ pandasSQL.drop_table('temp_frame')
+
+ assert not temp_conn.has_table('temp_frame')
+
+ def test_roundtrip(self):
+ self._roundtrip()
+
+ def test_execute_sql(self):
+ self._execute_sql()
+
+ def test_read_table(self):
+ iris_frame = sql.read_sql_table("iris", con=self.conn)
+ self._check_iris_loaded_frame(iris_frame)
+
+ def test_read_table_columns(self):
+ iris_frame = sql.read_sql_table(
+ "iris", con=self.conn, columns=['SepalLength', 'SepalLength'])
+ tm.equalContents(
+ iris_frame.columns.values, ['SepalLength', 'SepalLength'])
+
+ def test_read_table_absent(self):
+ pytest.raises(
+ ValueError, sql.read_sql_table, "this_doesnt_exist", con=self.conn)
+
+ def test_default_type_conversion(self):
+ df = sql.read_sql_table("types_test_data", self.conn)
+
+ assert issubclass(df.FloatCol.dtype.type, np.floating)
+ assert issubclass(df.IntCol.dtype.type, np.integer)
+ assert issubclass(df.BoolCol.dtype.type, np.bool_)
+
+ # Int column with NA values stays as float
+ assert issubclass(df.IntColWithNull.dtype.type, np.floating)
+ # Bool column with NA values becomes object
+ assert issubclass(df.BoolColWithNull.dtype.type, np.object)
+
+ def test_bigint(self):
+ # int64 should be converted to BigInteger, GH7433
+ df = DataFrame(data={'i64': [2**62]})
+ df.to_sql('test_bigint', self.conn, index=False)
+ result = sql.read_sql_table('test_bigint', self.conn)
+
+ tm.assert_frame_equal(df, result)
+
+ def test_default_date_load(self):
+ df = sql.read_sql_table("types_test_data", self.conn)
+
+ # IMPORTANT - sqlite has no native date type, so shouldn't parse, but
+ # MySQL SHOULD be converted.
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ def test_datetime_with_timezone(self):
+ # edge case that converts postgresql datetime with time zone types
+ # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok
+ # but should be more natural, so coerce to datetime64[ns] for now
+
+ def check(col):
+ # check that a column is either datetime64[ns]
+ # or datetime64[ns, UTC]
+ if is_datetime64_dtype(col.dtype):
+
+ # "2000-01-01 00:00:00-08:00" should convert to
+ # "2000-01-01 08:00:00"
+ assert col[0] == Timestamp('2000-01-01 08:00:00')
+
+ # "2000-06-01 00:00:00-07:00" should convert to
+ # "2000-06-01 07:00:00"
+ assert col[1] == Timestamp('2000-06-01 07:00:00')
+
+ elif is_datetime64tz_dtype(col.dtype):
+ assert str(col.dt.tz) == 'UTC'
+
+ # "2000-01-01 00:00:00-08:00" should convert to
+ # "2000-01-01 08:00:00"
+ # "2000-06-01 00:00:00-07:00" should convert to
+ # "2000-06-01 07:00:00"
+ # GH 6415
+ expected_data = [Timestamp('2000-01-01 08:00:00', tz='UTC'),
+ Timestamp('2000-06-01 07:00:00', tz='UTC')]
+ expected = Series(expected_data, name=col.name)
+ tm.assert_series_equal(col, expected)
+
+ else:
+ raise AssertionError("DateCol loaded with incorrect type "
+ "-> {0}".format(col.dtype))
+
+ # GH11216
+ df = pd.read_sql_query("select * from types_test_data", self.conn)
+ if not hasattr(df, 'DateColWithTz'):
+ pytest.skip("no column with datetime with time zone")
+
+ # this is parsed on Travis (linux), but not on macosx for some reason
+ # even with the same versions of psycopg2 & sqlalchemy, possibly a
+ # Postgrsql server version difference
+ col = df.DateColWithTz
+ assert is_datetime64tz_dtype(col.dtype)
+
+ df = pd.read_sql_query("select * from types_test_data",
+ self.conn, parse_dates=['DateColWithTz'])
+ if not hasattr(df, 'DateColWithTz'):
+ pytest.skip("no column with datetime with time zone")
+ col = df.DateColWithTz
+ assert is_datetime64tz_dtype(col.dtype)
+ assert str(col.dt.tz) == 'UTC'
+ check(df.DateColWithTz)
+
+ df = pd.concat(list(pd.read_sql_query("select * from types_test_data",
+ self.conn, chunksize=1)),
+ ignore_index=True)
+ col = df.DateColWithTz
+ assert is_datetime64tz_dtype(col.dtype)
+ assert str(col.dt.tz) == 'UTC'
+ expected = sql.read_sql_table("types_test_data", self.conn)
+ col = expected.DateColWithTz
+ assert is_datetime64tz_dtype(col.dtype)
+ tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz)
+
+ # xref #7139
+ # this might or might not be converted depending on the postgres driver
+ df = sql.read_sql_table("types_test_data", self.conn)
+ check(df.DateColWithTz)
+
+ def test_datetime_with_timezone_roundtrip(self):
+ # GH 9086
+ # Write datetimetz data to a db and read it back
+ # For dbs that support timestamps with timezones, should get back UTC
+ # otherwise naive data should be returned
+ expected = DataFrame({'A': date_range(
+ '2013-01-01 09:00:00', periods=3, tz='US/Pacific'
+ )})
+ expected.to_sql('test_datetime_tz', self.conn, index=False)
+
+ if self.flavor == 'postgresql':
+ # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC
+ expected['A'] = expected['A'].dt.tz_convert('UTC')
+ else:
+ # Otherwise, timestamps are returned as local, naive
+ expected['A'] = expected['A'].dt.tz_localize(None)
+
+ result = sql.read_sql_table('test_datetime_tz', self.conn)
+ tm.assert_frame_equal(result, expected)
+
+ result = sql.read_sql_query(
+ 'SELECT * FROM test_datetime_tz', self.conn
+ )
+ if self.flavor == 'sqlite':
+ # read_sql_query does not return datetime type like read_sql_table
+ assert isinstance(result.loc[0, 'A'], string_types)
+ result['A'] = to_datetime(result['A'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_naive_datetimeindex_roundtrip(self):
+ # GH 23510
+ # Ensure that a naive DatetimeIndex isn't converted to UTC
+ dates = date_range('2018-01-01', periods=5, freq='6H')
+ expected = DataFrame({'nums': range(5)}, index=dates)
+ expected.to_sql('foo_table', self.conn, index_label='info_date')
+ result = sql.read_sql_table('foo_table', self.conn,
+ index_col='info_date')
+ # result index with gain a name from a set_index operation; expected
+ tm.assert_frame_equal(result, expected, check_names=False)
+
+ def test_date_parsing(self):
+ # No Parsing
+ df = sql.read_sql_table("types_test_data", self.conn)
+ expected_type = object if self.flavor == 'sqlite' else np.datetime64
+ assert issubclass(df.DateCol.dtype.type, expected_type)
+
+ df = sql.read_sql_table("types_test_data", self.conn,
+ parse_dates=['DateCol'])
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_table("types_test_data", self.conn,
+ parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'})
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_table("types_test_data", self.conn, parse_dates={
+ 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}})
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_table(
+ "types_test_data", self.conn, parse_dates=['IntDateCol'])
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_table(
+ "types_test_data", self.conn, parse_dates={'IntDateCol': 's'})
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+ df = sql.read_sql_table("types_test_data", self.conn,
+ parse_dates={'IntDateCol': {'unit': 's'}})
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+ def test_datetime(self):
+ df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
+ 'B': np.arange(3.0)})
+ df.to_sql('test_datetime', self.conn)
+
+ # with read_table -> type information from schema used
+ result = sql.read_sql_table('test_datetime', self.conn)
+ result = result.drop('index', axis=1)
+ tm.assert_frame_equal(result, df)
+
+ # with read_sql -> no type information -> sqlite has no native
+ result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn)
+ result = result.drop('index', axis=1)
+ if self.flavor == 'sqlite':
+ assert isinstance(result.loc[0, 'A'], string_types)
+ result['A'] = to_datetime(result['A'])
+ tm.assert_frame_equal(result, df)
+ else:
+ tm.assert_frame_equal(result, df)
+
+ def test_datetime_NaT(self):
+ df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
+ 'B': np.arange(3.0)})
+ df.loc[1, 'A'] = np.nan
+ df.to_sql('test_datetime', self.conn, index=False)
+
+ # with read_table -> type information from schema used
+ result = sql.read_sql_table('test_datetime', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ # with read_sql -> no type information -> sqlite has no native
+ result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn)
+ if self.flavor == 'sqlite':
+ assert isinstance(result.loc[0, 'A'], string_types)
+ result['A'] = to_datetime(result['A'], errors='coerce')
+ tm.assert_frame_equal(result, df)
+ else:
+ tm.assert_frame_equal(result, df)
+
+ def test_datetime_date(self):
+ # test support for datetime.date
+ df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"])
+ df.to_sql('test_date', self.conn, index=False)
+ res = read_sql_table('test_date', self.conn)
+ result = res['a']
+ expected = to_datetime(df['a'])
+ # comes back as datetime64
+ tm.assert_series_equal(result, expected)
+
+ def test_datetime_time(self):
+ # test support for datetime.time
+ df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"])
+ df.to_sql('test_time', self.conn, index=False)
+ res = read_sql_table('test_time', self.conn)
+ tm.assert_frame_equal(res, df)
+
+ # GH8341
+ # first, use the fallback to have the sqlite adapter put in place
+ sqlite_conn = TestSQLiteFallback.connect()
+ sql.to_sql(df, "test_time2", sqlite_conn, index=False)
+ res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn)
+ ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f"))
+ tm.assert_frame_equal(ref, res) # check if adapter is in place
+ # then test if sqlalchemy is unaffected by the sqlite adapter
+ sql.to_sql(df, "test_time3", self.conn, index=False)
+ if self.flavor == 'sqlite':
+ res = sql.read_sql_query("SELECT * FROM test_time3", self.conn)
+ ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f"))
+ tm.assert_frame_equal(ref, res)
+ res = sql.read_sql_table("test_time3", self.conn)
+ tm.assert_frame_equal(df, res)
+
+ def test_mixed_dtype_insert(self):
+ # see GH6509
+ s1 = Series(2**25 + 1, dtype=np.int32)
+ s2 = Series(0.0, dtype=np.float32)
+ df = DataFrame({'s1': s1, 's2': s2})
+
+ # write and read again
+ df.to_sql("test_read_write", self.conn, index=False)
+ df2 = sql.read_sql_table("test_read_write", self.conn)
+
+ tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
+
+ def test_nan_numeric(self):
+ # NaNs in numeric float column
+ df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]})
+ df.to_sql('test_nan', self.conn, index=False)
+
+ # with read_table
+ result = sql.read_sql_table('test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ # with read_sql
+ result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ def test_nan_fullcolumn(self):
+ # full NaN column (numeric float column)
+ df = DataFrame({'A': [0, 1, 2], 'B': [np.nan, np.nan, np.nan]})
+ df.to_sql('test_nan', self.conn, index=False)
+
+ # with read_table
+ result = sql.read_sql_table('test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ # with read_sql -> not type info from table -> stays None
+ df['B'] = df['B'].astype('object')
+ df['B'] = None
+ result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ def test_nan_string(self):
+ # NaNs in string column
+ df = DataFrame({'A': [0, 1, 2], 'B': ['a', 'b', np.nan]})
+ df.to_sql('test_nan', self.conn, index=False)
+
+ # NaNs are coming back as None
+ df.loc[2, 'B'] = None
+
+ # with read_table
+ result = sql.read_sql_table('test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ # with read_sql
+ result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
+ tm.assert_frame_equal(result, df)
+
+ def _get_index_columns(self, tbl_name):
+ from sqlalchemy.engine import reflection
+ insp = reflection.Inspector.from_engine(self.conn)
+ ixs = insp.get_indexes(tbl_name)
+ ixs = [i['column_names'] for i in ixs]
+ return ixs
+
+ def test_to_sql_save_index(self):
+ self._to_sql_save_index()
+
+ def test_transactions(self):
+ self._transaction_test()
+
+ def test_get_schema_create_table(self):
+ # Use a dataframe without a bool column, since MySQL converts bool to
+ # TINYINT (which read_sql_table returns as an int and causes a dtype
+ # mismatch)
+
+ self._load_test3_data()
+ tbl = 'test_get_schema_create_table'
+ create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn)
+ blank_test_df = self.test_frame3.iloc[:0]
+
+ self.drop_table(tbl)
+ self.conn.execute(create_sql)
+ returned_df = sql.read_sql_table(tbl, self.conn)
+ tm.assert_frame_equal(returned_df, blank_test_df,
+ check_index_type=False)
+ self.drop_table(tbl)
+
+ def test_dtype(self):
+ cols = ['A', 'B']
+ data = [(0.8, True),
+ (0.9, None)]
+ df = DataFrame(data, columns=cols)
+ df.to_sql('dtype_test', self.conn)
+ df.to_sql('dtype_test2', self.conn, dtype={'B': sqlalchemy.TEXT})
+ meta = sqlalchemy.schema.MetaData(bind=self.conn)
+ meta.reflect()
+ sqltype = meta.tables['dtype_test2'].columns['B'].type
+ assert isinstance(sqltype, sqlalchemy.TEXT)
+ pytest.raises(ValueError, df.to_sql,
+ 'error', self.conn, dtype={'B': str})
+
+ # GH9083
+ df.to_sql('dtype_test3', self.conn, dtype={'B': sqlalchemy.String(10)})
+ meta.reflect()
+ sqltype = meta.tables['dtype_test3'].columns['B'].type
+ assert isinstance(sqltype, sqlalchemy.String)
+ assert sqltype.length == 10
+
+ # single dtype
+ df.to_sql('single_dtype_test', self.conn, dtype=sqlalchemy.TEXT)
+ meta = sqlalchemy.schema.MetaData(bind=self.conn)
+ meta.reflect()
+ sqltypea = meta.tables['single_dtype_test'].columns['A'].type
+ sqltypeb = meta.tables['single_dtype_test'].columns['B'].type
+ assert isinstance(sqltypea, sqlalchemy.TEXT)
+ assert isinstance(sqltypeb, sqlalchemy.TEXT)
+
+ def test_notna_dtype(self):
+ cols = {'Bool': Series([True, None]),
+ 'Date': Series([datetime(2012, 5, 1), None]),
+ 'Int': Series([1, None], dtype='object'),
+ 'Float': Series([1.1, None])
+ }
+ df = DataFrame(cols)
+
+ tbl = 'notna_dtype_test'
+ df.to_sql(tbl, self.conn)
+ returned_df = sql.read_sql_table(tbl, self.conn) # noqa
+ meta = sqlalchemy.schema.MetaData(bind=self.conn)
+ meta.reflect()
+ if self.flavor == 'mysql':
+ my_type = sqltypes.Integer
+ else:
+ my_type = sqltypes.Boolean
+
+ col_dict = meta.tables[tbl].columns
+
+ assert isinstance(col_dict['Bool'].type, my_type)
+ assert isinstance(col_dict['Date'].type, sqltypes.DateTime)
+ assert isinstance(col_dict['Int'].type, sqltypes.Integer)
+ assert isinstance(col_dict['Float'].type, sqltypes.Float)
+
+ def test_double_precision(self):
+ V = 1.23456789101112131415
+
+ df = DataFrame({'f32': Series([V, ], dtype='float32'),
+ 'f64': Series([V, ], dtype='float64'),
+ 'f64_as_f32': Series([V, ], dtype='float64'),
+ 'i32': Series([5, ], dtype='int32'),
+ 'i64': Series([5, ], dtype='int64'),
+ })
+
+ df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace',
+ dtype={'f64_as_f32': sqlalchemy.Float(precision=23)})
+ res = sql.read_sql_table('test_dtypes', self.conn)
+
+ # check precision of float64
+ assert (np.round(df['f64'].iloc[0], 14) ==
+ np.round(res['f64'].iloc[0], 14))
+
+ # check sql types
+ meta = sqlalchemy.schema.MetaData(bind=self.conn)
+ meta.reflect()
+ col_dict = meta.tables['test_dtypes'].columns
+ assert str(col_dict['f32'].type) == str(col_dict['f64_as_f32'].type)
+ assert isinstance(col_dict['f32'].type, sqltypes.Float)
+ assert isinstance(col_dict['f64'].type, sqltypes.Float)
+ assert isinstance(col_dict['i32'].type, sqltypes.Integer)
+ assert isinstance(col_dict['i64'].type, sqltypes.BigInteger)
+
+ def test_connectable_issue_example(self):
+ # This tests the example raised in issue
+ # https://github.com/pandas-dev/pandas/issues/10104
+
+ def foo(connection):
+ query = 'SELECT test_foo_data FROM test_foo_data'
+ return sql.read_sql_query(query, con=connection)
+
+ def bar(connection, data):
+ data.to_sql(name='test_foo_data',
+ con=connection, if_exists='append')
+
+ def main(connectable):
+ with connectable.connect() as conn:
+ with conn.begin():
+ foo_data = conn.run_callable(foo)
+ conn.run_callable(bar, foo_data)
+
+ DataFrame({'test_foo_data': [0, 1, 2]}).to_sql(
+ 'test_foo_data', self.conn)
+ main(self.conn)
+
+ def test_temporary_table(self):
+ test_data = u'Hello, World!'
+ expected = DataFrame({'spam': [test_data]})
+ Base = declarative.declarative_base()
+
+ class Temporary(Base):
+ __tablename__ = 'temp_test'
+ __table_args__ = {'prefixes': ['TEMPORARY']}
+ id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
+ spam = sqlalchemy.Column(sqlalchemy.Unicode(30), nullable=False)
+
+ Session = sa_session.sessionmaker(bind=self.conn)
+ session = Session()
+ with session.transaction:
+ conn = session.connection()
+ Temporary.__table__.create(conn)
+ session.add(Temporary(spam=test_data))
+ session.flush()
+ df = sql.read_sql_query(
+ sql=sqlalchemy.select([Temporary.spam]),
+ con=conn,
+ )
+
+ tm.assert_frame_equal(df, expected)
+
+
+class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy):
+
+ def test_transactions(self):
+ pytest.skip(
+ "Nested transactions rollbacks don't work with Pandas")
+
+
+class _TestSQLiteAlchemy(object):
+ """
+ Test the sqlalchemy backend against an in-memory sqlite database.
+
+ """
+ flavor = 'sqlite'
+
+ @classmethod
+ def connect(cls):
+ return sqlalchemy.create_engine('sqlite:///:memory:')
+
+ @classmethod
+ def setup_driver(cls):
+ # sqlite3 is built-in
+ cls.driver = None
+
+ def test_default_type_conversion(self):
+ df = sql.read_sql_table("types_test_data", self.conn)
+
+ assert issubclass(df.FloatCol.dtype.type, np.floating)
+ assert issubclass(df.IntCol.dtype.type, np.integer)
+
+ # sqlite has no boolean type, so integer type is returned
+ assert issubclass(df.BoolCol.dtype.type, np.integer)
+
+ # Int column with NA values stays as float
+ assert issubclass(df.IntColWithNull.dtype.type, np.floating)
+
+ # Non-native Bool column with NA values stays as float
+ assert issubclass(df.BoolColWithNull.dtype.type, np.floating)
+
+ def test_default_date_load(self):
+ df = sql.read_sql_table("types_test_data", self.conn)
+
+ # IMPORTANT - sqlite has no native date type, so shouldn't parse, but
+ assert not issubclass(df.DateCol.dtype.type, np.datetime64)
+
+ def test_bigint_warning(self):
+ # test no warning for BIGINT (to support int64) is raised (GH7433)
+ df = DataFrame({'a': [1, 2]}, dtype='int64')
+ df.to_sql('test_bigintwarning', self.conn, index=False)
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ sql.read_sql_table('test_bigintwarning', self.conn)
+ assert len(w) == 0
+
+
+class _TestMySQLAlchemy(object):
+ """
+ Test the sqlalchemy backend against an MySQL database.
+
+ """
+ flavor = 'mysql'
+
+ @classmethod
+ def connect(cls):
+ url = 'mysql+{driver}://root@localhost/pandas_nosetest'
+ return sqlalchemy.create_engine(url.format(driver=cls.driver),
+ connect_args=cls.connect_args)
+
+ @classmethod
+ def setup_driver(cls):
+ pymysql = pytest.importorskip('pymysql')
+ cls.driver = 'pymysql'
+ cls.connect_args = {
+ 'client_flag': pymysql.constants.CLIENT.MULTI_STATEMENTS}
+
+ def test_default_type_conversion(self):
+ df = sql.read_sql_table("types_test_data", self.conn)
+
+ assert issubclass(df.FloatCol.dtype.type, np.floating)
+ assert issubclass(df.IntCol.dtype.type, np.integer)
+
+ # MySQL has no real BOOL type (it's an alias for TINYINT)
+ assert issubclass(df.BoolCol.dtype.type, np.integer)
+
+ # Int column with NA values stays as float
+ assert issubclass(df.IntColWithNull.dtype.type, np.floating)
+
+ # Bool column with NA = int column with NA values => becomes float
+ assert issubclass(df.BoolColWithNull.dtype.type, np.floating)
+
+ def test_read_procedure(self):
+ import pymysql
+ # see GH7324. Although it is more an api test, it is added to the
+ # mysql tests as sqlite does not have stored procedures
+ df = DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]})
+ df.to_sql('test_procedure', self.conn, index=False)
+
+ proc = """DROP PROCEDURE IF EXISTS get_testdb;
+
+ CREATE PROCEDURE get_testdb ()
+
+ BEGIN
+ SELECT * FROM test_procedure;
+ END"""
+
+ connection = self.conn.connect()
+ trans = connection.begin()
+ try:
+ r1 = connection.execute(proc) # noqa
+ trans.commit()
+ except pymysql.Error:
+ trans.rollback()
+ raise
+
+ res1 = sql.read_sql_query("CALL get_testdb();", self.conn)
+ tm.assert_frame_equal(df, res1)
+
+ # test delegation to read_sql_query
+ res2 = sql.read_sql("CALL get_testdb();", self.conn)
+ tm.assert_frame_equal(df, res2)
+
+
+class _TestPostgreSQLAlchemy(object):
+ """
+ Test the sqlalchemy backend against an PostgreSQL database.
+
+ """
+ flavor = 'postgresql'
+
+ @classmethod
+ def connect(cls):
+ url = 'postgresql+{driver}://postgres@localhost/pandas_nosetest'
+ return sqlalchemy.create_engine(url.format(driver=cls.driver))
+
+ @classmethod
+ def setup_driver(cls):
+ pytest.importorskip('psycopg2')
+ cls.driver = 'psycopg2'
+
+ def test_schema_support(self):
+ # only test this for postgresql (schema's not supported in
+ # mysql/sqlite)
+ df = DataFrame({'col1': [1, 2], 'col2': [
+ 0.1, 0.2], 'col3': ['a', 'n']})
+
+ # create a schema
+ self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;")
+ self.conn.execute("CREATE SCHEMA other;")
+
+ # write dataframe to different schema's
+ df.to_sql('test_schema_public', self.conn, index=False)
+ df.to_sql('test_schema_public_explicit', self.conn, index=False,
+ schema='public')
+ df.to_sql('test_schema_other', self.conn, index=False, schema='other')
+
+ # read dataframes back in
+ res1 = sql.read_sql_table('test_schema_public', self.conn)
+ tm.assert_frame_equal(df, res1)
+ res2 = sql.read_sql_table('test_schema_public_explicit', self.conn)
+ tm.assert_frame_equal(df, res2)
+ res3 = sql.read_sql_table('test_schema_public_explicit', self.conn,
+ schema='public')
+ tm.assert_frame_equal(df, res3)
+ res4 = sql.read_sql_table('test_schema_other', self.conn,
+ schema='other')
+ tm.assert_frame_equal(df, res4)
+ pytest.raises(ValueError, sql.read_sql_table, 'test_schema_other',
+ self.conn, schema='public')
+
+ # different if_exists options
+
+ # create a schema
+ self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;")
+ self.conn.execute("CREATE SCHEMA other;")
+
+ # write dataframe with different if_exists options
+ df.to_sql('test_schema_other', self.conn, schema='other', index=False)
+ df.to_sql('test_schema_other', self.conn, schema='other', index=False,
+ if_exists='replace')
+ df.to_sql('test_schema_other', self.conn, schema='other', index=False,
+ if_exists='append')
+ res = sql.read_sql_table(
+ 'test_schema_other', self.conn, schema='other')
+ tm.assert_frame_equal(concat([df, df], ignore_index=True), res)
+
+ # specifying schema in user-provided meta
+
+ # The schema won't be applied on another Connection
+ # because of transactional schemas
+ if isinstance(self.conn, sqlalchemy.engine.Engine):
+ engine2 = self.connect()
+ meta = sqlalchemy.MetaData(engine2, schema='other')
+ pdsql = sql.SQLDatabase(engine2, meta=meta)
+ pdsql.to_sql(df, 'test_schema_other2', index=False)
+ pdsql.to_sql(df, 'test_schema_other2',
+ index=False, if_exists='replace')
+ pdsql.to_sql(df, 'test_schema_other2',
+ index=False, if_exists='append')
+ res1 = sql.read_sql_table(
+ 'test_schema_other2', self.conn, schema='other')
+ res2 = pdsql.read_table('test_schema_other2')
+ tm.assert_frame_equal(res1, res2)
+
+ def test_copy_from_callable_insertion_method(self):
+ # GH 8953
+ # Example in io.rst found under _io.sql.method
+ # not available in sqlite, mysql
+ def psql_insert_copy(table, conn, keys, data_iter):
+ # gets a DBAPI connection that can provide a cursor
+ dbapi_conn = conn.connection
+ with dbapi_conn.cursor() as cur:
+ s_buf = compat.StringIO()
+ writer = csv.writer(s_buf)
+ writer.writerows(data_iter)
+ s_buf.seek(0)
+
+ columns = ', '.join('"{}"'.format(k) for k in keys)
+ if table.schema:
+ table_name = '{}.{}'.format(table.schema, table.name)
+ else:
+ table_name = table.name
+
+ sql_query = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
+ table_name, columns)
+ cur.copy_expert(sql=sql_query, file=s_buf)
+
+ expected = DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2],
+ 'col3': ['a', 'n']})
+ expected.to_sql('test_copy_insert', self.conn, index=False,
+ method=psql_insert_copy)
+ result = sql.read_sql_table('test_copy_insert', self.conn)
+ tm.assert_frame_equal(result, expected)
+
+
+class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy):
+ pass
+
+
+class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn):
+ pass
+
+
+class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy):
+ pass
+
+
+class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn):
+ pass
+
+
+class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy):
+ pass
+
+
+class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn):
+ pass
+
+
+# -----------------------------------------------------------------------------
+# -- Test Sqlite / MySQL fallback
+
+class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest):
+ """
+ Test the fallback mode against an in-memory sqlite database.
+
+ """
+ flavor = 'sqlite'
+
+ @classmethod
+ def connect(cls):
+ return sqlite3.connect(':memory:')
+
+ def setup_connect(self):
+ self.conn = self.connect()
+
+ def load_test_data_and_sql(self):
+ self.pandasSQL = sql.SQLiteDatabase(self.conn)
+ self._load_test1_data()
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, load_iris_data):
+ self.load_test_data_and_sql()
+
+ def test_read_sql(self):
+ self._read_sql_iris()
+
+ def test_read_sql_parameter(self):
+ self._read_sql_iris_parameter()
+
+ def test_read_sql_named_parameter(self):
+ self._read_sql_iris_named_parameter()
+
+ def test_to_sql(self):
+ self._to_sql()
+
+ def test_to_sql_empty(self):
+ self._to_sql_empty()
+
+ def test_to_sql_fail(self):
+ self._to_sql_fail()
+
+ def test_to_sql_replace(self):
+ self._to_sql_replace()
+
+ def test_to_sql_append(self):
+ self._to_sql_append()
+
+ def test_create_and_drop_table(self):
+ temp_frame = DataFrame(
+ {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]})
+
+ self.pandasSQL.to_sql(temp_frame, 'drop_test_frame')
+
+ assert self.pandasSQL.has_table('drop_test_frame')
+
+ self.pandasSQL.drop_table('drop_test_frame')
+
+ assert not self.pandasSQL.has_table('drop_test_frame')
+
+ def test_roundtrip(self):
+ self._roundtrip()
+
+ def test_execute_sql(self):
+ self._execute_sql()
+
+ def test_datetime_date(self):
+ # test support for datetime.date
+ df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"])
+ df.to_sql('test_date', self.conn, index=False)
+ res = read_sql_query('SELECT * FROM test_date', self.conn)
+ if self.flavor == 'sqlite':
+ # comes back as strings
+ tm.assert_frame_equal(res, df.astype(str))
+ elif self.flavor == 'mysql':
+ tm.assert_frame_equal(res, df)
+
+ def test_datetime_time(self):
+ # test support for datetime.time, GH #8341
+ df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"])
+ df.to_sql('test_time', self.conn, index=False)
+ res = read_sql_query('SELECT * FROM test_time', self.conn)
+ if self.flavor == 'sqlite':
+ # comes back as strings
+ expected = df.applymap(lambda _: _.strftime("%H:%M:%S.%f"))
+ tm.assert_frame_equal(res, expected)
+
+ def _get_index_columns(self, tbl_name):
+ ixs = sql.read_sql_query(
+ "SELECT * FROM sqlite_master WHERE type = 'index' " +
+ "AND tbl_name = '%s'" % tbl_name, self.conn)
+ ix_cols = []
+ for ix_name in ixs.name:
+ ix_info = sql.read_sql_query(
+ "PRAGMA index_info(%s)" % ix_name, self.conn)
+ ix_cols.append(ix_info.name.tolist())
+ return ix_cols
+
+ def test_to_sql_save_index(self):
+ self._to_sql_save_index()
+
+ def test_transactions(self):
+ if PY36:
+ pytest.skip("not working on python > 3.5")
+ self._transaction_test()
+
+ def _get_sqlite_column_type(self, table, column):
+ recs = self.conn.execute('PRAGMA table_info(%s)' % table)
+ for cid, name, ctype, not_null, default, pk in recs:
+ if name == column:
+ return ctype
+ raise ValueError('Table %s, column %s not found' % (table, column))
+
+ def test_dtype(self):
+ if self.flavor == 'mysql':
+ pytest.skip('Not applicable to MySQL legacy')
+ cols = ['A', 'B']
+ data = [(0.8, True),
+ (0.9, None)]
+ df = DataFrame(data, columns=cols)
+ df.to_sql('dtype_test', self.conn)
+ df.to_sql('dtype_test2', self.conn, dtype={'B': 'STRING'})
+
+ # sqlite stores Boolean values as INTEGER
+ assert self._get_sqlite_column_type(
+ 'dtype_test', 'B') == 'INTEGER'
+
+ assert self._get_sqlite_column_type(
+ 'dtype_test2', 'B') == 'STRING'
+ pytest.raises(ValueError, df.to_sql,
+ 'error', self.conn, dtype={'B': bool})
+
+ # single dtype
+ df.to_sql('single_dtype_test', self.conn, dtype='STRING')
+ assert self._get_sqlite_column_type(
+ 'single_dtype_test', 'A') == 'STRING'
+ assert self._get_sqlite_column_type(
+ 'single_dtype_test', 'B') == 'STRING'
+
+ def test_notna_dtype(self):
+ if self.flavor == 'mysql':
+ pytest.skip('Not applicable to MySQL legacy')
+
+ cols = {'Bool': Series([True, None]),
+ 'Date': Series([datetime(2012, 5, 1), None]),
+ 'Int': Series([1, None], dtype='object'),
+ 'Float': Series([1.1, None])
+ }
+ df = DataFrame(cols)
+
+ tbl = 'notna_dtype_test'
+ df.to_sql(tbl, self.conn)
+
+ assert self._get_sqlite_column_type(tbl, 'Bool') == 'INTEGER'
+ assert self._get_sqlite_column_type(tbl, 'Date') == 'TIMESTAMP'
+ assert self._get_sqlite_column_type(tbl, 'Int') == 'INTEGER'
+ assert self._get_sqlite_column_type(tbl, 'Float') == 'REAL'
+
+ def test_illegal_names(self):
+ # For sqlite, these should work fine
+ df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
+
+ # Raise error on blank
+ pytest.raises(ValueError, df.to_sql, "", self.conn)
+
+ for ndx, weird_name in enumerate(
+ ['test_weird_name]', 'test_weird_name[',
+ 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'',
+ '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"',
+ '99beginswithnumber', '12345', u'\xe9']):
+ df.to_sql(weird_name, self.conn)
+ sql.table_exists(weird_name, self.conn)
+
+ df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name])
+ c_tbl = 'test_weird_col_name%d' % ndx
+ df2.to_sql(c_tbl, self.conn)
+ sql.table_exists(c_tbl, self.conn)
+
+
+# -----------------------------------------------------------------------------
+# -- Old tests from 0.13.1 (before refactor using sqlalchemy)
+
+
+def date_format(dt):
+ """Returns date in YYYYMMDD format."""
+ return dt.strftime('%Y%m%d')
+
+
+_formatters = {
+ datetime: lambda dt: "'%s'" % date_format(dt),
+ str: lambda x: "'%s'" % x,
+ np.str_: lambda x: "'%s'" % x,
+ compat.text_type: lambda x: "'%s'" % x,
+ compat.binary_type: lambda x: "'%s'" % x,
+ float: lambda x: "%.8f" % x,
+ int: lambda x: "%s" % x,
+ type(None): lambda x: "NULL",
+ np.float64: lambda x: "%.10f" % x,
+ bool: lambda x: "'%s'" % x,
+}
+
+
+def format_query(sql, *args):
+ """
+
+ """
+ processed_args = []
+ for arg in args:
+ if isinstance(arg, float) and isna(arg):
+ arg = None
+
+ formatter = _formatters[type(arg)]
+ processed_args.append(formatter(arg))
+
+ return sql % tuple(processed_args)
+
+
+def tquery(query, con=None, cur=None):
+ """Replace removed sql.tquery function"""
+ res = sql.execute(query, con=con, cur=cur).fetchall()
+ if res is None:
+ return None
+ else:
+ return list(res)
+
+
+class TestXSQLite(SQLiteMixIn):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, request, datapath):
+ self.method = request.function
+ self.conn = sqlite3.connect(':memory:')
+
+ # In some test cases we may close db connection
+ # Re-open conn here so we can perform cleanup in teardown
+ yield
+ self.method = request.function
+ self.conn = sqlite3.connect(':memory:')
+
+ def test_basic(self):
+ frame = tm.makeTimeDataFrame()
+ self._check_roundtrip(frame)
+
+ def test_write_row_by_row(self):
+
+ frame = tm.makeTimeDataFrame()
+ frame.iloc[0, 0] = np.nan
+ create_sql = sql.get_schema(frame, 'test')
+ cur = self.conn.cursor()
+ cur.execute(create_sql)
+
+ cur = self.conn.cursor()
+
+ ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"
+ for idx, row in frame.iterrows():
+ fmt_sql = format_query(ins, *row)
+ tquery(fmt_sql, cur=cur)
+
+ self.conn.commit()
+
+ result = sql.read_sql("select * from test", con=self.conn)
+ result.index = frame.index
+ tm.assert_frame_equal(result, frame, check_less_precise=True)
+
+ def test_execute(self):
+ frame = tm.makeTimeDataFrame()
+ create_sql = sql.get_schema(frame, 'test')
+ cur = self.conn.cursor()
+ cur.execute(create_sql)
+ ins = "INSERT INTO test VALUES (?, ?, ?, ?)"
+
+ row = frame.iloc[0]
+ sql.execute(ins, self.conn, params=tuple(row))
+ self.conn.commit()
+
+ result = sql.read_sql("select * from test", self.conn)
+ result.index = frame.index[:1]
+ tm.assert_frame_equal(result, frame[:1])
+
+ def test_schema(self):
+ frame = tm.makeTimeDataFrame()
+ create_sql = sql.get_schema(frame, 'test')
+ lines = create_sql.splitlines()
+ for l in lines:
+ tokens = l.split(' ')
+ if len(tokens) == 2 and tokens[0] == 'A':
+ assert tokens[1] == 'DATETIME'
+
+ frame = tm.makeTimeDataFrame()
+ create_sql = sql.get_schema(frame, 'test', keys=['A', 'B'])
+ lines = create_sql.splitlines()
+ assert 'PRIMARY KEY ("A", "B")' in create_sql
+ cur = self.conn.cursor()
+ cur.execute(create_sql)
+
+ def test_execute_fail(self):
+ create_sql = """
+ CREATE TABLE test
+ (
+ a TEXT,
+ b TEXT,
+ c REAL,
+ PRIMARY KEY (a, b)
+ );
+ """
+ cur = self.conn.cursor()
+ cur.execute(create_sql)
+
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn)
+ sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.conn)
+
+ with pytest.raises(Exception):
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn)
+
+ def test_execute_closed_connection(self):
+ create_sql = """
+ CREATE TABLE test
+ (
+ a TEXT,
+ b TEXT,
+ c REAL,
+ PRIMARY KEY (a, b)
+ );
+ """
+ cur = self.conn.cursor()
+ cur.execute(create_sql)
+
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn)
+ self.conn.close()
+
+ with pytest.raises(Exception):
+ tquery("select * from test", con=self.conn)
+
+ def test_na_roundtrip(self):
+ pass
+
+ def _check_roundtrip(self, frame):
+ sql.to_sql(frame, name='test_table', con=self.conn, index=False)
+ result = sql.read_sql("select * from test_table", self.conn)
+
+ # HACK! Change this once indexes are handled properly.
+ result.index = frame.index
+
+ expected = frame
+ tm.assert_frame_equal(result, expected)
+
+ frame['txt'] = ['a'] * len(frame)
+ frame2 = frame.copy()
+ frame2['Idx'] = Index(lrange(len(frame2))) + 10
+ sql.to_sql(frame2, name='test_table2', con=self.conn, index=False)
+ result = sql.read_sql("select * from test_table2", self.conn,
+ index_col='Idx')
+ expected = frame.copy()
+ expected.index = Index(lrange(len(frame2))) + 10
+ expected.index.name = 'Idx'
+ tm.assert_frame_equal(expected, result)
+
+ def test_keyword_as_column_names(self):
+ df = DataFrame({'From': np.ones(5)})
+ sql.to_sql(df, con=self.conn, name='testkeywords', index=False)
+
+ def test_onecolumn_of_integer(self):
+ # GH 3628
+ # a column_of_integers dataframe should transfer well to sql
+
+ mono_df = DataFrame([1, 2], columns=['c0'])
+ sql.to_sql(mono_df, con=self.conn, name='mono_df', index=False)
+ # computing the sum via sql
+ con_x = self.conn
+ the_sum = sum(my_c0[0]
+ for my_c0 in con_x.execute("select * from mono_df"))
+ # it should not fail, and gives 3 ( Issue #3628 )
+ assert the_sum == 3
+
+ result = sql.read_sql("select * from mono_df", con_x)
+ tm.assert_frame_equal(result, mono_df)
+
+ def test_if_exists(self):
+ df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']})
+ df_if_exists_2 = DataFrame(
+ {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']})
+ table_name = 'table_if_exists'
+ sql_select = "SELECT * FROM %s" % table_name
+
+ def clean_up(test_table_to_drop):
+ """
+ Drops tables created from individual tests
+ so no dependencies arise from sequential tests
+ """
+ self.drop_table(test_table_to_drop)
+
+ # test if invalid value for if_exists raises appropriate error
+ pytest.raises(ValueError,
+ sql.to_sql,
+ frame=df_if_exists_1,
+ con=self.conn,
+ name=table_name,
+ if_exists='notvalidvalue')
+ clean_up(table_name)
+
+ # test if_exists='fail'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn,
+ name=table_name, if_exists='fail')
+ pytest.raises(ValueError,
+ sql.to_sql,
+ frame=df_if_exists_1,
+ con=self.conn,
+ name=table_name,
+ if_exists='fail')
+
+ # test if_exists='replace'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name,
+ if_exists='replace', index=False)
+ assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')]
+ sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name,
+ if_exists='replace', index=False)
+ assert (tquery(sql_select, con=self.conn) ==
+ [(3, 'C'), (4, 'D'), (5, 'E')])
+ clean_up(table_name)
+
+ # test if_exists='append'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name,
+ if_exists='fail', index=False)
+ assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')]
+ sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name,
+ if_exists='append', index=False)
+ assert (tquery(sql_select, con=self.conn) ==
+ [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')])
+ clean_up(table_name)
+
+
[email protected](reason="gh-13611: there is no support for MySQL "
+ "if SQLAlchemy is not installed")
+class TestXMySQL(MySQLMixIn):
+
+ @pytest.fixture(autouse=True, scope='class')
+ def setup_class(cls):
+ pymysql = pytest.importorskip('pymysql')
+ pymysql.connect(host='localhost', user='root', passwd='',
+ db='pandas_nosetest')
+ try:
+ pymysql.connect(read_default_group='pandas')
+ except pymysql.ProgrammingError:
+ raise RuntimeError(
+ "Create a group of connection parameters under the heading "
+ "[pandas] in your system's mysql default file, "
+ "typically located at ~/.my.cnf or /etc/.my.cnf.")
+ except pymysql.Error:
+ raise RuntimeError(
+ "Cannot connect to database. "
+ "Create a group of connection parameters under the heading "
+ "[pandas] in your system's mysql default file, "
+ "typically located at ~/.my.cnf or /etc/.my.cnf.")
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, request, datapath):
+ pymysql = pytest.importorskip('pymysql')
+ pymysql.connect(host='localhost', user='root', passwd='',
+ db='pandas_nosetest')
+ try:
+ pymysql.connect(read_default_group='pandas')
+ except pymysql.ProgrammingError:
+ raise RuntimeError(
+ "Create a group of connection parameters under the heading "
+ "[pandas] in your system's mysql default file, "
+ "typically located at ~/.my.cnf or /etc/.my.cnf.")
+ except pymysql.Error:
+ raise RuntimeError(
+ "Cannot connect to database. "
+ "Create a group of connection parameters under the heading "
+ "[pandas] in your system's mysql default file, "
+ "typically located at ~/.my.cnf or /etc/.my.cnf.")
+
+ self.method = request.function
+
+ def test_basic(self):
+ frame = tm.makeTimeDataFrame()
+ self._check_roundtrip(frame)
+
+ def test_write_row_by_row(self):
+ frame = tm.makeTimeDataFrame()
+ frame.iloc[0, 0] = np.nan
+ drop_sql = "DROP TABLE IF EXISTS test"
+ create_sql = sql.get_schema(frame, 'test')
+ cur = self.conn.cursor()
+ cur.execute(drop_sql)
+ cur.execute(create_sql)
+ ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"
+ for idx, row in frame.iterrows():
+ fmt_sql = format_query(ins, *row)
+ tquery(fmt_sql, cur=cur)
+
+ self.conn.commit()
+
+ result = sql.read_sql("select * from test", con=self.conn)
+ result.index = frame.index
+ tm.assert_frame_equal(result, frame, check_less_precise=True)
+
+ def test_chunksize_read_type(self):
+ frame = tm.makeTimeDataFrame()
+ frame.index.name = "index"
+ drop_sql = "DROP TABLE IF EXISTS test"
+ cur = self.conn.cursor()
+ cur.execute(drop_sql)
+ sql.to_sql(frame, name='test', con=self.conn)
+ query = "select * from test"
+ chunksize = 5
+ chunk_gen = pd.read_sql_query(sql=query, con=self.conn,
+ chunksize=chunksize, index_col="index")
+ chunk_df = next(chunk_gen)
+ tm.assert_frame_equal(frame[:chunksize], chunk_df)
+
+ def test_execute(self):
+ frame = tm.makeTimeDataFrame()
+ drop_sql = "DROP TABLE IF EXISTS test"
+ create_sql = sql.get_schema(frame, 'test')
+ cur = self.conn.cursor()
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "Unknown table.*")
+ cur.execute(drop_sql)
+ cur.execute(create_sql)
+ ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"
+
+ row = frame.iloc[0].values.tolist()
+ sql.execute(ins, self.conn, params=tuple(row))
+ self.conn.commit()
+
+ result = sql.read_sql("select * from test", self.conn)
+ result.index = frame.index[:1]
+ tm.assert_frame_equal(result, frame[:1])
+
+ def test_schema(self):
+ frame = tm.makeTimeDataFrame()
+ create_sql = sql.get_schema(frame, 'test')
+ lines = create_sql.splitlines()
+ for l in lines:
+ tokens = l.split(' ')
+ if len(tokens) == 2 and tokens[0] == 'A':
+ assert tokens[1] == 'DATETIME'
+
+ frame = tm.makeTimeDataFrame()
+ drop_sql = "DROP TABLE IF EXISTS test"
+ create_sql = sql.get_schema(frame, 'test', keys=['A', 'B'])
+ lines = create_sql.splitlines()
+ assert 'PRIMARY KEY (`A`, `B`)' in create_sql
+ cur = self.conn.cursor()
+ cur.execute(drop_sql)
+ cur.execute(create_sql)
+
+ def test_execute_fail(self):
+ drop_sql = "DROP TABLE IF EXISTS test"
+ create_sql = """
+ CREATE TABLE test
+ (
+ a TEXT,
+ b TEXT,
+ c REAL,
+ PRIMARY KEY (a(5), b(5))
+ );
+ """
+ cur = self.conn.cursor()
+ cur.execute(drop_sql)
+ cur.execute(create_sql)
+
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn)
+ sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.conn)
+
+ with pytest.raises(Exception):
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn)
+
+ def test_execute_closed_connection(self, request, datapath):
+ drop_sql = "DROP TABLE IF EXISTS test"
+ create_sql = """
+ CREATE TABLE test
+ (
+ a TEXT,
+ b TEXT,
+ c REAL,
+ PRIMARY KEY (a(5), b(5))
+ );
+ """
+ cur = self.conn.cursor()
+ cur.execute(drop_sql)
+ cur.execute(create_sql)
+
+ sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn)
+ self.conn.close()
+
+ with pytest.raises(Exception):
+ tquery("select * from test", con=self.conn)
+
+ # Initialize connection again (needed for tearDown)
+ self.setup_method(request, datapath)
+
+ def test_na_roundtrip(self):
+ pass
+
+ def _check_roundtrip(self, frame):
+ drop_sql = "DROP TABLE IF EXISTS test_table"
+ cur = self.conn.cursor()
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "Unknown table.*")
+ cur.execute(drop_sql)
+ sql.to_sql(frame, name='test_table', con=self.conn, index=False)
+ result = sql.read_sql("select * from test_table", self.conn)
+
+ # HACK! Change this once indexes are handled properly.
+ result.index = frame.index
+ result.index.name = frame.index.name
+
+ expected = frame
+ tm.assert_frame_equal(result, expected)
+
+ frame['txt'] = ['a'] * len(frame)
+ frame2 = frame.copy()
+ index = Index(lrange(len(frame2))) + 10
+ frame2['Idx'] = index
+ drop_sql = "DROP TABLE IF EXISTS test_table2"
+ cur = self.conn.cursor()
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "Unknown table.*")
+ cur.execute(drop_sql)
+ sql.to_sql(frame2, name='test_table2',
+ con=self.conn, index=False)
+ result = sql.read_sql("select * from test_table2", self.conn,
+ index_col='Idx')
+ expected = frame.copy()
+
+ # HACK! Change this once indexes are handled properly.
+ expected.index = index
+ expected.index.names = result.index.names
+ tm.assert_frame_equal(expected, result)
+
+ def test_keyword_as_column_names(self):
+ df = DataFrame({'From': np.ones(5)})
+ sql.to_sql(df, con=self.conn, name='testkeywords',
+ if_exists='replace', index=False)
+
+ def test_if_exists(self):
+ df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']})
+ df_if_exists_2 = DataFrame(
+ {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']})
+ table_name = 'table_if_exists'
+ sql_select = "SELECT * FROM %s" % table_name
+
+ def clean_up(test_table_to_drop):
+ """
+ Drops tables created from individual tests
+ so no dependencies arise from sequential tests
+ """
+ self.drop_table(test_table_to_drop)
+
+ # test if invalid value for if_exists raises appropriate error
+ pytest.raises(ValueError,
+ sql.to_sql,
+ frame=df_if_exists_1,
+ con=self.conn,
+ name=table_name,
+ if_exists='notvalidvalue')
+ clean_up(table_name)
+
+ # test if_exists='fail'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name,
+ if_exists='fail', index=False)
+ pytest.raises(ValueError,
+ sql.to_sql,
+ frame=df_if_exists_1,
+ con=self.conn,
+ name=table_name,
+ if_exists='fail')
+
+ # test if_exists='replace'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name,
+ if_exists='replace', index=False)
+ assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')]
+ sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name,
+ if_exists='replace', index=False)
+ assert (tquery(sql_select, con=self.conn) ==
+ [(3, 'C'), (4, 'D'), (5, 'E')])
+ clean_up(table_name)
+
+ # test if_exists='append'
+ sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name,
+ if_exists='fail', index=False)
+ assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')]
+ sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name,
+ if_exists='append', index=False)
+ assert (tquery(sql_select, con=self.conn) ==
+ [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')])
+ clean_up(table_name)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/test_stata.py b/contrib/python/pandas/py2/pandas/tests/io/test_stata.py
new file mode 100644
index 00000000000..586297d2e38
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/test_stata.py
@@ -0,0 +1,1613 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=E1101
+
+from collections import OrderedDict
+import datetime as dt
+from datetime import datetime
+import gzip
+import io
+import os
+import struct
+import warnings
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import PY3, ResourceWarning, iterkeys
+
+from pandas.core.dtypes.common import is_categorical_dtype
+
+import pandas as pd
+from pandas.core.frame import DataFrame, Series
+import pandas.util.testing as tm
+
+from pandas.io.parsers import read_csv
+from pandas.io.stata import (
+ InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader,
+ read_stata)
+
+
+def dirpath(datapath):
+ return datapath("io", "data")
+
+
+def parsed_114(dirpath):
+ dta14_114 = os.path.join(dirpath, 'stata5_114.dta')
+ parsed_114 = read_stata(dta14_114, convert_dates=True)
+ parsed_114.index.name = 'index'
+ return parsed_114
+
+
+class TestStata(object):
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+ self.dirpath = datapath("io", "data")
+ self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta')
+ self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta')
+
+ self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta')
+ self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta')
+ self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta')
+ self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta')
+
+ self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta')
+ self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta')
+ self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta')
+ self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta')
+ self.csv3 = os.path.join(self.dirpath, 'stata3.csv')
+
+ self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta')
+ self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta')
+ self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta')
+ self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta')
+
+ self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
+
+ self.csv14 = os.path.join(self.dirpath, 'stata5.csv')
+ self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
+ self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta')
+ self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta')
+ self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta')
+
+ self.csv15 = os.path.join(self.dirpath, 'stata6.csv')
+ self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta')
+ self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta')
+ self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
+ self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
+
+ self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
+ self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
+
+ self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta')
+ self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta')
+ self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta')
+
+ self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta')
+ self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta')
+
+ self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta')
+ self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta')
+
+ self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta')
+ self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta')
+
+ self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta')
+
+ self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
+ self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
+
+ self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
+ self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
+
+ self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
+
+ def read_dta(self, file):
+ # Legacy default reader configuration
+ return read_stata(file, convert_dates=True)
+
+ def read_csv(self, file):
+ return read_csv(file, parse_dates=True)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_read_empty_dta(self, version):
+ empty_ds = DataFrame(columns=['unit'])
+ # GH 7369, make sure can read a 0-obs dta file
+ with tm.ensure_clean() as path:
+ empty_ds.to_stata(path, write_index=False, version=version)
+ empty_ds2 = read_stata(path)
+ tm.assert_frame_equal(empty_ds, empty_ds2)
+
+ def test_data_method(self):
+ # Minimal testing of legacy data method
+ with StataReader(self.dta1_114) as rdr:
+ with tm.assert_produces_warning(UserWarning):
+ parsed_114_data = rdr.data()
+
+ with StataReader(self.dta1_114) as rdr:
+ parsed_114_read = rdr.read()
+ tm.assert_frame_equal(parsed_114_data, parsed_114_read)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta1_114', 'dta1_117'])
+ def test_read_dta1(self, file):
+
+ file = getattr(self, file)
+ parsed = self.read_dta(file)
+
+ # Pandas uses np.nan as missing value.
+ # Thus, all columns will be of type float, regardless of their name.
+ expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
+ columns=['float_miss', 'double_miss', 'byte_miss',
+ 'int_miss', 'long_miss'])
+
+ # this is an oddity as really the nan should be float64, but
+ # the casting doesn't fail so need to match stata here
+ expected['float_miss'] = expected['float_miss'].astype(np.float32)
+
+ tm.assert_frame_equal(parsed, expected)
+
+ def test_read_dta2(self):
+
+ expected = DataFrame.from_records(
+ [
+ (
+ datetime(2006, 11, 19, 23, 13, 20),
+ 1479596223000,
+ datetime(2010, 1, 20),
+ datetime(2010, 1, 8),
+ datetime(2010, 1, 1),
+ datetime(1974, 7, 1),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1)
+ ),
+ (
+ datetime(1959, 12, 31, 20, 3, 20),
+ -1479590,
+ datetime(1953, 10, 2),
+ datetime(1948, 6, 10),
+ datetime(1955, 1, 1),
+ datetime(1955, 7, 1),
+ datetime(1955, 1, 1),
+ datetime(2, 1, 1)
+ ),
+ (
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ pd.NaT,
+ )
+ ],
+ columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date',
+ 'monthly_date', 'quarterly_date', 'half_yearly_date',
+ 'yearly_date']
+ )
+ expected['yearly_date'] = expected['yearly_date'].astype('O')
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ parsed_114 = self.read_dta(self.dta2_114)
+ parsed_115 = self.read_dta(self.dta2_115)
+ parsed_117 = self.read_dta(self.dta2_117)
+ # 113 is buggy due to limits of date format support in Stata
+ # parsed_113 = self.read_dta(self.dta2_113)
+
+ # Remove resource warnings
+ w = [x for x in w if x.category is UserWarning]
+
+ # should get warning for each call to read_dta
+ assert len(w) == 3
+
+ # buggy test because of the NaT comparison on certain platforms
+ # Format 113 test fails since it does not support tc and tC formats
+ # tm.assert_frame_equal(parsed_113, expected)
+ tm.assert_frame_equal(parsed_114, expected,
+ check_datetimelike_compat=True)
+ tm.assert_frame_equal(parsed_115, expected,
+ check_datetimelike_compat=True)
+ tm.assert_frame_equal(parsed_117, expected,
+ check_datetimelike_compat=True)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117'])
+ def test_read_dta3(self, file):
+
+ file = getattr(self, file)
+ parsed = self.read_dta(file)
+
+ # match stata here
+ expected = self.read_csv(self.csv3)
+ expected = expected.astype(np.float32)
+ expected['year'] = expected['year'].astype(np.int16)
+ expected['quarter'] = expected['quarter'].astype(np.int8)
+
+ tm.assert_frame_equal(parsed, expected)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117'])
+ def test_read_dta4(self, file):
+
+ file = getattr(self, file)
+ parsed = self.read_dta(file)
+
+ expected = DataFrame.from_records(
+ [
+ ["one", "ten", "one", "one", "one"],
+ ["two", "nine", "two", "two", "two"],
+ ["three", "eight", "three", "three", "three"],
+ ["four", "seven", 4, "four", "four"],
+ ["five", "six", 5, np.nan, "five"],
+ ["six", "five", 6, np.nan, "six"],
+ ["seven", "four", 7, np.nan, "seven"],
+ ["eight", "three", 8, np.nan, "eight"],
+ ["nine", "two", 9, np.nan, "nine"],
+ ["ten", "one", "ten", np.nan, "ten"]
+ ],
+ columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled',
+ 'labeled_with_missings', 'float_labelled'])
+
+ # these are all categoricals
+ expected = pd.concat([expected[col].astype('category')
+ for col in expected], axis=1)
+
+ # stata doesn't save .category metadata
+ tm.assert_frame_equal(parsed, expected, check_categorical=False)
+
+ # File containing strls
+ def test_read_dta12(self):
+ parsed_117 = self.read_dta(self.dta21_117)
+ expected = DataFrame.from_records(
+ [
+ [1, "abc", "abcdefghi"],
+ [3, "cba", "qwertywertyqwerty"],
+ [93, "", "strl"],
+ ],
+ columns=['x', 'y', 'z'])
+
+ tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
+
+ def test_read_dta18(self):
+ parsed_118 = self.read_dta(self.dta22_118)
+ parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
+ expected = DataFrame.from_records(
+ [['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
+ ['Dog', 'Boston', u'Uzunköprü', np.nan, np.nan, np.nan, np.nan],
+ ['Plane', 'Rome', u'Tromsø', 0, 0.0, 'option a', 0.0],
+ ['Potato', 'Tokyo', u'Elâzığ', -4, 4.0, 4, 4],
+ ['', '', '', 0, 0.3332999, 'option a', 1 / 3.]
+ ],
+ columns=['Things', 'Cities', 'Unicode_Cities_Strl',
+ 'Ints', 'Floats', 'Bytes', 'Longs'])
+ expected["Floats"] = expected["Floats"].astype(np.float32)
+ for col in parsed_118.columns:
+ tm.assert_almost_equal(parsed_118[col], expected[col])
+
+ with StataReader(self.dta22_118) as rdr:
+ vl = rdr.variable_labels()
+ vl_expected = {u'Unicode_Cities_Strl':
+ u'Here are some strls with Ünicode chars',
+ u'Longs': u'long data',
+ u'Things': u'Here are some things',
+ u'Bytes': u'byte data',
+ u'Ints': u'int data',
+ u'Cities': u'Here are some cities',
+ u'Floats': u'float data'}
+ tm.assert_dict_equal(vl, vl_expected)
+
+ assert rdr.data_label == u'This is a Ünicode data label'
+
+ def test_read_write_dta5(self):
+ original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
+ columns=['float_miss', 'double_miss', 'byte_miss',
+ 'int_miss', 'long_miss'])
+ original.index.name = 'index'
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, None)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ original)
+
+ def test_write_dta6(self):
+ original = self.read_csv(self.csv3)
+ original.index.name = 'index'
+ original.index = original.index.astype(np.int32)
+ original['year'] = original['year'].astype(np.int32)
+ original['quarter'] = original['quarter'].astype(np.int32)
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, None)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ original, check_index_type=False)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_read_write_dta10(self, version):
+ original = DataFrame(data=[["string", "object", 1, 1.1,
+ np.datetime64('2003-12-25')]],
+ columns=['string', 'object', 'integer',
+ 'floating', 'datetime'])
+ original["object"] = Series(original["object"], dtype=object)
+ original.index.name = 'index'
+ original.index = original.index.astype(np.int32)
+ original['integer'] = original['integer'].astype(np.int32)
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, {'datetime': 'tc'}, version=version)
+ written_and_read_again = self.read_dta(path)
+ # original.index is np.int32, read index is np.int64
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ original, check_index_type=False)
+
+ def test_stata_doc_examples(self):
+ with tm.ensure_clean() as path:
+ df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
+ df.to_stata(path)
+
+ def test_write_preserves_original(self):
+ # 9795
+ np.random.seed(423)
+ df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
+ df.loc[2, 'a':'c'] = np.nan
+ df_copy = df.copy()
+ with tm.ensure_clean() as path:
+ df.to_stata(path, write_index=False)
+ tm.assert_frame_equal(df, df_copy)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_encoding(self, version):
+
+ # GH 4626, proper encoding handling
+ raw = read_stata(self.dta_encoding)
+ with tm.assert_produces_warning(FutureWarning):
+ encoded = read_stata(self.dta_encoding, encoding='latin-1')
+ result = encoded.kreis1849[0]
+
+ expected = raw.kreis1849[0]
+ assert result == expected
+ assert isinstance(result, compat.string_types)
+
+ with tm.ensure_clean() as path:
+ with tm.assert_produces_warning(FutureWarning):
+ encoded.to_stata(path, write_index=False, version=version,
+ encoding='latin-1')
+ reread_encoded = read_stata(path)
+ tm.assert_frame_equal(encoded, reread_encoded)
+
+ def test_read_write_dta11(self):
+ original = DataFrame([(1, 2, 3, 4)],
+ columns=['good', compat.u('b\u00E4d'), '8number',
+ 'astringwithmorethan32characters______'])
+ formatted = DataFrame([(1, 2, 3, 4)],
+ columns=['good', 'b_d', '_8number',
+ 'astringwithmorethan32characters_'])
+ formatted.index.name = 'index'
+ formatted = formatted.astype(np.int32)
+
+ with tm.ensure_clean() as path:
+ with tm.assert_produces_warning(pd.io.stata.InvalidColumnName):
+ original.to_stata(path, None)
+
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(
+ written_and_read_again.set_index('index'), formatted)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_read_write_dta12(self, version):
+ original = DataFrame([(1, 2, 3, 4, 5, 6)],
+ columns=['astringwithmorethan32characters_1',
+ 'astringwithmorethan32characters_2',
+ '+',
+ '-',
+ 'short',
+ 'delete'])
+ formatted = DataFrame([(1, 2, 3, 4, 5, 6)],
+ columns=['astringwithmorethan32characters_',
+ '_0astringwithmorethan32character',
+ '_',
+ '_1_',
+ '_short',
+ '_delete'])
+ formatted.index.name = 'index'
+ formatted = formatted.astype(np.int32)
+
+ with tm.ensure_clean() as path:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always', InvalidColumnName)
+ original.to_stata(path, None, version=version)
+ # should get a warning for that format.
+ assert len(w) == 1
+
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(
+ written_and_read_again.set_index('index'), formatted)
+
+ def test_read_write_dta13(self):
+ s1 = Series(2 ** 9, dtype=np.int16)
+ s2 = Series(2 ** 17, dtype=np.int32)
+ s3 = Series(2 ** 33, dtype=np.int64)
+ original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3})
+ original.index.name = 'index'
+
+ formatted = original
+ formatted['int64'] = formatted['int64'].astype(np.float64)
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ formatted)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ @pytest.mark.parametrize(
+ 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117'])
+ def test_read_write_reread_dta14(self, file, parsed_114, version):
+ file = getattr(self, file)
+ parsed = self.read_dta(file)
+ parsed.index.name = 'index'
+
+ expected = self.read_csv(self.csv14)
+ cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
+ for col in cols:
+ expected[col] = expected[col]._convert(datetime=True, numeric=True)
+ expected['float_'] = expected['float_'].astype(np.float32)
+ expected['date_td'] = pd.to_datetime(
+ expected['date_td'], errors='coerce')
+
+ tm.assert_frame_equal(parsed_114, parsed)
+
+ with tm.ensure_clean() as path:
+ parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(
+ written_and_read_again.set_index('index'), parsed_114)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117'])
+ def test_read_write_reread_dta15(self, file):
+
+ expected = self.read_csv(self.csv15)
+ expected['byte_'] = expected['byte_'].astype(np.int8)
+ expected['int_'] = expected['int_'].astype(np.int16)
+ expected['long_'] = expected['long_'].astype(np.int32)
+ expected['float_'] = expected['float_'].astype(np.float32)
+ expected['double_'] = expected['double_'].astype(np.float64)
+ expected['date_td'] = expected['date_td'].apply(
+ datetime.strptime, args=('%Y-%m-%d',))
+
+ file = getattr(self, file)
+ parsed = self.read_dta(file)
+
+ tm.assert_frame_equal(expected, parsed)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_timestamp_and_label(self, version):
+ original = DataFrame([(1,)], columns=['variable'])
+ time_stamp = datetime(2000, 2, 29, 14, 21)
+ data_label = 'This is a data file.'
+ with tm.ensure_clean() as path:
+ original.to_stata(path, time_stamp=time_stamp,
+ data_label=data_label,
+ version=version)
+
+ with StataReader(path) as reader:
+ assert reader.time_stamp == '29 Feb 2000 14:21'
+ assert reader.data_label == data_label
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_invalid_timestamp(self, version):
+ original = DataFrame([(1,)], columns=['variable'])
+ time_stamp = '01 Jan 2000, 00:00:00'
+ with tm.ensure_clean() as path:
+ msg = "time_stamp should be datetime type"
+ with pytest.raises(ValueError, match=msg):
+ original.to_stata(path, time_stamp=time_stamp,
+ version=version)
+
+ def test_numeric_column_names(self):
+ original = DataFrame(np.reshape(np.arange(25.0), (5, 5)))
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ # should get a warning for that format.
+ with tm.assert_produces_warning(InvalidColumnName):
+ original.to_stata(path)
+
+ written_and_read_again = self.read_dta(path)
+ written_and_read_again = written_and_read_again.set_index('index')
+ columns = list(written_and_read_again.columns)
+ convert_col_name = lambda x: int(x[1])
+ written_and_read_again.columns = map(convert_col_name, columns)
+ tm.assert_frame_equal(original, written_and_read_again)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_nan_to_missing_value(self, version):
+ s1 = Series(np.arange(4.0), dtype=np.float32)
+ s2 = Series(np.arange(4.0), dtype=np.float64)
+ s1[::2] = np.nan
+ s2[1::2] = np.nan
+ original = DataFrame({'s1': s1, 's2': s2})
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path, version=version)
+ written_and_read_again = self.read_dta(path)
+ written_and_read_again = written_and_read_again.set_index('index')
+ tm.assert_frame_equal(written_and_read_again, original)
+
+ def test_no_index(self):
+ columns = ['x', 'y']
+ original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
+ columns=columns)
+ original.index.name = 'index_not_written'
+ with tm.ensure_clean() as path:
+ original.to_stata(path, write_index=False)
+ written_and_read_again = self.read_dta(path)
+ with pytest.raises(KeyError, match=original.index.name):
+ written_and_read_again['index_not_written']
+
+ def test_string_no_dates(self):
+ s1 = Series(['a', 'A longer string'])
+ s2 = Series([1.0, 2.0], dtype=np.float64)
+ original = DataFrame({'s1': s1, 's2': s2})
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ original)
+
+ def test_large_value_conversion(self):
+ s0 = Series([1, 99], dtype=np.int8)
+ s1 = Series([1, 127], dtype=np.int8)
+ s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
+ s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
+ original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ with tm.assert_produces_warning(PossiblePrecisionLoss):
+ original.to_stata(path)
+
+ written_and_read_again = self.read_dta(path)
+ modified = original.copy()
+ modified['s1'] = Series(modified['s1'], dtype=np.int16)
+ modified['s2'] = Series(modified['s2'], dtype=np.int32)
+ modified['s3'] = Series(modified['s3'], dtype=np.float64)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ modified)
+
+ def test_dates_invalid_column(self):
+ original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ with tm.assert_produces_warning(InvalidColumnName):
+ original.to_stata(path, {0: 'tc'})
+
+ written_and_read_again = self.read_dta(path)
+ modified = original.copy()
+ modified.columns = ['_0']
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ modified)
+
+ def test_105(self):
+ # Data obtained from:
+ # http://go.worldbank.org/ZXY29PVJ21
+ dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
+ df = pd.read_stata(dpath)
+ df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
+ df0 = pd.DataFrame(df0)
+ df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
+ df0['clustnum'] = df0["clustnum"].astype(np.int16)
+ df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
+ df0['psch_num'] = df0["psch_num"].astype(np.int8)
+ df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
+ tm.assert_frame_equal(df.head(3), df0)
+
+ def test_value_labels_old_format(self):
+ # GH 19417
+ #
+ # Test that value_labels() returns an empty dict if the file format
+ # predates supporting value labels.
+ dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
+ reader = StataReader(dpath)
+ assert reader.value_labels() == {}
+ reader.close()
+
+ def test_date_export_formats(self):
+ columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
+ conversions = {c: c for c in columns}
+ data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
+ original = DataFrame([data], columns=columns)
+ original.index.name = 'index'
+ expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
+ datetime(2006, 11, 20), # Day
+ datetime(2006, 11, 19), # Week
+ datetime(2006, 11, 1), # Month
+ datetime(2006, 10, 1), # Quarter year
+ datetime(2006, 7, 1), # Half year
+ datetime(2006, 1, 1)] # Year
+
+ expected = DataFrame([expected_values], columns=columns)
+ expected.index.name = 'index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path, conversions)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ expected)
+
+ def test_write_missing_strings(self):
+ original = DataFrame([["1"], [None]], columns=["foo"])
+ expected = DataFrame([["1"], [""]], columns=["foo"])
+ expected.index.name = 'index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ expected)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ @pytest.mark.parametrize('byteorder', ['>', '<'])
+ def test_bool_uint(self, byteorder, version):
+ s0 = Series([0, 1, True], dtype=np.bool)
+ s1 = Series([0, 1, 100], dtype=np.uint8)
+ s2 = Series([0, 1, 255], dtype=np.uint8)
+ s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16)
+ s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16)
+ s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32)
+ s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32)
+
+ original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3,
+ 's4': s4, 's5': s5, 's6': s6})
+ original.index.name = 'index'
+ expected = original.copy()
+ expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
+ np.int32, np.float64)
+ for c, t in zip(expected.columns, expected_types):
+ expected[c] = expected[c].astype(t)
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, byteorder=byteorder, version=version)
+ written_and_read_again = self.read_dta(path)
+ written_and_read_again = written_and_read_again.set_index('index')
+ tm.assert_frame_equal(written_and_read_again, expected)
+
+ def test_variable_labels(self):
+ with StataReader(self.dta16_115) as rdr:
+ sr_115 = rdr.variable_labels()
+ with StataReader(self.dta16_117) as rdr:
+ sr_117 = rdr.variable_labels()
+ keys = ('var1', 'var2', 'var3')
+ labels = ('label1', 'label2', 'label3')
+ for k, v in compat.iteritems(sr_115):
+ assert k in sr_117
+ assert v == sr_117[k]
+ assert k in keys
+ assert v in labels
+
+ def test_minimal_size_col(self):
+ str_lens = (1, 100, 244)
+ s = {}
+ for str_len in str_lens:
+ s['s' + str(str_len)] = Series(['a' * str_len,
+ 'b' * str_len, 'c' * str_len])
+ original = DataFrame(s)
+ with tm.ensure_clean() as path:
+ original.to_stata(path, write_index=False)
+
+ with StataReader(path) as sr:
+ typlist = sr.typlist
+ variables = sr.varlist
+ formats = sr.fmtlist
+ for variable, fmt, typ in zip(variables, formats, typlist):
+ assert int(variable[1:]) == int(fmt[1:-1])
+ assert int(variable[1:]) == typ
+
+ def test_excessively_long_string(self):
+ str_lens = (1, 244, 500)
+ s = {}
+ for str_len in str_lens:
+ s['s' + str(str_len)] = Series(['a' * str_len,
+ 'b' * str_len, 'c' * str_len])
+ original = DataFrame(s)
+ msg = (r"Fixed width strings in Stata \.dta files are limited to 244"
+ r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy"
+ r" this restriction\. Use the\n'version=117' parameter to write"
+ r" the newer \(Stata 13 and later\) format\.")
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+
+ def test_missing_value_generator(self):
+ types = ('b', 'h', 'l')
+ df = DataFrame([[0.0]], columns=['float_'])
+ with tm.ensure_clean() as path:
+ df.to_stata(path)
+ with StataReader(path) as rdr:
+ valid_range = rdr.VALID_RANGE
+ expected_values = ['.' + chr(97 + i) for i in range(26)]
+ expected_values.insert(0, '.')
+ for t in types:
+ offset = valid_range[t][1]
+ for i in range(0, 27):
+ val = StataMissingValue(offset + 1 + i)
+ assert val.string == expected_values[i]
+
+ # Test extremes for floats
+ val = StataMissingValue(struct.unpack('<f', b'\x00\x00\x00\x7f')[0])
+ assert val.string == '.'
+ val = StataMissingValue(struct.unpack('<f', b'\x00\xd0\x00\x7f')[0])
+ assert val.string == '.z'
+
+ # Test extremes for floats
+ val = StataMissingValue(struct.unpack(
+ '<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
+ assert val.string == '.'
+ val = StataMissingValue(struct.unpack(
+ '<d', b'\x00\x00\x00\x00\x00\x1a\xe0\x7f')[0])
+ assert val.string == '.z'
+
+ @pytest.mark.parametrize(
+ 'file', ['dta17_113', 'dta17_115', 'dta17_117'])
+ def test_missing_value_conversion(self, file):
+ columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_']
+ smv = StataMissingValue(101)
+ keys = [key for key in iterkeys(smv.MISSING_VALUES)]
+ keys.sort()
+ data = []
+ for i in range(27):
+ row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)]
+ data.append(row)
+ expected = DataFrame(data, columns=columns)
+
+ parsed = read_stata(getattr(self, file), convert_missing=True)
+ tm.assert_frame_equal(parsed, expected)
+
+ def test_big_dates(self):
+ yr = [1960, 2000, 9999, 100, 2262, 1677]
+ mo = [1, 1, 12, 1, 4, 9]
+ dd = [1, 1, 31, 1, 22, 23]
+ hr = [0, 0, 23, 0, 0, 0]
+ mm = [0, 0, 59, 0, 0, 0]
+ ss = [0, 0, 59, 0, 0, 0]
+ expected = []
+ for i in range(len(yr)):
+ row = []
+ for j in range(7):
+ if j == 0:
+ row.append(
+ datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i]))
+ elif j == 6:
+ row.append(datetime(yr[i], 1, 1))
+ else:
+ row.append(datetime(yr[i], mo[i], dd[i]))
+ expected.append(row)
+ expected.append([pd.NaT] * 7)
+ columns = ['date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq',
+ 'date_th', 'date_ty']
+
+ # Fixes for weekly, quarterly,half,year
+ expected[2][2] = datetime(9999, 12, 24)
+ expected[2][3] = datetime(9999, 12, 1)
+ expected[2][4] = datetime(9999, 10, 1)
+ expected[2][5] = datetime(9999, 7, 1)
+ expected[4][2] = datetime(2262, 4, 16)
+ expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
+ expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
+ expected[5][2] = expected[5][3] = expected[
+ 5][4] = datetime(1677, 10, 1)
+ expected[5][5] = expected[5][6] = datetime(1678, 1, 1)
+
+ expected = DataFrame(expected, columns=columns, dtype=np.object)
+ parsed_115 = read_stata(self.dta18_115)
+ parsed_117 = read_stata(self.dta18_117)
+ tm.assert_frame_equal(expected, parsed_115,
+ check_datetimelike_compat=True)
+ tm.assert_frame_equal(expected, parsed_117,
+ check_datetimelike_compat=True)
+
+ date_conversion = {c: c[-2:] for c in columns}
+ # {c : c[-2:] for c in columns}
+ with tm.ensure_clean() as path:
+ expected.index.name = 'index'
+ expected.to_stata(path, date_conversion)
+ written_and_read_again = self.read_dta(path)
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ expected,
+ check_datetimelike_compat=True)
+
+ def test_dtype_conversion(self):
+ expected = self.read_csv(self.csv15)
+ expected['byte_'] = expected['byte_'].astype(np.int8)
+ expected['int_'] = expected['int_'].astype(np.int16)
+ expected['long_'] = expected['long_'].astype(np.int32)
+ expected['float_'] = expected['float_'].astype(np.float32)
+ expected['double_'] = expected['double_'].astype(np.float64)
+ expected['date_td'] = expected['date_td'].apply(datetime.strptime,
+ args=('%Y-%m-%d',))
+
+ no_conversion = read_stata(self.dta15_117,
+ convert_dates=True)
+ tm.assert_frame_equal(expected, no_conversion)
+
+ conversion = read_stata(self.dta15_117,
+ convert_dates=True,
+ preserve_dtypes=False)
+
+ # read_csv types are the same
+ expected = self.read_csv(self.csv15)
+ expected['date_td'] = expected['date_td'].apply(datetime.strptime,
+ args=('%Y-%m-%d',))
+
+ tm.assert_frame_equal(expected, conversion)
+
+ def test_drop_column(self):
+ expected = self.read_csv(self.csv15)
+ expected['byte_'] = expected['byte_'].astype(np.int8)
+ expected['int_'] = expected['int_'].astype(np.int16)
+ expected['long_'] = expected['long_'].astype(np.int32)
+ expected['float_'] = expected['float_'].astype(np.float32)
+ expected['double_'] = expected['double_'].astype(np.float64)
+ expected['date_td'] = expected['date_td'].apply(datetime.strptime,
+ args=('%Y-%m-%d',))
+
+ columns = ['byte_', 'int_', 'long_']
+ expected = expected[columns]
+ dropped = read_stata(self.dta15_117, convert_dates=True,
+ columns=columns)
+
+ tm.assert_frame_equal(expected, dropped)
+
+ # See PR 10757
+ columns = ['int_', 'long_', 'byte_']
+ expected = expected[columns]
+ reordered = read_stata(self.dta15_117, convert_dates=True,
+ columns=columns)
+ tm.assert_frame_equal(expected, reordered)
+
+ msg = "columns contains duplicate entries"
+ with pytest.raises(ValueError, match=msg):
+ columns = ['byte_', 'byte_']
+ read_stata(self.dta15_117, convert_dates=True, columns=columns)
+
+ msg = ("The following columns were not found in the Stata data set:"
+ " not_found")
+ with pytest.raises(ValueError, match=msg):
+ columns = ['byte_', 'int_', 'long_', 'not_found']
+ read_stata(self.dta15_117, convert_dates=True, columns=columns)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ @pytest.mark.filterwarnings(
+ "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch"
+ )
+ def test_categorical_writing(self, version):
+ original = DataFrame.from_records(
+ [
+ ["one", "ten", "one", "one", "one", 1],
+ ["two", "nine", "two", "two", "two", 2],
+ ["three", "eight", "three", "three", "three", 3],
+ ["four", "seven", 4, "four", "four", 4],
+ ["five", "six", 5, np.nan, "five", 5],
+ ["six", "five", 6, np.nan, "six", 6],
+ ["seven", "four", 7, np.nan, "seven", 7],
+ ["eight", "three", 8, np.nan, "eight", 8],
+ ["nine", "two", 9, np.nan, "nine", 9],
+ ["ten", "one", "ten", np.nan, "ten", 10]
+ ],
+ columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled',
+ 'labeled_with_missings', 'float_labelled', 'unlabeled'])
+ expected = original.copy()
+
+ # these are all categoricals
+ original = pd.concat([original[col].astype('category')
+ for col in original], axis=1)
+
+ expected['incompletely_labeled'] = expected[
+ 'incompletely_labeled'].apply(str)
+ expected['unlabeled'] = expected['unlabeled'].apply(str)
+ expected = pd.concat([expected[col].astype('category')
+ for col in expected], axis=1)
+ expected.index.name = 'index'
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, version=version)
+ written_and_read_again = self.read_dta(path)
+ res = written_and_read_again.set_index('index')
+ tm.assert_frame_equal(res, expected, check_categorical=False)
+
+ def test_categorical_warnings_and_errors(self):
+ # Warning for non-string labels
+ # Error for labels too long
+ original = pd.DataFrame.from_records(
+ [['a' * 10000],
+ ['b' * 10000],
+ ['c' * 10000],
+ ['d' * 10000]],
+ columns=['Too_long'])
+
+ original = pd.concat([original[col].astype('category')
+ for col in original], axis=1)
+ with tm.ensure_clean() as path:
+ msg = ("Stata value labels for a single variable must have"
+ r" a combined length less than 32,000 characters\.")
+ with pytest.raises(ValueError, match=msg):
+ original.to_stata(path)
+
+ original = pd.DataFrame.from_records(
+ [['a'],
+ ['b'],
+ ['c'],
+ ['d'],
+ [1]],
+ columns=['Too_long'])
+ original = pd.concat([original[col].astype('category')
+ for col in original], axis=1)
+
+ with tm.assert_produces_warning(pd.io.stata.ValueLabelTypeMismatch):
+ original.to_stata(path)
+ # should get a warning for mixed content
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_categorical_with_stata_missing_values(self, version):
+ values = [['a' + str(i)] for i in range(120)]
+ values.append([np.nan])
+ original = pd.DataFrame.from_records(values, columns=['many_labels'])
+ original = pd.concat([original[col].astype('category')
+ for col in original], axis=1)
+ original.index.name = 'index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path, version=version)
+ written_and_read_again = self.read_dta(path)
+ res = written_and_read_again.set_index('index')
+ tm.assert_frame_equal(res, original, check_categorical=False)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta19_115', 'dta19_117'])
+ def test_categorical_order(self, file):
+ # Directly construct using expected codes
+ # Format is is_cat, col_name, labels (in order), underlying data
+ expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
+ (True, 'reverse', ['a', 'b', 'c',
+ 'd', 'e'], np.arange(5)[::-1]),
+ (True, 'noorder', ['a', 'b', 'c', 'd',
+ 'e'], np.array([2, 1, 4, 0, 3])),
+ (True, 'floating', [
+ 'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
+ (True, 'float_missing', [
+ 'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
+ (False, 'nolabel', [
+ 1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
+ (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
+ np.arange(5))]
+ cols = []
+ for is_cat, col, labels, codes in expected:
+ if is_cat:
+ cols.append((col, pd.Categorical.from_codes(codes, labels)))
+ else:
+ cols.append((col, pd.Series(labels, dtype=np.float32)))
+ expected = DataFrame.from_dict(OrderedDict(cols))
+
+ # Read with and with out categoricals, ensure order is identical
+ file = getattr(self, file)
+ parsed = read_stata(file)
+ tm.assert_frame_equal(expected, parsed, check_categorical=False)
+
+ # Check identity of codes
+ for col in expected:
+ if is_categorical_dtype(expected[col]):
+ tm.assert_series_equal(expected[col].cat.codes,
+ parsed[col].cat.codes)
+ tm.assert_index_equal(expected[col].cat.categories,
+ parsed[col].cat.categories)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta20_115', 'dta20_117'])
+ def test_categorical_sorting(self, file):
+ parsed = read_stata(getattr(self, file))
+
+ # Sort based on codes, not strings
+ parsed = parsed.sort_values("srh", na_position='first')
+
+ # Don't sort index
+ parsed.index = np.arange(parsed.shape[0])
+ codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
+ categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
+ cat = pd.Categorical.from_codes(codes=codes, categories=categories)
+ expected = pd.Series(cat, name='srh')
+ tm.assert_series_equal(expected, parsed["srh"],
+ check_categorical=False)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta19_115', 'dta19_117'])
+ def test_categorical_ordering(self, file):
+ file = getattr(self, file)
+ parsed = read_stata(file)
+
+ parsed_unordered = read_stata(file,
+ order_categoricals=False)
+ for col in parsed:
+ if not is_categorical_dtype(parsed[col]):
+ continue
+ assert parsed[col].cat.ordered
+ assert not parsed_unordered[col].cat.ordered
+
+ @pytest.mark.parametrize(
+ 'file', ['dta1_117', 'dta2_117', 'dta3_117',
+ 'dta4_117', 'dta14_117', 'dta15_117',
+ 'dta16_117', 'dta17_117', 'dta18_117',
+ 'dta19_117', 'dta20_117'])
+ @pytest.mark.parametrize(
+ 'chunksize', [1, 2])
+ @pytest.mark.parametrize(
+ 'convert_categoricals', [False, True])
+ @pytest.mark.parametrize(
+ 'convert_dates', [False, True])
+ def test_read_chunks_117(self, file, chunksize,
+ convert_categoricals, convert_dates):
+ fname = getattr(self, file)
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ parsed = read_stata(
+ fname,
+ convert_categoricals=convert_categoricals,
+ convert_dates=convert_dates)
+ itr = read_stata(
+ fname, iterator=True,
+ convert_categoricals=convert_categoricals,
+ convert_dates=convert_dates)
+
+ pos = 0
+ for j in range(5):
+ with warnings.catch_warnings(record=True) as w: # noqa
+ warnings.simplefilter("always")
+ try:
+ chunk = itr.read(chunksize)
+ except StopIteration:
+ break
+ from_frame = parsed.iloc[pos:pos + chunksize, :]
+ tm.assert_frame_equal(
+ from_frame, chunk, check_dtype=False,
+ check_datetimelike_compat=True,
+ check_categorical=False)
+
+ pos += chunksize
+ itr.close()
+
+ def test_iterator(self):
+
+ fname = self.dta3_117
+
+ parsed = read_stata(fname)
+
+ with read_stata(fname, iterator=True) as itr:
+ chunk = itr.read(5)
+ tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+ with read_stata(fname, chunksize=5) as itr:
+ chunk = list(itr)
+ tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])
+
+ with read_stata(fname, iterator=True) as itr:
+ chunk = itr.get_chunk(5)
+ tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+ with read_stata(fname, chunksize=5) as itr:
+ chunk = itr.get_chunk()
+ tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
+
+ # GH12153
+ with read_stata(fname, chunksize=4) as itr:
+ from_chunks = pd.concat(itr)
+ tm.assert_frame_equal(parsed, from_chunks)
+
+ @pytest.mark.parametrize(
+ 'file', ['dta2_115', 'dta3_115', 'dta4_115',
+ 'dta14_115', 'dta15_115', 'dta16_115',
+ 'dta17_115', 'dta18_115', 'dta19_115',
+ 'dta20_115'])
+ @pytest.mark.parametrize(
+ 'chunksize', [1, 2])
+ @pytest.mark.parametrize(
+ 'convert_categoricals', [False, True])
+ @pytest.mark.parametrize(
+ 'convert_dates', [False, True])
+ def test_read_chunks_115(self, file, chunksize,
+ convert_categoricals, convert_dates):
+ fname = getattr(self, file)
+
+ # Read the whole file
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ parsed = read_stata(
+ fname,
+ convert_categoricals=convert_categoricals,
+ convert_dates=convert_dates)
+
+ # Compare to what we get when reading by chunk
+ itr = read_stata(
+ fname, iterator=True,
+ convert_dates=convert_dates,
+ convert_categoricals=convert_categoricals)
+ pos = 0
+ for j in range(5):
+ with warnings.catch_warnings(record=True) as w: # noqa
+ warnings.simplefilter("always")
+ try:
+ chunk = itr.read(chunksize)
+ except StopIteration:
+ break
+ from_frame = parsed.iloc[pos:pos + chunksize, :]
+ tm.assert_frame_equal(
+ from_frame, chunk, check_dtype=False,
+ check_datetimelike_compat=True,
+ check_categorical=False)
+
+ pos += chunksize
+ itr.close()
+
+ def test_read_chunks_columns(self):
+ fname = self.dta3_117
+ columns = ['quarter', 'cpi', 'm1']
+ chunksize = 2
+
+ parsed = read_stata(fname, columns=columns)
+ with read_stata(fname, iterator=True) as itr:
+ pos = 0
+ for j in range(5):
+ chunk = itr.read(chunksize, columns=columns)
+ if chunk is None:
+ break
+ from_frame = parsed.iloc[pos:pos + chunksize, :]
+ tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
+ pos += chunksize
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_write_variable_labels(self, version):
+ # GH 13631, add support for writing variable labels
+ original = pd.DataFrame({'a': [1, 2, 3, 4],
+ 'b': [1.0, 3.0, 27.0, 81.0],
+ 'c': ['Atlanta', 'Birmingham',
+ 'Cincinnati', 'Detroit']})
+ original.index.name = 'index'
+ variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'}
+ with tm.ensure_clean() as path:
+ original.to_stata(path,
+ variable_labels=variable_labels,
+ version=version)
+ with StataReader(path) as sr:
+ read_labels = sr.variable_labels()
+ expected_labels = {'index': '',
+ 'a': 'City Rank',
+ 'b': 'City Exponent',
+ 'c': 'City'}
+ assert read_labels == expected_labels
+
+ variable_labels['index'] = 'The Index'
+ with tm.ensure_clean() as path:
+ original.to_stata(path,
+ variable_labels=variable_labels,
+ version=version)
+ with StataReader(path) as sr:
+ read_labels = sr.variable_labels()
+ assert read_labels == variable_labels
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_invalid_variable_labels(self, version):
+ original = pd.DataFrame({'a': [1, 2, 3, 4],
+ 'b': [1.0, 3.0, 27.0, 81.0],
+ 'c': ['Atlanta', 'Birmingham',
+ 'Cincinnati', 'Detroit']})
+ original.index.name = 'index'
+ variable_labels = {'a': 'very long' * 10,
+ 'b': 'City Exponent',
+ 'c': 'City'}
+ with tm.ensure_clean() as path:
+ msg = "Variable labels must be 80 characters or fewer"
+ with pytest.raises(ValueError, match=msg):
+ original.to_stata(path,
+ variable_labels=variable_labels,
+ version=version)
+
+ variable_labels['a'] = u'invalid character Œ'
+ with tm.ensure_clean() as path:
+ msg = ("Variable labels must contain only characters that can be"
+ " encoded in Latin-1")
+ with pytest.raises(ValueError, match=msg):
+ original.to_stata(path,
+ variable_labels=variable_labels,
+ version=version)
+
+ def test_write_variable_label_errors(self):
+ original = pd.DataFrame({'a': [1, 2, 3, 4],
+ 'b': [1.0, 3.0, 27.0, 81.0],
+ 'c': ['Atlanta', 'Birmingham',
+ 'Cincinnati', 'Detroit']})
+ values = [u'\u03A1', u'\u0391',
+ u'\u039D', u'\u0394',
+ u'\u0391', u'\u03A3']
+
+ variable_labels_utf8 = {'a': 'City Rank',
+ 'b': 'City Exponent',
+ 'c': u''.join(values)}
+
+ msg = ("Variable labels must contain only characters that can be"
+ " encoded in Latin-1")
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path, variable_labels=variable_labels_utf8)
+
+ variable_labels_long = {'a': 'City Rank',
+ 'b': 'City Exponent',
+ 'c': 'A very, very, very long variable label '
+ 'that is too long for Stata which means '
+ 'that it has more than 80 characters'}
+
+ msg = "Variable labels must be 80 characters or fewer"
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path, variable_labels=variable_labels_long)
+
+ def test_default_date_conversion(self):
+ # GH 12259
+ dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+ dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+ dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+ original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+ 'strs': ['apple', 'banana', 'cherry'],
+ 'dates': dates})
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path, write_index=False)
+ reread = read_stata(path, convert_dates=True)
+ tm.assert_frame_equal(original, reread)
+
+ original.to_stata(path,
+ write_index=False,
+ convert_dates={'dates': 'tc'})
+ direct = read_stata(path, convert_dates=True)
+ tm.assert_frame_equal(reread, direct)
+
+ dates_idx = original.columns.tolist().index('dates')
+ original.to_stata(path,
+ write_index=False,
+ convert_dates={dates_idx: 'tc'})
+ direct = read_stata(path, convert_dates=True)
+ tm.assert_frame_equal(reread, direct)
+
+ def test_unsupported_type(self):
+ original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
+
+ msg = "Data type complex128 not supported"
+ with pytest.raises(NotImplementedError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+
+ def test_unsupported_datetype(self):
+ dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+ dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+ dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+ original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+ 'strs': ['apple', 'banana', 'cherry'],
+ 'dates': dates})
+
+ msg = "Format %tC not implemented"
+ with pytest.raises(NotImplementedError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path, convert_dates={'dates': 'tC'})
+
+ dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong')
+ original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+ 'strs': ['apple', 'banana', 'cherry'],
+ 'dates': dates})
+ with pytest.raises(NotImplementedError):
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+
+ def test_repeated_column_labels(self):
+ # GH 13923
+ msg = (r"Value labels for column ethnicsn are not unique\. The"
+ r" repeated labels are:\n\n-+wolof")
+ with pytest.raises(ValueError, match=msg):
+ read_stata(self.dta23, convert_categoricals=True)
+
+ def test_stata_111(self):
+ # 111 is an old version but still used by current versions of
+ # SAS when exporting to Stata format. We do not know of any
+ # on-line documentation for this version.
+ df = read_stata(self.dta24_111)
+ original = pd.DataFrame({'y': [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0],
+ 'x': [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6],
+ 'w': [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3],
+ 'z': ['a', 'b', 'c', 'd', 'e', '', 'g', 'h',
+ 'i', 'j']})
+ original = original[['y', 'x', 'w', 'z']]
+ tm.assert_frame_equal(original, df)
+
+ def test_out_of_range_double(self):
+ # GH 14618
+ df = DataFrame({'ColumnOk': [0.0,
+ np.finfo(np.double).eps,
+ 4.49423283715579e+307],
+ 'ColumnTooBig': [0.0,
+ np.finfo(np.double).eps,
+ np.finfo(np.double).max]})
+ msg = (r"Column ColumnTooBig has a maximum value \(.+\)"
+ r" outside the range supported by Stata \(.+\)")
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ df.to_stata(path)
+
+ df.loc[2, 'ColumnTooBig'] = np.inf
+ msg = ("Column ColumnTooBig has a maximum value of infinity which"
+ " is outside the range supported by Stata")
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ df.to_stata(path)
+
+ def test_out_of_range_float(self):
+ original = DataFrame({'ColumnOk': [0.0,
+ np.finfo(np.float32).eps,
+ np.finfo(np.float32).max / 10.0],
+ 'ColumnTooBig': [0.0,
+ np.finfo(np.float32).eps,
+ np.finfo(np.float32).max]})
+ original.index.name = 'index'
+ for col in original:
+ original[col] = original[col].astype(np.float32)
+
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+ reread = read_stata(path)
+ original['ColumnTooBig'] = original['ColumnTooBig'].astype(
+ np.float64)
+ tm.assert_frame_equal(original,
+ reread.set_index('index'))
+
+ original.loc[2, 'ColumnTooBig'] = np.inf
+ msg = ("Column ColumnTooBig has a maximum value of infinity which"
+ " is outside the range supported by Stata")
+ with pytest.raises(ValueError, match=msg):
+ with tm.ensure_clean() as path:
+ original.to_stata(path)
+
+ def test_path_pathlib(self):
+ df = tm.makeDataFrame()
+ df.index.name = 'index'
+ reader = lambda x: read_stata(x).set_index('index')
+ result = tm.round_trip_pathlib(df.to_stata, reader)
+ tm.assert_frame_equal(df, result)
+
+ def test_pickle_path_localpath(self):
+ df = tm.makeDataFrame()
+ df.index.name = 'index'
+ reader = lambda x: read_stata(x).set_index('index')
+ result = tm.round_trip_localpath(df.to_stata, reader)
+ tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize(
+ 'write_index', [True, False])
+ def test_value_labels_iterator(self, write_index):
+ # GH 16923
+ d = {'A': ['B', 'E', 'C', 'A', 'E']}
+ df = pd.DataFrame(data=d)
+ df['A'] = df['A'].astype('category')
+ with tm.ensure_clean() as path:
+ df.to_stata(path, write_index=write_index)
+
+ with pd.read_stata(path, iterator=True) as dta_iter:
+ value_labels = dta_iter.value_labels()
+ assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}
+
+ def test_set_index(self):
+ # GH 17328
+ df = tm.makeDataFrame()
+ df.index.name = 'index'
+ with tm.ensure_clean() as path:
+ df.to_stata(path)
+ reread = pd.read_stata(path, index_col='index')
+ tm.assert_frame_equal(df, reread)
+
+ @pytest.mark.parametrize(
+ 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr'])
+ def test_date_parsing_ignores_format_details(self, column):
+ # GH 17797
+ #
+ # Test that display formats are ignored when determining if a numeric
+ # column is a date value.
+ #
+ # All date types are stored as numbers and format associated with the
+ # column denotes both the type of the date and the display format.
+ #
+ # STATA supports 9 date types which each have distinct units. We test 7
+ # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
+ # accounts for leap seconds and %tb relies on STATAs business calendar.
+ df = read_stata(self.stata_dates)
+ unformatted = df.loc[0, column]
+ formatted = df.loc[0, column + "_fmt"]
+ assert unformatted == formatted
+
+ def test_writer_117(self):
+ original = DataFrame(data=[['string', 'object', 1, 1, 1, 1.1, 1.1,
+ np.datetime64('2003-12-25'),
+ 'a', 'a' * 2045, 'a' * 5000, 'a'],
+ ['string-1', 'object-1', 1, 1, 1, 1.1, 1.1,
+ np.datetime64('2003-12-26'),
+ 'b', 'b' * 2045, '', '']
+ ],
+ columns=['string', 'object', 'int8', 'int16',
+ 'int32', 'float32', 'float64',
+ 'datetime',
+ 's1', 's2045', 'srtl', 'forced_strl'])
+ original['object'] = Series(original['object'], dtype=object)
+ original['int8'] = Series(original['int8'], dtype=np.int8)
+ original['int16'] = Series(original['int16'], dtype=np.int16)
+ original['int32'] = original['int32'].astype(np.int32)
+ original['float32'] = Series(original['float32'], dtype=np.float32)
+ original.index.name = 'index'
+ original.index = original.index.astype(np.int32)
+ copy = original.copy()
+ with tm.ensure_clean() as path:
+ original.to_stata(path,
+ convert_dates={'datetime': 'tc'},
+ convert_strl=['forced_strl'],
+ version=117)
+ written_and_read_again = self.read_dta(path)
+ # original.index is np.int32, read index is np.int64
+ tm.assert_frame_equal(written_and_read_again.set_index('index'),
+ original, check_index_type=False)
+ tm.assert_frame_equal(original, copy)
+
+ def test_convert_strl_name_swap(self):
+ original = DataFrame([['a' * 3000, 'A', 'apple'],
+ ['b' * 1000, 'B', 'banana']],
+ columns=['long1' * 10, 'long', 1])
+ original.index.name = 'index'
+
+ with tm.assert_produces_warning(pd.io.stata.InvalidColumnName):
+ with tm.ensure_clean() as path:
+ original.to_stata(path, convert_strl=['long', 1], version=117)
+ reread = self.read_dta(path)
+ reread = reread.set_index('index')
+ reread.columns = original.columns
+ tm.assert_frame_equal(reread, original,
+ check_index_type=False)
+
+ def test_invalid_date_conversion(self):
+ # GH 12259
+ dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+ dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+ dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+ original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+ 'strs': ['apple', 'banana', 'cherry'],
+ 'dates': dates})
+
+ with tm.ensure_clean() as path:
+ msg = "convert_dates key must be a column or an integer"
+ with pytest.raises(ValueError, match=msg):
+ original.to_stata(path,
+ convert_dates={'wrong_name': 'tc'})
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_nonfile_writing(self, version):
+ # GH 21041
+ bio = io.BytesIO()
+ df = tm.makeDataFrame()
+ df.index.name = 'index'
+ with tm.ensure_clean() as path:
+ df.to_stata(bio, version=version)
+ bio.seek(0)
+ with open(path, 'wb') as dta:
+ dta.write(bio.read())
+ reread = pd.read_stata(path, index_col='index')
+ tm.assert_frame_equal(df, reread)
+
+ def test_gzip_writing(self):
+ # writing version 117 requires seek and cannot be used with gzip
+ df = tm.makeDataFrame()
+ df.index.name = 'index'
+ with tm.ensure_clean() as path:
+ with gzip.GzipFile(path, 'wb') as gz:
+ df.to_stata(gz, version=114)
+ with gzip.GzipFile(path, 'rb') as gz:
+ reread = pd.read_stata(gz, index_col='index')
+ tm.assert_frame_equal(df, reread)
+
+ def test_unicode_dta_118(self):
+ unicode_df = self.read_dta(self.dta25_118)
+
+ columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
+ values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
+ [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
+ [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
+ [' ', ' ', 'd', ' ', 'd'],
+ [' ', '', 'a', ' ', 'a'],
+ ['', '', 's', '', 's'],
+ ['', '', ' ', '', ' ']]
+ expected = pd.DataFrame(values, columns=columns)
+
+ tm.assert_frame_equal(unicode_df, expected)
+
+ def test_mixed_string_strl(self):
+ # GH 23633
+ output = [
+ {'mixed': 'string' * 500,
+ 'number': 0},
+ {'mixed': None,
+ 'number': 1}
+ ]
+ output = pd.DataFrame(output)
+ output.number = output.number.astype('int32')
+
+ with tm.ensure_clean() as path:
+ output.to_stata(path, write_index=False, version=117)
+ reread = read_stata(path)
+ expected = output.fillna('')
+ tm.assert_frame_equal(reread, expected)
+
+ # Check strl supports all None (null)
+ output.loc[:, 'mixed'] = None
+ output.to_stata(path, write_index=False, convert_strl=['mixed'],
+ version=117)
+ reread = read_stata(path)
+ expected = output.fillna('')
+ tm.assert_frame_equal(reread, expected)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_all_none_exception(self, version):
+ output = [
+ {'none': 'none',
+ 'number': 0},
+ {'none': None,
+ 'number': 1}
+ ]
+ output = pd.DataFrame(output)
+ output.loc[:, 'none'] = None
+ with tm.ensure_clean() as path:
+ msg = (r"Column `none` cannot be exported\.\n\n"
+ "Only string-like object arrays containing all strings or a"
+ r" mix of strings and None can be exported\. Object arrays"
+ r" containing only null values are prohibited\. Other"
+ " object typescannot be exported and must first be"
+ r" converted to one of the supported types\.")
+ with pytest.raises(ValueError, match=msg):
+ output.to_stata(path, version=version)
+
+ @pytest.mark.parametrize('version', [114, 117])
+ def test_invalid_file_not_written(self, version):
+ content = 'Here is one __�__ Another one __·__ Another one __½__'
+ df = DataFrame([content], columns=['invalid'])
+ expected_exc = UnicodeEncodeError if PY3 else UnicodeDecodeError
+ with tm.ensure_clean() as path:
+ msg1 = (r"'latin-1' codec can't encode character '\\ufffd'"
+ r" in position 14: ordinal not in range\(256\)")
+ msg2 = ("'ascii' codec can't decode byte 0xef in position 14:"
+ r" ordinal not in range\(128\)")
+ with pytest.raises(expected_exc, match=r'{}|{}'.format(
+ msg1, msg2)):
+ with tm.assert_produces_warning(ResourceWarning):
+ df.to_stata(path)
+
+ def test_strl_latin1(self):
+ # GH 23573, correct GSO data to reflect correct size
+ output = DataFrame([[u'pandas'] * 2, [u'þâÑÐŧ'] * 2],
+ columns=['var_str', 'var_strl'])
+
+ with tm.ensure_clean() as path:
+ output.to_stata(path, version=117, convert_strl=['var_strl'])
+ with open(path, 'rb') as reread:
+ content = reread.read()
+ expected = u'þâÑÐŧ'
+ assert expected.encode('latin-1') in content
+ assert expected.encode('utf-8') in content
+ gsos = content.split(b'strls')[1][1:-2]
+ for gso in gsos.split(b'GSO')[1:]:
+ val = gso.split(b'\x00')[-2]
+ size = gso[gso.find(b'\x82') + 1]
+ if not PY3:
+ size = ord(size)
+ assert len(val) == size - 1
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/__init__.py b/contrib/python/pandas/py2/pandas/tests/plotting/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/common.py b/contrib/python/pandas/py2/pandas/tests/plotting/common.py
new file mode 100644
index 00000000000..4ca916a0aa4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/common.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os
+import warnings
+
+import numpy as np
+from numpy import random
+import pytest
+
+from pandas.compat import iteritems, zip
+from pandas.util._decorators import cache_readonly
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.api import is_list_like
+
+from pandas import DataFrame, Series
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_is_valid_plot_return_object, ensure_clean)
+
+import pandas.plotting as plotting
+from pandas.plotting._tools import _flatten
+
+
+"""
+This is a common base class used for various plotting tests
+"""
+
+
+def _skip_if_no_scipy_gaussian_kde():
+ try:
+ from scipy.stats import gaussian_kde # noqa
+ except ImportError:
+ pytest.skip("scipy version doesn't support gaussian_kde")
+
+
+def _ok_for_gaussian_kde(kind):
+ if kind in ['kde', 'density']:
+ try:
+ from scipy.stats import gaussian_kde # noqa
+ except ImportError:
+ return False
+
+ return True
+
+
+class TestPlotBase(object):
+
+ def setup_method(self, method):
+
+ import matplotlib as mpl
+ mpl.rcdefaults()
+
+ self.mpl_ge_2_0_1 = plotting._compat._mpl_ge_2_0_1()
+ self.mpl_ge_2_1_0 = plotting._compat._mpl_ge_2_1_0()
+ self.mpl_ge_2_2_0 = plotting._compat._mpl_ge_2_2_0()
+ self.mpl_ge_2_2_2 = plotting._compat._mpl_ge_2_2_2()
+ self.mpl_ge_3_0_0 = plotting._compat._mpl_ge_3_0_0()
+
+ self.bp_n_objects = 7
+ self.polycollection_factor = 2
+ self.default_figsize = (6.4, 4.8)
+ self.default_tick_position = 'left'
+
+ n = 100
+ with tm.RNGContext(42):
+ gender = np.random.choice(['Male', 'Female'], size=n)
+ classroom = np.random.choice(['A', 'B', 'C'], size=n)
+
+ self.hist_df = DataFrame({'gender': gender,
+ 'classroom': classroom,
+ 'height': random.normal(66, 4, size=n),
+ 'weight': random.normal(161, 32, size=n),
+ 'category': random.randint(4, size=n)})
+
+ self.tdf = tm.makeTimeDataFrame()
+ self.hexbin_df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(
+ size=20)})
+
+ def teardown_method(self, method):
+ tm.close()
+
+ @cache_readonly
+ def plt(self):
+ import matplotlib.pyplot as plt
+ return plt
+
+ @cache_readonly
+ def colorconverter(self):
+ import matplotlib.colors as colors
+ return colors.colorConverter
+
+ def _check_legend_labels(self, axes, labels=None, visible=True):
+ """
+ Check each axes has expected legend labels
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+ labels : list-like
+ expected legend labels
+ visible : bool
+ expected legend visibility. labels are checked only when visible is
+ True
+ """
+
+ if visible and (labels is None):
+ raise ValueError('labels must be specified when visible is True')
+ axes = self._flatten_visible(axes)
+ for ax in axes:
+ if visible:
+ assert ax.get_legend() is not None
+ self._check_text_labels(ax.get_legend().get_texts(), labels)
+ else:
+ assert ax.get_legend() is None
+
+ def _check_data(self, xp, rs):
+ """
+ Check each axes has identical lines
+
+ Parameters
+ ----------
+ xp : matplotlib Axes object
+ rs : matplotlib Axes object
+ """
+ xp_lines = xp.get_lines()
+ rs_lines = rs.get_lines()
+
+ def check_line(xpl, rsl):
+ xpdata = xpl.get_xydata()
+ rsdata = rsl.get_xydata()
+ tm.assert_almost_equal(xpdata, rsdata)
+
+ assert len(xp_lines) == len(rs_lines)
+ [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)]
+ tm.close()
+
+ def _check_visible(self, collections, visible=True):
+ """
+ Check each artist is visible or not
+
+ Parameters
+ ----------
+ collections : matplotlib Artist or its list-like
+ target Artist or its list or collection
+ visible : bool
+ expected visibility
+ """
+ from matplotlib.collections import Collection
+ if not isinstance(collections,
+ Collection) and not is_list_like(collections):
+ collections = [collections]
+
+ for patch in collections:
+ assert patch.get_visible() == visible
+
+ def _get_colors_mapped(self, series, colors):
+ unique = series.unique()
+ # unique and colors length can be differed
+ # depending on slice value
+ mapped = dict(zip(unique, colors))
+ return [mapped[v] for v in series.values]
+
+ def _check_colors(self, collections, linecolors=None, facecolors=None,
+ mapping=None):
+ """
+ Check each artist has expected line colors and face colors
+
+ Parameters
+ ----------
+ collections : list-like
+ list or collection of target artist
+ linecolors : list-like which has the same length as collections
+ list of expected line colors
+ facecolors : list-like which has the same length as collections
+ list of expected face colors
+ mapping : Series
+ Series used for color grouping key
+ used for andrew_curves, parallel_coordinates, radviz test
+ """
+
+ from matplotlib.lines import Line2D
+ from matplotlib.collections import (
+ Collection, PolyCollection, LineCollection
+ )
+ conv = self.colorconverter
+ if linecolors is not None:
+
+ if mapping is not None:
+ linecolors = self._get_colors_mapped(mapping, linecolors)
+ linecolors = linecolors[:len(collections)]
+
+ assert len(collections) == len(linecolors)
+ for patch, color in zip(collections, linecolors):
+ if isinstance(patch, Line2D):
+ result = patch.get_color()
+ # Line2D may contains string color expression
+ result = conv.to_rgba(result)
+ elif isinstance(patch, (PolyCollection, LineCollection)):
+ result = tuple(patch.get_edgecolor()[0])
+ else:
+ result = patch.get_edgecolor()
+
+ expected = conv.to_rgba(color)
+ assert result == expected
+
+ if facecolors is not None:
+
+ if mapping is not None:
+ facecolors = self._get_colors_mapped(mapping, facecolors)
+ facecolors = facecolors[:len(collections)]
+
+ assert len(collections) == len(facecolors)
+ for patch, color in zip(collections, facecolors):
+ if isinstance(patch, Collection):
+ # returned as list of np.array
+ result = patch.get_facecolor()[0]
+ else:
+ result = patch.get_facecolor()
+
+ if isinstance(result, np.ndarray):
+ result = tuple(result)
+
+ expected = conv.to_rgba(color)
+ assert result == expected
+
+ def _check_text_labels(self, texts, expected):
+ """
+ Check each text has expected labels
+
+ Parameters
+ ----------
+ texts : matplotlib Text object, or its list-like
+ target text, or its list
+ expected : str or list-like which has the same length as texts
+ expected text label, or its list
+ """
+ if not is_list_like(texts):
+ assert texts.get_text() == expected
+ else:
+ labels = [t.get_text() for t in texts]
+ assert len(labels) == len(expected)
+ for label, e in zip(labels, expected):
+ assert label == e
+
+ def _check_ticks_props(self, axes, xlabelsize=None, xrot=None,
+ ylabelsize=None, yrot=None):
+ """
+ Check each axes has expected tick properties
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+ xlabelsize : number
+ expected xticks font size
+ xrot : number
+ expected xticks rotation
+ ylabelsize : number
+ expected yticks font size
+ yrot : number
+ expected yticks rotation
+ """
+ from matplotlib.ticker import NullFormatter
+ axes = self._flatten_visible(axes)
+ for ax in axes:
+ if xlabelsize or xrot:
+ if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter):
+ # If minor ticks has NullFormatter, rot / fontsize are not
+ # retained
+ labels = ax.get_xticklabels()
+ else:
+ labels = ax.get_xticklabels() + ax.get_xticklabels(
+ minor=True)
+
+ for label in labels:
+ if xlabelsize is not None:
+ tm.assert_almost_equal(label.get_fontsize(),
+ xlabelsize)
+ if xrot is not None:
+ tm.assert_almost_equal(label.get_rotation(), xrot)
+
+ if ylabelsize or yrot:
+ if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter):
+ labels = ax.get_yticklabels()
+ else:
+ labels = ax.get_yticklabels() + ax.get_yticklabels(
+ minor=True)
+
+ for label in labels:
+ if ylabelsize is not None:
+ tm.assert_almost_equal(label.get_fontsize(),
+ ylabelsize)
+ if yrot is not None:
+ tm.assert_almost_equal(label.get_rotation(), yrot)
+
+ def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'):
+ """
+ Check each axes has expected scales
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+ xaxis : {'linear', 'log'}
+ expected xaxis scale
+ yaxis : {'linear', 'log'}
+ expected yaxis scale
+ """
+ axes = self._flatten_visible(axes)
+ for ax in axes:
+ assert ax.xaxis.get_scale() == xaxis
+ assert ax.yaxis.get_scale() == yaxis
+
+ def _check_axes_shape(self, axes, axes_num=None, layout=None,
+ figsize=None):
+ """
+ Check expected number of axes is drawn in expected layout
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+ axes_num : number
+ expected number of axes. Unnecessary axes should be set to
+ invisible.
+ layout : tuple
+ expected layout, (expected number of rows , columns)
+ figsize : tuple
+ expected figsize. default is matplotlib default
+ """
+ if figsize is None:
+ figsize = self.default_figsize
+ visible_axes = self._flatten_visible(axes)
+
+ if axes_num is not None:
+ assert len(visible_axes) == axes_num
+ for ax in visible_axes:
+ # check something drawn on visible axes
+ assert len(ax.get_children()) > 0
+
+ if layout is not None:
+ result = self._get_axes_layout(_flatten(axes))
+ assert result == layout
+
+ tm.assert_numpy_array_equal(
+ visible_axes[0].figure.get_size_inches(),
+ np.array(figsize, dtype=np.float64))
+
+ def _get_axes_layout(self, axes):
+ x_set = set()
+ y_set = set()
+ for ax in axes:
+ # check axes coordinates to estimate layout
+ points = ax.get_position().get_points()
+ x_set.add(points[0][0])
+ y_set.add(points[0][1])
+ return (len(y_set), len(x_set))
+
+ def _flatten_visible(self, axes):
+ """
+ Flatten axes, and filter only visible
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+
+ """
+ axes = _flatten(axes)
+ axes = [ax for ax in axes if ax.get_visible()]
+ return axes
+
+ def _check_has_errorbars(self, axes, xerr=0, yerr=0):
+ """
+ Check axes has expected number of errorbars
+
+ Parameters
+ ----------
+ axes : matplotlib Axes object, or its list-like
+ xerr : number
+ expected number of x errorbar
+ yerr : number
+ expected number of y errorbar
+ """
+ axes = self._flatten_visible(axes)
+ for ax in axes:
+ containers = ax.containers
+ xerr_count = 0
+ yerr_count = 0
+ for c in containers:
+ has_xerr = getattr(c, 'has_xerr', False)
+ has_yerr = getattr(c, 'has_yerr', False)
+ if has_xerr:
+ xerr_count += 1
+ if has_yerr:
+ yerr_count += 1
+ assert xerr == xerr_count
+ assert yerr == yerr_count
+
+ def _check_box_return_type(self, returned, return_type, expected_keys=None,
+ check_ax_title=True):
+ """
+ Check box returned type is correct
+
+ Parameters
+ ----------
+ returned : object to be tested, returned from boxplot
+ return_type : str
+ return_type passed to boxplot
+ expected_keys : list-like, optional
+ group labels in subplot case. If not passed,
+ the function checks assuming boxplot uses single ax
+ check_ax_title : bool
+ Whether to check the ax.title is the same as expected_key
+ Intended to be checked by calling from ``boxplot``.
+ Normal ``plot`` doesn't attach ``ax.title``, it must be disabled.
+ """
+ from matplotlib.axes import Axes
+ types = {'dict': dict, 'axes': Axes, 'both': tuple}
+ if expected_keys is None:
+ # should be fixed when the returning default is changed
+ if return_type is None:
+ return_type = 'dict'
+
+ assert isinstance(returned, types[return_type])
+ if return_type == 'both':
+ assert isinstance(returned.ax, Axes)
+ assert isinstance(returned.lines, dict)
+ else:
+ # should be fixed when the returning default is changed
+ if return_type is None:
+ for r in self._flatten_visible(returned):
+ assert isinstance(r, Axes)
+ return
+
+ assert isinstance(returned, Series)
+
+ assert sorted(returned.keys()) == sorted(expected_keys)
+ for key, value in iteritems(returned):
+ assert isinstance(value, types[return_type])
+ # check returned dict has correct mapping
+ if return_type == 'axes':
+ if check_ax_title:
+ assert value.get_title() == key
+ elif return_type == 'both':
+ if check_ax_title:
+ assert value.ax.get_title() == key
+ assert isinstance(value.ax, Axes)
+ assert isinstance(value.lines, dict)
+ elif return_type == 'dict':
+ line = value['medians'][0]
+ axes = line.axes
+ if check_ax_title:
+ assert axes.get_title() == key
+ else:
+ raise AssertionError
+
+ def _check_grid_settings(self, obj, kinds, kws={}):
+ # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
+
+ import matplotlib as mpl
+
+ def is_grid_on():
+ xoff = all(not g.gridOn
+ for g in self.plt.gca().xaxis.get_major_ticks())
+ yoff = all(not g.gridOn
+ for g in self.plt.gca().yaxis.get_major_ticks())
+ return not (xoff and yoff)
+
+ spndx = 1
+ for kind in kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+
+ self.plt.subplot(1, 4 * len(kinds), spndx)
+ spndx += 1
+ mpl.rc('axes', grid=False)
+ obj.plot(kind=kind, **kws)
+ assert not is_grid_on()
+
+ self.plt.subplot(1, 4 * len(kinds), spndx)
+ spndx += 1
+ mpl.rc('axes', grid=True)
+ obj.plot(kind=kind, grid=False, **kws)
+ assert not is_grid_on()
+
+ if kind != 'pie':
+ self.plt.subplot(1, 4 * len(kinds), spndx)
+ spndx += 1
+ mpl.rc('axes', grid=True)
+ obj.plot(kind=kind, **kws)
+ assert is_grid_on()
+
+ self.plt.subplot(1, 4 * len(kinds), spndx)
+ spndx += 1
+ mpl.rc('axes', grid=False)
+ obj.plot(kind=kind, grid=True, **kws)
+ assert is_grid_on()
+
+ def _unpack_cycler(self, rcParams, field='color'):
+ """
+ Auxiliary function for correctly unpacking cycler after MPL >= 1.5
+ """
+ return [v[field] for v in rcParams['axes.prop_cycle']]
+
+
+def _check_plot_works(f, filterwarnings='always', **kwargs):
+ import matplotlib.pyplot as plt
+ ret = None
+ with warnings.catch_warnings():
+ warnings.simplefilter(filterwarnings)
+ try:
+ try:
+ fig = kwargs['figure']
+ except KeyError:
+ fig = plt.gcf()
+
+ plt.clf()
+
+ ax = kwargs.get('ax', fig.add_subplot(211)) # noqa
+ ret = f(**kwargs)
+
+ assert_is_valid_plot_return_object(ret)
+
+ try:
+ kwargs['ax'] = fig.add_subplot(212)
+ ret = f(**kwargs)
+ except Exception:
+ pass
+ else:
+ assert_is_valid_plot_return_object(ret)
+
+ with ensure_clean(return_filelike=True) as path:
+ plt.savefig(path)
+ finally:
+ tm.close(fig)
+
+ return ret
+
+
+def curpath():
+ pth, _ = os.path.split(os.path.abspath(__file__))
+ return pth
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_boxplot_method.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_boxplot_method.py
new file mode 100644
index 00000000000..7d721c7de33
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_boxplot_method.py
@@ -0,0 +1,385 @@
+# coding: utf-8
+
+import itertools
+import string
+
+import numpy as np
+from numpy import random
+import pytest
+
+from pandas.compat import lzip, range
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame, MultiIndex, Series
+from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
+import pandas.util.testing as tm
+
+import pandas.plotting as plotting
+
+""" Test cases for .boxplot method """
+
+
+class TestDataFramePlots(TestPlotBase):
+
+ @pytest.mark.slow
+ def test_boxplot_legacy1(self):
+ df = DataFrame(np.random.randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['one', 'two', 'three', 'four'])
+ df['indic'] = ['foo', 'bar'] * 3
+ df['indic2'] = ['foo', 'bar', 'foo'] * 2
+
+ _check_plot_works(df.boxplot, return_type='dict')
+ _check_plot_works(df.boxplot, column=[
+ 'one', 'two'], return_type='dict')
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.boxplot, column=['one', 'two'],
+ by='indic')
+ _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.boxplot, by='indic')
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.boxplot, by=['indic', 'indic2'])
+ _check_plot_works(plotting._core.boxplot, data=df['one'],
+ return_type='dict')
+ _check_plot_works(df.boxplot, notch=1, return_type='dict')
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.boxplot, by='indic', notch=1)
+
+ @pytest.mark.slow
+ def test_boxplot_legacy2(self):
+ df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
+ df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
+ df['Y'] = Series(['A'] * 10)
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.boxplot, by='X')
+
+ # When ax is supplied and required number of axes is 1,
+ # passed ax should be used:
+ fig, ax = self.plt.subplots()
+ axes = df.boxplot('Col1', by='X', ax=ax)
+ ax_axes = ax.axes
+ assert ax_axes is axes
+
+ fig, ax = self.plt.subplots()
+ axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
+ ax_axes = ax.axes
+ assert ax_axes is axes['A']
+
+ # Multiple columns with an ax argument should use same figure
+ fig, ax = self.plt.subplots()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.boxplot(column=['Col1', 'Col2'],
+ by='X', ax=ax, return_type='axes')
+ assert axes['Col1'].get_figure() is fig
+
+ # When by is None, check that all relevant lines are present in the
+ # dict
+ fig, ax = self.plt.subplots()
+ d = df.boxplot(ax=ax, return_type='dict')
+ lines = list(itertools.chain.from_iterable(d.values()))
+ assert len(ax.get_lines()) == len(lines)
+
+ @pytest.mark.slow
+ def test_boxplot_return_type_none(self):
+ # GH 12216; return_type=None & by=None -> axes
+ result = self.hist_df.boxplot()
+ assert isinstance(result, self.plt.Axes)
+
+ @pytest.mark.slow
+ def test_boxplot_return_type_legacy(self):
+ # API change in https://github.com/pandas-dev/pandas/pull/7096
+ import matplotlib as mpl # noqa
+
+ df = DataFrame(np.random.randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['one', 'two', 'three', 'four'])
+ with pytest.raises(ValueError):
+ df.boxplot(return_type='NOTATYPE')
+
+ result = df.boxplot()
+ self._check_box_return_type(result, 'axes')
+
+ with tm.assert_produces_warning(False):
+ result = df.boxplot(return_type='dict')
+ self._check_box_return_type(result, 'dict')
+
+ with tm.assert_produces_warning(False):
+ result = df.boxplot(return_type='axes')
+ self._check_box_return_type(result, 'axes')
+
+ with tm.assert_produces_warning(False):
+ result = df.boxplot(return_type='both')
+ self._check_box_return_type(result, 'both')
+
+ @pytest.mark.slow
+ def test_boxplot_axis_limits(self):
+
+ def _check_ax_limits(col, ax):
+ y_min, y_max = ax.get_ylim()
+ assert y_min <= col.min()
+ assert y_max >= col.max()
+
+ df = self.hist_df.copy()
+ df['age'] = np.random.randint(1, 20, df.shape[0])
+ # One full row
+ height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category')
+ _check_ax_limits(df['height'], height_ax)
+ _check_ax_limits(df['weight'], weight_ax)
+ assert weight_ax._sharey == height_ax
+
+ # Two rows, one partial
+ p = df.boxplot(['height', 'weight', 'age'], by='category')
+ height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0]
+ dummy_ax = p[1, 1]
+
+ _check_ax_limits(df['height'], height_ax)
+ _check_ax_limits(df['weight'], weight_ax)
+ _check_ax_limits(df['age'], age_ax)
+ assert weight_ax._sharey == height_ax
+ assert age_ax._sharey == height_ax
+ assert dummy_ax._sharey is None
+
+ @pytest.mark.slow
+ def test_boxplot_empty_column(self):
+ df = DataFrame(np.random.randn(20, 4))
+ df.loc[:, 0] = np.nan
+ _check_plot_works(df.boxplot, return_type='axes')
+
+ @pytest.mark.slow
+ def test_figsize(self):
+ df = DataFrame(np.random.rand(10, 5),
+ columns=['A', 'B', 'C', 'D', 'E'])
+ result = df.boxplot(return_type='axes', figsize=(12, 8))
+ assert result.figure.bbox_inches.width == 12
+ assert result.figure.bbox_inches.height == 8
+
+ def test_fontsize(self):
+ df = DataFrame({"a": [1, 2, 3, 4, 5, 6]})
+ self._check_ticks_props(df.boxplot("a", fontsize=16),
+ xlabelsize=16, ylabelsize=16)
+
+
+class TestDataFrameGroupByPlots(TestPlotBase):
+
+ @pytest.mark.slow
+ def test_boxplot_legacy1(self):
+ grouped = self.hist_df.groupby(by='gender')
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(grouped.boxplot, return_type='axes')
+ self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2))
+ axes = _check_plot_works(grouped.boxplot, subplots=False,
+ return_type='axes')
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_boxplot_legacy2(self):
+ tuples = lzip(string.ascii_letters[:10], range(10))
+ df = DataFrame(np.random.rand(10, 3),
+ index=MultiIndex.from_tuples(tuples))
+ grouped = df.groupby(level=1)
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(grouped.boxplot, return_type='axes')
+ self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3))
+
+ axes = _check_plot_works(grouped.boxplot, subplots=False,
+ return_type='axes')
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_boxplot_legacy3(self):
+ tuples = lzip(string.ascii_letters[:10], range(10))
+ df = DataFrame(np.random.rand(10, 3),
+ index=MultiIndex.from_tuples(tuples))
+ grouped = df.unstack(level=1).groupby(level=0, axis=1)
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(grouped.boxplot, return_type='axes')
+ self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2))
+ axes = _check_plot_works(grouped.boxplot, subplots=False,
+ return_type='axes')
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_grouped_plot_fignums(self):
+ n = 10
+ weight = Series(np.random.normal(166, 20, size=n))
+ height = Series(np.random.normal(60, 10, size=n))
+ with tm.RNGContext(42):
+ gender = np.random.choice(['male', 'female'], size=n)
+ df = DataFrame({'height': height, 'weight': weight, 'gender': gender})
+ gb = df.groupby('gender')
+
+ res = gb.plot()
+ assert len(self.plt.get_fignums()) == 2
+ assert len(res) == 2
+ tm.close()
+
+ res = gb.boxplot(return_type='axes')
+ assert len(self.plt.get_fignums()) == 1
+ assert len(res) == 2
+ tm.close()
+
+ # now works with GH 5610 as gender is excluded
+ res = df.groupby('gender').hist()
+ tm.close()
+
+ @pytest.mark.slow
+ def test_grouped_box_return_type(self):
+ df = self.hist_df
+
+ # old style: return_type=None
+ result = df.boxplot(by='gender')
+ assert isinstance(result, np.ndarray)
+ self._check_box_return_type(
+ result, None,
+ expected_keys=['height', 'weight', 'category'])
+
+ # now for groupby
+ result = df.groupby('gender').boxplot(return_type='dict')
+ self._check_box_return_type(
+ result, 'dict', expected_keys=['Male', 'Female'])
+
+ columns2 = 'X B C D A G Y N Q O'.split()
+ df2 = DataFrame(random.randn(50, 10), columns=columns2)
+ categories2 = 'A B C D E F G H I J'.split()
+ df2['category'] = categories2 * 5
+
+ for t in ['dict', 'axes', 'both']:
+ returned = df.groupby('classroom').boxplot(return_type=t)
+ self._check_box_return_type(
+ returned, t, expected_keys=['A', 'B', 'C'])
+
+ returned = df.boxplot(by='classroom', return_type=t)
+ self._check_box_return_type(
+ returned, t,
+ expected_keys=['height', 'weight', 'category'])
+
+ returned = df2.groupby('category').boxplot(return_type=t)
+ self._check_box_return_type(returned, t, expected_keys=categories2)
+
+ returned = df2.boxplot(by='category', return_type=t)
+ self._check_box_return_type(returned, t, expected_keys=columns2)
+
+ @pytest.mark.slow
+ def test_grouped_box_layout(self):
+ df = self.hist_df
+
+ pytest.raises(ValueError, df.boxplot, column=['weight', 'height'],
+ by=df.gender, layout=(1, 1))
+ pytest.raises(ValueError, df.boxplot,
+ column=['height', 'weight', 'category'],
+ layout=(2, 1), return_type='dict')
+ pytest.raises(ValueError, df.boxplot, column=['weight', 'height'],
+ by=df.gender, layout=(-1, -1))
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ box = _check_plot_works(df.groupby('gender').boxplot,
+ column='height', return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ box = _check_plot_works(df.groupby('category').boxplot,
+ column='height',
+ return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2))
+
+ # GH 6769
+ with tm.assert_produces_warning(UserWarning):
+ box = _check_plot_works(df.groupby('classroom').boxplot,
+ column='height', return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2))
+
+ # GH 5897
+ axes = df.boxplot(column=['height', 'weight', 'category'], by='gender',
+ return_type='axes')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2))
+ for ax in [axes['height']]:
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible([ax.xaxis.get_label()], visible=False)
+ for ax in [axes['weight'], axes['category']]:
+ self._check_visible(ax.get_xticklabels())
+ self._check_visible([ax.xaxis.get_label()])
+
+ box = df.groupby('classroom').boxplot(
+ column=['height', 'weight', 'category'], return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ box = _check_plot_works(df.groupby('category').boxplot,
+ column='height',
+ layout=(3, 2), return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2))
+ with tm.assert_produces_warning(UserWarning):
+ box = _check_plot_works(df.groupby('category').boxplot,
+ column='height',
+ layout=(3, -1), return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2))
+
+ box = df.boxplot(column=['height', 'weight', 'category'], by='gender',
+ layout=(4, 1))
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1))
+
+ box = df.boxplot(column=['height', 'weight', 'category'], by='gender',
+ layout=(-1, 1))
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1))
+
+ box = df.groupby('classroom').boxplot(
+ column=['height', 'weight', 'category'], layout=(1, 4),
+ return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4))
+
+ box = df.groupby('classroom').boxplot( # noqa
+ column=['height', 'weight', 'category'], layout=(1, -1),
+ return_type='dict')
+ self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3))
+
+ @pytest.mark.slow
+ def test_grouped_box_multiple_axes(self):
+ # GH 6970, GH 7069
+ df = self.hist_df
+
+ # check warning to ignore sharex / sharey
+ # this check should be done in the first function which
+ # passes multiple axes to plot, hist or boxplot
+ # location should be changed if other test is added
+ # which has earlier alphabetical order
+ with tm.assert_produces_warning(UserWarning):
+ fig, axes = self.plt.subplots(2, 2)
+ df.groupby('category').boxplot(
+ column='height', return_type='axes', ax=axes)
+ self._check_axes_shape(self.plt.gcf().axes,
+ axes_num=4, layout=(2, 2))
+
+ fig, axes = self.plt.subplots(2, 3)
+ with tm.assert_produces_warning(UserWarning):
+ returned = df.boxplot(column=['height', 'weight', 'category'],
+ by='gender', return_type='axes', ax=axes[0])
+ returned = np.array(list(returned.values))
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ tm.assert_numpy_array_equal(returned, axes[0])
+ assert returned[0].figure is fig
+
+ # draw on second row
+ with tm.assert_produces_warning(UserWarning):
+ returned = df.groupby('classroom').boxplot(
+ column=['height', 'weight', 'category'],
+ return_type='axes', ax=axes[1])
+ returned = np.array(list(returned.values))
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ tm.assert_numpy_array_equal(returned, axes[1])
+ assert returned[0].figure is fig
+
+ with pytest.raises(ValueError):
+ fig, axes = self.plt.subplots(2, 3)
+ # pass different number of axes from required
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.groupby('classroom').boxplot(ax=axes)
+
+ def test_fontsize(self):
+ df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]})
+ self._check_ticks_props(df.boxplot("a", by="b", fontsize=16),
+ xlabelsize=16, ylabelsize=16)
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_converter.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_converter.py
new file mode 100644
index 00000000000..7dfc21562cc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_converter.py
@@ -0,0 +1,346 @@
+from datetime import date, datetime
+import subprocess
+import sys
+
+import numpy as np
+import pytest
+
+from pandas.compat import u
+from pandas.compat.numpy import np_datetime64_compat
+
+from pandas import Index, Period, Series, Timestamp, date_range
+import pandas.core.config as cf
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day, Micro, Milli, Second
+
+converter = pytest.importorskip('pandas.plotting._converter')
+from pandas.plotting import (deregister_matplotlib_converters, # isort:skip
+ register_matplotlib_converters)
+
+
+def test_timtetonum_accepts_unicode():
+ assert (converter.time2num("00:01") == converter.time2num(u("00:01")))
+
+
+class TestRegistration(object):
+
+ def test_register_by_default(self):
+ # Run in subprocess to ensure a clean state
+ code = ("'import matplotlib.units; "
+ "import pandas as pd; "
+ "units = dict(matplotlib.units.registry); "
+ "assert pd.Timestamp in units)'")
+ call = [sys.executable, '-c', code]
+ assert subprocess.check_call(call) == 0
+
+ def test_warns(self):
+ plt = pytest.importorskip("matplotlib.pyplot")
+ s = Series(range(12), index=date_range('2017', periods=12))
+ _, ax = plt.subplots()
+
+ # Set to the "warning" state, in case this isn't the first test run
+ converter._WARN = True
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False) as w:
+ ax.plot(s.index, s.values)
+ plt.close()
+
+ assert len(w) == 1
+ assert "Using an implicitly registered datetime converter" in str(w[0])
+
+ def test_registering_no_warning(self):
+ plt = pytest.importorskip("matplotlib.pyplot")
+ s = Series(range(12), index=date_range('2017', periods=12))
+ _, ax = plt.subplots()
+
+ # Set to the "warn" state, in case this isn't the first test run
+ converter._WARN = True
+ register_matplotlib_converters()
+ with tm.assert_produces_warning(None) as w:
+ ax.plot(s.index, s.values)
+
+ assert len(w) == 0
+
+ def test_pandas_plots_register(self):
+ pytest.importorskip("matplotlib.pyplot")
+ s = Series(range(12), index=date_range('2017', periods=12))
+ # Set to the "warn" state, in case this isn't the first test run
+ converter._WARN = True
+ with tm.assert_produces_warning(None) as w:
+ s.plot()
+
+ assert len(w) == 0
+
+ def test_matplotlib_formatters(self):
+ units = pytest.importorskip("matplotlib.units")
+ assert Timestamp in units.registry
+
+ ctx = cf.option_context("plotting.matplotlib.register_converters",
+ False)
+ with ctx:
+ assert Timestamp not in units.registry
+
+ assert Timestamp in units.registry
+
+ def test_option_no_warning(self):
+ pytest.importorskip("matplotlib.pyplot")
+ ctx = cf.option_context("plotting.matplotlib.register_converters",
+ False)
+ plt = pytest.importorskip("matplotlib.pyplot")
+ s = Series(range(12), index=date_range('2017', periods=12))
+ _, ax = plt.subplots()
+
+ converter._WARN = True
+ # Test without registering first, no warning
+ with ctx:
+ with tm.assert_produces_warning(None) as w:
+ ax.plot(s.index, s.values)
+
+ assert len(w) == 0
+
+ # Now test with registering
+ converter._WARN = True
+ register_matplotlib_converters()
+ with ctx:
+ with tm.assert_produces_warning(None) as w:
+ ax.plot(s.index, s.values)
+
+ assert len(w) == 0
+
+ def test_registry_resets(self):
+ units = pytest.importorskip("matplotlib.units")
+ dates = pytest.importorskip("matplotlib.dates")
+
+ # make a copy, to reset to
+ original = dict(units.registry)
+
+ try:
+ # get to a known state
+ units.registry.clear()
+ date_converter = dates.DateConverter()
+ units.registry[datetime] = date_converter
+ units.registry[date] = date_converter
+
+ register_matplotlib_converters()
+ assert units.registry[date] is not date_converter
+ deregister_matplotlib_converters()
+ assert units.registry[date] is date_converter
+
+ finally:
+ # restore original stater
+ units.registry.clear()
+ for k, v in original.items():
+ units.registry[k] = v
+
+ def test_old_import_warns(self):
+ with tm.assert_produces_warning(FutureWarning) as w:
+ from pandas.tseries import converter
+ converter.register()
+
+ assert len(w)
+ assert ('pandas.plotting.register_matplotlib_converters' in
+ str(w[0].message))
+
+
+class TestDateTimeConverter(object):
+
+ def setup_method(self, method):
+ self.dtc = converter.DatetimeConverter()
+ self.tc = converter.TimeFormatter(None)
+
+ def test_convert_accepts_unicode(self):
+ r1 = self.dtc.convert("12:22", None, None)
+ r2 = self.dtc.convert(u("12:22"), None, None)
+ assert (r1 == r2), "DatetimeConverter.convert should accept unicode"
+
+ def test_conversion(self):
+ rs = self.dtc.convert(['2012-1-1'], None, None)[0]
+ xp = datetime(2012, 1, 1).toordinal()
+ assert rs == xp
+
+ rs = self.dtc.convert('2012-1-1', None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(date(2012, 1, 1), None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert('2012-1-1', None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(Timestamp('2012-1-1'), None, None)
+ assert rs == xp
+
+ # also testing datetime64 dtype (GH8614)
+ rs = self.dtc.convert(np_datetime64_compat('2012-01-01'), None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(np_datetime64_compat(
+ '2012-01-01 00:00:00+0000'), None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(np.array([
+ np_datetime64_compat('2012-01-01 00:00:00+0000'),
+ np_datetime64_compat('2012-01-02 00:00:00+0000')]), None, None)
+ assert rs[0] == xp
+
+ # we have a tz-aware date (constructed to that when we turn to utc it
+ # is the same as our sample)
+ ts = (Timestamp('2012-01-01')
+ .tz_localize('UTC')
+ .tz_convert('US/Eastern')
+ )
+ rs = self.dtc.convert(ts, None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(ts.to_pydatetime(), None, None)
+ assert rs == xp
+
+ rs = self.dtc.convert(Index([ts - Day(1), ts]), None, None)
+ assert rs[1] == xp
+
+ rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(),
+ None, None)
+ assert rs[1] == xp
+
+ def test_conversion_float(self):
+ decimals = 9
+
+ rs = self.dtc.convert(
+ Timestamp('2012-1-1 01:02:03', tz='UTC'), None, None)
+ xp = converter.dates.date2num(Timestamp('2012-1-1 01:02:03', tz='UTC'))
+ tm.assert_almost_equal(rs, xp, decimals)
+
+ rs = self.dtc.convert(
+ Timestamp('2012-1-1 09:02:03', tz='Asia/Hong_Kong'), None, None)
+ tm.assert_almost_equal(rs, xp, decimals)
+
+ rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None)
+ tm.assert_almost_equal(rs, xp, decimals)
+
+ def test_conversion_outofbounds_datetime(self):
+ # 2579
+ values = [date(1677, 1, 1), date(1677, 1, 2)]
+ rs = self.dtc.convert(values, None, None)
+ xp = converter.dates.date2num(values)
+ tm.assert_numpy_array_equal(rs, xp)
+ rs = self.dtc.convert(values[0], None, None)
+ xp = converter.dates.date2num(values[0])
+ assert rs == xp
+
+ values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)]
+ rs = self.dtc.convert(values, None, None)
+ xp = converter.dates.date2num(values)
+ tm.assert_numpy_array_equal(rs, xp)
+ rs = self.dtc.convert(values[0], None, None)
+ xp = converter.dates.date2num(values[0])
+ assert rs == xp
+
+ @pytest.mark.parametrize('time,format_expected', [
+ (0, '00:00'), # time2num(datetime.time.min)
+ (86399.999999, '23:59:59.999999'), # time2num(datetime.time.max)
+ (90000, '01:00'),
+ (3723, '01:02:03'),
+ (39723.2, '11:02:03.200')
+ ])
+ def test_time_formatter(self, time, format_expected):
+ # issue 18478
+ result = self.tc(time)
+ assert result == format_expected
+
+ def test_dateindex_conversion(self):
+ decimals = 9
+
+ for freq in ('B', 'L', 'S'):
+ dateindex = tm.makeDateIndex(k=10, freq=freq)
+ rs = self.dtc.convert(dateindex, None, None)
+ xp = converter.dates.date2num(dateindex._mpl_repr())
+ tm.assert_almost_equal(rs, xp, decimals)
+
+ def test_resolution(self):
+ def _assert_less(ts1, ts2):
+ val1 = self.dtc.convert(ts1, None, None)
+ val2 = self.dtc.convert(ts2, None, None)
+ if not val1 < val2:
+ raise AssertionError('{0} is not less than {1}.'.format(val1,
+ val2))
+
+ # Matplotlib's time representation using floats cannot distinguish
+ # intervals smaller than ~10 microsecond in the common range of years.
+ ts = Timestamp('2012-1-1')
+ _assert_less(ts, ts + Second())
+ _assert_less(ts, ts + Milli())
+ _assert_less(ts, ts + Micro(50))
+
+ def test_convert_nested(self):
+ inner = [Timestamp('2017-01-01'), Timestamp('2017-01-02')]
+ data = [inner, inner]
+ result = self.dtc.convert(data, None, None)
+ expected = [self.dtc.convert(x, None, None) for x in data]
+ assert (np.array(result) == expected).all()
+
+
+class TestPeriodConverter(object):
+
+ def setup_method(self, method):
+ self.pc = converter.PeriodConverter()
+
+ class Axis(object):
+ pass
+
+ self.axis = Axis()
+ self.axis.freq = 'D'
+
+ def test_convert_accepts_unicode(self):
+ r1 = self.pc.convert("2012-1-1", None, self.axis)
+ r2 = self.pc.convert(u("2012-1-1"), None, self.axis)
+ assert r1 == r2
+
+ def test_conversion(self):
+ rs = self.pc.convert(['2012-1-1'], None, self.axis)[0]
+ xp = Period('2012-1-1').ordinal
+ assert rs == xp
+
+ rs = self.pc.convert('2012-1-1', None, self.axis)
+ assert rs == xp
+
+ rs = self.pc.convert([date(2012, 1, 1)], None, self.axis)[0]
+ assert rs == xp
+
+ rs = self.pc.convert(date(2012, 1, 1), None, self.axis)
+ assert rs == xp
+
+ rs = self.pc.convert([Timestamp('2012-1-1')], None, self.axis)[0]
+ assert rs == xp
+
+ rs = self.pc.convert(Timestamp('2012-1-1'), None, self.axis)
+ assert rs == xp
+
+ rs = self.pc.convert(
+ np_datetime64_compat('2012-01-01'), None, self.axis)
+ assert rs == xp
+
+ rs = self.pc.convert(
+ np_datetime64_compat('2012-01-01 00:00:00+0000'), None, self.axis)
+ assert rs == xp
+
+ rs = self.pc.convert(np.array([
+ np_datetime64_compat('2012-01-01 00:00:00+0000'),
+ np_datetime64_compat('2012-01-02 00:00:00+0000')]),
+ None, self.axis)
+ assert rs[0] == xp
+
+ def test_integer_passthrough(self):
+ # GH9012
+ rs = self.pc.convert([0, 1], None, self.axis)
+ xp = [0, 1]
+ assert rs == xp
+
+ def test_convert_nested(self):
+ data = ['2012-1-1', '2012-1-2']
+ r1 = self.pc.convert([data, data], None, self.axis)
+ r2 = [self.pc.convert(data, None, self.axis) for _ in range(2)]
+ assert r1 == r2
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_datetimelike.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_datetimelike.py
new file mode 100644
index 00000000000..ad79cc97f8b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_datetimelike.py
@@ -0,0 +1,1563 @@
+""" Test cases for time series specific (freq conversion, etc) """
+from datetime import date, datetime, time, timedelta
+import pickle
+import sys
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY3, lrange, zip
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame, Index, NaT, Series, isna
+from pandas.core.indexes.datetimes import bdate_range, date_range
+from pandas.core.indexes.period import Period, PeriodIndex, period_range
+from pandas.core.indexes.timedeltas import timedelta_range
+from pandas.core.resample import DatetimeIndex
+from pandas.tests.plotting.common import (
+ TestPlotBase, _skip_if_no_scipy_gaussian_kde)
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal, ensure_clean
+
+from pandas.tseries.offsets import DateOffset
+
+
+class TestTSPlot(TestPlotBase):
+
+ def setup_method(self, method):
+ TestPlotBase.setup_method(self, method)
+
+ freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A']
+ idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq]
+ self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx]
+ self.period_df = [DataFrame(np.random.randn(len(x), 3), index=x,
+ columns=['A', 'B', 'C'])
+ for x in idx]
+
+ freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min']
+ idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq]
+ self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx]
+ self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x,
+ columns=['A', 'B', 'C'])
+ for x in idx]
+
+ def teardown_method(self, method):
+ tm.close()
+
+ @pytest.mark.slow
+ def test_ts_plot_with_tz(self):
+ # GH2877
+ index = date_range('1/1/2011', periods=2, freq='H',
+ tz='Europe/Brussels')
+ ts = Series([188.5, 328.25], index=index)
+ _check_plot_works(ts.plot)
+
+ def test_fontsize_set_correctly(self):
+ # For issue #8765
+ df = DataFrame(np.random.randn(10, 9), index=range(10))
+ fig, ax = self.plt.subplots()
+ df.plot(fontsize=2, ax=ax)
+ for label in (ax.get_xticklabels() + ax.get_yticklabels()):
+ assert label.get_fontsize() == 2
+
+ @pytest.mark.slow
+ def test_frame_inferred(self):
+ # inferred freq
+ idx = date_range('1/1/1987', freq='MS', periods=100)
+ idx = DatetimeIndex(idx.values, freq=None)
+
+ df = DataFrame(np.random.randn(len(idx), 3), index=idx)
+ _check_plot_works(df.plot)
+
+ # axes freq
+ idx = idx[0:40].union(idx[45:99])
+ df2 = DataFrame(np.random.randn(len(idx), 3), index=idx)
+ _check_plot_works(df2.plot)
+
+ # N > 1
+ idx = date_range('2008-1-1 00:15:00', freq='15T', periods=10)
+ idx = DatetimeIndex(idx.values, freq=None)
+ df = DataFrame(np.random.randn(len(idx), 3), index=idx)
+ _check_plot_works(df.plot)
+
+ def test_is_error_nozeroindex(self):
+ # GH11858
+ i = np.array([1, 2, 3])
+ a = DataFrame(i, index=i)
+ _check_plot_works(a.plot, xerr=a)
+ _check_plot_works(a.plot, yerr=a)
+
+ def test_nonnumeric_exclude(self):
+ idx = date_range('1/1/1987', freq='A', periods=3)
+ df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}, idx)
+
+ fig, ax = self.plt.subplots()
+ df.plot(ax=ax) # it works
+ assert len(ax.get_lines()) == 1 # B was plotted
+ self.plt.close(fig)
+
+ pytest.raises(TypeError, df['A'].plot)
+
+ def test_tsplot_deprecated(self):
+ from pandas.tseries.plotting import tsplot
+
+ _, ax = self.plt.subplots()
+ ts = tm.makeTimeSeries()
+
+ with tm.assert_produces_warning(FutureWarning):
+ tsplot(ts, self.plt.Axes.plot, ax=ax)
+
+ @pytest.mark.slow
+ def test_tsplot(self):
+
+ from pandas.tseries.plotting import tsplot
+
+ _, ax = self.plt.subplots()
+ ts = tm.makeTimeSeries()
+
+ def f(*args, **kwds):
+ with tm.assert_produces_warning(FutureWarning):
+ return tsplot(s, self.plt.Axes.plot, *args, **kwds)
+
+ for s in self.period_ser:
+ _check_plot_works(f, s.index.freq, ax=ax, series=s)
+
+ for s in self.datetime_ser:
+ _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s)
+
+ for s in self.period_ser:
+ _check_plot_works(s.plot, ax=ax)
+
+ for s in self.datetime_ser:
+ _check_plot_works(s.plot, ax=ax)
+
+ _, ax = self.plt.subplots()
+ ts.plot(style='k', ax=ax)
+ color = (0., 0., 0., 1)
+ assert color == ax.get_lines()[0].get_color()
+
+ def test_both_style_and_color(self):
+
+ ts = tm.makeTimeSeries()
+ pytest.raises(ValueError, ts.plot, style='b-', color='#000099')
+
+ s = ts.reset_index(drop=True)
+ pytest.raises(ValueError, s.plot, style='b-', color='#000099')
+
+ @pytest.mark.slow
+ def test_high_freq(self):
+ freaks = ['ms', 'us']
+ for freq in freaks:
+ _, ax = self.plt.subplots()
+ rng = date_range('1/1/2012', periods=100, freq=freq)
+ ser = Series(np.random.randn(len(rng)), rng)
+ _check_plot_works(ser.plot, ax=ax)
+
+ def test_get_datevalue(self):
+ from pandas.plotting._converter import get_datevalue
+ assert get_datevalue(None, 'D') is None
+ assert get_datevalue(1987, 'A') == 1987
+ assert (get_datevalue(Period(1987, 'A'), 'M') ==
+ Period('1987-12', 'M').ordinal)
+ assert (get_datevalue('1/1/1987', 'D') ==
+ Period('1987-1-1', 'D').ordinal)
+
+ @pytest.mark.slow
+ def test_ts_plot_format_coord(self):
+ def check_format_of_first_point(ax, expected_string):
+ first_line = ax.get_lines()[0]
+ first_x = first_line.get_xdata()[0].ordinal
+ first_y = first_line.get_ydata()[0]
+ try:
+ assert expected_string == ax.format_coord(first_x, first_y)
+ except (ValueError):
+ pytest.skip("skipping test because issue forming "
+ "test comparison GH7664")
+
+ annual = Series(1, index=date_range('2014-01-01', periods=3,
+ freq='A-DEC'))
+ _, ax = self.plt.subplots()
+ annual.plot(ax=ax)
+ check_format_of_first_point(ax, 't = 2014 y = 1.000000')
+
+ # note this is added to the annual plot already in existence, and
+ # changes its freq field
+ daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D'))
+ daily.plot(ax=ax)
+ check_format_of_first_point(ax,
+ 't = 2014-01-01 y = 1.000000')
+ tm.close()
+
+ # tsplot
+ from pandas.tseries.plotting import tsplot
+ _, ax = self.plt.subplots()
+ with tm.assert_produces_warning(FutureWarning):
+ tsplot(annual, self.plt.Axes.plot, ax=ax)
+ check_format_of_first_point(ax, 't = 2014 y = 1.000000')
+ with tm.assert_produces_warning(FutureWarning):
+ tsplot(daily, self.plt.Axes.plot, ax=ax)
+ check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000')
+
+ @pytest.mark.slow
+ def test_line_plot_period_series(self):
+ for s in self.period_ser:
+ _check_plot_works(s.plot, s.index.freq)
+
+ @pytest.mark.slow
+ def test_line_plot_datetime_series(self):
+ for s in self.datetime_ser:
+ _check_plot_works(s.plot, s.index.freq.rule_code)
+
+ @pytest.mark.slow
+ def test_line_plot_period_frame(self):
+ for df in self.period_df:
+ _check_plot_works(df.plot, df.index.freq)
+
+ @pytest.mark.slow
+ def test_line_plot_datetime_frame(self):
+ for df in self.datetime_df:
+ freq = df.index.to_period(df.index.freq.rule_code).freq
+ _check_plot_works(df.plot, freq)
+
+ @pytest.mark.slow
+ def test_line_plot_inferred_freq(self):
+ for ser in self.datetime_ser:
+ ser = Series(ser.values, Index(np.asarray(ser.index)))
+ _check_plot_works(ser.plot, ser.index.inferred_freq)
+
+ ser = ser[[0, 3, 5, 6]]
+ _check_plot_works(ser.plot)
+
+ def test_fake_inferred_business(self):
+ _, ax = self.plt.subplots()
+ rng = date_range('2001-1-1', '2001-1-10')
+ ts = Series(lrange(len(rng)), rng)
+ ts = ts[:3].append(ts[5:])
+ ts.plot(ax=ax)
+ assert not hasattr(ax, 'freq')
+
+ @pytest.mark.slow
+ def test_plot_offset_freq(self):
+ ser = tm.makeTimeSeries()
+ _check_plot_works(ser.plot)
+
+ dr = date_range(ser.index[0], freq='BQS', periods=10)
+ ser = Series(np.random.randn(len(dr)), dr)
+ _check_plot_works(ser.plot)
+
+ @pytest.mark.slow
+ def test_plot_multiple_inferred_freq(self):
+ dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(
+ 2000, 1, 11)])
+ ser = Series(np.random.randn(len(dr)), dr)
+ _check_plot_works(ser.plot)
+
+ @pytest.mark.slow
+ def test_uhf(self):
+ import pandas.plotting._converter as conv
+ idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500)
+ df = DataFrame(np.random.randn(len(idx), 2), idx)
+
+ _, ax = self.plt.subplots()
+ df.plot(ax=ax)
+ axis = ax.get_xaxis()
+
+ tlocs = axis.get_ticklocs()
+ tlabels = axis.get_ticklabels()
+ for loc, label in zip(tlocs, tlabels):
+ xp = conv._from_ordinal(loc).strftime('%H:%M:%S.%f')
+ rs = str(label.get_text())
+ if len(rs):
+ assert xp == rs
+
+ @pytest.mark.slow
+ def test_irreg_hf(self):
+ idx = date_range('2012-6-22 21:59:51', freq='S', periods=100)
+ df = DataFrame(np.random.randn(len(idx), 2), idx)
+
+ irreg = df.iloc[[0, 1, 3, 4]]
+ _, ax = self.plt.subplots()
+ irreg.plot(ax=ax)
+ diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff()
+
+ sec = 1. / 24 / 60 / 60
+ assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all()
+
+ _, ax = self.plt.subplots()
+ df2 = df.copy()
+ df2.index = df.index.astype(object)
+ df2.plot(ax=ax)
+ diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff()
+ assert (np.fabs(diffs[1:] - sec) < 1e-8).all()
+
+ def test_irregular_datetime64_repr_bug(self):
+ ser = tm.makeTimeSeries()
+ ser = ser[[0, 1, 2, 7]]
+
+ _, ax = self.plt.subplots()
+
+ ret = ser.plot(ax=ax)
+ assert ret is not None
+
+ for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index):
+ assert rs == xp
+
+ def test_business_freq(self):
+ bts = tm.makePeriodSeries()
+ _, ax = self.plt.subplots()
+ bts.plot(ax=ax)
+ assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal
+ idx = ax.get_lines()[0].get_xdata()
+ assert PeriodIndex(data=idx).freqstr == 'B'
+
+ @pytest.mark.slow
+ def test_business_freq_convert(self):
+ bts = tm.makeTimeSeries(300).asfreq('BM')
+ ts = bts.to_period('M')
+ _, ax = self.plt.subplots()
+ bts.plot(ax=ax)
+ assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal
+ idx = ax.get_lines()[0].get_xdata()
+ assert PeriodIndex(data=idx).freqstr == 'M'
+
+ def test_nonzero_base(self):
+ # GH2571
+ idx = (date_range('2012-12-20', periods=24, freq='H') + timedelta(
+ minutes=30))
+ df = DataFrame(np.arange(24), index=idx)
+ _, ax = self.plt.subplots()
+ df.plot(ax=ax)
+ rs = ax.get_lines()[0].get_xdata()
+ assert not Index(rs).is_normalized
+
+ def test_dataframe(self):
+ bts = DataFrame({'a': tm.makeTimeSeries()})
+ _, ax = self.plt.subplots()
+ bts.plot(ax=ax)
+ idx = ax.get_lines()[0].get_xdata()
+ tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx))
+
+ @pytest.mark.slow
+ def test_axis_limits(self):
+
+ def _test(ax):
+ xlim = ax.get_xlim()
+ ax.set_xlim(xlim[0] - 5, xlim[1] + 10)
+ ax.get_figure().canvas.draw()
+ result = ax.get_xlim()
+ assert result[0] == xlim[0] - 5
+ assert result[1] == xlim[1] + 10
+
+ # string
+ expected = (Period('1/1/2000', ax.freq),
+ Period('4/1/2000', ax.freq))
+ ax.set_xlim('1/1/2000', '4/1/2000')
+ ax.get_figure().canvas.draw()
+ result = ax.get_xlim()
+ assert int(result[0]) == expected[0].ordinal
+ assert int(result[1]) == expected[1].ordinal
+
+ # datetime
+ expected = (Period('1/1/2000', ax.freq),
+ Period('4/1/2000', ax.freq))
+ ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1))
+ ax.get_figure().canvas.draw()
+ result = ax.get_xlim()
+ assert int(result[0]) == expected[0].ordinal
+ assert int(result[1]) == expected[1].ordinal
+ fig = ax.get_figure()
+ self.plt.close(fig)
+
+ ser = tm.makeTimeSeries()
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ _test(ax)
+
+ _, ax = self.plt.subplots()
+ df = DataFrame({'a': ser, 'b': ser + 1})
+ df.plot(ax=ax)
+ _test(ax)
+
+ df = DataFrame({'a': ser, 'b': ser + 1})
+ axes = df.plot(subplots=True)
+
+ for ax in axes:
+ _test(ax)
+
+ def test_get_finder(self):
+ import pandas.plotting._converter as conv
+
+ assert conv.get_finder('B') == conv._daily_finder
+ assert conv.get_finder('D') == conv._daily_finder
+ assert conv.get_finder('M') == conv._monthly_finder
+ assert conv.get_finder('Q') == conv._quarterly_finder
+ assert conv.get_finder('A') == conv._annual_finder
+ assert conv.get_finder('W') == conv._daily_finder
+
+ @pytest.mark.slow
+ def test_finder_daily(self):
+ day_lst = [10, 40, 252, 400, 950, 2750, 10000]
+
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst)
+ else: # 2.0.1, 2.1.0, 2.2.2, 2.2.3
+ xpl1 = [7565, 7564, 7553, 7546, 7518, 7428, 7066]
+ xpl2 = [7566, 7564, 7554, 7546, 7519, 7429, 7066]
+
+ rs1 = []
+ rs2 = []
+ for i, n in enumerate(day_lst):
+ rng = bdate_range('1999-1-1', periods=n)
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs1.append(xaxis.get_majorticklocs()[0])
+
+ vmin, vmax = ax.get_xlim()
+ ax.set_xlim(vmin + 0.9, vmax)
+ rs2.append(xaxis.get_majorticklocs()[0])
+ self.plt.close(ax.get_figure())
+
+ assert rs1 == xpl1
+ assert rs2 == xpl2
+
+ @pytest.mark.slow
+ def test_finder_quarterly(self):
+ yrs = [3.5, 11]
+
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs)
+ else: # 2.0.1, 2.1.0, 2.2.2, 2.2.3
+ xpl1 = [68, 68]
+ xpl2 = [72, 68]
+
+ rs1 = []
+ rs2 = []
+ for i, n in enumerate(yrs):
+ rng = period_range('1987Q2', periods=int(n * 4), freq='Q')
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs1.append(xaxis.get_majorticklocs()[0])
+
+ (vmin, vmax) = ax.get_xlim()
+ ax.set_xlim(vmin + 0.9, vmax)
+ rs2.append(xaxis.get_majorticklocs()[0])
+ self.plt.close(ax.get_figure())
+
+ assert rs1 == xpl1
+ assert rs2 == xpl2
+
+ @pytest.mark.slow
+ def test_finder_monthly(self):
+ yrs = [1.15, 2.5, 4, 11]
+
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs)
+ else: # 2.0.1, 2.1.0, 2.2.2, 2.2.3
+ xpl1 = [216, 216, 204, 204]
+ xpl2 = [216, 216, 216, 204]
+
+ rs1 = []
+ rs2 = []
+ for i, n in enumerate(yrs):
+ rng = period_range('1987Q2', periods=int(n * 12), freq='M')
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs1.append(xaxis.get_majorticklocs()[0])
+
+ vmin, vmax = ax.get_xlim()
+ ax.set_xlim(vmin + 0.9, vmax)
+ rs2.append(xaxis.get_majorticklocs()[0])
+ self.plt.close(ax.get_figure())
+
+ assert rs1 == xpl1
+ assert rs2 == xpl2
+
+ def test_finder_monthly_long(self):
+ rng = period_range('1988Q1', periods=24 * 12, freq='M')
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs = xaxis.get_majorticklocs()[0]
+ xp = Period('1989Q1', 'M').ordinal
+ assert rs == xp
+
+ @pytest.mark.slow
+ def test_finder_annual(self):
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170]
+ else: # 2.0.1, 2.1.0, 2.2.2, 2.2.3
+ xp = [1986, 1986, 1990, 1990, 1995, 2020, 1970, 1970]
+
+ xp = [Period(x, freq='A').ordinal for x in xp]
+ rs = []
+ for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]):
+ rng = period_range('1987', periods=nyears, freq='A')
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs.append(xaxis.get_majorticklocs()[0])
+ self.plt.close(ax.get_figure())
+
+ assert rs == xp
+
+ @pytest.mark.slow
+ def test_finder_minutely(self):
+ nminutes = 50 * 24 * 60
+ rng = date_range('1/1/1999', freq='Min', periods=nminutes)
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs = xaxis.get_majorticklocs()[0]
+ xp = Period('1/1/1999', freq='Min').ordinal
+
+ assert rs == xp
+
+ def test_finder_hourly(self):
+ nhours = 23
+ rng = date_range('1/1/1999', freq='H', periods=nhours)
+ ser = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ rs = xaxis.get_majorticklocs()[0]
+ if self.mpl_ge_2_0_1:
+ xp = Period('1/1/1999', freq='H').ordinal
+ else: # 2.0.0
+ xp = Period('1998-12-31 22:00', freq='H').ordinal
+
+ assert rs == xp
+
+ @pytest.mark.slow
+ def test_gaps(self):
+ ts = tm.makeTimeSeries()
+ ts[5:25] = np.nan
+ _, ax = self.plt.subplots()
+ ts.plot(ax=ax)
+ lines = ax.get_lines()
+ assert len(lines) == 1
+ line = lines[0]
+ data = line.get_xydata()
+
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+ assert isinstance(data, np.ma.core.MaskedArray)
+ mask = data.mask
+ assert mask[5:25, 1].all()
+ self.plt.close(ax.get_figure())
+
+ # irregular
+ ts = tm.makeTimeSeries()
+ ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]]
+ ts[2:5] = np.nan
+ _, ax = self.plt.subplots()
+ ax = ts.plot(ax=ax)
+ lines = ax.get_lines()
+ assert len(lines) == 1
+ line = lines[0]
+ data = line.get_xydata()
+
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+ assert isinstance(data, np.ma.core.MaskedArray)
+ mask = data.mask
+ assert mask[2:5, 1].all()
+ self.plt.close(ax.get_figure())
+
+ # non-ts
+ idx = [0, 1, 2, 5, 7, 9, 12, 15, 20]
+ ser = Series(np.random.randn(len(idx)), idx)
+ ser[2:5] = np.nan
+ _, ax = self.plt.subplots()
+ ser.plot(ax=ax)
+ lines = ax.get_lines()
+ assert len(lines) == 1
+ line = lines[0]
+ data = line.get_xydata()
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+ assert isinstance(data, np.ma.core.MaskedArray)
+ mask = data.mask
+ assert mask[2:5, 1].all()
+
+ @pytest.mark.slow
+ def test_gap_upsample(self):
+ low = tm.makeTimeSeries()
+ low[5:25] = np.nan
+ _, ax = self.plt.subplots()
+ low.plot(ax=ax)
+
+ idxh = date_range(low.index[0], low.index[-1], freq='12h')
+ s = Series(np.random.randn(len(idxh)), idxh)
+ s.plot(secondary_y=True)
+ lines = ax.get_lines()
+ assert len(lines) == 1
+ assert len(ax.right_ax.get_lines()) == 1
+
+ line = lines[0]
+ data = line.get_xydata()
+ if (self.mpl_ge_3_0_0 or not self.mpl_ge_2_0_1
+ or (self.mpl_ge_2_1_0 and not self.mpl_ge_2_2_2)):
+ # 2.0.0, 2.2.0 (exactly) or >= 3.0.0
+ data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+ assert isinstance(data, np.ma.core.MaskedArray)
+ mask = data.mask
+ assert mask[5:25, 1].all()
+
+ @pytest.mark.slow
+ def test_secondary_y(self):
+ ser = Series(np.random.randn(10))
+ ser2 = Series(np.random.randn(10))
+ fig, _ = self.plt.subplots()
+ ax = ser.plot(secondary_y=True)
+ assert hasattr(ax, 'left_ax')
+ assert not hasattr(ax, 'right_ax')
+ axes = fig.get_axes()
+ line = ax.get_lines()[0]
+ xp = Series(line.get_ydata(), line.get_xdata())
+ assert_series_equal(ser, xp)
+ assert ax.get_yaxis().get_ticks_position() == 'right'
+ assert not axes[0].get_yaxis().get_visible()
+ self.plt.close(fig)
+
+ _, ax2 = self.plt.subplots()
+ ser2.plot(ax=ax2)
+ assert (ax2.get_yaxis().get_ticks_position() ==
+ self.default_tick_position)
+ self.plt.close(ax2.get_figure())
+
+ ax = ser2.plot()
+ ax2 = ser.plot(secondary_y=True)
+ assert ax.get_yaxis().get_visible()
+ assert not hasattr(ax, 'left_ax')
+ assert hasattr(ax, 'right_ax')
+ assert hasattr(ax2, 'left_ax')
+ assert not hasattr(ax2, 'right_ax')
+
+ @pytest.mark.slow
+ def test_secondary_y_ts(self):
+ idx = date_range('1/1/2000', periods=10)
+ ser = Series(np.random.randn(10), idx)
+ ser2 = Series(np.random.randn(10), idx)
+ fig, _ = self.plt.subplots()
+ ax = ser.plot(secondary_y=True)
+ assert hasattr(ax, 'left_ax')
+ assert not hasattr(ax, 'right_ax')
+ axes = fig.get_axes()
+ line = ax.get_lines()[0]
+ xp = Series(line.get_ydata(), line.get_xdata()).to_timestamp()
+ assert_series_equal(ser, xp)
+ assert ax.get_yaxis().get_ticks_position() == 'right'
+ assert not axes[0].get_yaxis().get_visible()
+ self.plt.close(fig)
+
+ _, ax2 = self.plt.subplots()
+ ser2.plot(ax=ax2)
+ assert (ax2.get_yaxis().get_ticks_position() ==
+ self.default_tick_position)
+ self.plt.close(ax2.get_figure())
+
+ ax = ser2.plot()
+ ax2 = ser.plot(secondary_y=True)
+ assert ax.get_yaxis().get_visible()
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_secondary_kde(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ ser = Series(np.random.randn(10))
+ fig, ax = self.plt.subplots()
+ ax = ser.plot(secondary_y=True, kind='density', ax=ax)
+ assert hasattr(ax, 'left_ax')
+ assert not hasattr(ax, 'right_ax')
+ axes = fig.get_axes()
+ assert axes[1].get_yaxis().get_ticks_position() == 'right'
+
+ @pytest.mark.slow
+ def test_secondary_bar(self):
+ ser = Series(np.random.randn(10))
+ fig, ax = self.plt.subplots()
+ ser.plot(secondary_y=True, kind='bar', ax=ax)
+ axes = fig.get_axes()
+ assert axes[1].get_yaxis().get_ticks_position() == 'right'
+
+ @pytest.mark.slow
+ def test_secondary_frame(self):
+ df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])
+ axes = df.plot(secondary_y=['a', 'c'], subplots=True)
+ assert axes[0].get_yaxis().get_ticks_position() == 'right'
+ assert (axes[1].get_yaxis().get_ticks_position() ==
+ self.default_tick_position)
+ assert axes[2].get_yaxis().get_ticks_position() == 'right'
+
+ @pytest.mark.slow
+ def test_secondary_bar_frame(self):
+ df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])
+ axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True)
+ assert axes[0].get_yaxis().get_ticks_position() == 'right'
+ assert (axes[1].get_yaxis().get_ticks_position() ==
+ self.default_tick_position)
+ assert axes[2].get_yaxis().get_ticks_position() == 'right'
+
+ def test_mixed_freq_regular_first(self):
+ # TODO
+ s1 = tm.makeTimeSeries()
+ s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]]
+
+ # it works!
+ _, ax = self.plt.subplots()
+ s1.plot(ax=ax)
+
+ ax2 = s2.plot(style='g', ax=ax)
+ lines = ax2.get_lines()
+ idx1 = PeriodIndex(lines[0].get_xdata())
+ idx2 = PeriodIndex(lines[1].get_xdata())
+
+ tm.assert_index_equal(idx1, s1.index.to_period('B'))
+ tm.assert_index_equal(idx2, s2.index.to_period('B'))
+
+ left, right = ax2.get_xlim()
+ pidx = s1.index.to_period()
+ assert left <= pidx[0].ordinal
+ assert right >= pidx[-1].ordinal
+
+ @pytest.mark.slow
+ def test_mixed_freq_irregular_first(self):
+ s1 = tm.makeTimeSeries()
+ s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]]
+ _, ax = self.plt.subplots()
+ s2.plot(style='g', ax=ax)
+ s1.plot(ax=ax)
+ assert not hasattr(ax, 'freq')
+ lines = ax.get_lines()
+ x1 = lines[0].get_xdata()
+ tm.assert_numpy_array_equal(x1, s2.index.astype(object).values)
+ x2 = lines[1].get_xdata()
+ tm.assert_numpy_array_equal(x2, s1.index.astype(object).values)
+
+ def test_mixed_freq_regular_first_df(self):
+ # GH 9852
+ s1 = tm.makeTimeSeries().to_frame()
+ s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :]
+ _, ax = self.plt.subplots()
+ s1.plot(ax=ax)
+ ax2 = s2.plot(style='g', ax=ax)
+ lines = ax2.get_lines()
+ idx1 = PeriodIndex(lines[0].get_xdata())
+ idx2 = PeriodIndex(lines[1].get_xdata())
+ assert idx1.equals(s1.index.to_period('B'))
+ assert idx2.equals(s2.index.to_period('B'))
+ left, right = ax2.get_xlim()
+ pidx = s1.index.to_period()
+ assert left <= pidx[0].ordinal
+ assert right >= pidx[-1].ordinal
+
+ @pytest.mark.slow
+ def test_mixed_freq_irregular_first_df(self):
+ # GH 9852
+ s1 = tm.makeTimeSeries().to_frame()
+ s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :]
+ _, ax = self.plt.subplots()
+ s2.plot(style='g', ax=ax)
+ s1.plot(ax=ax)
+ assert not hasattr(ax, 'freq')
+ lines = ax.get_lines()
+ x1 = lines[0].get_xdata()
+ tm.assert_numpy_array_equal(x1, s2.index.astype(object).values)
+ x2 = lines[1].get_xdata()
+ tm.assert_numpy_array_equal(x2, s1.index.astype(object).values)
+
+ def test_mixed_freq_hf_first(self):
+ idxh = date_range('1/1/1999', periods=365, freq='D')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ high.plot(ax=ax)
+ low.plot(ax=ax)
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == 'D'
+
+ @pytest.mark.slow
+ def test_mixed_freq_alignment(self):
+ ts_ind = date_range('2012-01-01 13:00', '2012-01-02', freq='H')
+ ts_data = np.random.randn(12)
+
+ ts = Series(ts_data, index=ts_ind)
+ ts2 = ts.asfreq('T').interpolate()
+
+ _, ax = self.plt.subplots()
+ ax = ts.plot(ax=ax)
+ ts2.plot(style='r', ax=ax)
+
+ assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0]
+
+ @pytest.mark.slow
+ def test_mixed_freq_lf_first(self):
+
+ idxh = date_range('1/1/1999', periods=365, freq='D')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ low.plot(legend=True, ax=ax)
+ high.plot(legend=True, ax=ax)
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == 'D'
+ leg = ax.get_legend()
+ assert len(leg.texts) == 2
+ self.plt.close(ax.get_figure())
+
+ idxh = date_range('1/1/1999', periods=240, freq='T')
+ idxl = date_range('1/1/1999', periods=4, freq='H')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ low.plot(ax=ax)
+ high.plot(ax=ax)
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == 'T'
+
+ def test_mixed_freq_irreg_period(self):
+ ts = tm.makeTimeSeries()
+ irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]]
+ rng = period_range('1/3/2000', periods=30, freq='B')
+ ps = Series(np.random.randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ irreg.plot(ax=ax)
+ ps.plot(ax=ax)
+
+ def test_mixed_freq_shared_ax(self):
+
+ # GH13341, using sharex=True
+ idx1 = date_range('2015-01-01', periods=3, freq='M')
+ idx2 = idx1[:1].union(idx1[2:])
+ s1 = Series(range(len(idx1)), idx1)
+ s2 = Series(range(len(idx2)), idx2)
+
+ fig, (ax1, ax2) = self.plt.subplots(nrows=2, sharex=True)
+ s1.plot(ax=ax1)
+ s2.plot(ax=ax2)
+
+ assert ax1.freq == 'M'
+ assert ax2.freq == 'M'
+ assert (ax1.lines[0].get_xydata()[0, 0] ==
+ ax2.lines[0].get_xydata()[0, 0])
+
+ # using twinx
+ fig, ax1 = self.plt.subplots()
+ ax2 = ax1.twinx()
+ s1.plot(ax=ax1)
+ s2.plot(ax=ax2)
+
+ assert (ax1.lines[0].get_xydata()[0, 0] ==
+ ax2.lines[0].get_xydata()[0, 0])
+
+ # TODO (GH14330, GH14322)
+ # plotting the irregular first does not yet work
+ # fig, ax1 = plt.subplots()
+ # ax2 = ax1.twinx()
+ # s2.plot(ax=ax1)
+ # s1.plot(ax=ax2)
+ # assert (ax1.lines[0].get_xydata()[0, 0] ==
+ # ax2.lines[0].get_xydata()[0, 0])
+
+ def test_nat_handling(self):
+
+ _, ax = self.plt.subplots()
+
+ dti = DatetimeIndex(['2015-01-01', NaT, '2015-01-03'])
+ s = Series(range(len(dti)), dti)
+ s.plot(ax=ax)
+ xdata = ax.get_lines()[0].get_xdata()
+ # plot x data is bounded by index values
+ assert s.index.min() <= Series(xdata).min()
+ assert Series(xdata).max() <= s.index.max()
+
+ @pytest.mark.slow
+ def test_to_weekly_resampling(self):
+ idxh = date_range('1/1/1999', periods=52, freq='W')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ high.plot(ax=ax)
+ low.plot(ax=ax)
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq
+
+ _, ax = self.plt.subplots()
+ from pandas.tseries.plotting import tsplot
+ with tm.assert_produces_warning(FutureWarning):
+ tsplot(high, self.plt.Axes.plot, ax=ax)
+ with tm.assert_produces_warning(FutureWarning):
+ lines = tsplot(low, self.plt.Axes.plot, ax=ax)
+ for l in lines:
+ assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq
+
+ @pytest.mark.slow
+ def test_from_weekly_resampling(self):
+ idxh = date_range('1/1/1999', periods=52, freq='W')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ low.plot(ax=ax)
+ high.plot(ax=ax)
+
+ expected_h = idxh.to_period().asi8.astype(np.float64)
+ expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544,
+ 1549, 1553, 1558, 1562], dtype=np.float64)
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq
+ xdata = l.get_xdata(orig=False)
+ if len(xdata) == 12: # idxl lines
+ tm.assert_numpy_array_equal(xdata, expected_l)
+ else:
+ tm.assert_numpy_array_equal(xdata, expected_h)
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ from pandas.tseries.plotting import tsplot
+ with tm.assert_produces_warning(FutureWarning):
+ tsplot(low, self.plt.Axes.plot, ax=ax)
+ with tm.assert_produces_warning(FutureWarning):
+ lines = tsplot(high, self.plt.Axes.plot, ax=ax)
+ for l in lines:
+ assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq
+ xdata = l.get_xdata(orig=False)
+ if len(xdata) == 12: # idxl lines
+ tm.assert_numpy_array_equal(xdata, expected_l)
+ else:
+ tm.assert_numpy_array_equal(xdata, expected_h)
+
+ @pytest.mark.slow
+ def test_from_resampling_area_line_mixed(self):
+ idxh = date_range('1/1/1999', periods=52, freq='W')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = DataFrame(np.random.rand(len(idxh), 3),
+ index=idxh, columns=[0, 1, 2])
+ low = DataFrame(np.random.rand(len(idxl), 3),
+ index=idxl, columns=[0, 1, 2])
+
+ # low to high
+ for kind1, kind2 in [('line', 'area'), ('area', 'line')]:
+ _, ax = self.plt.subplots()
+ low.plot(kind=kind1, stacked=True, ax=ax)
+ high.plot(kind=kind2, stacked=True, ax=ax)
+
+ # check low dataframe result
+ expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540,
+ 1544, 1549, 1553, 1558, 1562],
+ dtype=np.float64)
+ expected_y = np.zeros(len(expected_x), dtype=np.float64)
+ for i in range(3):
+ line = ax.lines[i]
+ assert PeriodIndex(line.get_xdata()).freq == idxh.freq
+ tm.assert_numpy_array_equal(line.get_xdata(orig=False),
+ expected_x)
+ # check stacked values are correct
+ expected_y += low[i].values
+ tm.assert_numpy_array_equal(line.get_ydata(orig=False),
+ expected_y)
+
+ # check high dataframe result
+ expected_x = idxh.to_period().asi8.astype(np.float64)
+ expected_y = np.zeros(len(expected_x), dtype=np.float64)
+ for i in range(3):
+ line = ax.lines[3 + i]
+ assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+ tm.assert_numpy_array_equal(line.get_xdata(orig=False),
+ expected_x)
+ expected_y += high[i].values
+ tm.assert_numpy_array_equal(line.get_ydata(orig=False),
+ expected_y)
+
+ # high to low
+ for kind1, kind2 in [('line', 'area'), ('area', 'line')]:
+ _, ax = self.plt.subplots()
+ high.plot(kind=kind1, stacked=True, ax=ax)
+ low.plot(kind=kind2, stacked=True, ax=ax)
+
+ # check high dataframe result
+ expected_x = idxh.to_period().asi8.astype(np.float64)
+ expected_y = np.zeros(len(expected_x), dtype=np.float64)
+ for i in range(3):
+ line = ax.lines[i]
+ assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+ tm.assert_numpy_array_equal(line.get_xdata(orig=False),
+ expected_x)
+ expected_y += high[i].values
+ tm.assert_numpy_array_equal(line.get_ydata(orig=False),
+ expected_y)
+
+ # check low dataframe result
+ expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540,
+ 1544, 1549, 1553, 1558, 1562],
+ dtype=np.float64)
+ expected_y = np.zeros(len(expected_x), dtype=np.float64)
+ for i in range(3):
+ lines = ax.lines[3 + i]
+ assert PeriodIndex(data=lines.get_xdata()).freq == idxh.freq
+ tm.assert_numpy_array_equal(lines.get_xdata(orig=False),
+ expected_x)
+ expected_y += low[i].values
+ tm.assert_numpy_array_equal(lines.get_ydata(orig=False),
+ expected_y)
+
+ @pytest.mark.slow
+ def test_mixed_freq_second_millisecond(self):
+ # GH 7772, GH 7760
+ idxh = date_range('2014-07-01 09:00', freq='S', periods=50)
+ idxl = date_range('2014-07-01 09:00', freq='100L', periods=500)
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ # high to low
+ _, ax = self.plt.subplots()
+ high.plot(ax=ax)
+ low.plot(ax=ax)
+ assert len(ax.get_lines()) == 2
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == 'L'
+ tm.close()
+
+ # low to high
+ _, ax = self.plt.subplots()
+ low.plot(ax=ax)
+ high.plot(ax=ax)
+ assert len(ax.get_lines()) == 2
+ for l in ax.get_lines():
+ assert PeriodIndex(data=l.get_xdata()).freq == 'L'
+
+ @pytest.mark.slow
+ def test_irreg_dtypes(self):
+ # date
+ idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)]
+ df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object))
+ _check_plot_works(df.plot)
+
+ # np.datetime64
+ idx = date_range('1/1/2000', periods=10)
+ idx = idx[[0, 2, 5, 9]].astype(object)
+ df = DataFrame(np.random.randn(len(idx), 3), idx)
+ _, ax = self.plt.subplots()
+ _check_plot_works(df.plot, ax=ax)
+
+ @pytest.mark.xfail(reason="fails with py2.7.15", strict=False)
+ @pytest.mark.slow
+ def test_time(self):
+ t = datetime(1, 1, 1, 3, 30, 0)
+ deltas = np.random.randint(1, 20, 3).cumsum()
+ ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
+ df = DataFrame({'a': np.random.randn(len(ts)),
+ 'b': np.random.randn(len(ts))},
+ index=ts)
+ fig, ax = self.plt.subplots()
+ df.plot(ax=ax)
+
+ # verify tick labels
+ fig.canvas.draw()
+ ticks = ax.get_xticks()
+ labels = ax.get_xticklabels()
+ for t, l in zip(ticks, labels):
+ m, s = divmod(int(t), 60)
+ h, m = divmod(m, 60)
+ rs = l.get_text()
+ if len(rs) > 0:
+ if s != 0:
+ xp = time(h, m, s).strftime('%H:%M:%S')
+ else:
+ xp = time(h, m, s).strftime('%H:%M')
+ assert xp == rs
+
+ # change xlim
+ ax.set_xlim('1:30', '5:00')
+
+ # check tick labels again
+ fig.canvas.draw()
+ ticks = ax.get_xticks()
+ labels = ax.get_xticklabels()
+ for t, l in zip(ticks, labels):
+ m, s = divmod(int(t), 60)
+ h, m = divmod(m, 60)
+ rs = l.get_text()
+ if len(rs) > 0:
+ if s != 0:
+ xp = time(h, m, s).strftime('%H:%M:%S')
+ else:
+ xp = time(h, m, s).strftime('%H:%M')
+ assert xp == rs
+
+ @pytest.mark.slow
+ def test_time_musec(self):
+ t = datetime(1, 1, 1, 3, 30, 0)
+ deltas = np.random.randint(1, 20, 3).cumsum()
+ ts = np.array([(t + timedelta(microseconds=int(x))).time()
+ for x in deltas])
+ df = DataFrame({'a': np.random.randn(len(ts)),
+ 'b': np.random.randn(len(ts))},
+ index=ts)
+ fig, ax = self.plt.subplots()
+ ax = df.plot(ax=ax)
+
+ # verify tick labels
+ fig.canvas.draw()
+ ticks = ax.get_xticks()
+ labels = ax.get_xticklabels()
+ for t, l in zip(ticks, labels):
+ m, s = divmod(int(t), 60)
+
+ us = int(round((t - int(t)) * 1e6))
+
+ h, m = divmod(m, 60)
+ rs = l.get_text()
+ if len(rs) > 0:
+ if (us % 1000) != 0:
+ xp = time(h, m, s, us).strftime('%H:%M:%S.%f')
+ elif (us // 1000) != 0:
+ xp = time(h, m, s, us).strftime('%H:%M:%S.%f')[:-3]
+ elif s != 0:
+ xp = time(h, m, s, us).strftime('%H:%M:%S')
+ else:
+ xp = time(h, m, s, us).strftime('%H:%M')
+ assert xp == rs
+
+ @pytest.mark.slow
+ def test_secondary_upsample(self):
+ idxh = date_range('1/1/1999', periods=365, freq='D')
+ idxl = date_range('1/1/1999', periods=12, freq='M')
+ high = Series(np.random.randn(len(idxh)), idxh)
+ low = Series(np.random.randn(len(idxl)), idxl)
+ _, ax = self.plt.subplots()
+ low.plot(ax=ax)
+ ax = high.plot(secondary_y=True, ax=ax)
+ for l in ax.get_lines():
+ assert PeriodIndex(l.get_xdata()).freq == 'D'
+ assert hasattr(ax, 'left_ax')
+ assert not hasattr(ax, 'right_ax')
+ for l in ax.left_ax.get_lines():
+ assert PeriodIndex(l.get_xdata()).freq == 'D'
+
+ @pytest.mark.slow
+ def test_secondary_legend(self):
+ fig = self.plt.figure()
+ ax = fig.add_subplot(211)
+
+ # ts
+ df = tm.makeTimeDataFrame()
+ df.plot(secondary_y=['A', 'B'], ax=ax)
+ leg = ax.get_legend()
+ assert len(leg.get_lines()) == 4
+ assert leg.get_texts()[0].get_text() == 'A (right)'
+ assert leg.get_texts()[1].get_text() == 'B (right)'
+ assert leg.get_texts()[2].get_text() == 'C'
+ assert leg.get_texts()[3].get_text() == 'D'
+ assert ax.right_ax.get_legend() is None
+ colors = set()
+ for line in leg.get_lines():
+ colors.add(line.get_color())
+
+ # TODO: color cycle problems
+ assert len(colors) == 4
+ self.plt.close(fig)
+
+ fig = self.plt.figure()
+ ax = fig.add_subplot(211)
+ df.plot(secondary_y=['A', 'C'], mark_right=False, ax=ax)
+ leg = ax.get_legend()
+ assert len(leg.get_lines()) == 4
+ assert leg.get_texts()[0].get_text() == 'A'
+ assert leg.get_texts()[1].get_text() == 'B'
+ assert leg.get_texts()[2].get_text() == 'C'
+ assert leg.get_texts()[3].get_text() == 'D'
+ self.plt.close(fig)
+
+ fig, ax = self.plt.subplots()
+ df.plot(kind='bar', secondary_y=['A'], ax=ax)
+ leg = ax.get_legend()
+ assert leg.get_texts()[0].get_text() == 'A (right)'
+ assert leg.get_texts()[1].get_text() == 'B'
+ self.plt.close(fig)
+
+ fig, ax = self.plt.subplots()
+ df.plot(kind='bar', secondary_y=['A'], mark_right=False, ax=ax)
+ leg = ax.get_legend()
+ assert leg.get_texts()[0].get_text() == 'A'
+ assert leg.get_texts()[1].get_text() == 'B'
+ self.plt.close(fig)
+
+ fig = self.plt.figure()
+ ax = fig.add_subplot(211)
+ df = tm.makeTimeDataFrame()
+ ax = df.plot(secondary_y=['C', 'D'], ax=ax)
+ leg = ax.get_legend()
+ assert len(leg.get_lines()) == 4
+ assert ax.right_ax.get_legend() is None
+ colors = set()
+ for line in leg.get_lines():
+ colors.add(line.get_color())
+
+ # TODO: color cycle problems
+ assert len(colors) == 4
+ self.plt.close(fig)
+
+ # non-ts
+ df = tm.makeDataFrame()
+ fig = self.plt.figure()
+ ax = fig.add_subplot(211)
+ ax = df.plot(secondary_y=['A', 'B'], ax=ax)
+ leg = ax.get_legend()
+ assert len(leg.get_lines()) == 4
+ assert ax.right_ax.get_legend() is None
+ colors = set()
+ for line in leg.get_lines():
+ colors.add(line.get_color())
+
+ # TODO: color cycle problems
+ assert len(colors) == 4
+ self.plt.close()
+
+ fig = self.plt.figure()
+ ax = fig.add_subplot(211)
+ ax = df.plot(secondary_y=['C', 'D'], ax=ax)
+ leg = ax.get_legend()
+ assert len(leg.get_lines()) == 4
+ assert ax.right_ax.get_legend() is None
+ colors = set()
+ for line in leg.get_lines():
+ colors.add(line.get_color())
+
+ # TODO: color cycle problems
+ assert len(colors) == 4
+
+ def test_format_date_axis(self):
+ rng = date_range('1/1/2012', periods=12, freq='M')
+ df = DataFrame(np.random.randn(len(rng), 3), rng)
+ _, ax = self.plt.subplots()
+ ax = df.plot(ax=ax)
+ xaxis = ax.get_xaxis()
+ for l in xaxis.get_ticklabels():
+ if len(l.get_text()) > 0:
+ assert l.get_rotation() == 30
+
+ @pytest.mark.slow
+ def test_ax_plot(self):
+ x = date_range(start='2012-01-02', periods=10, freq='D')
+ y = lrange(len(x))
+ _, ax = self.plt.subplots()
+ lines = ax.plot(x, y, label='Y')
+ tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x)
+
+ @pytest.mark.slow
+ def test_mpl_nopandas(self):
+ dates = [date(2008, 12, 31), date(2009, 1, 31)]
+ values1 = np.arange(10.0, 11.0, 0.5)
+ values2 = np.arange(11.0, 12.0, 0.5)
+
+ kw = dict(fmt='-', lw=4)
+
+ _, ax = self.plt.subplots()
+ ax.plot_date([x.toordinal() for x in dates], values1, **kw)
+ ax.plot_date([x.toordinal() for x in dates], values2, **kw)
+
+ line1, line2 = ax.get_lines()
+
+ exp = np.array([x.toordinal() for x in dates], dtype=np.float64)
+ tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp)
+ exp = np.array([x.toordinal() for x in dates], dtype=np.float64)
+ tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp)
+
+ @pytest.mark.slow
+ def test_irregular_ts_shared_ax_xlim(self):
+ # GH 2960
+ ts = tm.makeTimeSeries()[:20]
+ ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
+
+ # plot the left section of the irregular series, then the right section
+ _, ax = self.plt.subplots()
+ ts_irregular[:5].plot(ax=ax)
+ ts_irregular[5:].plot(ax=ax)
+
+ # check that axis limits are correct
+ left, right = ax.get_xlim()
+ assert left <= ts_irregular.index.min().toordinal()
+ assert right >= ts_irregular.index.max().toordinal()
+
+ @pytest.mark.slow
+ def test_secondary_y_non_ts_xlim(self):
+ # GH 3490 - non-timeseries with secondary y
+ index_1 = [1, 2, 3, 4]
+ index_2 = [5, 6, 7, 8]
+ s1 = Series(1, index=index_1)
+ s2 = Series(2, index=index_2)
+
+ _, ax = self.plt.subplots()
+ s1.plot(ax=ax)
+ left_before, right_before = ax.get_xlim()
+ s2.plot(secondary_y=True, ax=ax)
+ left_after, right_after = ax.get_xlim()
+
+ assert left_before >= left_after
+ assert right_before < right_after
+
+ @pytest.mark.slow
+ def test_secondary_y_regular_ts_xlim(self):
+ # GH 3490 - regular-timeseries with secondary y
+ index_1 = date_range(start='2000-01-01', periods=4, freq='D')
+ index_2 = date_range(start='2000-01-05', periods=4, freq='D')
+ s1 = Series(1, index=index_1)
+ s2 = Series(2, index=index_2)
+
+ _, ax = self.plt.subplots()
+ s1.plot(ax=ax)
+ left_before, right_before = ax.get_xlim()
+ s2.plot(secondary_y=True, ax=ax)
+ left_after, right_after = ax.get_xlim()
+
+ assert left_before >= left_after
+ assert right_before < right_after
+
+ @pytest.mark.slow
+ def test_secondary_y_mixed_freq_ts_xlim(self):
+ # GH 3490 - mixed frequency timeseries with secondary y
+ rng = date_range('2000-01-01', periods=10000, freq='min')
+ ts = Series(1, index=rng)
+
+ _, ax = self.plt.subplots()
+ ts.plot(ax=ax)
+ left_before, right_before = ax.get_xlim()
+ ts.resample('D').mean().plot(secondary_y=True, ax=ax)
+ left_after, right_after = ax.get_xlim()
+
+ # a downsample should not have changed either limit
+ assert left_before == left_after
+ assert right_before == right_after
+
+ @pytest.mark.slow
+ def test_secondary_y_irregular_ts_xlim(self):
+ # GH 3490 - irregular-timeseries with secondary y
+ ts = tm.makeTimeSeries()[:20]
+ ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
+
+ _, ax = self.plt.subplots()
+ ts_irregular[:5].plot(ax=ax)
+ # plot higher-x values on secondary axis
+ ts_irregular[5:].plot(secondary_y=True, ax=ax)
+ # ensure secondary limits aren't overwritten by plot on primary
+ ts_irregular[:5].plot(ax=ax)
+
+ left, right = ax.get_xlim()
+ assert left <= ts_irregular.index.min().toordinal()
+ assert right >= ts_irregular.index.max().toordinal()
+
+ def test_plot_outofbounds_datetime(self):
+ # 2579 - checking this does not raise
+ values = [date(1677, 1, 1), date(1677, 1, 2)]
+ _, ax = self.plt.subplots()
+ ax.plot(values)
+
+ values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)]
+ ax.plot(values)
+
+ def test_format_timedelta_ticks_narrow(self):
+
+ if self.mpl_ge_2_0_1:
+ expected_labels = (['00:00:00.0000000{:0>2d}'.format(i)
+ for i in range(10)])
+ else: # 2.0.0
+ expected_labels = [''] + [
+ '00:00:00.00000000{:d}'.format(2 * i)
+ for i in range(5)] + ['']
+
+ rng = timedelta_range('0', periods=10, freq='ns')
+ df = DataFrame(np.random.randn(len(rng), 3), rng)
+ fig, ax = self.plt.subplots()
+ df.plot(fontsize=2, ax=ax)
+ fig.canvas.draw()
+ labels = ax.get_xticklabels()
+
+ result_labels = [x.get_text() for x in labels]
+ assert len(result_labels) == len(expected_labels)
+ assert result_labels == expected_labels
+
+ def test_format_timedelta_ticks_wide(self):
+ expected_labels = [
+ '',
+ '00:00:00',
+ '1 days 03:46:40',
+ '2 days 07:33:20',
+ '3 days 11:20:00',
+ '4 days 15:06:40',
+ '5 days 18:53:20',
+ '6 days 22:40:00',
+ '8 days 02:26:40',
+ '9 days 06:13:20',
+ ''
+ ]
+ if self.mpl_ge_2_2_0:
+ expected_labels = expected_labels[1:-1]
+ elif self.mpl_ge_2_0_1:
+ expected_labels = expected_labels[1:-1]
+ expected_labels[-1] = ''
+
+ rng = timedelta_range('0', periods=10, freq='1 d')
+ df = DataFrame(np.random.randn(len(rng), 3), rng)
+ fig, ax = self.plt.subplots()
+ ax = df.plot(fontsize=2, ax=ax)
+ fig.canvas.draw()
+ labels = ax.get_xticklabels()
+
+ result_labels = [x.get_text() for x in labels]
+ assert len(result_labels) == len(expected_labels)
+ assert result_labels == expected_labels
+
+ def test_timedelta_plot(self):
+ # test issue #8711
+ s = Series(range(5), timedelta_range('1day', periods=5))
+ _, ax = self.plt.subplots()
+ _check_plot_works(s.plot, ax=ax)
+
+ # test long period
+ index = timedelta_range('1 day 2 hr 30 min 10 s',
+ periods=10, freq='1 d')
+ s = Series(np.random.randn(len(index)), index)
+ _, ax = self.plt.subplots()
+ _check_plot_works(s.plot, ax=ax)
+
+ # test short period
+ index = timedelta_range('1 day 2 hr 30 min 10 s',
+ periods=10, freq='1 ns')
+ s = Series(np.random.randn(len(index)), index)
+ _, ax = self.plt.subplots()
+ _check_plot_works(s.plot, ax=ax)
+
+ def test_hist(self):
+ # https://github.com/matplotlib/matplotlib/issues/8459
+ rng = date_range('1/1/2011', periods=10, freq='H')
+ x = rng
+ w1 = np.arange(0, 1, .1)
+ w2 = np.arange(0, 1, .1)[::-1]
+ _, ax = self.plt.subplots()
+ ax.hist([x, x], weights=[w1, w2])
+
+ @pytest.mark.slow
+ def test_overlapping_datetime(self):
+ # GB 6608
+ s1 = Series([1, 2, 3], index=[datetime(1995, 12, 31),
+ datetime(2000, 12, 31),
+ datetime(2005, 12, 31)])
+ s2 = Series([1, 2, 3], index=[datetime(1997, 12, 31),
+ datetime(2003, 12, 31),
+ datetime(2008, 12, 31)])
+
+ # plot first series, then add the second series to those axes,
+ # then try adding the first series again
+ _, ax = self.plt.subplots()
+ s1.plot(ax=ax)
+ s2.plot(ax=ax)
+ s1.plot(ax=ax)
+
+ @pytest.mark.xfail(reason="GH9053 matplotlib does not use"
+ " ax.xaxis.converter")
+ def test_add_matplotlib_datetime64(self):
+ # GH9053 - ensure that a plot with PeriodConverter still understands
+ # datetime64 data. This still fails because matplotlib overrides the
+ # ax.xaxis.converter with a DatetimeConverter
+ s = Series(np.random.randn(10),
+ index=date_range('1970-01-02', periods=10))
+ ax = s.plot()
+ ax.plot(s.index, s.values, color='g')
+ l1, l2 = ax.lines
+ tm.assert_numpy_array_equal(l1.get_xydata(), l2.get_xydata())
+
+ def test_matplotlib_scatter_datetime64(self):
+ # https://github.com/matplotlib/matplotlib/issues/11391
+ df = DataFrame(np.random.RandomState(0).rand(10, 2),
+ columns=["x", "y"])
+ df["time"] = date_range("2018-01-01", periods=10, freq="D")
+ fig, ax = self.plt.subplots()
+ ax.scatter(x="time", y="y", data=df)
+ fig.canvas.draw()
+ label = ax.get_xticklabels()[0]
+ if self.mpl_ge_3_0_0:
+ expected = "2017-12-08"
+ else:
+ expected = "2017-12-12"
+ assert label.get_text() == expected
+
+
+def _check_plot_works(f, freq=None, series=None, *args, **kwargs):
+ import matplotlib.pyplot as plt
+
+ fig = plt.gcf()
+
+ try:
+ plt.clf()
+ ax = fig.add_subplot(211)
+ orig_ax = kwargs.pop('ax', plt.gca())
+ orig_axfreq = getattr(orig_ax, 'freq', None)
+
+ ret = f(*args, **kwargs)
+ assert ret is not None # do something more intelligent
+
+ ax = kwargs.pop('ax', plt.gca())
+ if series is not None:
+ dfreq = series.index.freq
+ if isinstance(dfreq, DateOffset):
+ dfreq = dfreq.rule_code
+ if orig_axfreq is None:
+ assert ax.freq == dfreq
+
+ if freq is not None and orig_axfreq is None:
+ assert ax.freq == freq
+
+ ax = fig.add_subplot(212)
+ try:
+ kwargs['ax'] = ax
+ ret = f(*args, **kwargs)
+ assert ret is not None # do something more intelligent
+ except Exception:
+ pass
+
+ with ensure_clean(return_filelike=True) as path:
+ plt.savefig(path)
+
+ # GH18439
+ # this is supported only in Python 3 pickle since
+ # pickle in Python2 doesn't support instancemethod pickling
+ # TODO(statsmodels 0.10.0): Remove the statsmodels check
+ # https://github.com/pandas-dev/pandas/issues/24088
+ # https://github.com/statsmodels/statsmodels/issues/4772
+ if PY3 and 'statsmodels' not in sys.modules:
+ with ensure_clean(return_filelike=True) as path:
+ pickle.dump(fig, path)
+ finally:
+ plt.close(fig)
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_frame.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_frame.py
new file mode 100644
index 00000000000..98b241f5c82
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_frame.py
@@ -0,0 +1,3003 @@
+# coding: utf-8
+
+""" Test cases for DataFrame.plot """
+
+from datetime import date, datetime
+import string
+import warnings
+
+import numpy as np
+from numpy.random import rand, randn
+import pytest
+
+from pandas.compat import PY3, lmap, lrange, lzip, range, u, zip
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.api import is_list_like
+
+import pandas as pd
+from pandas import (
+ DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range)
+from pandas.tests.plotting.common import (
+ TestPlotBase, _check_plot_works, _ok_for_gaussian_kde,
+ _skip_if_no_scipy_gaussian_kde)
+import pandas.util.testing as tm
+
+from pandas.io.formats.printing import pprint_thing
+import pandas.plotting as plotting
+
+
+class TestDataFramePlots(TestPlotBase):
+
+ def setup_method(self, method):
+ TestPlotBase.setup_method(self, method)
+ import matplotlib as mpl
+ mpl.rcdefaults()
+
+ self.tdf = tm.makeTimeDataFrame()
+ self.hexbin_df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(
+ size=20)})
+
+ def _assert_ytickslabels_visibility(self, axes, expected):
+ for ax, exp in zip(axes, expected):
+ self._check_visible(ax.get_yticklabels(), visible=exp)
+
+ def _assert_xtickslabels_visibility(self, axes, expected):
+ for ax, exp in zip(axes, expected):
+ self._check_visible(ax.get_xticklabels(), visible=exp)
+
+ @pytest.mark.slow
+ def test_plot(self):
+ df = self.tdf
+ _check_plot_works(df.plot, grid=False)
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot,
+ subplots=True)
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot,
+ subplots=True, layout=(-1, 2))
+ self._check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot,
+ subplots=True, use_index=False)
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ df = DataFrame({'x': [1, 2], 'y': [3, 4]})
+ with pytest.raises(AttributeError, match='Unknown property blarg'):
+ df.plot.line(blarg=True)
+
+ df = DataFrame(np.random.rand(10, 3),
+ index=list(string.ascii_letters[:10]))
+
+ _check_plot_works(df.plot, use_index=True)
+ _check_plot_works(df.plot, sort_columns=False)
+ _check_plot_works(df.plot, yticks=[1, 5, 10])
+ _check_plot_works(df.plot, xticks=[1, 5, 10])
+ _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100))
+
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.plot, subplots=True, title='blah')
+
+ # We have to redo it here because _check_plot_works does two plots,
+ # once without an ax kwarg and once with an ax kwarg and the new sharex
+ # behaviour does not remove the visibility of the latter axis (as ax is
+ # present). see: https://github.com/pandas-dev/pandas/issues/9737
+
+ axes = df.plot(subplots=True, title='blah')
+ self._check_axes_shape(axes, axes_num=3, layout=(3, 1))
+ # axes[0].figure.savefig("test.png")
+ for ax in axes[:2]:
+ self._check_visible(ax.xaxis) # xaxis must be visible for grid
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=False)
+ self._check_visible([ax.xaxis.get_label()], visible=False)
+ for ax in [axes[2]]:
+ self._check_visible(ax.xaxis)
+ self._check_visible(ax.get_xticklabels())
+ self._check_visible([ax.xaxis.get_label()])
+ self._check_ticks_props(ax, xrot=0)
+
+ _check_plot_works(df.plot, title='blah')
+
+ tuples = lzip(string.ascii_letters[:10], range(10))
+ df = DataFrame(np.random.rand(10, 3),
+ index=MultiIndex.from_tuples(tuples))
+ _check_plot_works(df.plot, use_index=True)
+
+ # unicode
+ index = MultiIndex.from_tuples([(u('\u03b1'), 0),
+ (u('\u03b1'), 1),
+ (u('\u03b2'), 2),
+ (u('\u03b2'), 3),
+ (u('\u03b3'), 4),
+ (u('\u03b3'), 5),
+ (u('\u03b4'), 6),
+ (u('\u03b4'), 7)], names=['i0', 'i1'])
+ columns = MultiIndex.from_tuples([('bar', u('\u0394')),
+ ('bar', u('\u0395'))], names=['c0',
+ 'c1'])
+ df = DataFrame(np.random.randint(0, 10, (8, 2)),
+ columns=columns,
+ index=index)
+ _check_plot_works(df.plot, title=u('\u03A3'))
+
+ # GH 6951
+ # Test with single column
+ df = DataFrame({'x': np.random.rand(10)})
+ axes = _check_plot_works(df.plot.bar, subplots=True)
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ axes = _check_plot_works(df.plot.bar, subplots=True, layout=(-1, 1))
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ # When ax is supplied and required number of axes is 1,
+ # passed ax should be used:
+ fig, ax = self.plt.subplots()
+ axes = df.plot.bar(subplots=True, ax=ax)
+ assert len(axes) == 1
+ result = ax.axes
+ assert result is axes[0]
+
+ # GH 15516
+ def test_mpl2_color_cycle_str(self):
+ colors = ['C' + str(x) for x in range(10)]
+ df = DataFrame(randn(10, 3), columns=['a', 'b', 'c'])
+ for c in colors:
+ _check_plot_works(df.plot, color=c)
+
+ def test_color_single_series_list(self):
+ # GH 3486
+ df = DataFrame({"A": [1, 2, 3]})
+ _check_plot_works(df.plot, color=['red'])
+
+ def test_rgb_tuple_color(self):
+ # GH 16695
+ df = DataFrame({'x': [1, 2], 'y': [3, 4]})
+ _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0))
+ _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0, 0.5))
+
+ def test_color_empty_string(self):
+ df = DataFrame(randn(10, 2))
+ with pytest.raises(ValueError):
+ df.plot(color='')
+
+ def test_color_and_style_arguments(self):
+ df = DataFrame({'x': [1, 2], 'y': [3, 4]})
+ # passing both 'color' and 'style' arguments should be allowed
+ # if there is no color symbol in the style strings:
+ ax = df.plot(color=['red', 'black'], style=['-', '--'])
+ # check that the linestyles are correctly set:
+ linestyle = [line.get_linestyle() for line in ax.lines]
+ assert linestyle == ['-', '--']
+ # check that the colors are correctly set:
+ color = [line.get_color() for line in ax.lines]
+ assert color == ['red', 'black']
+ # passing both 'color' and 'style' arguments should not be allowed
+ # if there is a color symbol in the style strings:
+ with pytest.raises(ValueError):
+ df.plot(color=['red', 'black'], style=['k-', 'r--'])
+
+ def test_nonnumeric_exclude(self):
+ df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]})
+ ax = df.plot()
+ assert len(ax.get_lines()) == 1 # B was plotted
+
+ @pytest.mark.slow
+ def test_implicit_label(self):
+ df = DataFrame(randn(10, 3), columns=['a', 'b', 'c'])
+ ax = df.plot(x='a', y='b')
+ self._check_text_labels(ax.xaxis.get_label(), 'a')
+
+ @pytest.mark.slow
+ def test_donot_overwrite_index_name(self):
+ # GH 8494
+ df = DataFrame(randn(2, 2), columns=['a', 'b'])
+ df.index.name = 'NAME'
+ df.plot(y='b', label='LABEL')
+ assert df.index.name == 'NAME'
+
+ @pytest.mark.slow
+ def test_plot_xy(self):
+ # columns.inferred_type == 'string'
+ df = self.tdf
+ self._check_data(df.plot(x=0, y=1), df.set_index('A')['B'].plot())
+ self._check_data(df.plot(x=0), df.set_index('A').plot())
+ self._check_data(df.plot(y=0), df.B.plot())
+ self._check_data(df.plot(x='A', y='B'), df.set_index('A').B.plot())
+ self._check_data(df.plot(x='A'), df.set_index('A').plot())
+ self._check_data(df.plot(y='B'), df.B.plot())
+
+ # columns.inferred_type == 'integer'
+ df.columns = lrange(1, len(df.columns) + 1)
+ self._check_data(df.plot(x=1, y=2), df.set_index(1)[2].plot())
+ self._check_data(df.plot(x=1), df.set_index(1).plot())
+ self._check_data(df.plot(y=1), df[1].plot())
+
+ # figsize and title
+ ax = df.plot(x=1, y=2, title='Test', figsize=(16, 8))
+ self._check_text_labels(ax.title, 'Test')
+ self._check_axes_shape(ax, axes_num=1, layout=(1, 1),
+ figsize=(16., 8.))
+
+ # columns.inferred_type == 'mixed'
+ # TODO add MultiIndex test
+
+ @pytest.mark.slow
+ def test_logscales(self):
+ df = DataFrame({'a': np.arange(100)}, index=np.arange(100))
+ ax = df.plot(logy=True)
+ self._check_ax_scales(ax, yaxis='log')
+
+ ax = df.plot(logx=True)
+ self._check_ax_scales(ax, xaxis='log')
+
+ ax = df.plot(loglog=True)
+ self._check_ax_scales(ax, xaxis='log', yaxis='log')
+
+ @pytest.mark.slow
+ def test_xcompat(self):
+ import pandas as pd
+
+ df = self.tdf
+ ax = df.plot(x_compat=True)
+ lines = ax.get_lines()
+ assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+
+ tm.close()
+ pd.plotting.plot_params['xaxis.compat'] = True
+ ax = df.plot()
+ lines = ax.get_lines()
+ assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+
+ tm.close()
+ pd.plotting.plot_params['x_compat'] = False
+
+ ax = df.plot()
+ lines = ax.get_lines()
+ assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+ assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex)
+
+ tm.close()
+ # useful if you're plotting a bunch together
+ with pd.plotting.plot_params.use('x_compat', True):
+ ax = df.plot()
+ lines = ax.get_lines()
+ assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+
+ tm.close()
+ ax = df.plot()
+ lines = ax.get_lines()
+ assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+ assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex)
+
+ def test_period_compat(self):
+ # GH 9012
+ # period-array conversions
+ df = DataFrame(
+ np.random.rand(21, 2),
+ index=bdate_range(datetime(2000, 1, 1), datetime(2000, 1, 31)),
+ columns=['a', 'b'])
+
+ df.plot()
+ self.plt.axhline(y=0)
+ tm.close()
+
+ def test_unsorted_index(self):
+ df = DataFrame({'y': np.arange(100)}, index=np.arange(99, -1, -1),
+ dtype=np.int64)
+ ax = df.plot()
+ lines = ax.get_lines()[0]
+ rs = lines.get_xydata()
+ rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y')
+ tm.assert_series_equal(rs, df.y, check_index_type=False)
+ tm.close()
+
+ df.index = pd.Index(np.arange(99, -1, -1), dtype=np.float64)
+ ax = df.plot()
+ lines = ax.get_lines()[0]
+ rs = lines.get_xydata()
+ rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y')
+ tm.assert_series_equal(rs, df.y)
+
+ def test_unsorted_index_lims(self):
+ df = DataFrame({'y': [0., 1., 2., 3.]}, index=[1., 0., 3., 2.])
+ ax = df.plot()
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= np.nanmin(lines[0].get_data()[0])
+ assert xmax >= np.nanmax(lines[0].get_data()[0])
+
+ df = DataFrame({'y': [0., 1., np.nan, 3., 4., 5., 6.]},
+ index=[1., 0., 3., 2., np.nan, 3., 2.])
+ ax = df.plot()
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= np.nanmin(lines[0].get_data()[0])
+ assert xmax >= np.nanmax(lines[0].get_data()[0])
+
+ df = DataFrame({'y': [0., 1., 2., 3.], 'z': [91., 90., 93., 92.]})
+ ax = df.plot(x='z', y='y')
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= np.nanmin(lines[0].get_data()[0])
+ assert xmax >= np.nanmax(lines[0].get_data()[0])
+
+ @pytest.mark.slow
+ def test_subplots(self):
+ df = DataFrame(np.random.rand(10, 3),
+ index=list(string.ascii_letters[:10]))
+
+ for kind in ['bar', 'barh', 'line', 'area']:
+ axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True)
+ self._check_axes_shape(axes, axes_num=3, layout=(3, 1))
+ assert axes.shape == (3, )
+
+ for ax, column in zip(axes, df.columns):
+ self._check_legend_labels(ax,
+ labels=[pprint_thing(column)])
+
+ for ax in axes[:-2]:
+ self._check_visible(ax.xaxis) # xaxis must be visible for grid
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=False)
+ self._check_visible(ax.xaxis.get_label(), visible=False)
+ self._check_visible(ax.get_yticklabels())
+
+ self._check_visible(axes[-1].xaxis)
+ self._check_visible(axes[-1].get_xticklabels())
+ self._check_visible(axes[-1].get_xticklabels(minor=True))
+ self._check_visible(axes[-1].xaxis.get_label())
+ self._check_visible(axes[-1].get_yticklabels())
+
+ axes = df.plot(kind=kind, subplots=True, sharex=False)
+ for ax in axes:
+ self._check_visible(ax.xaxis)
+ self._check_visible(ax.get_xticklabels())
+ self._check_visible(ax.get_xticklabels(minor=True))
+ self._check_visible(ax.xaxis.get_label())
+ self._check_visible(ax.get_yticklabels())
+
+ axes = df.plot(kind=kind, subplots=True, legend=False)
+ for ax in axes:
+ assert ax.get_legend() is None
+
+ def test_groupby_boxplot_sharey(self):
+ # https://github.com/pandas-dev/pandas/issues/20968
+ # sharey can now be switched check whether the right
+ # pair of axes is turned on or off
+
+ df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14],
+ 'b': [0.56, 0.84, 0.29, 0.56, 0.85],
+ 'c': [0, 1, 2, 3, 1]},
+ index=[0, 1, 2, 3, 4])
+
+ # behavior without keyword
+ axes = df.groupby('c').boxplot()
+ expected = [True, False, True, False]
+ self._assert_ytickslabels_visibility(axes, expected)
+
+ # set sharey=True should be identical
+ axes = df.groupby('c').boxplot(sharey=True)
+ expected = [True, False, True, False]
+ self._assert_ytickslabels_visibility(axes, expected)
+
+ # sharey=False, all yticklabels should be visible
+ axes = df.groupby('c').boxplot(sharey=False)
+ expected = [True, True, True, True]
+ self._assert_ytickslabels_visibility(axes, expected)
+
+ def test_groupby_boxplot_sharex(self):
+ # https://github.com/pandas-dev/pandas/issues/20968
+ # sharex can now be switched check whether the right
+ # pair of axes is turned on or off
+
+ df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14],
+ 'b': [0.56, 0.84, 0.29, 0.56, 0.85],
+ 'c': [0, 1, 2, 3, 1]},
+ index=[0, 1, 2, 3, 4])
+
+ # behavior without keyword
+ axes = df.groupby('c').boxplot()
+ expected = [True, True, True, True]
+ self._assert_xtickslabels_visibility(axes, expected)
+
+ # set sharex=False should be identical
+ axes = df.groupby('c').boxplot(sharex=False)
+ expected = [True, True, True, True]
+ self._assert_xtickslabels_visibility(axes, expected)
+
+ # sharex=True, yticklabels should be visible
+ # only for bottom plots
+ axes = df.groupby('c').boxplot(sharex=True)
+ expected = [False, False, True, True]
+ self._assert_xtickslabels_visibility(axes, expected)
+
+ @pytest.mark.slow
+ def test_subplots_timeseries(self):
+ idx = date_range(start='2014-07-01', freq='M', periods=10)
+ df = DataFrame(np.random.rand(10, 3), index=idx)
+
+ for kind in ['line', 'area']:
+ axes = df.plot(kind=kind, subplots=True, sharex=True)
+ self._check_axes_shape(axes, axes_num=3, layout=(3, 1))
+
+ for ax in axes[:-2]:
+ # GH 7801
+ self._check_visible(ax.xaxis) # xaxis must be visible for grid
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=False)
+ self._check_visible(ax.xaxis.get_label(), visible=False)
+ self._check_visible(ax.get_yticklabels())
+
+ self._check_visible(axes[-1].xaxis)
+ self._check_visible(axes[-1].get_xticklabels())
+ self._check_visible(axes[-1].get_xticklabels(minor=True))
+ self._check_visible(axes[-1].xaxis.get_label())
+ self._check_visible(axes[-1].get_yticklabels())
+ self._check_ticks_props(axes, xrot=0)
+
+ axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45,
+ fontsize=7)
+ for ax in axes:
+ self._check_visible(ax.xaxis)
+ self._check_visible(ax.get_xticklabels())
+ self._check_visible(ax.get_xticklabels(minor=True))
+ self._check_visible(ax.xaxis.get_label())
+ self._check_visible(ax.get_yticklabels())
+ self._check_ticks_props(ax, xlabelsize=7, xrot=45,
+ ylabelsize=7)
+
+ def test_subplots_timeseries_y_axis(self):
+ # GH16953
+ data = {"numeric": np.array([1, 2, 5]),
+ "timedelta": [pd.Timedelta(-10, unit="s"),
+ pd.Timedelta(10, unit="m"),
+ pd.Timedelta(10, unit="h")],
+ "datetime_no_tz": [pd.to_datetime("2017-08-01 00:00:00"),
+ pd.to_datetime("2017-08-01 02:00:00"),
+ pd.to_datetime("2017-08-02 00:00:00")],
+ "datetime_all_tz": [pd.to_datetime("2017-08-01 00:00:00",
+ utc=True),
+ pd.to_datetime("2017-08-01 02:00:00",
+ utc=True),
+ pd.to_datetime("2017-08-02 00:00:00",
+ utc=True)],
+ "text": ["This", "should", "fail"]}
+ testdata = DataFrame(data)
+
+ ax_numeric = testdata.plot(y="numeric")
+ assert (ax_numeric.get_lines()[0].get_data()[1] ==
+ testdata["numeric"].values).all()
+ ax_timedelta = testdata.plot(y="timedelta")
+ assert (ax_timedelta.get_lines()[0].get_data()[1] ==
+ testdata["timedelta"].values).all()
+ ax_datetime_no_tz = testdata.plot(y="datetime_no_tz")
+ assert (ax_datetime_no_tz.get_lines()[0].get_data()[1] ==
+ testdata["datetime_no_tz"].values).all()
+ ax_datetime_all_tz = testdata.plot(y="datetime_all_tz")
+ assert (ax_datetime_all_tz.get_lines()[0].get_data()[1] ==
+ testdata["datetime_all_tz"].values).all()
+ with pytest.raises(TypeError):
+ testdata.plot(y="text")
+
+ @pytest.mark.xfail(reason='not support for period, categorical, '
+ 'datetime_mixed_tz')
+ def test_subplots_timeseries_y_axis_not_supported(self):
+ """
+ This test will fail for:
+ period:
+ since period isn't yet implemented in ``select_dtypes``
+ and because it will need a custom value converter +
+ tick formater (as was done for x-axis plots)
+
+ categorical:
+ because it will need a custom value converter +
+ tick formater (also doesn't work for x-axis, as of now)
+
+ datetime_mixed_tz:
+ because of the way how pandas handels ``Series`` of
+ ``datetime`` objects with different timezone,
+ generally converting ``datetime`` objects in a tz-aware
+ form could help with this problem
+ """
+ data = {"numeric": np.array([1, 2, 5]),
+ "period": [pd.Period('2017-08-01 00:00:00', freq='H'),
+ pd.Period('2017-08-01 02:00', freq='H'),
+ pd.Period('2017-08-02 00:00:00', freq='H')],
+ "categorical": pd.Categorical(["c", "b", "a"],
+ categories=["a", "b", "c"],
+ ordered=False),
+ "datetime_mixed_tz": [pd.to_datetime("2017-08-01 00:00:00",
+ utc=True),
+ pd.to_datetime("2017-08-01 02:00:00"),
+ pd.to_datetime("2017-08-02 00:00:00")]}
+ testdata = pd.DataFrame(data)
+ ax_period = testdata.plot(x="numeric", y="period")
+ assert (ax_period.get_lines()[0].get_data()[1] ==
+ testdata["period"].values).all()
+ ax_categorical = testdata.plot(x="numeric", y="categorical")
+ assert (ax_categorical.get_lines()[0].get_data()[1] ==
+ testdata["categorical"].values).all()
+ ax_datetime_mixed_tz = testdata.plot(x="numeric",
+ y="datetime_mixed_tz")
+ assert (ax_datetime_mixed_tz.get_lines()[0].get_data()[1] ==
+ testdata["datetime_mixed_tz"].values).all()
+
+ @pytest.mark.slow
+ def test_subplots_layout(self):
+ # GH 6667
+ df = DataFrame(np.random.rand(10, 3),
+ index=list(string.ascii_letters[:10]))
+
+ axes = df.plot(subplots=True, layout=(2, 2))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+ assert axes.shape == (2, 2)
+
+ axes = df.plot(subplots=True, layout=(-1, 2))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+ assert axes.shape == (2, 2)
+
+ axes = df.plot(subplots=True, layout=(2, -1))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+ assert axes.shape == (2, 2)
+
+ axes = df.plot(subplots=True, layout=(1, 4))
+ self._check_axes_shape(axes, axes_num=3, layout=(1, 4))
+ assert axes.shape == (1, 4)
+
+ axes = df.plot(subplots=True, layout=(-1, 4))
+ self._check_axes_shape(axes, axes_num=3, layout=(1, 4))
+ assert axes.shape == (1, 4)
+
+ axes = df.plot(subplots=True, layout=(4, -1))
+ self._check_axes_shape(axes, axes_num=3, layout=(4, 1))
+ assert axes.shape == (4, 1)
+
+ with pytest.raises(ValueError):
+ df.plot(subplots=True, layout=(1, 1))
+ with pytest.raises(ValueError):
+ df.plot(subplots=True, layout=(-1, -1))
+
+ # single column
+ df = DataFrame(np.random.rand(10, 1),
+ index=list(string.ascii_letters[:10]))
+ axes = df.plot(subplots=True)
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ assert axes.shape == (1, )
+
+ axes = df.plot(subplots=True, layout=(3, 3))
+ self._check_axes_shape(axes, axes_num=1, layout=(3, 3))
+ assert axes.shape == (3, 3)
+
+ @pytest.mark.slow
+ def test_subplots_warnings(self):
+ # GH 9464
+ with tm.assert_produces_warning(None):
+ df = DataFrame(np.random.randn(100, 4))
+ df.plot(subplots=True, layout=(3, 2))
+
+ df = DataFrame(np.random.randn(100, 4),
+ index=date_range('1/1/2000', periods=100))
+ df.plot(subplots=True, layout=(3, 2))
+
+ @pytest.mark.slow
+ def test_subplots_multiple_axes(self):
+ # GH 5353, 6970, GH 7069
+ fig, axes = self.plt.subplots(2, 3)
+ df = DataFrame(np.random.rand(10, 3),
+ index=list(string.ascii_letters[:10]))
+
+ returned = df.plot(subplots=True, ax=axes[0], sharex=False,
+ sharey=False)
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ assert returned.shape == (3, )
+ assert returned[0].figure is fig
+ # draw on second row
+ returned = df.plot(subplots=True, ax=axes[1], sharex=False,
+ sharey=False)
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ assert returned.shape == (3, )
+ assert returned[0].figure is fig
+ self._check_axes_shape(axes, axes_num=6, layout=(2, 3))
+ tm.close()
+
+ with pytest.raises(ValueError):
+ fig, axes = self.plt.subplots(2, 3)
+ # pass different number of axes from required
+ df.plot(subplots=True, ax=axes)
+
+ # pass 2-dim axes and invalid layout
+ # invalid lauout should not affect to input and return value
+ # (show warning is tested in
+ # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes
+ fig, axes = self.plt.subplots(2, 2)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", UserWarning)
+ df = DataFrame(np.random.rand(10, 4),
+ index=list(string.ascii_letters[:10]))
+
+ returned = df.plot(subplots=True, ax=axes, layout=(2, 1),
+ sharex=False, sharey=False)
+ self._check_axes_shape(returned, axes_num=4, layout=(2, 2))
+ assert returned.shape == (4, )
+
+ returned = df.plot(subplots=True, ax=axes, layout=(2, -1),
+ sharex=False, sharey=False)
+ self._check_axes_shape(returned, axes_num=4, layout=(2, 2))
+ assert returned.shape == (4, )
+
+ returned = df.plot(subplots=True, ax=axes, layout=(-1, 2),
+ sharex=False, sharey=False)
+ self._check_axes_shape(returned, axes_num=4, layout=(2, 2))
+ assert returned.shape == (4, )
+
+ # single column
+ fig, axes = self.plt.subplots(1, 1)
+ df = DataFrame(np.random.rand(10, 1),
+ index=list(string.ascii_letters[:10]))
+
+ axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False)
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ assert axes.shape == (1, )
+
+ def test_subplots_ts_share_axes(self):
+ # GH 3964
+ fig, axes = self.plt.subplots(3, 3, sharex=True, sharey=True)
+ self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3)
+ df = DataFrame(
+ np.random.randn(10, 9),
+ index=date_range(start='2014-07-01', freq='M', periods=10))
+ for i, ax in enumerate(axes.ravel()):
+ df[i].plot(ax=ax, fontsize=5)
+
+ # Rows other than bottom should not be visible
+ for ax in axes[0:-1].ravel():
+ self._check_visible(ax.get_xticklabels(), visible=False)
+
+ # Bottom row should be visible
+ for ax in axes[-1].ravel():
+ self._check_visible(ax.get_xticklabels(), visible=True)
+
+ # First column should be visible
+ for ax in axes[[0, 1, 2], [0]].ravel():
+ self._check_visible(ax.get_yticklabels(), visible=True)
+
+ # Other columns should not be visible
+ for ax in axes[[0, 1, 2], [1]].ravel():
+ self._check_visible(ax.get_yticklabels(), visible=False)
+ for ax in axes[[0, 1, 2], [2]].ravel():
+ self._check_visible(ax.get_yticklabels(), visible=False)
+
+ def test_subplots_sharex_axes_existing_axes(self):
+ # GH 9158
+ d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]}
+ df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14'))
+
+ axes = df[['A', 'B']].plot(subplots=True)
+ df['C'].plot(ax=axes[0], secondary_y=True)
+
+ self._check_visible(axes[0].get_xticklabels(), visible=False)
+ self._check_visible(axes[1].get_xticklabels(), visible=True)
+ for ax in axes.ravel():
+ self._check_visible(ax.get_yticklabels(), visible=True)
+
+ @pytest.mark.slow
+ def test_subplots_dup_columns(self):
+ # GH 10962
+ df = DataFrame(np.random.rand(5, 5), columns=list('aaaaa'))
+ axes = df.plot(subplots=True)
+ for ax in axes:
+ self._check_legend_labels(ax, labels=['a'])
+ assert len(ax.lines) == 1
+ tm.close()
+
+ axes = df.plot(subplots=True, secondary_y='a')
+ for ax in axes:
+ # (right) is only attached when subplots=False
+ self._check_legend_labels(ax, labels=['a'])
+ assert len(ax.lines) == 1
+ tm.close()
+
+ ax = df.plot(secondary_y='a')
+ self._check_legend_labels(ax, labels=['a (right)'] * 5)
+ assert len(ax.lines) == 0
+ assert len(ax.right_ax.lines) == 5
+
+ def test_negative_log(self):
+ df = - DataFrame(rand(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['x', 'y', 'z', 'four'])
+
+ with pytest.raises(ValueError):
+ df.plot.area(logy=True)
+ with pytest.raises(ValueError):
+ df.plot.area(loglog=True)
+
+ def _compare_stacked_y_cood(self, normal_lines, stacked_lines):
+ base = np.zeros(len(normal_lines[0].get_data()[1]))
+ for nl, sl in zip(normal_lines, stacked_lines):
+ base += nl.get_data()[1] # get y coordinates
+ sy = sl.get_data()[1]
+ tm.assert_numpy_array_equal(base, sy)
+
+ def test_line_area_stacked(self):
+ with tm.RNGContext(42):
+ df = DataFrame(rand(6, 4), columns=['w', 'x', 'y', 'z'])
+ neg_df = -df
+ # each column has either positive or negative value
+ sep_df = DataFrame({'w': rand(6),
+ 'x': rand(6),
+ 'y': -rand(6),
+ 'z': -rand(6)})
+ # each column has positive-negative mixed value
+ mixed_df = DataFrame(randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['w', 'x', 'y', 'z'])
+
+ for kind in ['line', 'area']:
+ ax1 = _check_plot_works(df.plot, kind=kind, stacked=False)
+ ax2 = _check_plot_works(df.plot, kind=kind, stacked=True)
+ self._compare_stacked_y_cood(ax1.lines, ax2.lines)
+
+ ax1 = _check_plot_works(neg_df.plot, kind=kind, stacked=False)
+ ax2 = _check_plot_works(neg_df.plot, kind=kind, stacked=True)
+ self._compare_stacked_y_cood(ax1.lines, ax2.lines)
+
+ ax1 = _check_plot_works(sep_df.plot, kind=kind, stacked=False)
+ ax2 = _check_plot_works(sep_df.plot, kind=kind, stacked=True)
+ self._compare_stacked_y_cood(ax1.lines[:2], ax2.lines[:2])
+ self._compare_stacked_y_cood(ax1.lines[2:], ax2.lines[2:])
+
+ _check_plot_works(mixed_df.plot, stacked=False)
+ with pytest.raises(ValueError):
+ mixed_df.plot(stacked=True)
+
+ _check_plot_works(df.plot, kind=kind, logx=True, stacked=True)
+
+ def test_line_area_nan_df(self):
+ values1 = [1, 2, np.nan, 3]
+ values2 = [3, np.nan, 2, 1]
+ df = DataFrame({'a': values1, 'b': values2})
+ tdf = DataFrame({'a': values1,
+ 'b': values2}, index=tm.makeDateIndex(k=4))
+
+ for d in [df, tdf]:
+ ax = _check_plot_works(d.plot)
+ masked1 = ax.lines[0].get_ydata()
+ masked2 = ax.lines[1].get_ydata()
+ # remove nan for comparison purpose
+
+ exp = np.array([1, 2, 3], dtype=np.float64)
+ tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp)
+
+ exp = np.array([3, 2, 1], dtype=np.float64)
+ tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp)
+ tm.assert_numpy_array_equal(
+ masked1.mask, np.array([False, False, True, False]))
+ tm.assert_numpy_array_equal(
+ masked2.mask, np.array([False, True, False, False]))
+
+ expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
+ expected2 = np.array([3, 0, 2, 1], dtype=np.float64)
+
+ ax = _check_plot_works(d.plot, stacked=True)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+ tm.assert_numpy_array_equal(ax.lines[1].get_ydata(),
+ expected1 + expected2)
+
+ ax = _check_plot_works(d.plot.area)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+ tm.assert_numpy_array_equal(ax.lines[1].get_ydata(),
+ expected1 + expected2)
+
+ ax = _check_plot_works(d.plot.area, stacked=False)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+ tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2)
+
+ def test_line_lim(self):
+ df = DataFrame(rand(6, 3), columns=['x', 'y', 'z'])
+ ax = df.plot()
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data()[0][0]
+ assert xmax >= lines[0].get_data()[0][-1]
+
+ ax = df.plot(secondary_y=True)
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data()[0][0]
+ assert xmax >= lines[0].get_data()[0][-1]
+
+ axes = df.plot(secondary_y=True, subplots=True)
+ self._check_axes_shape(axes, axes_num=3, layout=(3, 1))
+ for ax in axes:
+ assert hasattr(ax, 'left_ax')
+ assert not hasattr(ax, 'right_ax')
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data()[0][0]
+ assert xmax >= lines[0].get_data()[0][-1]
+
+ def test_area_lim(self):
+ df = DataFrame(rand(6, 4), columns=['x', 'y', 'z', 'four'])
+
+ neg_df = -df
+ for stacked in [True, False]:
+ ax = _check_plot_works(df.plot.area, stacked=stacked)
+ xmin, xmax = ax.get_xlim()
+ ymin, ymax = ax.get_ylim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data()[0][0]
+ assert xmax >= lines[0].get_data()[0][-1]
+ assert ymin == 0
+
+ ax = _check_plot_works(neg_df.plot.area, stacked=stacked)
+ ymin, ymax = ax.get_ylim()
+ assert ymax == 0
+
+ @pytest.mark.slow
+ def test_bar_colors(self):
+ import matplotlib.pyplot as plt
+ default_colors = self._unpack_cycler(plt.rcParams)
+
+ df = DataFrame(randn(5, 5))
+ ax = df.plot.bar()
+ self._check_colors(ax.patches[::5], facecolors=default_colors[:5])
+ tm.close()
+
+ custom_colors = 'rgcby'
+ ax = df.plot.bar(color=custom_colors)
+ self._check_colors(ax.patches[::5], facecolors=custom_colors)
+ tm.close()
+
+ from matplotlib import cm
+ # Test str -> colormap functionality
+ ax = df.plot.bar(colormap='jet')
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
+ self._check_colors(ax.patches[::5], facecolors=rgba_colors)
+ tm.close()
+
+ # Test colormap functionality
+ ax = df.plot.bar(colormap=cm.jet)
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
+ self._check_colors(ax.patches[::5], facecolors=rgba_colors)
+ tm.close()
+
+ ax = df.loc[:, [0]].plot.bar(color='DodgerBlue')
+ self._check_colors([ax.patches[0]], facecolors=['DodgerBlue'])
+ tm.close()
+
+ ax = df.plot(kind='bar', color='green')
+ self._check_colors(ax.patches[::5], facecolors=['green'] * 5)
+ tm.close()
+
+ def test_bar_user_colors(self):
+ df = pd.DataFrame({"A": range(4),
+ "B": range(1, 5),
+ "color": ['red', 'blue', 'blue', 'red']})
+ # This should *only* work when `y` is specified, else
+ # we use one color per column
+ ax = df.plot.bar(y='A', color=df['color'])
+ result = [p.get_facecolor() for p in ax.patches]
+ expected = [(1., 0., 0., 1.),
+ (0., 0., 1., 1.),
+ (0., 0., 1., 1.),
+ (1., 0., 0., 1.)]
+ assert result == expected
+
+ @pytest.mark.slow
+ def test_bar_linewidth(self):
+ df = DataFrame(randn(5, 5))
+
+ # regular
+ ax = df.plot.bar(linewidth=2)
+ for r in ax.patches:
+ assert r.get_linewidth() == 2
+
+ # stacked
+ ax = df.plot.bar(stacked=True, linewidth=2)
+ for r in ax.patches:
+ assert r.get_linewidth() == 2
+
+ # subplots
+ axes = df.plot.bar(linewidth=2, subplots=True)
+ self._check_axes_shape(axes, axes_num=5, layout=(5, 1))
+ for ax in axes:
+ for r in ax.patches:
+ assert r.get_linewidth() == 2
+
+ @pytest.mark.slow
+ def test_bar_barwidth(self):
+ df = DataFrame(randn(5, 5))
+
+ width = 0.9
+
+ # regular
+ ax = df.plot.bar(width=width)
+ for r in ax.patches:
+ assert r.get_width() == width / len(df.columns)
+
+ # stacked
+ ax = df.plot.bar(stacked=True, width=width)
+ for r in ax.patches:
+ assert r.get_width() == width
+
+ # horizontal regular
+ ax = df.plot.barh(width=width)
+ for r in ax.patches:
+ assert r.get_height() == width / len(df.columns)
+
+ # horizontal stacked
+ ax = df.plot.barh(stacked=True, width=width)
+ for r in ax.patches:
+ assert r.get_height() == width
+
+ # subplots
+ axes = df.plot.bar(width=width, subplots=True)
+ for ax in axes:
+ for r in ax.patches:
+ assert r.get_width() == width
+
+ # horizontal subplots
+ axes = df.plot.barh(width=width, subplots=True)
+ for ax in axes:
+ for r in ax.patches:
+ assert r.get_height() == width
+
+ @pytest.mark.slow
+ def test_bar_barwidth_position(self):
+ df = DataFrame(randn(5, 5))
+ self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9,
+ position=0.2)
+ self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9,
+ position=0.2)
+ self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9,
+ position=0.2)
+ self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9,
+ position=0.2)
+ self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9,
+ position=0.2)
+ self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9,
+ position=0.2)
+
+ @pytest.mark.slow
+ def test_bar_barwidth_position_int(self):
+ # GH 12979
+ df = DataFrame(randn(5, 5))
+
+ for w in [1, 1.]:
+ ax = df.plot.bar(stacked=True, width=w)
+ ticks = ax.xaxis.get_ticklocs()
+ tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4]))
+ assert ax.get_xlim() == (-0.75, 4.75)
+ # check left-edge of bars
+ assert ax.patches[0].get_x() == -0.5
+ assert ax.patches[-1].get_x() == 3.5
+
+ self._check_bar_alignment(df, kind='bar', stacked=True, width=1)
+ self._check_bar_alignment(df, kind='barh', stacked=False, width=1)
+ self._check_bar_alignment(df, kind='barh', stacked=True, width=1)
+ self._check_bar_alignment(df, kind='bar', subplots=True, width=1)
+ self._check_bar_alignment(df, kind='barh', subplots=True, width=1)
+
+ @pytest.mark.slow
+ def test_bar_bottom_left(self):
+ df = DataFrame(rand(5, 5))
+ ax = df.plot.bar(stacked=False, bottom=1)
+ result = [p.get_y() for p in ax.patches]
+ assert result == [1] * 25
+
+ ax = df.plot.bar(stacked=True, bottom=[-1, -2, -3, -4, -5])
+ result = [p.get_y() for p in ax.patches[:5]]
+ assert result == [-1, -2, -3, -4, -5]
+
+ ax = df.plot.barh(stacked=False, left=np.array([1, 1, 1, 1, 1]))
+ result = [p.get_x() for p in ax.patches]
+ assert result == [1] * 25
+
+ ax = df.plot.barh(stacked=True, left=[1, 2, 3, 4, 5])
+ result = [p.get_x() for p in ax.patches[:5]]
+ assert result == [1, 2, 3, 4, 5]
+
+ axes = df.plot.bar(subplots=True, bottom=-1)
+ for ax in axes:
+ result = [p.get_y() for p in ax.patches]
+ assert result == [-1] * 5
+
+ axes = df.plot.barh(subplots=True, left=np.array([1, 1, 1, 1, 1]))
+ for ax in axes:
+ result = [p.get_x() for p in ax.patches]
+ assert result == [1] * 5
+
+ @pytest.mark.slow
+ def test_bar_nan(self):
+ df = DataFrame({'A': [10, np.nan, 20],
+ 'B': [5, 10, 20],
+ 'C': [1, 2, 3]})
+ ax = df.plot.bar()
+ expected = [10, 0, 20, 5, 10, 20, 1, 2, 3]
+ result = [p.get_height() for p in ax.patches]
+ assert result == expected
+
+ ax = df.plot.bar(stacked=True)
+ result = [p.get_height() for p in ax.patches]
+ assert result == expected
+
+ result = [p.get_y() for p in ax.patches]
+ expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0]
+ assert result == expected
+
+ @pytest.mark.slow
+ def test_bar_categorical(self):
+ # GH 13019
+ df1 = pd.DataFrame(np.random.randn(6, 5),
+ index=pd.Index(list('ABCDEF')),
+ columns=pd.Index(list('abcde')))
+ # categorical index must behave the same
+ df2 = pd.DataFrame(np.random.randn(6, 5),
+ index=pd.CategoricalIndex(list('ABCDEF')),
+ columns=pd.CategoricalIndex(list('abcde')))
+
+ for df in [df1, df2]:
+ ax = df.plot.bar()
+ ticks = ax.xaxis.get_ticklocs()
+ tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5]))
+ assert ax.get_xlim() == (-0.5, 5.5)
+ # check left-edge of bars
+ assert ax.patches[0].get_x() == -0.25
+ assert ax.patches[-1].get_x() == 5.15
+
+ ax = df.plot.bar(stacked=True)
+ tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5]))
+ assert ax.get_xlim() == (-0.5, 5.5)
+ assert ax.patches[0].get_x() == -0.25
+ assert ax.patches[-1].get_x() == 4.75
+
+ @pytest.mark.slow
+ def test_plot_scatter(self):
+ df = DataFrame(randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['x', 'y', 'z', 'four'])
+
+ _check_plot_works(df.plot.scatter, x='x', y='y')
+ _check_plot_works(df.plot.scatter, x=1, y=2)
+
+ with pytest.raises(TypeError):
+ df.plot.scatter(x='x')
+ with pytest.raises(TypeError):
+ df.plot.scatter(y='y')
+
+ # GH 6951
+ axes = df.plot(x='x', y='y', kind='scatter', subplots=True)
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self):
+ # addressing issue #10611, to ensure colobar does not
+ # interfere with x-axis label and ticklabels with
+ # ipython inline backend.
+ random_array = np.random.random((1000, 3))
+ df = pd.DataFrame(random_array,
+ columns=['A label', 'B label', 'C label'])
+
+ ax1 = df.plot.scatter(x='A label', y='B label')
+ ax2 = df.plot.scatter(x='A label', y='B label', c='C label')
+
+ vis1 = [vis.get_visible() for vis in
+ ax1.xaxis.get_minorticklabels()]
+ vis2 = [vis.get_visible() for vis in
+ ax2.xaxis.get_minorticklabels()]
+ assert vis1 == vis2
+
+ vis1 = [vis.get_visible() for vis in
+ ax1.xaxis.get_majorticklabels()]
+ vis2 = [vis.get_visible() for vis in
+ ax2.xaxis.get_majorticklabels()]
+ assert vis1 == vis2
+
+ assert (ax1.xaxis.get_label().get_visible() ==
+ ax2.xaxis.get_label().get_visible())
+
+ @pytest.mark.slow
+ def test_if_hexbin_xaxis_label_is_visible(self):
+ # addressing issue #10678, to ensure colobar does not
+ # interfere with x-axis label and ticklabels with
+ # ipython inline backend.
+ random_array = np.random.random((1000, 3))
+ df = pd.DataFrame(random_array,
+ columns=['A label', 'B label', 'C label'])
+
+ ax = df.plot.hexbin('A label', 'B label', gridsize=12)
+ assert all(vis.get_visible() for vis in
+ ax.xaxis.get_minorticklabels())
+ assert all(vis.get_visible() for vis in
+ ax.xaxis.get_majorticklabels())
+ assert ax.xaxis.get_label().get_visible()
+
+ @pytest.mark.slow
+ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self):
+ import matplotlib.pyplot as plt
+ random_array = np.random.random((1000, 3))
+ df = pd.DataFrame(random_array,
+ columns=['A label', 'B label', 'C label'])
+
+ fig, axes = plt.subplots(1, 2)
+ df.plot.scatter('A label', 'B label', c='C label', ax=axes[0])
+ df.plot.scatter('A label', 'B label', c='C label', ax=axes[1])
+ plt.tight_layout()
+
+ points = np.array([ax.get_position().get_points()
+ for ax in fig.axes])
+ axes_x_coords = points[:, :, 0]
+ parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :]
+ colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :]
+ assert np.isclose(parent_distance,
+ colorbar_distance, atol=1e-7).all()
+
+ @pytest.mark.slow
+ def test_plot_scatter_with_categorical_data(self):
+ # GH 16199
+ df = pd.DataFrame({'x': [1, 2, 3, 4],
+ 'y': pd.Categorical(['a', 'b', 'a', 'c'])})
+
+ with pytest.raises(ValueError) as ve:
+ df.plot(x='x', y='y', kind='scatter')
+ ve.match('requires y column to be numeric')
+
+ with pytest.raises(ValueError) as ve:
+ df.plot(x='y', y='x', kind='scatter')
+ ve.match('requires x column to be numeric')
+
+ with pytest.raises(ValueError) as ve:
+ df.plot(x='y', y='y', kind='scatter')
+ ve.match('requires x column to be numeric')
+
+ @pytest.mark.slow
+ def test_plot_scatter_with_c(self):
+ df = DataFrame(randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['x', 'y', 'z', 'four'])
+
+ axes = [df.plot.scatter(x='x', y='y', c='z'),
+ df.plot.scatter(x=0, y=1, c=2)]
+ for ax in axes:
+ # default to Greys
+ assert ax.collections[0].cmap.name == 'Greys'
+
+ # n.b. there appears to be no public method
+ # to get the colorbar label
+ assert ax.collections[0].colorbar._label == 'z'
+
+ cm = 'cubehelix'
+ ax = df.plot.scatter(x='x', y='y', c='z', colormap=cm)
+ assert ax.collections[0].cmap.name == cm
+
+ # verify turning off colorbar works
+ ax = df.plot.scatter(x='x', y='y', c='z', colorbar=False)
+ assert ax.collections[0].colorbar is None
+
+ # verify that we can still plot a solid color
+ ax = df.plot.scatter(x=0, y=1, c='red')
+ assert ax.collections[0].colorbar is None
+ self._check_colors(ax.collections, facecolors=['r'])
+
+ # Ensure that we can pass an np.array straight through to matplotlib,
+ # this functionality was accidentally removed previously.
+ # See https://github.com/pandas-dev/pandas/issues/8852 for bug report
+ #
+ # Exercise colormap path and non-colormap path as they are independent
+ #
+ df = DataFrame({'A': [1, 2], 'B': [3, 4]})
+ red_rgba = [1.0, 0.0, 0.0, 1.0]
+ green_rgba = [0.0, 1.0, 0.0, 1.0]
+ rgba_array = np.array([red_rgba, green_rgba])
+ ax = df.plot.scatter(x='A', y='B', c=rgba_array)
+ # expect the face colors of the points in the non-colormap path to be
+ # identical to the values we supplied, normally we'd be on shaky ground
+ # comparing floats for equality but here we expect them to be
+ # identical.
+ tm.assert_numpy_array_equal(ax.collections[0]
+ .get_facecolor(), rgba_array)
+ # we don't test the colors of the faces in this next plot because they
+ # are dependent on the spring colormap, which may change its colors
+ # later.
+ float_array = np.array([0.0, 1.0])
+ df.plot.scatter(x='A', y='B', c=float_array, cmap='spring')
+
+ def test_scatter_colors(self):
+ df = DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]})
+ with pytest.raises(TypeError):
+ df.plot.scatter(x='a', y='b', c='c', color='green')
+
+ default_colors = self._unpack_cycler(self.plt.rcParams)
+
+ ax = df.plot.scatter(x='a', y='b', c='c')
+ tm.assert_numpy_array_equal(
+ ax.collections[0].get_facecolor()[0],
+ np.array(self.colorconverter.to_rgba(default_colors[0])))
+
+ ax = df.plot.scatter(x='a', y='b', color='white')
+ tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0],
+ np.array([1, 1, 1, 1], dtype=np.float64))
+
+ @pytest.mark.slow
+ def test_plot_bar(self):
+ df = DataFrame(randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['one', 'two', 'three', 'four'])
+
+ _check_plot_works(df.plot.bar)
+ _check_plot_works(df.plot.bar, legend=False)
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.plot.bar, subplots=True)
+ _check_plot_works(df.plot.bar, stacked=True)
+
+ df = DataFrame(randn(10, 15),
+ index=list(string.ascii_letters[:10]),
+ columns=lrange(15))
+ _check_plot_works(df.plot.bar)
+
+ df = DataFrame({'a': [0, 1], 'b': [1, 0]})
+ ax = _check_plot_works(df.plot.bar)
+ self._check_ticks_props(ax, xrot=90)
+
+ ax = df.plot.bar(rot=35, fontsize=10)
+ self._check_ticks_props(ax, xrot=35, xlabelsize=10, ylabelsize=10)
+
+ ax = _check_plot_works(df.plot.barh)
+ self._check_ticks_props(ax, yrot=0)
+
+ ax = df.plot.barh(rot=55, fontsize=11)
+ self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11)
+
+ def _check_bar_alignment(self, df, kind='bar', stacked=False,
+ subplots=False, align='center', width=0.5,
+ position=0.5):
+
+ axes = df.plot(kind=kind, stacked=stacked, subplots=subplots,
+ align=align, width=width, position=position, grid=True)
+
+ axes = self._flatten_visible(axes)
+
+ for ax in axes:
+ if kind == 'bar':
+ axis = ax.xaxis
+ ax_min, ax_max = ax.get_xlim()
+ min_edge = min(p.get_x() for p in ax.patches)
+ max_edge = max(p.get_x() + p.get_width() for p in ax.patches)
+ elif kind == 'barh':
+ axis = ax.yaxis
+ ax_min, ax_max = ax.get_ylim()
+ min_edge = min(p.get_y() for p in ax.patches)
+ max_edge = max(p.get_y() + p.get_height() for p in ax.patches)
+ else:
+ raise ValueError
+
+ # GH 7498
+ # compare margins between lim and bar edges
+ tm.assert_almost_equal(ax_min, min_edge - 0.25)
+ tm.assert_almost_equal(ax_max, max_edge + 0.25)
+
+ p = ax.patches[0]
+ if kind == 'bar' and (stacked is True or subplots is True):
+ edge = p.get_x()
+ center = edge + p.get_width() * position
+ elif kind == 'bar' and stacked is False:
+ center = p.get_x() + p.get_width() * len(df.columns) * position
+ edge = p.get_x()
+ elif kind == 'barh' and (stacked is True or subplots is True):
+ center = p.get_y() + p.get_height() * position
+ edge = p.get_y()
+ elif kind == 'barh' and stacked is False:
+ center = p.get_y() + p.get_height() * len(
+ df.columns) * position
+ edge = p.get_y()
+ else:
+ raise ValueError
+
+ # Check the ticks locates on integer
+ assert (axis.get_ticklocs() == np.arange(len(df))).all()
+
+ if align == 'center':
+ # Check whether the bar locates on center
+ tm.assert_almost_equal(axis.get_ticklocs()[0], center)
+ elif align == 'edge':
+ # Check whether the bar's edge starts from the tick
+ tm.assert_almost_equal(axis.get_ticklocs()[0], edge)
+ else:
+ raise ValueError
+
+ return axes
+
+ @pytest.mark.slow
+ def test_bar_stacked_center(self):
+ # GH2157
+ df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5))
+ self._check_bar_alignment(df, kind='bar', stacked=True)
+ self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9)
+ self._check_bar_alignment(df, kind='barh', stacked=True)
+ self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9)
+
+ @pytest.mark.slow
+ def test_bar_center(self):
+ df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5))
+ self._check_bar_alignment(df, kind='bar', stacked=False)
+ self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9)
+ self._check_bar_alignment(df, kind='barh', stacked=False)
+ self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9)
+
+ @pytest.mark.slow
+ def test_bar_subplots_center(self):
+ df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5))
+ self._check_bar_alignment(df, kind='bar', subplots=True)
+ self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9)
+ self._check_bar_alignment(df, kind='barh', subplots=True)
+ self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9)
+
+ @pytest.mark.slow
+ def test_bar_align_single_column(self):
+ df = DataFrame(randn(5))
+ self._check_bar_alignment(df, kind='bar', stacked=False)
+ self._check_bar_alignment(df, kind='bar', stacked=True)
+ self._check_bar_alignment(df, kind='barh', stacked=False)
+ self._check_bar_alignment(df, kind='barh', stacked=True)
+ self._check_bar_alignment(df, kind='bar', subplots=True)
+ self._check_bar_alignment(df, kind='barh', subplots=True)
+
+ @pytest.mark.slow
+ def test_bar_edge(self):
+ df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5))
+
+ self._check_bar_alignment(df, kind='bar', stacked=True, align='edge')
+ self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9,
+ align='edge')
+ self._check_bar_alignment(df, kind='barh', stacked=True, align='edge')
+ self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9,
+ align='edge')
+
+ self._check_bar_alignment(df, kind='bar', stacked=False, align='edge')
+ self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9,
+ align='edge')
+ self._check_bar_alignment(df, kind='barh', stacked=False, align='edge')
+ self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9,
+ align='edge')
+
+ self._check_bar_alignment(df, kind='bar', subplots=True, align='edge')
+ self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9,
+ align='edge')
+ self._check_bar_alignment(df, kind='barh', subplots=True, align='edge')
+ self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9,
+ align='edge')
+
+ @pytest.mark.slow
+ def test_bar_log_no_subplots(self):
+ # GH3254, GH3298 matplotlib/matplotlib#1882, #1892
+ # regressions in 1.2.1
+ expected = np.array([.1, 1., 10., 100])
+
+ # no subplots
+ df = DataFrame({'A': [3] * 5, 'B': lrange(1, 6)}, index=lrange(5))
+ ax = df.plot.bar(grid=True, log=True)
+ tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected)
+
+ @pytest.mark.slow
+ def test_bar_log_subplots(self):
+ expected = np.array([.1, 1., 10., 100., 1000., 1e4])
+
+ ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar(
+ log=True, subplots=True)
+
+ tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected)
+ tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected)
+
+ @pytest.mark.slow
+ def test_boxplot(self):
+ df = self.hist_df
+ series = df['height']
+ numeric_cols = df._get_numeric_data().columns
+ labels = [pprint_thing(c) for c in numeric_cols]
+
+ ax = _check_plot_works(df.plot.box)
+ self._check_text_labels(ax.get_xticklabels(), labels)
+ tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(),
+ np.arange(1, len(numeric_cols) + 1))
+ assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+
+ # different warning on py3
+ if not PY3:
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot.box, subplots=True, logy=True)
+
+ self._check_axes_shape(axes, axes_num=3, layout=(1, 3))
+ self._check_ax_scales(axes, yaxis='log')
+ for ax, label in zip(axes, labels):
+ self._check_text_labels(ax.get_xticklabels(), [label])
+ assert len(ax.lines) == self.bp_n_objects
+
+ axes = series.plot.box(rot=40)
+ self._check_ticks_props(axes, xrot=40, yrot=0)
+ tm.close()
+
+ ax = _check_plot_works(series.plot.box)
+
+ positions = np.array([1, 6, 7])
+ ax = df.plot.box(positions=positions)
+ numeric_cols = df._get_numeric_data().columns
+ labels = [pprint_thing(c) for c in numeric_cols]
+ self._check_text_labels(ax.get_xticklabels(), labels)
+ tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions)
+ assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+
+ @pytest.mark.slow
+ def test_boxplot_vertical(self):
+ df = self.hist_df
+ numeric_cols = df._get_numeric_data().columns
+ labels = [pprint_thing(c) for c in numeric_cols]
+
+ # if horizontal, yticklabels are rotated
+ ax = df.plot.box(rot=50, fontsize=8, vert=False)
+ self._check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8)
+ self._check_text_labels(ax.get_yticklabels(), labels)
+ assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot.box,
+ subplots=True, vert=False, logx=True)
+ self._check_axes_shape(axes, axes_num=3, layout=(1, 3))
+ self._check_ax_scales(axes, xaxis='log')
+ for ax, label in zip(axes, labels):
+ self._check_text_labels(ax.get_yticklabels(), [label])
+ assert len(ax.lines) == self.bp_n_objects
+
+ positions = np.array([3, 2, 8])
+ ax = df.plot.box(positions=positions, vert=False)
+ self._check_text_labels(ax.get_yticklabels(), labels)
+ tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions)
+ assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+
+ @pytest.mark.slow
+ def test_boxplot_return_type(self):
+ df = DataFrame(randn(6, 4),
+ index=list(string.ascii_letters[:6]),
+ columns=['one', 'two', 'three', 'four'])
+ with pytest.raises(ValueError):
+ df.plot.box(return_type='NOTATYPE')
+
+ result = df.plot.box(return_type='dict')
+ self._check_box_return_type(result, 'dict')
+
+ result = df.plot.box(return_type='axes')
+ self._check_box_return_type(result, 'axes')
+
+ result = df.plot.box() # default axes
+ self._check_box_return_type(result, 'axes')
+
+ result = df.plot.box(return_type='both')
+ self._check_box_return_type(result, 'both')
+
+ @pytest.mark.slow
+ def test_boxplot_subplots_return_type(self):
+ df = self.hist_df
+
+ # normal style: return_type=None
+ result = df.plot.box(subplots=True)
+ assert isinstance(result, Series)
+ self._check_box_return_type(result, None, expected_keys=[
+ 'height', 'weight', 'category'])
+
+ for t in ['dict', 'axes', 'both']:
+ returned = df.plot.box(return_type=t, subplots=True)
+ self._check_box_return_type(
+ returned, t,
+ expected_keys=['height', 'weight', 'category'],
+ check_ax_title=False)
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_df(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ df = DataFrame(randn(100, 4))
+ ax = _check_plot_works(df.plot, kind='kde')
+ expected = [pprint_thing(c) for c in df.columns]
+ self._check_legend_labels(ax, labels=expected)
+ self._check_ticks_props(ax, xrot=0)
+
+ ax = df.plot(kind='kde', rot=20, fontsize=5)
+ self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5)
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot, kind='kde',
+ subplots=True)
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ axes = df.plot(kind='kde', logy=True, subplots=True)
+ self._check_ax_scales(axes, yaxis='log')
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_missing_vals(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ df = DataFrame(np.random.uniform(size=(100, 4)))
+ df.loc[0, 0] = np.nan
+ _check_plot_works(df.plot, kind='kde')
+
+ @pytest.mark.slow
+ def test_hist_df(self):
+ from matplotlib.patches import Rectangle
+
+ df = DataFrame(randn(100, 4))
+ series = df[0]
+
+ ax = _check_plot_works(df.plot.hist)
+ expected = [pprint_thing(c) for c in df.columns]
+ self._check_legend_labels(ax, labels=expected)
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot.hist,
+ subplots=True, logy=True)
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+ self._check_ax_scales(axes, yaxis='log')
+
+ axes = series.plot.hist(rot=40)
+ self._check_ticks_props(axes, xrot=40, yrot=0)
+ tm.close()
+
+ if plotting._compat._mpl_ge_2_2_0():
+ kwargs = {"density": True}
+ else:
+ kwargs = {"normed": True}
+ ax = series.plot.hist(cumulative=True, bins=4, **kwargs)
+ # height of last bin (index 5) must be 1.0
+ rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
+ tm.assert_almost_equal(rects[-1].get_height(), 1.0)
+ tm.close()
+
+ ax = series.plot.hist(cumulative=True, bins=4)
+ rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
+
+ tm.assert_almost_equal(rects[-2].get_height(), 100.0)
+ tm.close()
+
+ # if horizontal, yticklabels are rotated
+ axes = df.plot.hist(rot=50, fontsize=8, orientation='horizontal')
+ self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8)
+
+ def _check_box_coord(self, patches, expected_y=None, expected_h=None,
+ expected_x=None, expected_w=None):
+ result_y = np.array([p.get_y() for p in patches])
+ result_height = np.array([p.get_height() for p in patches])
+ result_x = np.array([p.get_x() for p in patches])
+ result_width = np.array([p.get_width() for p in patches])
+ # dtype is depending on above values, no need to check
+
+ if expected_y is not None:
+ tm.assert_numpy_array_equal(result_y, expected_y,
+ check_dtype=False)
+ if expected_h is not None:
+ tm.assert_numpy_array_equal(result_height, expected_h,
+ check_dtype=False)
+ if expected_x is not None:
+ tm.assert_numpy_array_equal(result_x, expected_x,
+ check_dtype=False)
+ if expected_w is not None:
+ tm.assert_numpy_array_equal(result_width, expected_w,
+ check_dtype=False)
+
+ @pytest.mark.slow
+ def test_hist_df_coord(self):
+ normal_df = DataFrame({'A': np.repeat(np.array([1, 2, 3, 4, 5]),
+ np.array([10, 9, 8, 7, 6])),
+ 'B': np.repeat(np.array([1, 2, 3, 4, 5]),
+ np.array([8, 8, 8, 8, 8])),
+ 'C': np.repeat(np.array([1, 2, 3, 4, 5]),
+ np.array([6, 7, 8, 9, 10]))},
+ columns=['A', 'B', 'C'])
+
+ nan_df = DataFrame({'A': np.repeat(np.array([np.nan, 1, 2, 3, 4, 5]),
+ np.array([3, 10, 9, 8, 7, 6])),
+ 'B': np.repeat(np.array([1, np.nan, 2, 3, 4, 5]),
+ np.array([8, 3, 8, 8, 8, 8])),
+ 'C': np.repeat(np.array([1, 2, 3, np.nan, 4, 5]),
+ np.array([6, 7, 8, 3, 9, 10]))},
+ columns=['A', 'B', 'C'])
+
+ for df in [normal_df, nan_df]:
+ ax = df.plot.hist(bins=5)
+ self._check_box_coord(ax.patches[:5],
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(ax.patches[5:10],
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(ax.patches[10:],
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([6, 7, 8, 9, 10]))
+
+ ax = df.plot.hist(bins=5, stacked=True)
+ self._check_box_coord(ax.patches[:5],
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(ax.patches[5:10],
+ expected_y=np.array([10, 9, 8, 7, 6]),
+ expected_h=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(ax.patches[10:],
+ expected_y=np.array([18, 17, 16, 15, 14]),
+ expected_h=np.array([6, 7, 8, 9, 10]))
+
+ axes = df.plot.hist(bins=5, stacked=True, subplots=True)
+ self._check_box_coord(axes[0].patches,
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(axes[1].patches,
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(axes[2].patches,
+ expected_y=np.array([0, 0, 0, 0, 0]),
+ expected_h=np.array([6, 7, 8, 9, 10]))
+
+ # horizontal
+ ax = df.plot.hist(bins=5, orientation='horizontal')
+ self._check_box_coord(ax.patches[:5],
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(ax.patches[5:10],
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(ax.patches[10:],
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([6, 7, 8, 9, 10]))
+
+ ax = df.plot.hist(bins=5, stacked=True,
+ orientation='horizontal')
+ self._check_box_coord(ax.patches[:5],
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(ax.patches[5:10],
+ expected_x=np.array([10, 9, 8, 7, 6]),
+ expected_w=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(
+ ax.patches[10:],
+ expected_x=np.array([18, 17, 16, 15, 14]),
+ expected_w=np.array([6, 7, 8, 9, 10]))
+
+ axes = df.plot.hist(bins=5, stacked=True, subplots=True,
+ orientation='horizontal')
+ self._check_box_coord(axes[0].patches,
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([10, 9, 8, 7, 6]))
+ self._check_box_coord(axes[1].patches,
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([8, 8, 8, 8, 8]))
+ self._check_box_coord(axes[2].patches,
+ expected_x=np.array([0, 0, 0, 0, 0]),
+ expected_w=np.array([6, 7, 8, 9, 10]))
+
+ @pytest.mark.slow
+ def test_plot_int_columns(self):
+ df = DataFrame(randn(100, 4)).cumsum()
+ _check_plot_works(df.plot, legend=True)
+
+ @pytest.mark.slow
+ def test_df_legend_labels(self):
+ kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist']
+ df = DataFrame(rand(3, 3), columns=['a', 'b', 'c'])
+ df2 = DataFrame(rand(3, 3), columns=['d', 'e', 'f'])
+ df3 = DataFrame(rand(3, 3), columns=['g', 'h', 'i'])
+ df4 = DataFrame(rand(3, 3), columns=['j', 'k', 'l'])
+
+ for kind in kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+
+ ax = df.plot(kind=kind, legend=True)
+ self._check_legend_labels(ax, labels=df.columns)
+
+ ax = df2.plot(kind=kind, legend=False, ax=ax)
+ self._check_legend_labels(ax, labels=df.columns)
+
+ ax = df3.plot(kind=kind, legend=True, ax=ax)
+ self._check_legend_labels(ax, labels=df.columns.union(df3.columns))
+
+ ax = df4.plot(kind=kind, legend='reverse', ax=ax)
+ expected = list(df.columns.union(df3.columns)) + list(reversed(
+ df4.columns))
+ self._check_legend_labels(ax, labels=expected)
+
+ # Secondary Y
+ ax = df.plot(legend=True, secondary_y='b')
+ self._check_legend_labels(ax, labels=['a', 'b (right)', 'c'])
+ ax = df2.plot(legend=False, ax=ax)
+ self._check_legend_labels(ax, labels=['a', 'b (right)', 'c'])
+ ax = df3.plot(kind='bar', legend=True, secondary_y='h', ax=ax)
+ self._check_legend_labels(
+ ax, labels=['a', 'b (right)', 'c', 'g', 'h (right)', 'i'])
+
+ # Time Series
+ ind = date_range('1/1/2014', periods=3)
+ df = DataFrame(randn(3, 3), columns=['a', 'b', 'c'], index=ind)
+ df2 = DataFrame(randn(3, 3), columns=['d', 'e', 'f'], index=ind)
+ df3 = DataFrame(randn(3, 3), columns=['g', 'h', 'i'], index=ind)
+ ax = df.plot(legend=True, secondary_y='b')
+ self._check_legend_labels(ax, labels=['a', 'b (right)', 'c'])
+ ax = df2.plot(legend=False, ax=ax)
+ self._check_legend_labels(ax, labels=['a', 'b (right)', 'c'])
+ ax = df3.plot(legend=True, ax=ax)
+ self._check_legend_labels(
+ ax, labels=['a', 'b (right)', 'c', 'g', 'h', 'i'])
+
+ # scatter
+ ax = df.plot.scatter(x='a', y='b', label='data1')
+ self._check_legend_labels(ax, labels=['data1'])
+ ax = df2.plot.scatter(x='d', y='e', legend=False, label='data2', ax=ax)
+ self._check_legend_labels(ax, labels=['data1'])
+ ax = df3.plot.scatter(x='g', y='h', label='data3', ax=ax)
+ self._check_legend_labels(ax, labels=['data1', 'data3'])
+
+ # ensure label args pass through and
+ # index name does not mutate
+ # column names don't mutate
+ df5 = df.set_index('a')
+ ax = df5.plot(y='b')
+ self._check_legend_labels(ax, labels=['b'])
+ ax = df5.plot(y='b', label='LABEL_b')
+ self._check_legend_labels(ax, labels=['LABEL_b'])
+ self._check_text_labels(ax.xaxis.get_label(), 'a')
+ ax = df5.plot(y='c', label='LABEL_c', ax=ax)
+ self._check_legend_labels(ax, labels=['LABEL_b', 'LABEL_c'])
+ assert df5.columns.tolist() == ['b', 'c']
+
+ def test_legend_name(self):
+ multi = DataFrame(randn(4, 4),
+ columns=[np.array(['a', 'a', 'b', 'b']),
+ np.array(['x', 'y', 'x', 'y'])])
+ multi.columns.names = ['group', 'individual']
+
+ ax = multi.plot()
+ leg_title = ax.legend_.get_title()
+ self._check_text_labels(leg_title, 'group,individual')
+
+ df = DataFrame(randn(5, 5))
+ ax = df.plot(legend=True, ax=ax)
+ leg_title = ax.legend_.get_title()
+ self._check_text_labels(leg_title, 'group,individual')
+
+ df.columns.name = 'new'
+ ax = df.plot(legend=False, ax=ax)
+ leg_title = ax.legend_.get_title()
+ self._check_text_labels(leg_title, 'group,individual')
+
+ ax = df.plot(legend=True, ax=ax)
+ leg_title = ax.legend_.get_title()
+ self._check_text_labels(leg_title, 'new')
+
+ @pytest.mark.slow
+ def test_no_legend(self):
+ kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist']
+ df = DataFrame(rand(3, 3), columns=['a', 'b', 'c'])
+
+ for kind in kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+
+ ax = df.plot(kind=kind, legend=False)
+ self._check_legend_labels(ax, visible=False)
+
+ @pytest.mark.slow
+ def test_style_by_column(self):
+ import matplotlib.pyplot as plt
+ fig = plt.gcf()
+
+ df = DataFrame(randn(100, 3))
+ for markers in [{0: '^',
+ 1: '+',
+ 2: 'o'}, {0: '^',
+ 1: '+'}, ['^', '+', 'o'], ['^', '+']]:
+ fig.clf()
+ fig.add_subplot(111)
+ ax = df.plot(style=markers)
+ for i, l in enumerate(ax.get_lines()[:len(markers)]):
+ assert l.get_marker() == markers[i]
+
+ @pytest.mark.slow
+ def test_line_label_none(self):
+ s = Series([1, 2])
+ ax = s.plot()
+ assert ax.get_legend() is None
+
+ ax = s.plot(legend=True)
+ assert ax.get_legend().get_texts()[0].get_text() == 'None'
+
+ @pytest.mark.slow
+ def test_line_colors(self):
+ from matplotlib import cm
+
+ custom_colors = 'rgcby'
+ df = DataFrame(randn(5, 5))
+
+ ax = df.plot(color=custom_colors)
+ self._check_colors(ax.get_lines(), linecolors=custom_colors)
+
+ tm.close()
+
+ ax2 = df.plot(color=custom_colors)
+ lines2 = ax2.get_lines()
+
+ for l1, l2 in zip(ax.get_lines(), lines2):
+ assert l1.get_color() == l2.get_color()
+
+ tm.close()
+
+ ax = df.plot(colormap='jet')
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ self._check_colors(ax.get_lines(), linecolors=rgba_colors)
+ tm.close()
+
+ ax = df.plot(colormap=cm.jet)
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ self._check_colors(ax.get_lines(), linecolors=rgba_colors)
+ tm.close()
+
+ # make color a list if plotting one column frame
+ # handles cases like df.plot(color='DodgerBlue')
+ ax = df.loc[:, [0]].plot(color='DodgerBlue')
+ self._check_colors(ax.lines, linecolors=['DodgerBlue'])
+
+ ax = df.plot(color='red')
+ self._check_colors(ax.get_lines(), linecolors=['red'] * 5)
+ tm.close()
+
+ # GH 10299
+ custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF']
+ ax = df.plot(color=custom_colors)
+ self._check_colors(ax.get_lines(), linecolors=custom_colors)
+ tm.close()
+
+ with pytest.raises(ValueError):
+ # Color contains shorthand hex value results in ValueError
+ custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF']
+ # Forced show plot
+ _check_plot_works(df.plot, color=custom_colors)
+
+ @pytest.mark.slow
+ def test_dont_modify_colors(self):
+ colors = ['r', 'g', 'b']
+ pd.DataFrame(np.random.rand(10, 2)).plot(color=colors)
+ assert len(colors) == 3
+
+ @pytest.mark.slow
+ def test_line_colors_and_styles_subplots(self):
+ # GH 9894
+ from matplotlib import cm
+ default_colors = self._unpack_cycler(self.plt.rcParams)
+
+ df = DataFrame(randn(5, 5))
+
+ axes = df.plot(subplots=True)
+ for ax, c in zip(axes, list(default_colors)):
+ c = [c]
+ self._check_colors(ax.get_lines(), linecolors=c)
+ tm.close()
+
+ # single color char
+ axes = df.plot(subplots=True, color='k')
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['k'])
+ tm.close()
+
+ # single color str
+ axes = df.plot(subplots=True, color='green')
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['green'])
+ tm.close()
+
+ custom_colors = 'rgcby'
+ axes = df.plot(color=custom_colors, subplots=True)
+ for ax, c in zip(axes, list(custom_colors)):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ axes = df.plot(color=list(custom_colors), subplots=True)
+ for ax, c in zip(axes, list(custom_colors)):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ # GH 10299
+ custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF']
+ axes = df.plot(color=custom_colors, subplots=True)
+ for ax, c in zip(axes, list(custom_colors)):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ with pytest.raises(ValueError):
+ # Color contains shorthand hex value results in ValueError
+ custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF']
+ # Forced show plot
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.plot, color=custom_colors, subplots=True)
+
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ for cmap in ['jet', cm.jet]:
+ axes = df.plot(colormap=cmap, subplots=True)
+ for ax, c in zip(axes, rgba_colors):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ # make color a list if plotting one column frame
+ # handles cases like df.plot(color='DodgerBlue')
+ axes = df.loc[:, [0]].plot(color='DodgerBlue', subplots=True)
+ self._check_colors(axes[0].lines, linecolors=['DodgerBlue'])
+
+ # single character style
+ axes = df.plot(style='r', subplots=True)
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['r'])
+ tm.close()
+
+ # list of styles
+ styles = list('rgcby')
+ axes = df.plot(style=styles, subplots=True)
+ for ax, c in zip(axes, styles):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ @pytest.mark.slow
+ def test_area_colors(self):
+ from matplotlib import cm
+ from matplotlib.collections import PolyCollection
+
+ custom_colors = 'rgcby'
+ df = DataFrame(rand(5, 5))
+
+ ax = df.plot.area(color=custom_colors)
+ self._check_colors(ax.get_lines(), linecolors=custom_colors)
+ poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)]
+ self._check_colors(poly, facecolors=custom_colors)
+
+ handles, labels = ax.get_legend_handles_labels()
+ self._check_colors(handles, facecolors=custom_colors)
+
+ for h in handles:
+ assert h.get_alpha() is None
+ tm.close()
+
+ ax = df.plot.area(colormap='jet')
+ jet_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ self._check_colors(ax.get_lines(), linecolors=jet_colors)
+ poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)]
+ self._check_colors(poly, facecolors=jet_colors)
+
+ handles, labels = ax.get_legend_handles_labels()
+ self._check_colors(handles, facecolors=jet_colors)
+ for h in handles:
+ assert h.get_alpha() is None
+ tm.close()
+
+ # When stacked=False, alpha is set to 0.5
+ ax = df.plot.area(colormap=cm.jet, stacked=False)
+ self._check_colors(ax.get_lines(), linecolors=jet_colors)
+ poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)]
+ jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors]
+ self._check_colors(poly, facecolors=jet_with_alpha)
+
+ handles, labels = ax.get_legend_handles_labels()
+ linecolors = jet_with_alpha
+ self._check_colors(handles[:len(jet_colors)], linecolors=linecolors)
+ for h in handles:
+ assert h.get_alpha() == 0.5
+
+ @pytest.mark.slow
+ def test_hist_colors(self):
+ default_colors = self._unpack_cycler(self.plt.rcParams)
+
+ df = DataFrame(randn(5, 5))
+ ax = df.plot.hist()
+ self._check_colors(ax.patches[::10], facecolors=default_colors[:5])
+ tm.close()
+
+ custom_colors = 'rgcby'
+ ax = df.plot.hist(color=custom_colors)
+ self._check_colors(ax.patches[::10], facecolors=custom_colors)
+ tm.close()
+
+ from matplotlib import cm
+ # Test str -> colormap functionality
+ ax = df.plot.hist(colormap='jet')
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
+ self._check_colors(ax.patches[::10], facecolors=rgba_colors)
+ tm.close()
+
+ # Test colormap functionality
+ ax = df.plot.hist(colormap=cm.jet)
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5))
+ self._check_colors(ax.patches[::10], facecolors=rgba_colors)
+ tm.close()
+
+ ax = df.loc[:, [0]].plot.hist(color='DodgerBlue')
+ self._check_colors([ax.patches[0]], facecolors=['DodgerBlue'])
+
+ ax = df.plot(kind='hist', color='green')
+ self._check_colors(ax.patches[::10], facecolors=['green'] * 5)
+ tm.close()
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_colors(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ from matplotlib import cm
+
+ custom_colors = 'rgcby'
+ df = DataFrame(rand(5, 5))
+
+ ax = df.plot.kde(color=custom_colors)
+ self._check_colors(ax.get_lines(), linecolors=custom_colors)
+ tm.close()
+
+ ax = df.plot.kde(colormap='jet')
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ self._check_colors(ax.get_lines(), linecolors=rgba_colors)
+ tm.close()
+
+ ax = df.plot.kde(colormap=cm.jet)
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ self._check_colors(ax.get_lines(), linecolors=rgba_colors)
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_colors_and_styles_subplots(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ from matplotlib import cm
+ default_colors = self._unpack_cycler(self.plt.rcParams)
+
+ df = DataFrame(randn(5, 5))
+
+ axes = df.plot(kind='kde', subplots=True)
+ for ax, c in zip(axes, list(default_colors)):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ # single color char
+ axes = df.plot(kind='kde', color='k', subplots=True)
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['k'])
+ tm.close()
+
+ # single color str
+ axes = df.plot(kind='kde', color='red', subplots=True)
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['red'])
+ tm.close()
+
+ custom_colors = 'rgcby'
+ axes = df.plot(kind='kde', color=custom_colors, subplots=True)
+ for ax, c in zip(axes, list(custom_colors)):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df)))
+ for cmap in ['jet', cm.jet]:
+ axes = df.plot(kind='kde', colormap=cmap, subplots=True)
+ for ax, c in zip(axes, rgba_colors):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ # make color a list if plotting one column frame
+ # handles cases like df.plot(color='DodgerBlue')
+ axes = df.loc[:, [0]].plot(kind='kde', color='DodgerBlue',
+ subplots=True)
+ self._check_colors(axes[0].lines, linecolors=['DodgerBlue'])
+
+ # single character style
+ axes = df.plot(kind='kde', style='r', subplots=True)
+ for ax in axes:
+ self._check_colors(ax.get_lines(), linecolors=['r'])
+ tm.close()
+
+ # list of styles
+ styles = list('rgcby')
+ axes = df.plot(kind='kde', style=styles, subplots=True)
+ for ax, c in zip(axes, styles):
+ self._check_colors(ax.get_lines(), linecolors=[c])
+ tm.close()
+
+ @pytest.mark.slow
+ def test_boxplot_colors(self):
+ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k',
+ fliers_c=None):
+ # TODO: outside this func?
+ if fliers_c is None:
+ fliers_c = 'k'
+ self._check_colors(bp['boxes'],
+ linecolors=[box_c] * len(bp['boxes']))
+ self._check_colors(bp['whiskers'],
+ linecolors=[whiskers_c] * len(bp['whiskers']))
+ self._check_colors(bp['medians'],
+ linecolors=[medians_c] * len(bp['medians']))
+ self._check_colors(bp['fliers'],
+ linecolors=[fliers_c] * len(bp['fliers']))
+ self._check_colors(bp['caps'],
+ linecolors=[caps_c] * len(bp['caps']))
+
+ default_colors = self._unpack_cycler(self.plt.rcParams)
+
+ df = DataFrame(randn(5, 5))
+ bp = df.plot.box(return_type='dict')
+ _check_colors(bp, default_colors[0], default_colors[0],
+ default_colors[2])
+ tm.close()
+
+ dict_colors = dict(boxes='#572923', whiskers='#982042',
+ medians='#804823', caps='#123456')
+ bp = df.plot.box(color=dict_colors, sym='r+', return_type='dict')
+ _check_colors(bp, dict_colors['boxes'], dict_colors['whiskers'],
+ dict_colors['medians'], dict_colors['caps'], 'r')
+ tm.close()
+
+ # partial colors
+ dict_colors = dict(whiskers='c', medians='m')
+ bp = df.plot.box(color=dict_colors, return_type='dict')
+ _check_colors(bp, default_colors[0], 'c', 'm')
+ tm.close()
+
+ from matplotlib import cm
+ # Test str -> colormap functionality
+ bp = df.plot.box(colormap='jet', return_type='dict')
+ jet_colors = lmap(cm.jet, np.linspace(0, 1, 3))
+ _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2])
+ tm.close()
+
+ # Test colormap functionality
+ bp = df.plot.box(colormap=cm.jet, return_type='dict')
+ _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2])
+ tm.close()
+
+ # string color is applied to all artists except fliers
+ bp = df.plot.box(color='DodgerBlue', return_type='dict')
+ _check_colors(bp, 'DodgerBlue', 'DodgerBlue', 'DodgerBlue',
+ 'DodgerBlue')
+
+ # tuple is also applied to all artists except fliers
+ bp = df.plot.box(color=(0, 1, 0), sym='#123456', return_type='dict')
+ _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0),
+ (0, 1, 0), '#123456')
+
+ with pytest.raises(ValueError):
+ # Color contains invalid key results in ValueError
+ df.plot.box(color=dict(boxes='red', xxxx='blue'))
+
+ def test_default_color_cycle(self):
+ import matplotlib.pyplot as plt
+ import cycler
+ colors = list('rgbk')
+ plt.rcParams['axes.prop_cycle'] = cycler.cycler('color', colors)
+
+ df = DataFrame(randn(5, 3))
+ ax = df.plot()
+
+ expected = self._unpack_cycler(plt.rcParams)[:3]
+ self._check_colors(ax.get_lines(), linecolors=expected)
+
+ def test_unordered_ts(self):
+ df = DataFrame(np.array([3.0, 2.0, 1.0]),
+ index=[date(2012, 10, 1),
+ date(2012, 9, 1),
+ date(2012, 8, 1)],
+ columns=['test'])
+ ax = df.plot()
+ xticks = ax.lines[0].get_xdata()
+ assert xticks[0] < xticks[1]
+ ydata = ax.lines[0].get_ydata()
+ tm.assert_numpy_array_equal(ydata, np.array([1.0, 2.0, 3.0]))
+
+ def test_kind_both_ways(self):
+ df = DataFrame({'x': [1, 2, 3]})
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ df.plot(kind=kind)
+ getattr(df.plot, kind)()
+ for kind in ['scatter', 'hexbin']:
+ df.plot('x', 'x', kind=kind)
+ getattr(df.plot, kind)('x', 'x')
+
+ def test_all_invalid_plot_data(self):
+ df = DataFrame(list('abcd'))
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ with pytest.raises(TypeError):
+ df.plot(kind=kind)
+
+ @pytest.mark.slow
+ def test_partially_invalid_plot_data(self):
+ with tm.RNGContext(42):
+ df = DataFrame(randn(10, 2), dtype=object)
+ df[np.random.rand(df.shape[0]) > 0.5] = 'a'
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ with pytest.raises(TypeError):
+ df.plot(kind=kind)
+
+ with tm.RNGContext(42):
+ # area plot doesn't support positive/negative mixed data
+ kinds = ['area']
+ df = DataFrame(rand(10, 2), dtype=object)
+ df[np.random.rand(df.shape[0]) > 0.5] = 'a'
+ for kind in kinds:
+ with pytest.raises(TypeError):
+ df.plot(kind=kind)
+
+ def test_invalid_kind(self):
+ df = DataFrame(randn(10, 2))
+ with pytest.raises(ValueError):
+ df.plot(kind='aasdf')
+
+ @pytest.mark.parametrize("x,y,lbl", [
+ (['B', 'C'], 'A', 'a'),
+ (['A'], ['B', 'C'], ['b', 'c']),
+ ('A', ['B', 'C'], 'badlabel')
+ ])
+ def test_invalid_xy_args(self, x, y, lbl):
+ # GH 18671, 19699 allows y to be list-like but not x
+ df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]})
+ with pytest.raises(ValueError):
+ df.plot(x=x, y=y, label=lbl)
+
+ @pytest.mark.parametrize("x,y", [
+ ('A', 'B'),
+ (['A'], 'B')
+ ])
+ def test_invalid_xy_args_dup_cols(self, x, y):
+ # GH 18671, 19699 allows y to be list-like but not x
+ df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list('AAB'))
+ with pytest.raises(ValueError):
+ df.plot(x=x, y=y)
+
+ @pytest.mark.parametrize("x,y,lbl,colors", [
+ ('A', ['B'], ['b'], ['red']),
+ ('A', ['B', 'C'], ['b', 'c'], ['red', 'blue']),
+ (0, [1, 2], ['bokeh', 'cython'], ['green', 'yellow'])
+ ])
+ def test_y_listlike(self, x, y, lbl, colors):
+ # GH 19699: tests list-like y and verifies lbls & colors
+ df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]})
+ _check_plot_works(df.plot, x='A', y=y, label=lbl)
+
+ ax = df.plot(x=x, y=y, label=lbl, color=colors)
+ assert len(ax.lines) == len(y)
+ self._check_colors(ax.get_lines(), linecolors=colors)
+
+ @pytest.mark.parametrize("x,y,colnames", [
+ (0, 1, ['A', 'B']),
+ (1, 0, [0, 1])
+ ])
+ def test_xy_args_integer(self, x, y, colnames):
+ # GH 20056: tests integer args for xy and checks col names
+ df = DataFrame({"A": [1, 2], 'B': [3, 4]})
+ df.columns = colnames
+ _check_plot_works(df.plot, x=x, y=y)
+
+ @pytest.mark.slow
+ def test_hexbin_basic(self):
+ df = self.hexbin_df
+
+ ax = df.plot.hexbin(x='A', y='B', gridsize=10)
+ # TODO: need better way to test. This just does existence.
+ assert len(ax.collections) == 1
+
+ # GH 6951
+ axes = df.plot.hexbin(x='A', y='B', subplots=True)
+ # hexbin should have 2 axes in the figure, 1 for plotting and another
+ # is colorbar
+ assert len(axes[0].figure.axes) == 2
+ # return value is single axes
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_hexbin_with_c(self):
+ df = self.hexbin_df
+
+ ax = df.plot.hexbin(x='A', y='B', C='C')
+ assert len(ax.collections) == 1
+
+ ax = df.plot.hexbin(x='A', y='B', C='C', reduce_C_function=np.std)
+ assert len(ax.collections) == 1
+
+ @pytest.mark.slow
+ def test_hexbin_cmap(self):
+ df = self.hexbin_df
+
+ # Default to BuGn
+ ax = df.plot.hexbin(x='A', y='B')
+ assert ax.collections[0].cmap.name == 'BuGn'
+
+ cm = 'cubehelix'
+ ax = df.plot.hexbin(x='A', y='B', colormap=cm)
+ assert ax.collections[0].cmap.name == cm
+
+ @pytest.mark.slow
+ def test_no_color_bar(self):
+ df = self.hexbin_df
+
+ ax = df.plot.hexbin(x='A', y='B', colorbar=None)
+ assert ax.collections[0].colorbar is None
+
+ @pytest.mark.slow
+ def test_allow_cmap(self):
+ df = self.hexbin_df
+
+ ax = df.plot.hexbin(x='A', y='B', cmap='YlGn')
+ assert ax.collections[0].cmap.name == 'YlGn'
+
+ with pytest.raises(TypeError):
+ df.plot.hexbin(x='A', y='B', cmap='YlGn', colormap='BuGn')
+
+ @pytest.mark.slow
+ def test_pie_df(self):
+ df = DataFrame(np.random.rand(5, 3), columns=['X', 'Y', 'Z'],
+ index=['a', 'b', 'c', 'd', 'e'])
+ with pytest.raises(ValueError):
+ df.plot.pie()
+
+ ax = _check_plot_works(df.plot.pie, y='Y')
+ self._check_text_labels(ax.texts, df.index)
+
+ ax = _check_plot_works(df.plot.pie, y=2)
+ self._check_text_labels(ax.texts, df.index)
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot.pie,
+ subplots=True)
+ assert len(axes) == len(df.columns)
+ for ax in axes:
+ self._check_text_labels(ax.texts, df.index)
+ for ax, ylabel in zip(axes, df.columns):
+ assert ax.get_ylabel() == ylabel
+
+ labels = ['A', 'B', 'C', 'D', 'E']
+ color_args = ['r', 'g', 'b', 'c', 'm']
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.plot.pie,
+ subplots=True, labels=labels,
+ colors=color_args)
+ assert len(axes) == len(df.columns)
+
+ for ax in axes:
+ self._check_text_labels(ax.texts, labels)
+ self._check_colors(ax.patches, facecolors=color_args)
+
+ def test_pie_df_nan(self):
+ df = DataFrame(np.random.rand(4, 4))
+ for i in range(4):
+ df.iloc[i, i] = np.nan
+ fig, axes = self.plt.subplots(ncols=4)
+ df.plot.pie(subplots=True, ax=axes, legend=True)
+
+ base_expected = ['0', '1', '2', '3']
+ for i, ax in enumerate(axes):
+ expected = list(base_expected) # force copy
+ expected[i] = ''
+ result = [x.get_text() for x in ax.texts]
+ assert result == expected
+ # legend labels
+ # NaN's not included in legend with subplots
+ # see https://github.com/pandas-dev/pandas/issues/8390
+ assert ([x.get_text() for x in ax.get_legend().get_texts()] ==
+ base_expected[:i] + base_expected[i + 1:])
+
+ @pytest.mark.slow
+ def test_errorbar_plot(self):
+ with warnings.catch_warnings():
+ d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)}
+ df = DataFrame(d)
+ d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4}
+ df_err = DataFrame(d_err)
+
+ # check line plots
+ ax = _check_plot_works(df.plot, yerr=df_err, logy=True)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(df.plot, yerr=df_err, loglog=True)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ kinds = ['line', 'bar', 'barh']
+ for kind in kinds:
+ ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(df.plot, yerr=d_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err,
+ kind=kind)
+ self._check_has_errorbars(ax, xerr=2, yerr=2)
+ ax = _check_plot_works(df.plot, yerr=df_err['x'],
+ xerr=df_err['x'],
+ kind=kind)
+ self._check_has_errorbars(ax, xerr=2, yerr=2)
+ ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind)
+ self._check_has_errorbars(ax, xerr=2, yerr=2)
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ axes = _check_plot_works(df.plot,
+ yerr=df_err, xerr=df_err,
+ subplots=True,
+ kind=kind)
+ self._check_has_errorbars(axes, xerr=1, yerr=1)
+
+ ax = _check_plot_works((df + 1).plot, yerr=df_err,
+ xerr=df_err, kind='bar', log=True)
+ self._check_has_errorbars(ax, xerr=2, yerr=2)
+
+ # yerr is raw error values
+ ax = _check_plot_works(df['y'].plot, yerr=np.ones(12) * 0.4)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ # yerr is iterator
+ import itertools
+ ax = _check_plot_works(df.plot,
+ yerr=itertools.repeat(0.1, len(df)))
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ # yerr is column name
+ for yerr in ['yerr', u('誤差')]:
+ s_df = df.copy()
+ s_df[yerr] = np.ones(12) * 0.2
+ ax = _check_plot_works(s_df.plot, yerr=yerr)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+
+ with pytest.raises(ValueError):
+ df.plot(yerr=np.random.randn(11))
+
+ df_err = DataFrame({'x': ['zzz'] * 12, 'y': ['zzz'] * 12})
+ with pytest.raises((ValueError, TypeError)):
+ df.plot(yerr=df_err)
+
+ @pytest.mark.slow
+ def test_errorbar_with_integer_column_names(self):
+ # test with integer column names
+ df = DataFrame(np.random.randn(10, 2))
+ df_err = DataFrame(np.random.randn(10, 2))
+ ax = _check_plot_works(df.plot, yerr=df_err)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(df.plot, y=0, yerr=1)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+
+ @pytest.mark.slow
+ def test_errorbar_with_partial_columns(self):
+ df = DataFrame(np.random.randn(10, 3))
+ df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2])
+ kinds = ['line', 'bar']
+ for kind in kinds:
+ ax = _check_plot_works(df.plot, yerr=df_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ ix = date_range('1/1/2000', periods=10, freq='M')
+ df.set_index(ix, inplace=True)
+ df_err.set_index(ix, inplace=True)
+ ax = _check_plot_works(df.plot, yerr=df_err, kind='line')
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)}
+ df = DataFrame(d)
+ d_err = {'x': np.ones(12) * 0.2, 'z': np.ones(12) * 0.4}
+ df_err = DataFrame(d_err)
+ for err in [d_err, df_err]:
+ ax = _check_plot_works(df.plot, yerr=err)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+
+ @pytest.mark.slow
+ def test_errorbar_timeseries(self):
+
+ with warnings.catch_warnings():
+ d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)}
+ d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4}
+
+ # check time-series plots
+ ix = date_range('1/1/2000', '1/1/2001', freq='M')
+ tdf = DataFrame(d, index=ix)
+ tdf_err = DataFrame(d_err, index=ix)
+
+ kinds = ['line', 'bar', 'barh']
+ for kind in kinds:
+ ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+ ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'],
+ kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=2)
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ axes = _check_plot_works(tdf.plot,
+ kind=kind, yerr=tdf_err,
+ subplots=True)
+ self._check_has_errorbars(axes, xerr=0, yerr=1)
+
+ def test_errorbar_asymmetrical(self):
+
+ np.random.seed(0)
+ err = np.random.rand(3, 2, 5)
+
+ # each column is [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]...
+ df = DataFrame(np.arange(15).reshape(3, 5)).T
+
+ ax = df.plot(yerr=err, xerr=err / 2)
+
+ yerr_0_0 = ax.collections[1].get_paths()[0].vertices[:, 1]
+ expected_0_0 = err[0, :, 0] * np.array([-1, 1])
+ tm.assert_almost_equal(yerr_0_0, expected_0_0)
+
+ with pytest.raises(ValueError):
+ df.plot(yerr=err.T)
+
+ tm.close()
+
+ # This XPASSES when tested with mpl == 3.0.1
+ @td.xfail_if_mpl_2_2
+ def test_table(self):
+ df = DataFrame(np.random.rand(10, 3),
+ index=list(string.ascii_letters[:10]))
+ _check_plot_works(df.plot, table=True)
+ _check_plot_works(df.plot, table=df)
+
+ ax = df.plot()
+ assert len(ax.tables) == 0
+ plotting.table(ax, df.T)
+ assert len(ax.tables) == 1
+
+ def test_errorbar_scatter(self):
+ df = DataFrame(
+ np.random.randn(5, 2), index=range(5), columns=['x', 'y'])
+ df_err = DataFrame(np.random.randn(5, 2) / 5,
+ index=range(5), columns=['x', 'y'])
+
+ ax = _check_plot_works(df.plot.scatter, x='x', y='y')
+ self._check_has_errorbars(ax, xerr=0, yerr=0)
+ ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err)
+ self._check_has_errorbars(ax, xerr=1, yerr=0)
+
+ ax = _check_plot_works(df.plot.scatter, x='x', y='y', yerr=df_err)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err,
+ yerr=df_err)
+ self._check_has_errorbars(ax, xerr=1, yerr=1)
+
+ def _check_errorbar_color(containers, expected, has_err='has_xerr'):
+ lines = []
+ errs = [c.lines
+ for c in ax.containers if getattr(c, has_err, False)][0]
+ for el in errs:
+ if is_list_like(el):
+ lines.extend(el)
+ else:
+ lines.append(el)
+ err_lines = [x for x in lines if x in ax.collections]
+ self._check_colors(
+ err_lines, linecolors=np.array([expected] * len(err_lines)))
+
+ # GH 8081
+ df = DataFrame(
+ np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
+ ax = df.plot.scatter(x='a', y='b', xerr='d', yerr='e', c='red')
+ self._check_has_errorbars(ax, xerr=1, yerr=1)
+ _check_errorbar_color(ax.containers, 'red', has_err='has_xerr')
+ _check_errorbar_color(ax.containers, 'red', has_err='has_yerr')
+
+ ax = df.plot.scatter(x='a', y='b', yerr='e', color='green')
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ _check_errorbar_color(ax.containers, 'green', has_err='has_yerr')
+
+ @pytest.mark.slow
+ def test_sharex_and_ax(self):
+ # https://github.com/pandas-dev/pandas/issues/9737 using gridspec,
+ # the axis in fig.get_axis() are sorted differently than pandas
+ # expected them, so make sure that only the right ones are removed
+ import matplotlib.pyplot as plt
+ plt.close('all')
+ gs, axes = _generate_4_axes_via_gridspec()
+
+ df = DataFrame({"a": [1, 2, 3, 4, 5, 6],
+ "b": [1, 2, 3, 4, 5, 6],
+ "c": [1, 2, 3, 4, 5, 6],
+ "d": [1, 2, 3, 4, 5, 6]})
+
+ def _check(axes):
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ for ax in [axes[0], axes[2]]:
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=False)
+ for ax in [axes[1], axes[3]]:
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=True)
+
+ for ax in axes:
+ df.plot(x="a", y="b", title="title", ax=ax, sharex=True)
+ gs.tight_layout(plt.gcf())
+ _check(axes)
+ tm.close()
+
+ gs, axes = _generate_4_axes_via_gridspec()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=axes, sharex=True)
+ _check(axes)
+ tm.close()
+
+ gs, axes = _generate_4_axes_via_gridspec()
+ # without sharex, no labels should be touched!
+ for ax in axes:
+ df.plot(x="a", y="b", title="title", ax=ax)
+
+ gs.tight_layout(plt.gcf())
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ @pytest.mark.slow
+ def test_sharey_and_ax(self):
+ # https://github.com/pandas-dev/pandas/issues/9737 using gridspec,
+ # the axis in fig.get_axis() are sorted differently than pandas
+ # expected them, so make sure that only the right ones are removed
+ import matplotlib.pyplot as plt
+
+ gs, axes = _generate_4_axes_via_gridspec()
+
+ df = DataFrame({"a": [1, 2, 3, 4, 5, 6],
+ "b": [1, 2, 3, 4, 5, 6],
+ "c": [1, 2, 3, 4, 5, 6],
+ "d": [1, 2, 3, 4, 5, 6]})
+
+ def _check(axes):
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=True)
+ for ax in [axes[0], axes[1]]:
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ for ax in [axes[2], axes[3]]:
+ self._check_visible(ax.get_yticklabels(), visible=False)
+
+ for ax in axes:
+ df.plot(x="a", y="b", title="title", ax=ax, sharey=True)
+ gs.tight_layout(plt.gcf())
+ _check(axes)
+ tm.close()
+
+ gs, axes = _generate_4_axes_via_gridspec()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=axes, sharey=True)
+
+ gs.tight_layout(plt.gcf())
+ _check(axes)
+ tm.close()
+
+ gs, axes = _generate_4_axes_via_gridspec()
+ # without sharex, no labels should be touched!
+ for ax in axes:
+ df.plot(x="a", y="b", title="title", ax=ax)
+
+ gs.tight_layout(plt.gcf())
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+
+ def test_memory_leak(self):
+ """ Check that every plot type gets properly collected. """
+ import weakref
+ import gc
+
+ results = {}
+ for kind in plotting._core._plot_klass.keys():
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ args = {}
+ if kind in ['hexbin', 'scatter', 'pie']:
+ df = self.hexbin_df
+ args = {'x': 'A', 'y': 'B'}
+ elif kind == 'area':
+ df = self.tdf.abs()
+ else:
+ df = self.tdf
+
+ # Use a weakref so we can see if the object gets collected without
+ # also preventing it from being collected
+ results[kind] = weakref.proxy(df.plot(kind=kind, **args))
+
+ # have matplotlib delete all the figures
+ tm.close()
+ # force a garbage collection
+ gc.collect()
+ for key in results:
+ # check that every plot was collected
+ with pytest.raises(ReferenceError):
+ # need to actually access something to get an error
+ results[key].lines
+
+ @pytest.mark.slow
+ def test_df_subplots_patterns_minorticks(self):
+ # GH 10657
+ import matplotlib.pyplot as plt
+
+ df = DataFrame(np.random.randn(10, 2),
+ index=date_range('1/1/2000', periods=10),
+ columns=list('AB'))
+
+ # shared subplots
+ fig, axes = plt.subplots(2, 1, sharex=True)
+ axes = df.plot(subplots=True, ax=axes)
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ # xaxis of 1st ax must be hidden
+ self._check_visible(axes[0].get_xticklabels(), visible=False)
+ self._check_visible(axes[0].get_xticklabels(minor=True), visible=False)
+ self._check_visible(axes[1].get_xticklabels(), visible=True)
+ self._check_visible(axes[1].get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ fig, axes = plt.subplots(2, 1)
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=axes, sharex=True)
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ # xaxis of 1st ax must be hidden
+ self._check_visible(axes[0].get_xticklabels(), visible=False)
+ self._check_visible(axes[0].get_xticklabels(minor=True), visible=False)
+ self._check_visible(axes[1].get_xticklabels(), visible=True)
+ self._check_visible(axes[1].get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # not shared
+ fig, axes = plt.subplots(2, 1)
+ axes = df.plot(subplots=True, ax=axes)
+ for ax in axes:
+ assert len(ax.lines) == 1
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ @pytest.mark.slow
+ def test_df_gridspec_patterns(self):
+ # GH 10819
+ import matplotlib.pyplot as plt
+ import matplotlib.gridspec as gridspec
+
+ ts = Series(np.random.randn(10),
+ index=date_range('1/1/2000', periods=10))
+
+ df = DataFrame(np.random.randn(10, 2), index=ts.index,
+ columns=list('AB'))
+
+ def _get_vertical_grid():
+ gs = gridspec.GridSpec(3, 1)
+ fig = plt.figure()
+ ax1 = fig.add_subplot(gs[:2, :])
+ ax2 = fig.add_subplot(gs[2, :])
+ return ax1, ax2
+
+ def _get_horizontal_grid():
+ gs = gridspec.GridSpec(1, 3)
+ fig = plt.figure()
+ ax1 = fig.add_subplot(gs[:, :2])
+ ax2 = fig.add_subplot(gs[:, 2])
+ return ax1, ax2
+
+ for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]:
+ ax1 = ts.plot(ax=ax1)
+ assert len(ax1.lines) == 1
+ ax2 = df.plot(ax=ax2)
+ assert len(ax2.lines) == 2
+ for ax in [ax1, ax2]:
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # subplots=True
+ for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]:
+ axes = df.plot(subplots=True, ax=[ax1, ax2])
+ assert len(ax1.lines) == 1
+ assert len(ax2.lines) == 1
+ for ax in axes:
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(
+ ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # vertical / subplots / sharex=True / sharey=True
+ ax1, ax2 = _get_vertical_grid()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True,
+ sharey=True)
+ assert len(axes[0].lines) == 1
+ assert len(axes[1].lines) == 1
+ for ax in [ax1, ax2]:
+ # yaxis are visible because there is only one column
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ # xaxis of axes0 (top) are hidden
+ self._check_visible(axes[0].get_xticklabels(), visible=False)
+ self._check_visible(axes[0].get_xticklabels(minor=True), visible=False)
+ self._check_visible(axes[1].get_xticklabels(), visible=True)
+ self._check_visible(axes[1].get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # horizontal / subplots / sharex=True / sharey=True
+ ax1, ax2 = _get_horizontal_grid()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True,
+ sharey=True)
+ assert len(axes[0].lines) == 1
+ assert len(axes[1].lines) == 1
+ self._check_visible(axes[0].get_yticklabels(), visible=True)
+ # yaxis of axes1 (right) are hidden
+ self._check_visible(axes[1].get_yticklabels(), visible=False)
+ for ax in [ax1, ax2]:
+ # xaxis are visible because there is only one column
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # boxed
+ def _get_boxed_grid():
+ gs = gridspec.GridSpec(3, 3)
+ fig = plt.figure()
+ ax1 = fig.add_subplot(gs[:2, :2])
+ ax2 = fig.add_subplot(gs[:2, 2])
+ ax3 = fig.add_subplot(gs[2, :2])
+ ax4 = fig.add_subplot(gs[2, 2])
+ return ax1, ax2, ax3, ax4
+
+ axes = _get_boxed_grid()
+ df = DataFrame(np.random.randn(10, 4),
+ index=ts.index, columns=list('ABCD'))
+ axes = df.plot(subplots=True, ax=axes)
+ for ax in axes:
+ assert len(ax.lines) == 1
+ # axis are visible because these are not shared
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ # subplots / sharex=True / sharey=True
+ axes = _get_boxed_grid()
+ with tm.assert_produces_warning(UserWarning):
+ axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True)
+ for ax in axes:
+ assert len(ax.lines) == 1
+ for ax in [axes[0], axes[2]]: # left column
+ self._check_visible(ax.get_yticklabels(), visible=True)
+ for ax in [axes[1], axes[3]]: # right column
+ self._check_visible(ax.get_yticklabels(), visible=False)
+ for ax in [axes[0], axes[1]]: # top row
+ self._check_visible(ax.get_xticklabels(), visible=False)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=False)
+ for ax in [axes[2], axes[3]]: # bottom row
+ self._check_visible(ax.get_xticklabels(), visible=True)
+ self._check_visible(ax.get_xticklabels(minor=True), visible=True)
+ tm.close()
+
+ @pytest.mark.slow
+ def test_df_grid_settings(self):
+ # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
+ self._check_grid_settings(
+ DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}),
+ plotting._core._dataframe_kinds, kws={'x': 'a', 'y': 'b'})
+
+ def test_invalid_colormap(self):
+ df = DataFrame(randn(3, 2), columns=['A', 'B'])
+
+ with pytest.raises(ValueError):
+ df.plot(colormap='invalid_colormap')
+
+ def test_plain_axes(self):
+
+ # supplied ax itself is a SubplotAxes, but figure contains also
+ # a plain Axes object (GH11556)
+ fig, ax = self.plt.subplots()
+ fig.add_axes([0.2, 0.2, 0.2, 0.2])
+ Series(rand(10)).plot(ax=ax)
+
+ # suppliad ax itself is a plain Axes, but because the cmap keyword
+ # a new ax is created for the colorbar -> also multiples axes (GH11520)
+ df = DataFrame({'a': randn(8), 'b': randn(8)})
+ fig = self.plt.figure()
+ ax = fig.add_axes((0, 0, 1, 1))
+ df.plot(kind='scatter', ax=ax, x='a', y='b', c='a', cmap='hsv')
+
+ # other examples
+ fig, ax = self.plt.subplots()
+ from mpl_toolkits.axes_grid1 import make_axes_locatable
+ divider = make_axes_locatable(ax)
+ cax = divider.append_axes("right", size="5%", pad=0.05)
+ Series(rand(10)).plot(ax=ax)
+ Series(rand(10)).plot(ax=cax)
+
+ fig, ax = self.plt.subplots()
+ from mpl_toolkits.axes_grid1.inset_locator import inset_axes
+ iax = inset_axes(ax, width="30%", height=1., loc=3)
+ Series(rand(10)).plot(ax=ax)
+ Series(rand(10)).plot(ax=iax)
+
+ def test_passed_bar_colors(self):
+ import matplotlib as mpl
+ color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)]
+ colormap = mpl.colors.ListedColormap(color_tuples)
+ barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap)
+ assert color_tuples == [c.get_facecolor() for c in barplot.patches]
+
+ def test_rcParams_bar_colors(self):
+ import matplotlib as mpl
+ color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)]
+ with mpl.rc_context(
+ rc={'axes.prop_cycle': mpl.cycler("color", color_tuples)}):
+ barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar")
+ assert color_tuples == [c.get_facecolor() for c in barplot.patches]
+
+ @pytest.mark.parametrize('method', ['line', 'barh', 'bar'])
+ def test_secondary_axis_font_size(self, method):
+ # GH: 12565
+ df = (pd.DataFrame(np.random.randn(15, 2),
+ columns=list('AB'))
+ .assign(C=lambda df: df.B.cumsum())
+ .assign(D=lambda df: df.C * 1.1))
+
+ fontsize = 20
+ sy = ['C', 'D']
+
+ kwargs = dict(secondary_y=sy, fontsize=fontsize,
+ mark_right=True)
+ ax = getattr(df.plot, method)(**kwargs)
+ self._check_ticks_props(axes=ax.right_ax,
+ ylabelsize=fontsize)
+
+
+def _generate_4_axes_via_gridspec():
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import matplotlib.gridspec # noqa
+
+ gs = mpl.gridspec.GridSpec(2, 2)
+ ax_tl = plt.subplot(gs[0, 0])
+ ax_ll = plt.subplot(gs[1, 0])
+ ax_tr = plt.subplot(gs[0, 1])
+ ax_lr = plt.subplot(gs[1, 1])
+
+ return gs, [ax_tl, ax_ll, ax_tr, ax_lr]
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_groupby.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_groupby.py
new file mode 100644
index 00000000000..5a5ee75928c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_groupby.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+
+""" Test cases for GroupBy.plot """
+
+
+import numpy as np
+
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame, Series
+from pandas.tests.plotting.common import TestPlotBase
+import pandas.util.testing as tm
+
+
+class TestDataFrameGroupByPlots(TestPlotBase):
+
+ def test_series_groupby_plotting_nominally_works(self):
+ n = 10
+ weight = Series(np.random.normal(166, 20, size=n))
+ height = Series(np.random.normal(60, 10, size=n))
+ with tm.RNGContext(42):
+ gender = np.random.choice(['male', 'female'], size=n)
+
+ weight.groupby(gender).plot()
+ tm.close()
+ height.groupby(gender).hist()
+ tm.close()
+ # Regression test for GH8733
+ height.groupby(gender).plot(alpha=0.5)
+ tm.close()
+
+ def test_plotting_with_float_index_works(self):
+ # GH 7025
+ df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ 'val': np.random.randn(9)},
+ index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0])
+
+ df.groupby('def')['val'].plot()
+ tm.close()
+ df.groupby('def')['val'].apply(lambda x: x.plot())
+ tm.close()
+
+ def test_hist_single_row(self):
+ # GH10214
+ bins = np.arange(80, 100 + 2, 1)
+ df = DataFrame({"Name": ["AAA", "BBB"],
+ "ByCol": [1, 2],
+ "Mark": [85, 89]})
+ df["Mark"].hist(by=df["ByCol"], bins=bins)
+ df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]})
+ df["Mark"].hist(by=df["ByCol"], bins=bins)
+
+ def test_plot_submethod_works(self):
+ df = DataFrame({'x': [1, 2, 3, 4, 5],
+ 'y': [1, 2, 3, 2, 1],
+ 'z': list('ababa')})
+ df.groupby('z').plot.scatter('x', 'y')
+ tm.close()
+ df.groupby('z')['x'].plot.line()
+ tm.close()
+
+ def test_plot_kwargs(self):
+
+ df = DataFrame({'x': [1, 2, 3, 4, 5],
+ 'y': [1, 2, 3, 2, 1],
+ 'z': list('ababa')})
+
+ res = df.groupby('z').plot(kind='scatter', x='x', y='y')
+ # check that a scatter plot is effectively plotted: the axes should
+ # contain a PathCollection from the scatter plot (GH11805)
+ assert len(res['a'].collections) == 1
+
+ res = df.groupby('z').plot.scatter(x='x', y='y')
+ assert len(res['a'].collections) == 1
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_hist_method.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_hist_method.py
new file mode 100644
index 00000000000..7bdbdac54f7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_hist_method.py
@@ -0,0 +1,439 @@
+# coding: utf-8
+
+""" Test cases for .hist method """
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame, Series
+from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
+import pandas.util.testing as tm
+
+from pandas.plotting._compat import _mpl_ge_2_2_0
+from pandas.plotting._core import grouped_hist
+
+
+class TestSeriesPlots(TestPlotBase):
+
+ def setup_method(self, method):
+ TestPlotBase.setup_method(self, method)
+ import matplotlib as mpl
+ mpl.rcdefaults()
+
+ self.ts = tm.makeTimeSeries()
+ self.ts.name = 'ts'
+
+ @pytest.mark.slow
+ def test_hist_legacy(self):
+ _check_plot_works(self.ts.hist)
+ _check_plot_works(self.ts.hist, grid=False)
+ _check_plot_works(self.ts.hist, figsize=(8, 10))
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(self.ts.hist, by=self.ts.index.month)
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5)
+
+ fig, ax = self.plt.subplots(1, 1)
+ _check_plot_works(self.ts.hist, ax=ax)
+ _check_plot_works(self.ts.hist, ax=ax, figure=fig)
+ _check_plot_works(self.ts.hist, figure=fig)
+ tm.close()
+
+ fig, (ax1, ax2) = self.plt.subplots(1, 2)
+ _check_plot_works(self.ts.hist, figure=fig, ax=ax1)
+ _check_plot_works(self.ts.hist, figure=fig, ax=ax2)
+
+ with pytest.raises(ValueError):
+ self.ts.hist(by=self.ts.index, figure=fig)
+
+ @pytest.mark.slow
+ def test_hist_bins_legacy(self):
+ df = DataFrame(np.random.randn(10, 2))
+ ax = df.hist(bins=2)[0][0]
+ assert len(ax.patches) == 2
+
+ @pytest.mark.slow
+ def test_hist_layout(self):
+ df = self.hist_df
+ with pytest.raises(ValueError):
+ df.height.hist(layout=(1, 1))
+
+ with pytest.raises(ValueError):
+ df.height.hist(layout=[1, 1])
+
+ @pytest.mark.slow
+ def test_hist_layout_with_by(self):
+ df = self.hist_df
+
+ # _check_plot_works adds an `ax` kwarg to the method call
+ # so we get a warning about an axis being cleared, even
+ # though we don't explicing pass one, see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist, by=df.gender,
+ layout=(2, 1))
+ self._check_axes_shape(axes, axes_num=2, layout=(2, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist, by=df.gender,
+ layout=(3, -1))
+ self._check_axes_shape(axes, axes_num=2, layout=(3, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist, by=df.category,
+ layout=(4, 1))
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(
+ df.height.hist, by=df.category, layout=(2, -1))
+ self._check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(
+ df.height.hist, by=df.category, layout=(3, -1))
+ self._check_axes_shape(axes, axes_num=4, layout=(3, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(
+ df.height.hist, by=df.category, layout=(-1, 4))
+ self._check_axes_shape(axes, axes_num=4, layout=(1, 4))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(
+ df.height.hist, by=df.classroom, layout=(2, 2))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+ axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7))
+ self._check_axes_shape(
+ axes, axes_num=4, layout=(4, 2), figsize=(12, 7))
+
+ @pytest.mark.slow
+ def test_hist_no_overlap(self):
+ from matplotlib.pyplot import subplot, gcf
+ x = Series(randn(2))
+ y = Series(randn(2))
+ subplot(121)
+ x.hist()
+ subplot(122)
+ y.hist()
+ fig = gcf()
+ axes = fig.axes
+ assert len(axes) == 2
+
+ @pytest.mark.slow
+ def test_hist_by_no_extra_plots(self):
+ df = self.hist_df
+ axes = df.height.hist(by=df.gender) # noqa
+ assert len(self.plt.get_fignums()) == 1
+
+ @pytest.mark.slow
+ def test_plot_fails_when_ax_differs_from_figure(self):
+ from pylab import figure
+ fig1 = figure()
+ fig2 = figure()
+ ax1 = fig1.add_subplot(111)
+ with pytest.raises(AssertionError):
+ self.ts.hist(ax=ax1, figure=fig2)
+
+
+class TestDataFramePlots(TestPlotBase):
+
+ @pytest.mark.slow
+ def test_hist_df_legacy(self):
+ from matplotlib.patches import Rectangle
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(self.hist_df.hist)
+
+ # make sure layout is handled
+ df = DataFrame(randn(100, 3))
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.hist, grid=False)
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+ assert not axes[1, 1].get_visible()
+
+ df = DataFrame(randn(100, 1))
+ _check_plot_works(df.hist)
+
+ # make sure layout is handled
+ df = DataFrame(randn(100, 6))
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.hist, layout=(4, 2))
+ self._check_axes_shape(axes, axes_num=6, layout=(4, 2))
+
+ # make sure sharex, sharey is handled
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.hist, sharex=True, sharey=True)
+
+ # handle figsize arg
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.hist, figsize=(8, 10))
+
+ # check bins argument
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(df.hist, bins=5)
+
+ # make sure xlabelsize and xrot are handled
+ ser = df[0]
+ xf, yf = 20, 18
+ xrot, yrot = 30, 40
+ axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
+ self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
+ ylabelsize=yf, yrot=yrot)
+
+ xf, yf = 20, 18
+ xrot, yrot = 30, 40
+ axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
+ self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
+ ylabelsize=yf, yrot=yrot)
+
+ tm.close()
+ # make sure kwargs to hist are handled
+ if _mpl_ge_2_2_0():
+ kwargs = {"density": True}
+ else:
+ kwargs = {"normed": True}
+ ax = ser.hist(cumulative=True, bins=4, **kwargs)
+ # height of last bin (index 5) must be 1.0
+ rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
+ tm.assert_almost_equal(rects[-1].get_height(), 1.0)
+
+ tm.close()
+ ax = ser.hist(log=True)
+ # scale of y must be 'log'
+ self._check_ax_scales(ax, yaxis='log')
+
+ tm.close()
+
+ # propagate attr exception from matplotlib.Axes.hist
+ with pytest.raises(AttributeError):
+ ser.hist(foo='bar')
+
+ @pytest.mark.slow
+ def test_hist_layout(self):
+ df = DataFrame(randn(100, 3))
+
+ layout_to_expected_size = (
+ {'layout': None, 'expected_size': (2, 2)}, # default is 2x2
+ {'layout': (2, 2), 'expected_size': (2, 2)},
+ {'layout': (4, 1), 'expected_size': (4, 1)},
+ {'layout': (1, 4), 'expected_size': (1, 4)},
+ {'layout': (3, 3), 'expected_size': (3, 3)},
+ {'layout': (-1, 4), 'expected_size': (1, 4)},
+ {'layout': (4, -1), 'expected_size': (4, 1)},
+ {'layout': (-1, 2), 'expected_size': (2, 2)},
+ {'layout': (2, -1), 'expected_size': (2, 2)}
+ )
+
+ for layout_test in layout_to_expected_size:
+ axes = df.hist(layout=layout_test['layout'])
+ expected = layout_test['expected_size']
+ self._check_axes_shape(axes, axes_num=3, layout=expected)
+
+ # layout too small for all 4 plots
+ with pytest.raises(ValueError):
+ df.hist(layout=(1, 1))
+
+ # invalid format for layout
+ with pytest.raises(ValueError):
+ df.hist(layout=(1,))
+ with pytest.raises(ValueError):
+ df.hist(layout=(-1, -1))
+
+ @pytest.mark.slow
+ # GH 9351
+ def test_tight_layout(self):
+ if self.mpl_ge_2_0_1:
+ df = DataFrame(randn(100, 3))
+ _check_plot_works(df.hist)
+ self.plt.tight_layout()
+
+ tm.close()
+
+
+class TestDataFrameGroupByPlots(TestPlotBase):
+
+ @pytest.mark.slow
+ def test_grouped_hist_legacy(self):
+ from matplotlib.patches import Rectangle
+
+ df = DataFrame(randn(500, 2), columns=['A', 'B'])
+ df['C'] = np.random.randint(0, 4, 500)
+ df['D'] = ['X'] * 500
+
+ axes = grouped_hist(df.A, by=df.C)
+ self._check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+ tm.close()
+ axes = df.hist(by=df.C)
+ self._check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+ tm.close()
+ # group by a key with single value
+ axes = df.hist(by='D', rot=30)
+ self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ self._check_ticks_props(axes, xrot=30)
+
+ tm.close()
+ # make sure kwargs to hist are handled
+ xf, yf = 20, 18
+ xrot, yrot = 30, 40
+
+ if _mpl_ge_2_2_0():
+ kwargs = {"density": True}
+ else:
+ kwargs = {"normed": True}
+
+ axes = grouped_hist(df.A, by=df.C, cumulative=True,
+ bins=4, xlabelsize=xf, xrot=xrot,
+ ylabelsize=yf, yrot=yrot, **kwargs)
+ # height of last bin (index 5) must be 1.0
+ for ax in axes.ravel():
+ rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
+ height = rects[-1].get_height()
+ tm.assert_almost_equal(height, 1.0)
+ self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
+ ylabelsize=yf, yrot=yrot)
+
+ tm.close()
+ axes = grouped_hist(df.A, by=df.C, log=True)
+ # scale of y must be 'log'
+ self._check_ax_scales(axes, yaxis='log')
+
+ tm.close()
+ # propagate attr exception from matplotlib.Axes.hist
+ with pytest.raises(AttributeError):
+ grouped_hist(df.A, by=df.C, foo='bar')
+
+ with tm.assert_produces_warning(FutureWarning):
+ df.hist(by='C', figsize='default')
+
+ @pytest.mark.slow
+ def test_grouped_hist_legacy2(self):
+ n = 10
+ weight = Series(np.random.normal(166, 20, size=n))
+ height = Series(np.random.normal(60, 10, size=n))
+ with tm.RNGContext(42):
+ gender_int = np.random.choice([0, 1], size=n)
+ df_int = DataFrame({'height': height, 'weight': weight,
+ 'gender': gender_int})
+ gb = df_int.groupby('gender')
+ axes = gb.hist()
+ assert len(axes) == 2
+ assert len(self.plt.get_fignums()) == 2
+ tm.close()
+
+ @pytest.mark.slow
+ def test_grouped_hist_layout(self):
+ df = self.hist_df
+ pytest.raises(ValueError, df.hist, column='weight', by=df.gender,
+ layout=(1, 1))
+ pytest.raises(ValueError, df.hist, column='height', by=df.category,
+ layout=(1, 3))
+ pytest.raises(ValueError, df.hist, column='height', by=df.category,
+ layout=(-1, -1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.hist, column='height', by=df.gender,
+ layout=(2, 1))
+ self._check_axes_shape(axes, axes_num=2, layout=(2, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.hist, column='height', by=df.gender,
+ layout=(2, -1))
+ self._check_axes_shape(axes, axes_num=2, layout=(2, 1))
+
+ axes = df.hist(column='height', by=df.category, layout=(4, 1))
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ axes = df.hist(column='height', by=df.category, layout=(-1, 1))
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ axes = df.hist(column='height', by=df.category,
+ layout=(4, 2), figsize=(12, 8))
+ self._check_axes_shape(
+ axes, axes_num=4, layout=(4, 2), figsize=(12, 8))
+ tm.close()
+
+ # GH 6769
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(
+ df.hist, column='height', by='classroom', layout=(2, 2))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+ # without column
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.hist, by='classroom')
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+ axes = df.hist(by='gender', layout=(3, 5))
+ self._check_axes_shape(axes, axes_num=2, layout=(3, 5))
+
+ axes = df.hist(column=['height', 'weight', 'category'])
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+ @pytest.mark.slow
+ def test_grouped_hist_multiple_axes(self):
+ # GH 6970, GH 7069
+ df = self.hist_df
+
+ fig, axes = self.plt.subplots(2, 3)
+ returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0])
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ tm.assert_numpy_array_equal(returned, axes[0])
+ assert returned[0].figure is fig
+ returned = df.hist(by='classroom', ax=axes[1])
+ self._check_axes_shape(returned, axes_num=3, layout=(1, 3))
+ tm.assert_numpy_array_equal(returned, axes[1])
+ assert returned[0].figure is fig
+
+ with pytest.raises(ValueError):
+ fig, axes = self.plt.subplots(2, 3)
+ # pass different number of axes from required
+ axes = df.hist(column='height', ax=axes)
+
+ @pytest.mark.slow
+ def test_axis_share_x(self):
+ df = self.hist_df
+ # GH4089
+ ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True)
+
+ # share x
+ assert ax1._shared_x_axes.joined(ax1, ax2)
+ assert ax2._shared_x_axes.joined(ax1, ax2)
+
+ # don't share y
+ assert not ax1._shared_y_axes.joined(ax1, ax2)
+ assert not ax2._shared_y_axes.joined(ax1, ax2)
+
+ @pytest.mark.slow
+ def test_axis_share_y(self):
+ df = self.hist_df
+ ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True)
+
+ # share y
+ assert ax1._shared_y_axes.joined(ax1, ax2)
+ assert ax2._shared_y_axes.joined(ax1, ax2)
+
+ # don't share x
+ assert not ax1._shared_x_axes.joined(ax1, ax2)
+ assert not ax2._shared_x_axes.joined(ax1, ax2)
+
+ @pytest.mark.slow
+ def test_axis_share_xy(self):
+ df = self.hist_df
+ ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True,
+ sharey=True)
+
+ # share both x and y
+ assert ax1._shared_x_axes.joined(ax1, ax2)
+ assert ax2._shared_x_axes.joined(ax1, ax2)
+
+ assert ax1._shared_y_axes.joined(ax1, ax2)
+ assert ax2._shared_y_axes.joined(ax1, ax2)
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_misc.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_misc.py
new file mode 100644
index 00000000000..44b95f7d1b0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_misc.py
@@ -0,0 +1,356 @@
+# coding: utf-8
+
+""" Test cases for misc plot functions """
+
+import numpy as np
+from numpy import random
+from numpy.random import randn
+import pytest
+
+from pandas.compat import lmap
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame
+from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
+import pandas.util.testing as tm
+
+import pandas.plotting as plotting
+
+
+def test_import_error_message():
+ # GH-19810
+ df = DataFrame({"A": [1, 2]})
+
+ with pytest.raises(ImportError, match='matplotlib is required'):
+ df.plot()
+
+
+class TestSeriesPlots(TestPlotBase):
+
+ def setup_method(self, method):
+ TestPlotBase.setup_method(self, method)
+ import matplotlib as mpl
+ mpl.rcdefaults()
+
+ self.ts = tm.makeTimeSeries()
+ self.ts.name = 'ts'
+
+ @pytest.mark.slow
+ def test_autocorrelation_plot(self):
+ from pandas.plotting import autocorrelation_plot
+ _check_plot_works(autocorrelation_plot, series=self.ts)
+ _check_plot_works(autocorrelation_plot, series=self.ts.values)
+
+ ax = autocorrelation_plot(self.ts, label='Test')
+ self._check_legend_labels(ax, labels=['Test'])
+
+ @pytest.mark.slow
+ def test_lag_plot(self):
+ from pandas.plotting import lag_plot
+ _check_plot_works(lag_plot, series=self.ts)
+ _check_plot_works(lag_plot, series=self.ts, lag=5)
+
+ @pytest.mark.slow
+ def test_bootstrap_plot(self):
+ from pandas.plotting import bootstrap_plot
+ _check_plot_works(bootstrap_plot, series=self.ts, size=10)
+
+
+class TestDataFramePlots(TestPlotBase):
+
+ # This XPASSES when tested with mpl == 3.0.1
+ @td.xfail_if_mpl_2_2
+ @td.skip_if_no_scipy
+ def test_scatter_matrix_axis(self):
+ scatter_matrix = plotting.scatter_matrix
+
+ with tm.RNGContext(42):
+ df = DataFrame(randn(100, 3))
+
+ # we are plotting multiples on a sub-plot
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(scatter_matrix, filterwarnings='always',
+ frame=df, range_padding=.1)
+ axes0_labels = axes[0][0].yaxis.get_majorticklabels()
+
+ # GH 5662
+ expected = ['-2', '0', '2']
+ self._check_text_labels(axes0_labels, expected)
+ self._check_ticks_props(
+ axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+ df[0] = ((df[0] - 2) / 3)
+
+ # we are plotting multiples on a sub-plot
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(scatter_matrix, filterwarnings='always',
+ frame=df, range_padding=.1)
+ axes0_labels = axes[0][0].yaxis.get_majorticklabels()
+ expected = ['-1.0', '-0.5', '0.0']
+ self._check_text_labels(axes0_labels, expected)
+ self._check_ticks_props(
+ axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+ @pytest.mark.slow
+ def test_andrews_curves(self, iris):
+ from pandas.plotting import andrews_curves
+ from matplotlib import cm
+
+ df = iris
+
+ _check_plot_works(andrews_curves, frame=df, class_column='Name')
+
+ rgba = ('#556270', '#4ECDC4', '#C7F464')
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', color=rgba)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])
+
+ cnames = ['dodgerblue', 'aquamarine', 'seagreen']
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', color=cnames)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])
+
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', colormap=cm.jet)
+ cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])
+
+ length = 10
+ df = DataFrame({"A": random.rand(length),
+ "B": random.rand(length),
+ "C": random.rand(length),
+ "Name": ["A"] * length})
+
+ _check_plot_works(andrews_curves, frame=df, class_column='Name')
+
+ rgba = ('#556270', '#4ECDC4', '#C7F464')
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', color=rgba)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])
+
+ cnames = ['dodgerblue', 'aquamarine', 'seagreen']
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', color=cnames)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])
+
+ ax = _check_plot_works(andrews_curves, frame=df,
+ class_column='Name', colormap=cm.jet)
+ cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])
+
+ colors = ['b', 'g', 'r']
+ df = DataFrame({"A": [1, 2, 3],
+ "B": [1, 2, 3],
+ "C": [1, 2, 3],
+ "Name": colors})
+ ax = andrews_curves(df, 'Name', color=colors)
+ handles, labels = ax.get_legend_handles_labels()
+ self._check_colors(handles, linecolors=colors)
+
+ with tm.assert_produces_warning(FutureWarning):
+ andrews_curves(data=df, class_column='Name')
+
+ @pytest.mark.slow
+ def test_parallel_coordinates(self, iris):
+ from pandas.plotting import parallel_coordinates
+ from matplotlib import cm
+
+ df = iris
+
+ ax = _check_plot_works(parallel_coordinates,
+ frame=df, class_column='Name')
+ nlines = len(ax.get_lines())
+ nxticks = len(ax.xaxis.get_ticklabels())
+
+ rgba = ('#556270', '#4ECDC4', '#C7F464')
+ ax = _check_plot_works(parallel_coordinates,
+ frame=df, class_column='Name', color=rgba)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])
+
+ cnames = ['dodgerblue', 'aquamarine', 'seagreen']
+ ax = _check_plot_works(parallel_coordinates,
+ frame=df, class_column='Name', color=cnames)
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])
+
+ ax = _check_plot_works(parallel_coordinates,
+ frame=df, class_column='Name', colormap=cm.jet)
+ cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
+ self._check_colors(
+ ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])
+
+ ax = _check_plot_works(parallel_coordinates,
+ frame=df, class_column='Name', axvlines=False)
+ assert len(ax.get_lines()) == (nlines - nxticks)
+
+ colors = ['b', 'g', 'r']
+ df = DataFrame({"A": [1, 2, 3],
+ "B": [1, 2, 3],
+ "C": [1, 2, 3],
+ "Name": colors})
+ ax = parallel_coordinates(df, 'Name', color=colors)
+ handles, labels = ax.get_legend_handles_labels()
+ self._check_colors(handles, linecolors=colors)
+
+ with tm.assert_produces_warning(FutureWarning):
+ parallel_coordinates(data=df, class_column='Name')
+ with tm.assert_produces_warning(FutureWarning):
+ parallel_coordinates(df, 'Name', colors=colors)
+
+ # not sure if this is indicative of a problem
+ @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning")
+ def test_parallel_coordinates_with_sorted_labels(self):
+ """ For #15908 """
+ from pandas.plotting import parallel_coordinates
+
+ df = DataFrame({"feat": [i for i in range(30)],
+ "class": [2 for _ in range(10)] +
+ [3 for _ in range(10)] +
+ [1 for _ in range(10)]})
+ ax = parallel_coordinates(df, 'class', sort_labels=True)
+ polylines, labels = ax.get_legend_handles_labels()
+ color_label_tuples = \
+ zip([polyline.get_color() for polyline in polylines], labels)
+ ordered_color_label_tuples = sorted(color_label_tuples,
+ key=lambda x: x[1])
+ prev_next_tupels = zip([i for i in ordered_color_label_tuples[0:-1]],
+ [i for i in ordered_color_label_tuples[1:]])
+ for prev, nxt in prev_next_tupels:
+ # labels and colors are ordered strictly increasing
+ assert prev[1] < nxt[1] and prev[0] < nxt[0]
+
+ @pytest.mark.slow
+ def test_radviz(self, iris):
+ from pandas.plotting import radviz
+ from matplotlib import cm
+
+ df = iris
+ _check_plot_works(radviz, frame=df, class_column='Name')
+
+ rgba = ('#556270', '#4ECDC4', '#C7F464')
+ ax = _check_plot_works(
+ radviz, frame=df, class_column='Name', color=rgba)
+ # skip Circle drawn as ticks
+ patches = [p for p in ax.patches[:20] if p.get_label() != '']
+ self._check_colors(
+ patches[:10], facecolors=rgba, mapping=df['Name'][:10])
+
+ cnames = ['dodgerblue', 'aquamarine', 'seagreen']
+ _check_plot_works(radviz, frame=df, class_column='Name', color=cnames)
+ patches = [p for p in ax.patches[:20] if p.get_label() != '']
+ self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])
+
+ _check_plot_works(radviz, frame=df,
+ class_column='Name', colormap=cm.jet)
+ cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
+ patches = [p for p in ax.patches[:20] if p.get_label() != '']
+ self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])
+
+ colors = [[0., 0., 1., 1.],
+ [0., 0.5, 1., 1.],
+ [1., 0., 0., 1.]]
+ df = DataFrame({"A": [1, 2, 3],
+ "B": [2, 1, 3],
+ "C": [3, 2, 1],
+ "Name": ['b', 'g', 'r']})
+ ax = radviz(df, 'Name', color=colors)
+ handles, labels = ax.get_legend_handles_labels()
+ self._check_colors(handles, facecolors=colors)
+
+ @pytest.mark.slow
+ def test_subplot_titles(self, iris):
+ df = iris.drop('Name', axis=1).head()
+ # Use the column names as the subplot titles
+ title = list(df.columns)
+
+ # Case len(title) == len(df)
+ plot = df.plot(subplots=True, title=title)
+ assert [p.get_title() for p in plot] == title
+
+ # Case len(title) > len(df)
+ pytest.raises(ValueError, df.plot, subplots=True,
+ title=title + ["kittens > puppies"])
+
+ # Case len(title) < len(df)
+ pytest.raises(ValueError, df.plot, subplots=True, title=title[:2])
+
+ # Case subplots=False and title is of type list
+ pytest.raises(ValueError, df.plot, subplots=False, title=title)
+
+ # Case df with 3 numeric columns but layout of (2,2)
+ plot = df.drop('SepalWidth', axis=1).plot(subplots=True, layout=(2, 2),
+ title=title[:-1])
+ title_list = [ax.get_title() for sublist in plot for ax in sublist]
+ assert title_list == title[:3] + ['']
+
+ def test_get_standard_colors_random_seed(self):
+ # GH17525
+ df = DataFrame(np.zeros((10, 10)))
+
+ # Make sure that the random seed isn't reset by _get_standard_colors
+ plotting.parallel_coordinates(df, 0)
+ rand1 = random.random()
+ plotting.parallel_coordinates(df, 0)
+ rand2 = random.random()
+ assert rand1 != rand2
+
+ # Make sure it produces the same colors every time it's called
+ from pandas.plotting._style import _get_standard_colors
+ color1 = _get_standard_colors(1, color_type='random')
+ color2 = _get_standard_colors(1, color_type='random')
+ assert color1 == color2
+
+ def test_get_standard_colors_default_num_colors(self):
+ from pandas.plotting._style import _get_standard_colors
+
+ # Make sure the default color_types returns the specified amount
+ color1 = _get_standard_colors(1, color_type='default')
+ color2 = _get_standard_colors(9, color_type='default')
+ color3 = _get_standard_colors(20, color_type='default')
+ assert len(color1) == 1
+ assert len(color2) == 9
+ assert len(color3) == 20
+
+ def test_plot_single_color(self):
+ # Example from #20585. All 3 bars should have the same color
+ df = DataFrame({'account-start': ['2017-02-03', '2017-03-03',
+ '2017-01-01'],
+ 'client': ['Alice Anders', 'Bob Baker',
+ 'Charlie Chaplin'],
+ 'balance': [-1432.32, 10.43, 30000.00],
+ 'db-id': [1234, 2424, 251],
+ 'proxy-id': [525, 1525, 2542],
+ 'rank': [52, 525, 32],
+ })
+ ax = df.client.value_counts().plot.bar()
+ colors = lmap(lambda rect: rect.get_facecolor(),
+ ax.get_children()[0:3])
+ assert all(color == colors[0] for color in colors)
+
+ def test_get_standard_colors_no_appending(self):
+ # GH20726
+
+ # Make sure not to add more colors so that matplotlib can cycle
+ # correctly.
+ from matplotlib import cm
+ color_before = cm.gnuplot(range(5))
+ color_after = plotting._style._get_standard_colors(
+ 1, color=color_before)
+ assert len(color_after) == len(color_before)
+
+ df = DataFrame(np.random.randn(48, 4), columns=list("ABCD"))
+
+ color_list = cm.gnuplot(np.linspace(0, 1, 16))
+ p = df.A.plot.bar(figsize=(16, 7), color=color_list)
+ assert (p.patches[1].get_facecolor()
+ == p.patches[17].get_facecolor())
diff --git a/contrib/python/pandas/py2/pandas/tests/plotting/test_series.py b/contrib/python/pandas/py2/pandas/tests/plotting/test_series.py
new file mode 100644
index 00000000000..a234ea8f941
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/plotting/test_series.py
@@ -0,0 +1,903 @@
+# coding: utf-8
+
+""" Test cases for Series.plot """
+
+
+from datetime import datetime
+from itertools import chain
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+from pandas.compat import lrange, range
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame, Series, date_range
+from pandas.tests.plotting.common import (
+ TestPlotBase, _check_plot_works, _ok_for_gaussian_kde,
+ _skip_if_no_scipy_gaussian_kde)
+import pandas.util.testing as tm
+
+import pandas.plotting as plotting
+
+
+class TestSeriesPlots(TestPlotBase):
+
+ def setup_method(self, method):
+ TestPlotBase.setup_method(self, method)
+ import matplotlib as mpl
+ mpl.rcdefaults()
+
+ self.ts = tm.makeTimeSeries()
+ self.ts.name = 'ts'
+
+ self.series = tm.makeStringSeries()
+ self.series.name = 'series'
+
+ self.iseries = tm.makePeriodSeries()
+ self.iseries.name = 'iseries'
+
+ @pytest.mark.slow
+ def test_plot(self):
+ _check_plot_works(self.ts.plot, label='foo')
+ _check_plot_works(self.ts.plot, use_index=False)
+ axes = _check_plot_works(self.ts.plot, rot=0)
+ self._check_ticks_props(axes, xrot=0)
+
+ ax = _check_plot_works(self.ts.plot, style='.', logy=True)
+ self._check_ax_scales(ax, yaxis='log')
+
+ ax = _check_plot_works(self.ts.plot, style='.', logx=True)
+ self._check_ax_scales(ax, xaxis='log')
+
+ ax = _check_plot_works(self.ts.plot, style='.', loglog=True)
+ self._check_ax_scales(ax, xaxis='log', yaxis='log')
+
+ _check_plot_works(self.ts[:10].plot.bar)
+ _check_plot_works(self.ts.plot.area, stacked=False)
+ _check_plot_works(self.iseries.plot)
+
+ for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ _check_plot_works(self.series[:5].plot, kind=kind)
+
+ _check_plot_works(self.series[:10].plot.barh)
+ ax = _check_plot_works(Series(randn(10)).plot.bar, color='black')
+ self._check_colors([ax.patches[0]], facecolors=['black'])
+
+ # GH 6951
+ ax = _check_plot_works(self.ts.plot, subplots=True)
+ self._check_axes_shape(ax, axes_num=1, layout=(1, 1))
+
+ ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1))
+ self._check_axes_shape(ax, axes_num=1, layout=(1, 1))
+ ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1))
+ self._check_axes_shape(ax, axes_num=1, layout=(1, 1))
+
+ @pytest.mark.slow
+ def test_plot_figsize_and_title(self):
+ # figsize and title
+ _, ax = self.plt.subplots()
+ ax = self.series.plot(title='Test', figsize=(16, 8), ax=ax)
+ self._check_text_labels(ax.title, 'Test')
+ self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8))
+
+ def test_dont_modify_rcParams(self):
+ # GH 8242
+ key = 'axes.prop_cycle'
+ colors = self.plt.rcParams[key]
+ _, ax = self.plt.subplots()
+ Series([1, 2, 3]).plot(ax=ax)
+ assert colors == self.plt.rcParams[key]
+
+ def test_ts_line_lim(self):
+ fig, ax = self.plt.subplots()
+ ax = self.ts.plot(ax=ax)
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data(orig=False)[0][0]
+ assert xmax >= lines[0].get_data(orig=False)[0][-1]
+ tm.close()
+
+ ax = self.ts.plot(secondary_y=True, ax=ax)
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data(orig=False)[0][0]
+ assert xmax >= lines[0].get_data(orig=False)[0][-1]
+
+ def test_ts_area_lim(self):
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.area(stacked=False, ax=ax)
+ xmin, xmax = ax.get_xlim()
+ line = ax.get_lines()[0].get_data(orig=False)[0]
+ assert xmin <= line[0]
+ assert xmax >= line[-1]
+ tm.close()
+
+ # GH 7471
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.area(stacked=False, x_compat=True, ax=ax)
+ xmin, xmax = ax.get_xlim()
+ line = ax.get_lines()[0].get_data(orig=False)[0]
+ assert xmin <= line[0]
+ assert xmax >= line[-1]
+ tm.close()
+
+ tz_ts = self.ts.copy()
+ tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET')
+ _, ax = self.plt.subplots()
+ ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax)
+ xmin, xmax = ax.get_xlim()
+ line = ax.get_lines()[0].get_data(orig=False)[0]
+ assert xmin <= line[0]
+ assert xmax >= line[-1]
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ ax = tz_ts.plot.area(stacked=False, secondary_y=True, ax=ax)
+ xmin, xmax = ax.get_xlim()
+ line = ax.get_lines()[0].get_data(orig=False)[0]
+ assert xmin <= line[0]
+ assert xmax >= line[-1]
+
+ def test_label(self):
+ s = Series([1, 2])
+ _, ax = self.plt.subplots()
+ ax = s.plot(label='LABEL', legend=True, ax=ax)
+ self._check_legend_labels(ax, labels=['LABEL'])
+ self.plt.close()
+ _, ax = self.plt.subplots()
+ ax = s.plot(legend=True, ax=ax)
+ self._check_legend_labels(ax, labels=['None'])
+ self.plt.close()
+ # get name from index
+ s.name = 'NAME'
+ _, ax = self.plt.subplots()
+ ax = s.plot(legend=True, ax=ax)
+ self._check_legend_labels(ax, labels=['NAME'])
+ self.plt.close()
+ # override the default
+ _, ax = self.plt.subplots()
+ ax = s.plot(legend=True, label='LABEL', ax=ax)
+ self._check_legend_labels(ax, labels=['LABEL'])
+ self.plt.close()
+ # Add lebel info, but don't draw
+ _, ax = self.plt.subplots()
+ ax = s.plot(legend=False, label='LABEL', ax=ax)
+ assert ax.get_legend() is None # Hasn't been drawn
+ ax.legend() # draw it
+ self._check_legend_labels(ax, labels=['LABEL'])
+
+ def test_line_area_nan_series(self):
+ values = [1, 2, np.nan, 3]
+ s = Series(values)
+ ts = Series(values, index=tm.makeDateIndex(k=4))
+
+ for d in [s, ts]:
+ ax = _check_plot_works(d.plot)
+ masked = ax.lines[0].get_ydata()
+ # remove nan for comparison purpose
+ exp = np.array([1, 2, 3], dtype=np.float64)
+ tm.assert_numpy_array_equal(np.delete(masked.data, 2), exp)
+ tm.assert_numpy_array_equal(
+ masked.mask, np.array([False, False, True, False]))
+
+ expected = np.array([1, 2, 0, 3], dtype=np.float64)
+ ax = _check_plot_works(d.plot, stacked=True)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+ ax = _check_plot_works(d.plot.area)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+ ax = _check_plot_works(d.plot.area, stacked=False)
+ tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+
+ def test_line_use_index_false(self):
+ s = Series([1, 2, 3], index=['a', 'b', 'c'])
+ s.index.name = 'The Index'
+ _, ax = self.plt.subplots()
+ ax = s.plot(use_index=False, ax=ax)
+ label = ax.get_xlabel()
+ assert label == ''
+ _, ax = self.plt.subplots()
+ ax2 = s.plot.bar(use_index=False, ax=ax)
+ label2 = ax2.get_xlabel()
+ assert label2 == ''
+
+ @pytest.mark.slow
+ def test_bar_log(self):
+ expected = np.array([1e-1, 1e0, 1e1, 1e2, 1e3, 1e4])
+
+ _, ax = self.plt.subplots()
+ ax = Series([200, 500]).plot.bar(log=True, ax=ax)
+ tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected)
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ ax = Series([200, 500]).plot.barh(log=True, ax=ax)
+ tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected)
+ tm.close()
+
+ # GH 9905
+ expected = np.array([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1])
+
+ _, ax = self.plt.subplots()
+ ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar', ax=ax)
+ ymin = 0.0007943282347242822
+ ymax = 0.12589254117941673
+ res = ax.get_ylim()
+ tm.assert_almost_equal(res[0], ymin)
+ tm.assert_almost_equal(res[1], ymax)
+ tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected)
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh', ax=ax)
+ res = ax.get_xlim()
+ tm.assert_almost_equal(res[0], ymin)
+ tm.assert_almost_equal(res[1], ymax)
+ tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected)
+
+ @pytest.mark.slow
+ def test_bar_ignore_index(self):
+ df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
+ _, ax = self.plt.subplots()
+ ax = df.plot.bar(use_index=False, ax=ax)
+ self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3'])
+
+ def test_bar_user_colors(self):
+ s = Series([1, 2, 3, 4])
+ ax = s.plot.bar(color=['red', 'blue', 'blue', 'red'])
+ result = [p.get_facecolor() for p in ax.patches]
+ expected = [(1., 0., 0., 1.),
+ (0., 0., 1., 1.),
+ (0., 0., 1., 1.),
+ (1., 0., 0., 1.)]
+ assert result == expected
+
+ def test_rotation(self):
+ df = DataFrame(randn(5, 5))
+ # Default rot 0
+ _, ax = self.plt.subplots()
+ axes = df.plot(ax=ax)
+ self._check_ticks_props(axes, xrot=0)
+
+ _, ax = self.plt.subplots()
+ axes = df.plot(rot=30, ax=ax)
+ self._check_ticks_props(axes, xrot=30)
+
+ def test_irregular_datetime(self):
+ rng = date_range('1/1/2000', '3/1/2000')
+ rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]]
+ ser = Series(randn(len(rng)), rng)
+ _, ax = self.plt.subplots()
+ ax = ser.plot(ax=ax)
+ xp = datetime(1999, 1, 1).toordinal()
+ ax.set_xlim('1/1/1999', '1/1/2001')
+ assert xp == ax.get_xlim()[0]
+
+ def test_unsorted_index_xlim(self):
+ ser = Series([0., 1., np.nan, 3., 4., 5., 6.],
+ index=[1., 0., 3., 2., np.nan, 3., 2.])
+ _, ax = self.plt.subplots()
+ ax = ser.plot(ax=ax)
+ xmin, xmax = ax.get_xlim()
+ lines = ax.get_lines()
+ assert xmin <= np.nanmin(lines[0].get_data(orig=False)[0])
+ assert xmax >= np.nanmax(lines[0].get_data(orig=False)[0])
+
+ @pytest.mark.slow
+ def test_pie_series(self):
+ # if sum of values is less than 1.0, pie handle them as rate and draw
+ # semicircle.
+ series = Series(np.random.randint(1, 5),
+ index=['a', 'b', 'c', 'd', 'e'], name='YLABEL')
+ ax = _check_plot_works(series.plot.pie)
+ self._check_text_labels(ax.texts, series.index)
+ assert ax.get_ylabel() == 'YLABEL'
+
+ # without wedge labels
+ ax = _check_plot_works(series.plot.pie, labels=None)
+ self._check_text_labels(ax.texts, [''] * 5)
+
+ # with less colors than elements
+ color_args = ['r', 'g', 'b']
+ ax = _check_plot_works(series.plot.pie, colors=color_args)
+
+ color_expected = ['r', 'g', 'b', 'r', 'g']
+ self._check_colors(ax.patches, facecolors=color_expected)
+
+ # with labels and colors
+ labels = ['A', 'B', 'C', 'D', 'E']
+ color_args = ['r', 'g', 'b', 'c', 'm']
+ ax = _check_plot_works(series.plot.pie, labels=labels,
+ colors=color_args)
+ self._check_text_labels(ax.texts, labels)
+ self._check_colors(ax.patches, facecolors=color_args)
+
+ # with autopct and fontsize
+ ax = _check_plot_works(series.plot.pie, colors=color_args,
+ autopct='%.2f', fontsize=7)
+ pcts = ['{0:.2f}'.format(s * 100)
+ for s in series.values / float(series.sum())]
+ expected_texts = list(chain.from_iterable(zip(series.index, pcts)))
+ self._check_text_labels(ax.texts, expected_texts)
+ for t in ax.texts:
+ assert t.get_fontsize() == 7
+
+ # includes negative value
+ with pytest.raises(ValueError):
+ series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e'])
+ series.plot.pie()
+
+ # includes nan
+ series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'],
+ name='YLABEL')
+ ax = _check_plot_works(series.plot.pie)
+ self._check_text_labels(ax.texts, ['a', 'b', '', 'd'])
+
+ def test_pie_nan(self):
+ s = Series([1, np.nan, 1, 1])
+ _, ax = self.plt.subplots()
+ ax = s.plot.pie(legend=True, ax=ax)
+ expected = ['0', '', '2', '3']
+ result = [x.get_text() for x in ax.texts]
+ assert result == expected
+
+ @pytest.mark.slow
+ def test_hist_df_kwargs(self):
+ df = DataFrame(np.random.randn(10, 2))
+ _, ax = self.plt.subplots()
+ ax = df.plot.hist(bins=5, ax=ax)
+ assert len(ax.patches) == 10
+
+ @pytest.mark.slow
+ def test_hist_df_with_nonnumerics(self):
+ # GH 9853
+ with tm.RNGContext(1):
+ df = DataFrame(
+ np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
+ df['E'] = ['x', 'y'] * 5
+ _, ax = self.plt.subplots()
+ ax = df.plot.hist(bins=5, ax=ax)
+ assert len(ax.patches) == 20
+
+ _, ax = self.plt.subplots()
+ ax = df.plot.hist(ax=ax) # bins=10
+ assert len(ax.patches) == 40
+
+ @pytest.mark.slow
+ def test_hist_legacy(self):
+ _check_plot_works(self.ts.hist)
+ _check_plot_works(self.ts.hist, grid=False)
+ _check_plot_works(self.ts.hist, figsize=(8, 10))
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(self.ts.hist,
+ by=self.ts.index.month)
+ with tm.assert_produces_warning(UserWarning):
+ _check_plot_works(self.ts.hist,
+ by=self.ts.index.month, bins=5)
+
+ fig, ax = self.plt.subplots(1, 1)
+ _check_plot_works(self.ts.hist, ax=ax)
+ _check_plot_works(self.ts.hist, ax=ax, figure=fig)
+ _check_plot_works(self.ts.hist, figure=fig)
+ tm.close()
+
+ fig, (ax1, ax2) = self.plt.subplots(1, 2)
+ _check_plot_works(self.ts.hist, figure=fig, ax=ax1)
+ _check_plot_works(self.ts.hist, figure=fig, ax=ax2)
+
+ with pytest.raises(ValueError):
+ self.ts.hist(by=self.ts.index, figure=fig)
+
+ @pytest.mark.slow
+ def test_hist_bins_legacy(self):
+ df = DataFrame(np.random.randn(10, 2))
+ ax = df.hist(bins=2)[0][0]
+ assert len(ax.patches) == 2
+
+ @pytest.mark.slow
+ def test_hist_layout(self):
+ df = self.hist_df
+ with pytest.raises(ValueError):
+ df.height.hist(layout=(1, 1))
+
+ with pytest.raises(ValueError):
+ df.height.hist(layout=[1, 1])
+
+ @pytest.mark.slow
+ def test_hist_layout_with_by(self):
+ df = self.hist_df
+
+ # _check_plot_works adds an ax so catch warning. see GH #13188
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.gender, layout=(2, 1))
+ self._check_axes_shape(axes, axes_num=2, layout=(2, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.gender, layout=(3, -1))
+ self._check_axes_shape(axes, axes_num=2, layout=(3, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.category, layout=(4, 1))
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.category, layout=(2, -1))
+ self._check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.category, layout=(3, -1))
+ self._check_axes_shape(axes, axes_num=4, layout=(3, 2))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.category, layout=(-1, 4))
+ self._check_axes_shape(axes, axes_num=4, layout=(1, 4))
+
+ with tm.assert_produces_warning(UserWarning):
+ axes = _check_plot_works(df.height.hist,
+ by=df.classroom, layout=(2, 2))
+ self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+ axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7))
+ self._check_axes_shape(axes, axes_num=4, layout=(4, 2),
+ figsize=(12, 7))
+
+ @pytest.mark.slow
+ def test_hist_no_overlap(self):
+ from matplotlib.pyplot import subplot, gcf
+ x = Series(randn(2))
+ y = Series(randn(2))
+ subplot(121)
+ x.hist()
+ subplot(122)
+ y.hist()
+ fig = gcf()
+ axes = fig.axes
+ assert len(axes) == 2
+
+ @pytest.mark.slow
+ def test_hist_secondary_legend(self):
+ # GH 9610
+ df = DataFrame(np.random.randn(30, 4), columns=list('abcd'))
+
+ # primary -> secondary
+ _, ax = self.plt.subplots()
+ ax = df['a'].plot.hist(legend=True, ax=ax)
+ df['b'].plot.hist(ax=ax, legend=True, secondary_y=True)
+ # both legends are dran on left ax
+ # left and right axis must be visible
+ self._check_legend_labels(ax, labels=['a', 'b (right)'])
+ assert ax.get_yaxis().get_visible()
+ assert ax.right_ax.get_yaxis().get_visible()
+ tm.close()
+
+ # secondary -> secondary
+ _, ax = self.plt.subplots()
+ ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax)
+ df['b'].plot.hist(ax=ax, legend=True, secondary_y=True)
+ # both legends are draw on left ax
+ # left axis must be invisible, right axis must be visible
+ self._check_legend_labels(ax.left_ax,
+ labels=['a (right)', 'b (right)'])
+ assert not ax.left_ax.get_yaxis().get_visible()
+ assert ax.get_yaxis().get_visible()
+ tm.close()
+
+ # secondary -> primary
+ _, ax = self.plt.subplots()
+ ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax)
+ # right axes is returned
+ df['b'].plot.hist(ax=ax, legend=True)
+ # both legends are draw on left ax
+ # left and right axis must be visible
+ self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b'])
+ assert ax.left_ax.get_yaxis().get_visible()
+ assert ax.get_yaxis().get_visible()
+ tm.close()
+
+ @pytest.mark.slow
+ def test_df_series_secondary_legend(self):
+ # GH 9779
+ df = DataFrame(np.random.randn(30, 3), columns=list('abc'))
+ s = Series(np.random.randn(30), name='x')
+
+ # primary -> secondary (without passing ax)
+ _, ax = self.plt.subplots()
+ ax = df.plot(ax=ax)
+ s.plot(legend=True, secondary_y=True, ax=ax)
+ # both legends are dran on left ax
+ # left and right axis must be visible
+ self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)'])
+ assert ax.get_yaxis().get_visible()
+ assert ax.right_ax.get_yaxis().get_visible()
+ tm.close()
+
+ # primary -> secondary (with passing ax)
+ _, ax = self.plt.subplots()
+ ax = df.plot(ax=ax)
+ s.plot(ax=ax, legend=True, secondary_y=True)
+ # both legends are dran on left ax
+ # left and right axis must be visible
+ self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)'])
+ assert ax.get_yaxis().get_visible()
+ assert ax.right_ax.get_yaxis().get_visible()
+ tm.close()
+
+ # seconcary -> secondary (without passing ax)
+ _, ax = self.plt.subplots()
+ ax = df.plot(secondary_y=True, ax=ax)
+ s.plot(legend=True, secondary_y=True, ax=ax)
+ # both legends are dran on left ax
+ # left axis must be invisible and right axis must be visible
+ expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)']
+ self._check_legend_labels(ax.left_ax, labels=expected)
+ assert not ax.left_ax.get_yaxis().get_visible()
+ assert ax.get_yaxis().get_visible()
+ tm.close()
+
+ # secondary -> secondary (with passing ax)
+ _, ax = self.plt.subplots()
+ ax = df.plot(secondary_y=True, ax=ax)
+ s.plot(ax=ax, legend=True, secondary_y=True)
+ # both legends are dran on left ax
+ # left axis must be invisible and right axis must be visible
+ expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)']
+ self._check_legend_labels(ax.left_ax, expected)
+ assert not ax.left_ax.get_yaxis().get_visible()
+ assert ax.get_yaxis().get_visible()
+ tm.close()
+
+ # secondary -> secondary (with passing ax)
+ _, ax = self.plt.subplots()
+ ax = df.plot(secondary_y=True, mark_right=False, ax=ax)
+ s.plot(ax=ax, legend=True, secondary_y=True)
+ # both legends are dran on left ax
+ # left axis must be invisible and right axis must be visible
+ expected = ['a', 'b', 'c', 'x (right)']
+ self._check_legend_labels(ax.left_ax, expected)
+ assert not ax.left_ax.get_yaxis().get_visible()
+ assert ax.get_yaxis().get_visible()
+ tm.close()
+
+ @pytest.mark.slow
+ def test_secondary_logy(self):
+ # GH 25545
+ s1 = Series(np.random.randn(30))
+ s2 = Series(np.random.randn(30))
+
+ ax1 = s1.plot(logy=True)
+ ax2 = s2.plot(secondary_y=True, logy=True)
+
+ assert ax1.get_yscale() == 'log'
+ assert ax2.get_yscale() == 'log'
+
+ @pytest.mark.slow
+ def test_plot_fails_with_dupe_color_and_style(self):
+ x = Series(randn(2))
+ with pytest.raises(ValueError):
+ _, ax = self.plt.subplots()
+ x.plot(style='k--', color='k', ax=ax)
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_hist_kde(self):
+
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.hist(logy=True, ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ xlabels = ax.get_xticklabels()
+ # ticks are values, thus ticklabels are blank
+ self._check_text_labels(xlabels, [''] * len(xlabels))
+ ylabels = ax.get_yticklabels()
+ self._check_text_labels(ylabels, [''] * len(ylabels))
+
+ _skip_if_no_scipy_gaussian_kde()
+ _check_plot_works(self.ts.plot.kde)
+ _check_plot_works(self.ts.plot.density)
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.kde(logy=True, ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ xlabels = ax.get_xticklabels()
+ self._check_text_labels(xlabels, [''] * len(xlabels))
+ ylabels = ax.get_yticklabels()
+ self._check_text_labels(ylabels, [''] * len(ylabels))
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_kwargs(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ sample_points = np.linspace(-100, 100, 20)
+ _check_plot_works(self.ts.plot.kde, bw_method='scott', ind=20)
+ _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20)
+ _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20))
+ _check_plot_works(self.ts.plot.kde, bw_method=.5, ind=sample_points)
+ _check_plot_works(self.ts.plot.density, bw_method=.5,
+ ind=sample_points)
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.kde(logy=True, bw_method=.5, ind=sample_points,
+ ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ self._check_text_labels(ax.yaxis.get_label(), 'Density')
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_kde_missing_vals(self):
+ _skip_if_no_scipy_gaussian_kde()
+
+ s = Series(np.random.uniform(size=50))
+ s[0] = np.nan
+ axes = _check_plot_works(s.plot.kde)
+
+ # gh-14821: check if the values have any missing values
+ assert any(~np.isnan(axes.lines[0].get_xdata()))
+
+ @pytest.mark.slow
+ def test_hist_kwargs(self):
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.hist(bins=5, ax=ax)
+ assert len(ax.patches) == 5
+ self._check_text_labels(ax.yaxis.get_label(), 'Frequency')
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.hist(orientation='horizontal', ax=ax)
+ self._check_text_labels(ax.xaxis.get_label(), 'Frequency')
+ tm.close()
+
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.hist(align='left', stacked=True, ax=ax)
+ tm.close()
+
+ @pytest.mark.slow
+ @td.skip_if_no_scipy
+ def test_hist_kde_color(self):
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ assert len(ax.patches) == 10
+ self._check_colors(ax.patches, facecolors=['b'] * 10)
+
+ _skip_if_no_scipy_gaussian_kde()
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.kde(logy=True, color='r', ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ lines = ax.get_lines()
+ assert len(lines) == 1
+ self._check_colors(lines, ['r'])
+
+ @pytest.mark.slow
+ def test_boxplot_series(self):
+ _, ax = self.plt.subplots()
+ ax = self.ts.plot.box(logy=True, ax=ax)
+ self._check_ax_scales(ax, yaxis='log')
+ xlabels = ax.get_xticklabels()
+ self._check_text_labels(xlabels, [self.ts.name])
+ ylabels = ax.get_yticklabels()
+ self._check_text_labels(ylabels, [''] * len(ylabels))
+
+ @pytest.mark.slow
+ def test_kind_both_ways(self):
+ s = Series(range(3))
+ kinds = (plotting._core._common_kinds +
+ plotting._core._series_kinds)
+ _, ax = self.plt.subplots()
+ for kind in kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ s.plot(kind=kind, ax=ax)
+ getattr(s.plot, kind)()
+
+ @pytest.mark.slow
+ def test_invalid_plot_data(self):
+ s = Series(list('abcd'))
+ _, ax = self.plt.subplots()
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ with pytest.raises(TypeError):
+ s.plot(kind=kind, ax=ax)
+
+ @pytest.mark.slow
+ def test_valid_object_plot(self):
+ s = Series(lrange(10), dtype=object)
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ _check_plot_works(s.plot, kind=kind)
+
+ def test_partially_invalid_plot_data(self):
+ s = Series(['a', 'b', 1.0, 2])
+ _, ax = self.plt.subplots()
+ for kind in plotting._core._common_kinds:
+ if not _ok_for_gaussian_kde(kind):
+ continue
+ with pytest.raises(TypeError):
+ s.plot(kind=kind, ax=ax)
+
+ def test_invalid_kind(self):
+ s = Series([1, 2])
+ with pytest.raises(ValueError):
+ s.plot(kind='aasdf')
+
+ @pytest.mark.slow
+ def test_dup_datetime_index_plot(self):
+ dr1 = date_range('1/1/2009', periods=4)
+ dr2 = date_range('1/2/2009', periods=4)
+ index = dr1.append(dr2)
+ values = randn(index.size)
+ s = Series(values, index=index)
+ _check_plot_works(s.plot)
+
+ @pytest.mark.slow
+ def test_errorbar_plot(self):
+
+ s = Series(np.arange(10), name='x')
+ s_err = np.random.randn(10)
+ d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y'])
+ # test line and bar plots
+ kinds = ['line', 'bar']
+ for kind in kinds:
+ ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(s.plot, yerr=s_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(s.plot, yerr=d_err, kind=kind)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind)
+ self._check_has_errorbars(ax, xerr=1, yerr=1)
+
+ ax = _check_plot_works(s.plot, xerr=s_err)
+ self._check_has_errorbars(ax, xerr=1, yerr=0)
+
+ # test time series plotting
+ ix = date_range('1/1/2000', '1/1/2001', freq='M')
+ ts = Series(np.arange(12), index=ix, name='x')
+ ts_err = Series(np.random.randn(12), index=ix)
+ td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y'])
+
+ ax = _check_plot_works(ts.plot, yerr=ts_err)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+ ax = _check_plot_works(ts.plot, yerr=td_err)
+ self._check_has_errorbars(ax, xerr=0, yerr=1)
+
+ # check incorrect lengths and types
+ with pytest.raises(ValueError):
+ s.plot(yerr=np.arange(11))
+
+ s_err = ['zzz'] * 10
+ # MPL > 2.0.0 will most likely use TypeError here
+ with pytest.raises((TypeError, ValueError)):
+ s.plot(yerr=s_err)
+
+ # This XPASSES when tested with mpl == 3.0.1
+ @td.xfail_if_mpl_2_2
+ def test_table(self):
+ _check_plot_works(self.series.plot, table=True)
+ _check_plot_works(self.series.plot, table=self.series)
+
+ @pytest.mark.slow
+ def test_series_grid_settings(self):
+ # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
+ self._check_grid_settings(Series([1, 2, 3]),
+ plotting._core._series_kinds +
+ plotting._core._common_kinds)
+
+ @pytest.mark.slow
+ def test_standard_colors(self):
+ from pandas.plotting._style import _get_standard_colors
+
+ for c in ['r', 'red', 'green', '#FF0000']:
+ result = _get_standard_colors(1, color=c)
+ assert result == [c]
+
+ result = _get_standard_colors(1, color=[c])
+ assert result == [c]
+
+ result = _get_standard_colors(3, color=c)
+ assert result == [c] * 3
+
+ result = _get_standard_colors(3, color=[c])
+ assert result == [c] * 3
+
+ @pytest.mark.slow
+ def test_standard_colors_all(self):
+ import matplotlib.colors as colors
+ from pandas.plotting._style import _get_standard_colors
+
+ # multiple colors like mediumaquamarine
+ for c in colors.cnames:
+ result = _get_standard_colors(num_colors=1, color=c)
+ assert result == [c]
+
+ result = _get_standard_colors(num_colors=1, color=[c])
+ assert result == [c]
+
+ result = _get_standard_colors(num_colors=3, color=c)
+ assert result == [c] * 3
+
+ result = _get_standard_colors(num_colors=3, color=[c])
+ assert result == [c] * 3
+
+ # single letter colors like k
+ for c in colors.ColorConverter.colors:
+ result = _get_standard_colors(num_colors=1, color=c)
+ assert result == [c]
+
+ result = _get_standard_colors(num_colors=1, color=[c])
+ assert result == [c]
+
+ result = _get_standard_colors(num_colors=3, color=c)
+ assert result == [c] * 3
+
+ result = _get_standard_colors(num_colors=3, color=[c])
+ assert result == [c] * 3
+
+ def test_series_plot_color_kwargs(self):
+ # GH1890
+ _, ax = self.plt.subplots()
+ ax = Series(np.arange(12) + 1).plot(color='green', ax=ax)
+ self._check_colors(ax.get_lines(), linecolors=['green'])
+
+ def test_time_series_plot_color_kwargs(self):
+ # #1890
+ _, ax = self.plt.subplots()
+ ax = Series(np.arange(12) + 1, index=date_range(
+ '1/1/2000', periods=12)).plot(color='green', ax=ax)
+ self._check_colors(ax.get_lines(), linecolors=['green'])
+
+ def test_time_series_plot_color_with_empty_kwargs(self):
+ import matplotlib as mpl
+
+ def_colors = self._unpack_cycler(mpl.rcParams)
+ index = date_range('1/1/2000', periods=12)
+ s = Series(np.arange(1, 13), index=index)
+
+ ncolors = 3
+
+ _, ax = self.plt.subplots()
+ for i in range(ncolors):
+ ax = s.plot(ax=ax)
+ self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors])
+
+ def test_xticklabels(self):
+ # GH11529
+ s = Series(np.arange(10), index=['P%02d' % i for i in range(10)])
+ _, ax = self.plt.subplots()
+ ax = s.plot(xticks=[0, 3, 5, 9], ax=ax)
+ exp = ['P%02d' % i for i in [0, 3, 5, 9]]
+ self._check_text_labels(ax.get_xticklabels(), exp)
+
+ def test_custom_business_day_freq(self):
+ # GH7222
+ from pandas.tseries.offsets import CustomBusinessDay
+ s = Series(range(100, 121), index=pd.bdate_range(
+ start='2014-05-01', end='2014-06-01',
+ freq=CustomBusinessDay(holidays=['2014-05-26'])))
+
+ _check_plot_works(s.plot)
+
+ @pytest.mark.xfail
+ def test_plot_accessor_updates_on_inplace(self):
+ s = Series([1, 2, 3, 4])
+ _, ax = self.plt.subplots()
+ ax = s.plot(ax=ax)
+ before = ax.xaxis.get_ticklocs()
+
+ s.drop([0, 1], inplace=True)
+ _, ax = self.plt.subplots()
+ after = ax.xaxis.get_ticklocs()
+ tm.assert_numpy_array_equal(before, after)
diff --git a/contrib/python/pandas/py2/pandas/tests/reductions/__init__.py b/contrib/python/pandas/py2/pandas/tests/reductions/__init__.py
new file mode 100644
index 00000000000..e3851753b67
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reductions/__init__.py
@@ -0,0 +1,4 @@
+"""
+Tests for reductions where we want to test for matching behavior across
+Array, Index, Series, and DataFrame methods.
+"""
diff --git a/contrib/python/pandas/py2/pandas/tests/reductions/test_reductions.py b/contrib/python/pandas/py2/pandas/tests/reductions/test_reductions.py
new file mode 100644
index 00000000000..8520855d149
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reductions/test_reductions.py
@@ -0,0 +1,1159 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex,
+ RangeIndex, Series, Timedelta, TimedeltaIndex, Timestamp, compat, isna,
+ timedelta_range, to_timedelta)
+from pandas.core import nanops
+import pandas.util.testing as tm
+
+
+def get_objs():
+ indexes = [
+ tm.makeBoolIndex(10, name='a'),
+ tm.makeIntIndex(10, name='a'),
+ tm.makeFloatIndex(10, name='a'),
+ tm.makeDateIndex(10, name='a'),
+ tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern'),
+ tm.makePeriodIndex(10, name='a'),
+ tm.makeStringIndex(10, name='a'),
+ tm.makeUnicodeIndex(10, name='a')
+ ]
+
+ arr = np.random.randn(10)
+ series = [Series(arr, index=idx, name='a') for idx in indexes]
+
+ objs = indexes + series
+ return objs
+
+
+objs = get_objs()
+
+
+class TestReductions(object):
+
+ @pytest.mark.parametrize('opname', ['max', 'min'])
+ @pytest.mark.parametrize('obj', objs)
+ def test_ops(self, opname, obj):
+ result = getattr(obj, opname)()
+ if not isinstance(obj, PeriodIndex):
+ expected = getattr(obj.values, opname)()
+ else:
+ expected = pd.Period(
+ ordinal=getattr(obj._ndarray_values, opname)(),
+ freq=obj.freq)
+ try:
+ assert result == expected
+ except TypeError:
+ # comparing tz-aware series with np.array results in
+ # TypeError
+ expected = expected.astype('M8[ns]').astype('int64')
+ assert result.value == expected
+
+ def test_nanops(self):
+ # GH#7261
+ for opname in ['max', 'min']:
+ for klass in [Index, Series]:
+ arg_op = 'arg' + opname if klass is Index else 'idx' + opname
+
+ obj = klass([np.nan, 2.0])
+ assert getattr(obj, opname)() == 2.0
+
+ obj = klass([np.nan])
+ assert pd.isna(getattr(obj, opname)())
+ assert pd.isna(getattr(obj, opname)(skipna=False))
+
+ obj = klass([])
+ assert pd.isna(getattr(obj, opname)())
+ assert pd.isna(getattr(obj, opname)(skipna=False))
+
+ obj = klass([pd.NaT, datetime(2011, 11, 1)])
+ # check DatetimeIndex monotonic path
+ assert getattr(obj, opname)() == datetime(2011, 11, 1)
+ assert getattr(obj, opname)(skipna=False) is pd.NaT
+
+ assert getattr(obj, arg_op)() == 1
+ result = getattr(obj, arg_op)(skipna=False)
+ if klass is Series:
+ assert np.isnan(result)
+ else:
+ assert result == -1
+
+ obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT])
+ # check DatetimeIndex non-monotonic path
+ assert getattr(obj, opname)(), datetime(2011, 11, 1)
+ assert getattr(obj, opname)(skipna=False) is pd.NaT
+
+ assert getattr(obj, arg_op)() == 1
+ result = getattr(obj, arg_op)(skipna=False)
+ if klass is Series:
+ assert np.isnan(result)
+ else:
+ assert result == -1
+
+ for dtype in ["M8[ns]", "datetime64[ns, UTC]"]:
+ # cases with empty Series/DatetimeIndex
+ obj = klass([], dtype=dtype)
+
+ assert getattr(obj, opname)() is pd.NaT
+ assert getattr(obj, opname)(skipna=False) is pd.NaT
+
+ with pytest.raises(ValueError, match="empty sequence"):
+ getattr(obj, arg_op)()
+ with pytest.raises(ValueError, match="empty sequence"):
+ getattr(obj, arg_op)(skipna=False)
+
+ # argmin/max
+ obj = Index(np.arange(5, dtype='int64'))
+ assert obj.argmin() == 0
+ assert obj.argmax() == 4
+
+ obj = Index([np.nan, 1, np.nan, 2])
+ assert obj.argmin() == 1
+ assert obj.argmax() == 3
+ assert obj.argmin(skipna=False) == -1
+ assert obj.argmax(skipna=False) == -1
+
+ obj = Index([np.nan])
+ assert obj.argmin() == -1
+ assert obj.argmax() == -1
+ assert obj.argmin(skipna=False) == -1
+ assert obj.argmax(skipna=False) == -1
+
+ obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2),
+ pd.NaT])
+ assert obj.argmin() == 1
+ assert obj.argmax() == 2
+ assert obj.argmin(skipna=False) == -1
+ assert obj.argmax(skipna=False) == -1
+
+ obj = Index([pd.NaT])
+ assert obj.argmin() == -1
+ assert obj.argmax() == -1
+ assert obj.argmin(skipna=False) == -1
+ assert obj.argmax(skipna=False) == -1
+
+ @pytest.mark.parametrize('op, expected_col', [
+ ['max', 'a'], ['min', 'b']
+ ])
+ def test_same_tz_min_max_axis_1(self, op, expected_col):
+ # GH 10390
+ df = DataFrame(pd.date_range('2016-01-01 00:00:00', periods=3,
+ tz='UTC'),
+ columns=['a'])
+ df['b'] = df.a.subtract(pd.Timedelta(seconds=3600))
+ result = getattr(df, op)(axis=1)
+ expected = df[expected_col]
+ tm.assert_series_equal(result, expected)
+
+
+class TestIndexReductions(object):
+ # Note: the name TestIndexReductions indicates these tests
+ # were moved from a Index-specific test file, _not_ that these tests are
+ # intended long-term to be Index-specific
+
+ @pytest.mark.parametrize('start,stop,step',
+ [(0, 400, 3), (500, 0, -6), (-10**6, 10**6, 4),
+ (10**6, -10**6, -4), (0, 10, 20)])
+ def test_max_min_range(self, start, stop, step):
+ # GH#17607
+ idx = RangeIndex(start, stop, step)
+ expected = idx._int64index.max()
+ result = idx.max()
+ assert result == expected
+
+ # skipna should be irrelevant since RangeIndex should never have NAs
+ result2 = idx.max(skipna=False)
+ assert result2 == expected
+
+ expected = idx._int64index.min()
+ result = idx.min()
+ assert result == expected
+
+ # skipna should be irrelevant since RangeIndex should never have NAs
+ result2 = idx.min(skipna=False)
+ assert result2 == expected
+
+ # empty
+ idx = RangeIndex(start, stop, -step)
+ assert isna(idx.max())
+ assert isna(idx.min())
+
+ def test_minmax_timedelta64(self):
+
+ # monotonic
+ idx1 = TimedeltaIndex(['1 days', '2 days', '3 days'])
+ assert idx1.is_monotonic
+
+ # non-monotonic
+ idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT'])
+ assert not idx2.is_monotonic
+
+ for idx in [idx1, idx2]:
+ assert idx.min() == Timedelta('1 days')
+ assert idx.max() == Timedelta('3 days')
+ assert idx.argmin() == 0
+ assert idx.argmax() == 2
+
+ for op in ['min', 'max']:
+ # Return NaT
+ obj = TimedeltaIndex([])
+ assert pd.isna(getattr(obj, op)())
+
+ obj = TimedeltaIndex([pd.NaT])
+ assert pd.isna(getattr(obj, op)())
+
+ obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT])
+ assert pd.isna(getattr(obj, op)())
+
+ def test_numpy_minmax_timedelta64(self):
+ td = timedelta_range('16815 days', '16820 days', freq='D')
+
+ assert np.min(td) == Timedelta('16815 days')
+ assert np.max(td) == Timedelta('16820 days')
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.min(td, out=0)
+ with pytest.raises(ValueError, match=errmsg):
+ np.max(td, out=0)
+
+ assert np.argmin(td) == 0
+ assert np.argmax(td) == 5
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmin(td, out=0)
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmax(td, out=0)
+
+ def test_timedelta_ops(self):
+ # GH#4984
+ # make sure ops return Timedelta
+ s = Series([Timestamp('20130101') + timedelta(seconds=i * i)
+ for i in range(10)])
+ td = s.diff()
+
+ result = td.mean()
+ expected = to_timedelta(timedelta(seconds=9))
+ assert result == expected
+
+ result = td.to_frame().mean()
+ assert result[0] == expected
+
+ result = td.quantile(.1)
+ expected = Timedelta(np.timedelta64(2600, 'ms'))
+ assert result == expected
+
+ result = td.median()
+ expected = to_timedelta('00:00:09')
+ assert result == expected
+
+ result = td.to_frame().median()
+ assert result[0] == expected
+
+ # GH#6462
+ # consistency in returned values for sum
+ result = td.sum()
+ expected = to_timedelta('00:01:21')
+ assert result == expected
+
+ result = td.to_frame().sum()
+ assert result[0] == expected
+
+ # std
+ result = td.std()
+ expected = to_timedelta(Series(td.dropna().values).std())
+ assert result == expected
+
+ result = td.to_frame().std()
+ assert result[0] == expected
+
+ # invalid ops
+ for op in ['skew', 'kurt', 'sem', 'prod']:
+ pytest.raises(TypeError, getattr(td, op))
+
+ # GH#10040
+ # make sure NaT is properly handled by median()
+ s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')])
+ assert s.diff().median() == timedelta(days=4)
+
+ s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'),
+ Timestamp('2015-02-15')])
+ assert s.diff().median() == timedelta(days=6)
+
+ def test_minmax_tz(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ # monotonic
+ idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
+ '2011-01-03'], tz=tz)
+ assert idx1.is_monotonic
+
+ # non-monotonic
+ idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03',
+ '2011-01-02', pd.NaT], tz=tz)
+ assert not idx2.is_monotonic
+
+ for idx in [idx1, idx2]:
+ assert idx.min() == Timestamp('2011-01-01', tz=tz)
+ assert idx.max() == Timestamp('2011-01-03', tz=tz)
+ assert idx.argmin() == 0
+ assert idx.argmax() == 2
+
+ @pytest.mark.parametrize('op', ['min', 'max'])
+ def test_minmax_nat_datetime64(self, op):
+ # Return NaT
+ obj = DatetimeIndex([])
+ assert pd.isna(getattr(obj, op)())
+
+ obj = DatetimeIndex([pd.NaT])
+ assert pd.isna(getattr(obj, op)())
+
+ obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT])
+ assert pd.isna(getattr(obj, op)())
+
+ def test_numpy_minmax_datetime64(self):
+ dr = pd.date_range(start='2016-01-15', end='2016-01-20')
+
+ assert np.min(dr) == Timestamp('2016-01-15 00:00:00', freq='D')
+ assert np.max(dr) == Timestamp('2016-01-20 00:00:00', freq='D')
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.min(dr, out=0)
+
+ with pytest.raises(ValueError, match=errmsg):
+ np.max(dr, out=0)
+
+ assert np.argmin(dr) == 0
+ assert np.argmax(dr) == 5
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmin(dr, out=0)
+
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmax(dr, out=0)
+
+ def test_minmax_period(self):
+
+ # monotonic
+ idx1 = pd.PeriodIndex([NaT, '2011-01-01', '2011-01-02',
+ '2011-01-03'], freq='D')
+ assert idx1.is_monotonic
+
+ # non-monotonic
+ idx2 = pd.PeriodIndex(['2011-01-01', NaT, '2011-01-03',
+ '2011-01-02', NaT], freq='D')
+ assert not idx2.is_monotonic
+
+ for idx in [idx1, idx2]:
+ assert idx.min() == pd.Period('2011-01-01', freq='D')
+ assert idx.max() == pd.Period('2011-01-03', freq='D')
+ assert idx1.argmin() == 1
+ assert idx2.argmin() == 0
+ assert idx1.argmax() == 3
+ assert idx2.argmax() == 2
+
+ for op in ['min', 'max']:
+ # Return NaT
+ obj = PeriodIndex([], freq='M')
+ result = getattr(obj, op)()
+ assert result is NaT
+
+ obj = PeriodIndex([NaT], freq='M')
+ result = getattr(obj, op)()
+ assert result is NaT
+
+ obj = PeriodIndex([NaT, NaT, NaT], freq='M')
+ result = getattr(obj, op)()
+ assert result is NaT
+
+ def test_numpy_minmax_period(self):
+ pr = pd.period_range(start='2016-01-15', end='2016-01-20')
+
+ assert np.min(pr) == Period('2016-01-15', freq='D')
+ assert np.max(pr) == Period('2016-01-20', freq='D')
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.min(pr, out=0)
+ with pytest.raises(ValueError, match=errmsg):
+ np.max(pr, out=0)
+
+ assert np.argmin(pr) == 0
+ assert np.argmax(pr) == 5
+
+ errmsg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmin(pr, out=0)
+ with pytest.raises(ValueError, match=errmsg):
+ np.argmax(pr, out=0)
+
+ def test_min_max_categorical(self):
+
+ ci = pd.CategoricalIndex(list('aabbca'),
+ categories=list('cab'),
+ ordered=False)
+ with pytest.raises(TypeError):
+ ci.min()
+ with pytest.raises(TypeError):
+ ci.max()
+
+ ci = pd.CategoricalIndex(list('aabbca'),
+ categories=list('cab'),
+ ordered=True)
+ assert ci.min() == 'c'
+ assert ci.max() == 'b'
+
+
+class TestSeriesReductions(object):
+ # Note: the name TestSeriesReductions indicates these tests
+ # were moved from a series-specific test file, _not_ that these tests are
+ # intended long-term to be series-specific
+
+ def test_sum_inf(self):
+ s = Series(np.random.randn(10))
+ s2 = s.copy()
+
+ s[5:8] = np.inf
+ s2[5:8] = np.nan
+
+ assert np.isinf(s.sum())
+
+ arr = np.random.randn(100, 100).astype('f4')
+ arr[:, 2] = np.inf
+
+ with pd.option_context("mode.use_inf_as_na", True):
+ tm.assert_almost_equal(s.sum(), s2.sum())
+
+ res = nanops.nansum(arr, axis=1)
+ assert np.isinf(res).all()
+
+ @pytest.mark.parametrize("use_bottleneck", [True, False])
+ @pytest.mark.parametrize("method, unit", [
+ ("sum", 0.0),
+ ("prod", 1.0)
+ ])
+ def test_empty(self, method, unit, use_bottleneck):
+ with pd.option_context("use_bottleneck", use_bottleneck):
+ # GH#9422 / GH#18921
+ # Entirely empty
+ s = Series([])
+ # NA by default
+ result = getattr(s, method)()
+ assert result == unit
+
+ # Explicit
+ result = getattr(s, method)(min_count=0)
+ assert result == unit
+
+ result = getattr(s, method)(min_count=1)
+ assert pd.isna(result)
+
+ # Skipna, default
+ result = getattr(s, method)(skipna=True)
+ result == unit
+
+ # Skipna, explicit
+ result = getattr(s, method)(skipna=True, min_count=0)
+ assert result == unit
+
+ result = getattr(s, method)(skipna=True, min_count=1)
+ assert pd.isna(result)
+
+ # All-NA
+ s = Series([np.nan])
+ # NA by default
+ result = getattr(s, method)()
+ assert result == unit
+
+ # Explicit
+ result = getattr(s, method)(min_count=0)
+ assert result == unit
+
+ result = getattr(s, method)(min_count=1)
+ assert pd.isna(result)
+
+ # Skipna, default
+ result = getattr(s, method)(skipna=True)
+ result == unit
+
+ # skipna, explicit
+ result = getattr(s, method)(skipna=True, min_count=0)
+ assert result == unit
+
+ result = getattr(s, method)(skipna=True, min_count=1)
+ assert pd.isna(result)
+
+ # Mix of valid, empty
+ s = Series([np.nan, 1])
+ # Default
+ result = getattr(s, method)()
+ assert result == 1.0
+
+ # Explicit
+ result = getattr(s, method)(min_count=0)
+ assert result == 1.0
+
+ result = getattr(s, method)(min_count=1)
+ assert result == 1.0
+
+ # Skipna
+ result = getattr(s, method)(skipna=True)
+ assert result == 1.0
+
+ result = getattr(s, method)(skipna=True, min_count=0)
+ assert result == 1.0
+
+ result = getattr(s, method)(skipna=True, min_count=1)
+ assert result == 1.0
+
+ # GH#844 (changed in GH#9422)
+ df = DataFrame(np.empty((10, 0)))
+ assert (getattr(df, method)(1) == unit).all()
+
+ s = pd.Series([1])
+ result = getattr(s, method)(min_count=2)
+ assert pd.isna(result)
+
+ s = pd.Series([np.nan])
+ result = getattr(s, method)(min_count=2)
+ assert pd.isna(result)
+
+ s = pd.Series([np.nan, 1])
+ result = getattr(s, method)(min_count=2)
+ assert pd.isna(result)
+
+ @pytest.mark.parametrize('method, unit', [
+ ('sum', 0.0),
+ ('prod', 1.0),
+ ])
+ def test_empty_multi(self, method, unit):
+ s = pd.Series([1, np.nan, np.nan, np.nan],
+ index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)]))
+ # 1 / 0 by default
+ result = getattr(s, method)(level=0)
+ expected = pd.Series([1, unit], index=['a', 'b'])
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = getattr(s, method)(level=0, min_count=0)
+ expected = pd.Series([1, unit], index=['a', 'b'])
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = getattr(s, method)(level=0, min_count=1)
+ expected = pd.Series([1, np.nan], index=['a', 'b'])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "method", ['mean', 'median', 'std', 'var'])
+ def test_ops_consistency_on_empty(self, method):
+
+ # GH#7869
+ # consistency on empty
+
+ # float
+ result = getattr(Series(dtype=float), method)()
+ assert pd.isna(result)
+
+ # timedelta64[ns]
+ result = getattr(Series(dtype='m8[ns]'), method)()
+ assert result is pd.NaT
+
+ def test_nansum_buglet(self):
+ ser = Series([1.0, np.nan], index=[0, 1])
+ result = np.nansum(ser)
+ tm.assert_almost_equal(result, 1)
+
+ @pytest.mark.parametrize("use_bottleneck", [True, False])
+ def test_sum_overflow(self, use_bottleneck):
+
+ with pd.option_context('use_bottleneck', use_bottleneck):
+ # GH#6915
+ # overflowing on the smaller int dtypes
+ for dtype in ['int32', 'int64']:
+ v = np.arange(5000000, dtype=dtype)
+ s = Series(v)
+
+ result = s.sum(skipna=False)
+ assert int(result) == v.sum(dtype='int64')
+ result = s.min(skipna=False)
+ assert int(result) == 0
+ result = s.max(skipna=False)
+ assert int(result) == v[-1]
+
+ for dtype in ['float32', 'float64']:
+ v = np.arange(5000000, dtype=dtype)
+ s = Series(v)
+
+ result = s.sum(skipna=False)
+ assert result == v.sum(dtype=dtype)
+ result = s.min(skipna=False)
+ assert np.allclose(float(result), 0.0)
+ result = s.max(skipna=False)
+ assert np.allclose(float(result), v[-1])
+
+ def test_empty_timeseries_reductions_return_nat(self):
+ # covers GH#11245
+ for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'):
+ assert Series([], dtype=dtype).min() is pd.NaT
+ assert Series([], dtype=dtype).max() is pd.NaT
+ assert Series([], dtype=dtype).min(skipna=False) is pd.NaT
+ assert Series([], dtype=dtype).max(skipna=False) is pd.NaT
+
+ def test_numpy_argmin_deprecated(self):
+ # See GH#16830
+ data = np.arange(1, 11)
+
+ s = Series(data, index=data)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # The deprecation of Series.argmin also causes a deprecation
+ # warning when calling np.argmin. This behavior is temporary
+ # until the implementation of Series.argmin is corrected.
+ result = np.argmin(s)
+
+ assert result == 1
+
+ with tm.assert_produces_warning(FutureWarning):
+ # argmin is aliased to idxmin
+ result = s.argmin()
+
+ assert result == 1
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argmin(s, out=data)
+
+ def test_numpy_argmax_deprecated(self):
+ # See GH#16830
+ data = np.arange(1, 11)
+
+ s = Series(data, index=data)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # The deprecation of Series.argmax also causes a deprecation
+ # warning when calling np.argmax. This behavior is temporary
+ # until the implementation of Series.argmax is corrected.
+ result = np.argmax(s)
+ assert result == 10
+
+ with tm.assert_produces_warning(FutureWarning):
+ # argmax is aliased to idxmax
+ result = s.argmax()
+
+ assert result == 10
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.argmax(s, out=data)
+
+ def test_idxmin(self):
+ # test idxmin
+ # _check_stat_op approach can not be used here because of isna check.
+ string_series = tm.makeStringSeries().rename('series')
+
+ # add some NaNs
+ string_series[5:15] = np.NaN
+
+ # skipna or no
+ assert string_series[string_series.idxmin()] == string_series.min()
+ assert pd.isna(string_series.idxmin(skipna=False))
+
+ # no NaNs
+ nona = string_series.dropna()
+ assert nona[nona.idxmin()] == nona.min()
+ assert (nona.index.values.tolist().index(nona.idxmin()) ==
+ nona.values.argmin())
+
+ # all NaNs
+ allna = string_series * np.nan
+ assert pd.isna(allna.idxmin())
+
+ # datetime64[ns]
+ s = Series(pd.date_range('20130102', periods=6))
+ result = s.idxmin()
+ assert result == 0
+
+ s[0] = np.nan
+ result = s.idxmin()
+ assert result == 1
+
+ def test_idxmax(self):
+ # test idxmax
+ # _check_stat_op approach can not be used here because of isna check.
+ string_series = tm.makeStringSeries().rename('series')
+
+ # add some NaNs
+ string_series[5:15] = np.NaN
+
+ # skipna or no
+ assert string_series[string_series.idxmax()] == string_series.max()
+ assert pd.isna(string_series.idxmax(skipna=False))
+
+ # no NaNs
+ nona = string_series.dropna()
+ assert nona[nona.idxmax()] == nona.max()
+ assert (nona.index.values.tolist().index(nona.idxmax()) ==
+ nona.values.argmax())
+
+ # all NaNs
+ allna = string_series * np.nan
+ assert pd.isna(allna.idxmax())
+
+ from pandas import date_range
+ s = Series(date_range('20130102', periods=6))
+ result = s.idxmax()
+ assert result == 5
+
+ s[5] = np.nan
+ result = s.idxmax()
+ assert result == 4
+
+ # Float64Index
+ # GH#5914
+ s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1])
+ result = s.idxmax()
+ assert result == 3.1
+ result = s.idxmin()
+ assert result == 1.1
+
+ s = pd.Series(s.index, s.index)
+ result = s.idxmax()
+ assert result == 3.1
+ result = s.idxmin()
+ assert result == 1.1
+
+ def test_all_any(self):
+ ts = tm.makeTimeSeries()
+ bool_series = ts > 0
+ assert not bool_series.all()
+ assert bool_series.any()
+
+ # Alternative types, with implicit 'object' dtype.
+ s = Series(['abc', True])
+ assert 'abc' == s.any() # 'abc' || True => 'abc'
+
+ def test_all_any_params(self):
+ # Check skipna, with implicit 'object' dtype.
+ s1 = Series([np.nan, True])
+ s2 = Series([np.nan, False])
+ assert s1.all(skipna=False) # nan && True => True
+ assert s1.all(skipna=True)
+ assert np.isnan(s2.any(skipna=False)) # nan || False => nan
+ assert not s2.any(skipna=True)
+
+ # Check level.
+ s = pd.Series([False, False, True, True, False, True],
+ index=[0, 0, 1, 1, 2, 2])
+ tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
+ tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
+
+ # bool_only is not implemented with level option.
+ with pytest.raises(NotImplementedError):
+ s.any(bool_only=True, level=0)
+ with pytest.raises(NotImplementedError):
+ s.all(bool_only=True, level=0)
+
+ # bool_only is not implemented alone.
+ with pytest.raises(NotImplementedError):
+ s.any(bool_only=True,)
+ with pytest.raises(NotImplementedError):
+ s.all(bool_only=True)
+
+ def test_timedelta64_analytics(self):
+
+ # index min/max
+ dti = pd.date_range('2012-1-1', periods=3, freq='D')
+ td = Series(dti) - pd.Timestamp('20120101')
+
+ result = td.idxmin()
+ assert result == 0
+
+ result = td.idxmax()
+ assert result == 2
+
+ # GH#2982
+ # with NaT
+ td[0] = np.nan
+
+ result = td.idxmin()
+ assert result == 1
+
+ result = td.idxmax()
+ assert result == 2
+
+ # abs
+ s1 = Series(pd.date_range('20120101', periods=3))
+ s2 = Series(pd.date_range('20120102', periods=3))
+ expected = Series(s2 - s1)
+
+ # FIXME: don't leave commented-out code
+ # this fails as numpy returns timedelta64[us]
+ # result = np.abs(s1-s2)
+ # assert_frame_equal(result,expected)
+
+ result = (s1 - s2).abs()
+ tm.assert_series_equal(result, expected)
+
+ # max/min
+ result = td.max()
+ expected = pd.Timedelta('2 days')
+ assert result == expected
+
+ result = td.min()
+ expected = pd.Timedelta('1 days')
+ assert result == expected
+
+ @pytest.mark.parametrize(
+ "test_input,error_type",
+ [
+ (pd.Series([]), ValueError),
+
+ # For strings, or any Series with dtype 'O'
+ (pd.Series(['foo', 'bar', 'baz']), TypeError),
+ (pd.Series([(1,), (2,)]), TypeError),
+
+ # For mixed data types
+ (
+ pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']),
+ TypeError
+ ),
+ ]
+ )
+ def test_assert_idxminmax_raises(self, test_input, error_type):
+ """
+ Cases where ``Series.argmax`` and related should raise an exception
+ """
+ with pytest.raises(error_type):
+ test_input.idxmin()
+ with pytest.raises(error_type):
+ test_input.idxmin(skipna=False)
+ with pytest.raises(error_type):
+ test_input.idxmax()
+ with pytest.raises(error_type):
+ test_input.idxmax(skipna=False)
+
+ def test_idxminmax_with_inf(self):
+ # For numeric data with NA and Inf (GH #13595)
+ s = pd.Series([0, -np.inf, np.inf, np.nan])
+
+ assert s.idxmin() == 1
+ assert np.isnan(s.idxmin(skipna=False))
+
+ assert s.idxmax() == 2
+ assert np.isnan(s.idxmax(skipna=False))
+
+ # Using old-style behavior that treats floating point nan, -inf, and
+ # +inf as missing
+ with pd.option_context('mode.use_inf_as_na', True):
+ assert s.idxmin() == 0
+ assert np.isnan(s.idxmin(skipna=False))
+ assert s.idxmax() == 0
+ np.isnan(s.idxmax(skipna=False))
+
+
+class TestDatetime64SeriesReductions(object):
+ # Note: the name TestDatetime64SeriesReductions indicates these tests
+ # were moved from a series-specific test file, _not_ that these tests are
+ # intended long-term to be series-specific
+
+ @pytest.mark.parametrize('nat_ser', [
+ Series([pd.NaT, pd.NaT]),
+ Series([pd.NaT, pd.Timedelta('nat')]),
+ Series([pd.Timedelta('nat'), pd.Timedelta('nat')])])
+ def test_minmax_nat_series(self, nat_ser):
+ # GH#23282
+ assert nat_ser.min() is pd.NaT
+ assert nat_ser.max() is pd.NaT
+ assert nat_ser.min(skipna=False) is pd.NaT
+ assert nat_ser.max(skipna=False) is pd.NaT
+
+ @pytest.mark.parametrize('nat_df', [
+ pd.DataFrame([pd.NaT, pd.NaT]),
+ pd.DataFrame([pd.NaT, pd.Timedelta('nat')]),
+ pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])])
+ def test_minmax_nat_dataframe(self, nat_df):
+ # GH#23282
+ assert nat_df.min()[0] is pd.NaT
+ assert nat_df.max()[0] is pd.NaT
+ assert nat_df.min(skipna=False)[0] is pd.NaT
+ assert nat_df.max(skipna=False)[0] is pd.NaT
+
+ def test_min_max(self):
+ rng = pd.date_range('1/1/2000', '12/31/2000')
+ rng2 = rng.take(np.random.permutation(len(rng)))
+
+ the_min = rng2.min()
+ the_max = rng2.max()
+ assert isinstance(the_min, pd.Timestamp)
+ assert isinstance(the_max, pd.Timestamp)
+ assert the_min == rng[0]
+ assert the_max == rng[-1]
+
+ assert rng.min() == rng[0]
+ assert rng.max() == rng[-1]
+
+ def test_min_max_series(self):
+ rng = pd.date_range('1/1/2000', periods=10, freq='4h')
+ lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C']
+ df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls})
+
+ result = df.TS.max()
+ exp = pd.Timestamp(df.TS.iat[-1])
+ assert isinstance(result, pd.Timestamp)
+ assert result == exp
+
+ result = df.TS.min()
+ exp = pd.Timestamp(df.TS.iat[0])
+ assert isinstance(result, pd.Timestamp)
+ assert result == exp
+
+
+class TestCategoricalSeriesReductions(object):
+ # Note: the name TestCategoricalSeriesReductions indicates these tests
+ # were moved from a series-specific test file, _not_ that these tests are
+ # intended long-term to be series-specific
+
+ def test_min_max(self):
+ # unordered cats have no min/max
+ cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
+ with pytest.raises(TypeError):
+ cat.min()
+ with pytest.raises(TypeError):
+ cat.max()
+
+ cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "a"
+ assert _max == "d"
+
+ cat = Series(Categorical(["a", "b", "c", "d"], categories=[
+ 'd', 'c', 'b', 'a'], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert _min == "d"
+ assert _max == "a"
+
+ cat = Series(Categorical(
+ [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
+ ], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == "b"
+
+ cat = Series(Categorical(
+ [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == 1
+
+ def test_min_max_numeric_only(self):
+ # TODO deprecate numeric_only argument for Categorical and use
+ # skipna as well, see GH25303
+ cat = Series(Categorical(
+ ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True))
+
+ _min = cat.min()
+ _max = cat.max()
+ assert np.isnan(_min)
+ assert _max == "a"
+
+ _min = cat.min(numeric_only=True)
+ _max = cat.max(numeric_only=True)
+ assert _min == "b"
+ assert _max == "a"
+
+ _min = cat.min(numeric_only=False)
+ _max = cat.max(numeric_only=False)
+ assert np.isnan(_min)
+ assert _max == "a"
+
+
+class TestSeriesMode(object):
+ # Note: the name TestSeriesMode indicates these tests
+ # were moved from a series-specific test file, _not_ that these tests are
+ # intended long-term to be series-specific
+
+ @pytest.mark.parametrize('dropna, expected', [
+ (True, Series([], dtype=np.float64)),
+ (False, Series([], dtype=np.float64))
+ ])
+ def test_mode_empty(self, dropna, expected):
+ s = Series([], dtype=np.float64)
+ result = s.mode(dropna)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dropna, data, expected', [
+ (True, [1, 1, 1, 2], [1]),
+ (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]),
+ (False, [1, 1, 1, 2], [1]),
+ (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]),
+ ])
+ @pytest.mark.parametrize(
+ 'dt',
+ list(np.typecodes['AllInteger'] + np.typecodes['Float'])
+ )
+ def test_mode_numerical(self, dropna, data, expected, dt):
+ s = Series(data, dtype=dt)
+ result = s.mode(dropna)
+ expected = Series(expected, dtype=dt)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dropna, expected', [
+ (True, [1.0]),
+ (False, [1, np.nan]),
+ ])
+ def test_mode_numerical_nan(self, dropna, expected):
+ s = Series([1, 1, 2, np.nan, np.nan])
+ result = s.mode(dropna)
+ expected = Series(expected)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [
+ (True, ['b'], ['bar'], ['nan']),
+ (False, ['b'], [np.nan], ['nan'])
+ ])
+ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
+ # Test string and object types.
+ data = ['a'] * 2 + ['b'] * 3
+
+ s = Series(data, dtype='c')
+ result = s.mode(dropna)
+ expected1 = Series(expected1, dtype='c')
+ tm.assert_series_equal(result, expected1)
+
+ data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]
+
+ s = Series(data, dtype=object)
+ result = s.mode(dropna)
+ expected2 = Series(expected2, dtype=object)
+ tm.assert_series_equal(result, expected2)
+
+ data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]
+
+ s = Series(data, dtype=object).astype(str)
+ result = s.mode(dropna)
+ expected3 = Series(expected3, dtype=str)
+ tm.assert_series_equal(result, expected3)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2', [
+ (True, ['foo'], ['foo']),
+ (False, ['foo'], [np.nan])
+ ])
+ def test_mode_mixeddtype(self, dropna, expected1, expected2):
+ s = Series([1, 'foo', 'foo'])
+ result = s.mode(dropna)
+ expected = Series(expected1)
+ tm.assert_series_equal(result, expected)
+
+ s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan])
+ result = s.mode(dropna)
+ expected = Series(expected2, dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2', [
+ (True, ['1900-05-03', '2011-01-03', '2013-01-02'],
+ ['2011-01-03', '2013-01-02']),
+ (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']),
+ ])
+ def test_mode_datetime(self, dropna, expected1, expected2):
+ s = Series(['2011-01-03', '2013-01-02',
+ '1900-05-03', 'nan', 'nan'], dtype='M8[ns]')
+ result = s.mode(dropna)
+ expected1 = Series(expected1, dtype='M8[ns]')
+ tm.assert_series_equal(result, expected1)
+
+ s = Series(['2011-01-03', '2013-01-02', '1900-05-03',
+ '2011-01-03', '2013-01-02', 'nan', 'nan'],
+ dtype='M8[ns]')
+ result = s.mode(dropna)
+ expected2 = Series(expected2, dtype='M8[ns]')
+ tm.assert_series_equal(result, expected2)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2', [
+ (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']),
+ (False, [np.nan], [np.nan, '2 min', '1 day']),
+ ])
+ def test_mode_timedelta(self, dropna, expected1, expected2):
+ # gh-5986: Test timedelta types.
+
+ s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'],
+ dtype='timedelta64[ns]')
+ result = s.mode(dropna)
+ expected1 = Series(expected1, dtype='timedelta64[ns]')
+ tm.assert_series_equal(result, expected1)
+
+ s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min',
+ '2 min', '2 min', 'nan', 'nan'],
+ dtype='timedelta64[ns]')
+ result = s.mode(dropna)
+ expected2 = Series(expected2, dtype='timedelta64[ns]')
+ tm.assert_series_equal(result, expected2)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [
+ (True, Categorical([1, 2], categories=[1, 2]),
+ Categorical(['a'], categories=[1, 'a']),
+ Categorical([3, 1], categories=[3, 2, 1], ordered=True)),
+ (False, Categorical([np.nan], categories=[1, 2]),
+ Categorical([np.nan, 'a'], categories=[1, 'a']),
+ Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)),
+ ])
+ def test_mode_category(self, dropna, expected1, expected2, expected3):
+ s = Series(Categorical([1, 2, np.nan, np.nan]))
+ result = s.mode(dropna)
+ expected1 = Series(expected1, dtype='category')
+ tm.assert_series_equal(result, expected1)
+
+ s = Series(Categorical([1, 'a', 'a', np.nan, np.nan]))
+ result = s.mode(dropna)
+ expected2 = Series(expected2, dtype='category')
+ tm.assert_series_equal(result, expected2)
+
+ s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan],
+ categories=[3, 2, 1], ordered=True))
+ result = s.mode(dropna)
+ expected3 = Series(expected3, dtype='category')
+ tm.assert_series_equal(result, expected3)
+
+ @pytest.mark.parametrize('dropna, expected1, expected2', [
+ (True, [2**63], [1, 2**63]),
+ (False, [2**63], [1, 2**63])
+ ])
+ def test_mode_intoverflow(self, dropna, expected1, expected2):
+ # Test for uint64 overflow.
+ s = Series([1, 2**63, 2**63], dtype=np.uint64)
+ result = s.mode(dropna)
+ expected1 = Series(expected1, dtype=np.uint64)
+ tm.assert_series_equal(result, expected1)
+
+ s = Series([1, 2**63], dtype=np.uint64)
+ result = s.mode(dropna)
+ expected2 = Series(expected2, dtype=np.uint64)
+ tm.assert_series_equal(result, expected2)
+
+ @pytest.mark.skipif(not compat.PY3, reason="only PY3")
+ def test_mode_sortwarning(self):
+ # Check for the warning that is raised when the mode
+ # results cannot be sorted
+
+ expected = Series(['foo', np.nan])
+ s = Series([1, 'foo', 'foo', np.nan, np.nan])
+
+ with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+ result = s.mode(dropna=False)
+ result = result.sort_values().reset_index(drop=True)
+
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reductions/test_stat_reductions.py b/contrib/python/pandas/py2/pandas/tests/reductions/test_stat_reductions.py
new file mode 100644
index 00000000000..11ecd03f6c7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reductions/test_stat_reductions.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import DataFrame, Series, compat
+import pandas.util.testing as tm
+
+
+class TestSeriesStatReductions(object):
+ # Note: the name TestSeriesStatReductions indicates these tests
+ # were moved from a series-specific test file, _not_ that these tests are
+ # intended long-term to be series-specific
+
+ def _check_stat_op(self, name, alternate, string_series_,
+ check_objects=False, check_allna=False):
+
+ with pd.option_context('use_bottleneck', False):
+ f = getattr(Series, name)
+
+ # add some NaNs
+ string_series_[5:15] = np.NaN
+
+ # mean, idxmax, idxmin, min, and max are valid for dates
+ if name not in ['max', 'min', 'mean']:
+ ds = Series(pd.date_range('1/1/2001', periods=10))
+ with pytest.raises(TypeError):
+ f(ds)
+
+ # skipna or no
+ assert pd.notna(f(string_series_))
+ assert pd.isna(f(string_series_, skipna=False))
+
+ # check the result is correct
+ nona = string_series_.dropna()
+ tm.assert_almost_equal(f(nona), alternate(nona.values))
+ tm.assert_almost_equal(f(string_series_), alternate(nona.values))
+
+ allna = string_series_ * np.nan
+
+ if check_allna:
+ assert np.isnan(f(allna))
+
+ # dtype=object with None, it works!
+ s = Series([1, 2, 3, None, 5])
+ f(s)
+
+ # GH#2888
+ items = [0]
+ items.extend(lrange(2 ** 40, 2 ** 40 + 1000))
+ s = Series(items, dtype='int64')
+ tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
+
+ # check date range
+ if check_objects:
+ s = Series(pd.bdate_range('1/1/2000', periods=10))
+ res = f(s)
+ exp = alternate(s)
+ assert res == exp
+
+ # check on string data
+ if name not in ['sum', 'min', 'max']:
+ with pytest.raises(TypeError):
+ f(Series(list('abc')))
+
+ # Invalid axis.
+ with pytest.raises(ValueError):
+ f(string_series_, axis=1)
+
+ # Unimplemented numeric_only parameter.
+ if 'numeric_only' in compat.signature(f).args:
+ with pytest.raises(NotImplementedError, match=name):
+ f(string_series_, numeric_only=True)
+
+ def test_sum(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('sum', np.sum, string_series, check_allna=False)
+
+ def test_mean(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('mean', np.mean, string_series)
+
+ def test_median(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('median', np.median, string_series)
+
+ # test with integers, test failure
+ int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
+ tm.assert_almost_equal(np.median(int_ts), int_ts.median())
+
+ def test_prod(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('prod', np.prod, string_series)
+
+ def test_min(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('min', np.min, string_series, check_objects=True)
+
+ def test_max(self):
+ string_series = tm.makeStringSeries().rename('series')
+ self._check_stat_op('max', np.max, string_series, check_objects=True)
+
+ def test_var_std(self):
+ string_series = tm.makeStringSeries().rename('series')
+ datetime_series = tm.makeTimeSeries().rename('ts')
+
+ alt = lambda x: np.std(x, ddof=1)
+ self._check_stat_op('std', alt, string_series)
+
+ alt = lambda x: np.var(x, ddof=1)
+ self._check_stat_op('var', alt, string_series)
+
+ result = datetime_series.std(ddof=4)
+ expected = np.std(datetime_series.values, ddof=4)
+ tm.assert_almost_equal(result, expected)
+
+ result = datetime_series.var(ddof=4)
+ expected = np.var(datetime_series.values, ddof=4)
+ tm.assert_almost_equal(result, expected)
+
+ # 1 - element series with ddof=1
+ s = datetime_series.iloc[[0]]
+ result = s.var(ddof=1)
+ assert pd.isna(result)
+
+ result = s.std(ddof=1)
+ assert pd.isna(result)
+
+ def test_sem(self):
+ string_series = tm.makeStringSeries().rename('series')
+ datetime_series = tm.makeTimeSeries().rename('ts')
+
+ alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
+ self._check_stat_op('sem', alt, string_series)
+
+ result = datetime_series.sem(ddof=4)
+ expected = np.std(datetime_series.values,
+ ddof=4) / np.sqrt(len(datetime_series.values))
+ tm.assert_almost_equal(result, expected)
+
+ # 1 - element series with ddof=1
+ s = datetime_series.iloc[[0]]
+ result = s.sem(ddof=1)
+ assert pd.isna(result)
+
+ @td.skip_if_no_scipy
+ def test_skew(self):
+ from scipy.stats import skew
+
+ string_series = tm.makeStringSeries().rename('series')
+
+ alt = lambda x: skew(x, bias=False)
+ self._check_stat_op('skew', alt, string_series)
+
+ # test corner cases, skew() returns NaN unless there's at least 3
+ # values
+ min_N = 3
+ for i in range(1, min_N + 1):
+ s = Series(np.ones(i))
+ df = DataFrame(np.ones((i, i)))
+ if i < min_N:
+ assert np.isnan(s.skew())
+ assert np.isnan(df.skew()).all()
+ else:
+ assert 0 == s.skew()
+ assert (df.skew() == 0).all()
+
+ @td.skip_if_no_scipy
+ def test_kurt(self):
+ from scipy.stats import kurtosis
+
+ string_series = tm.makeStringSeries().rename('series')
+
+ alt = lambda x: kurtosis(x, bias=False)
+ self._check_stat_op('kurt', alt, string_series)
+
+ index = pd.MultiIndex(
+ levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]
+ )
+ s = Series(np.random.randn(6), index=index)
+ tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar'])
+
+ # test corner cases, kurt() returns NaN unless there's at least 4
+ # values
+ min_N = 4
+ for i in range(1, min_N + 1):
+ s = Series(np.ones(i))
+ df = DataFrame(np.ones((i, i)))
+ if i < min_N:
+ assert np.isnan(s.kurt())
+ assert np.isnan(df.kurt()).all()
+ else:
+ assert 0 == s.kurt()
+ assert (df.kurt() == 0).all()
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/__init__.py b/contrib/python/pandas/py2/pandas/tests/resample/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/conftest.py b/contrib/python/pandas/py2/pandas/tests/resample/conftest.py
new file mode 100644
index 00000000000..d0f78f6d5b4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/conftest.py
@@ -0,0 +1,142 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import period_range
+
+# The various methods we support
+downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem',
+ 'median', 'prod', 'var', 'std', 'ohlc', 'quantile']
+upsample_methods = ['count', 'size']
+series_methods = ['nunique']
+resample_methods = downsample_methods + upsample_methods + series_methods
+
+
[email protected](params=downsample_methods)
+def downsample_method(request):
+ """Fixture for parametrization of Grouper downsample methods."""
+ return request.param
+
+
[email protected](params=upsample_methods)
+def upsample_method(request):
+ """Fixture for parametrization of Grouper upsample methods."""
+ return request.param
+
+
[email protected](params=resample_methods)
+def resample_method(request):
+ """Fixture for parametrization of Grouper resample methods."""
+ return request.param
+
+
+def simple_date_range_series():
+ """
+ Series with date range index and random data for test purposes.
+ """
+ def _simple_date_range_series(start, end, freq='D'):
+ rng = date_range(start, end, freq=freq)
+ return Series(np.random.randn(len(rng)), index=rng)
+ return _simple_date_range_series
+
+
+def simple_period_range_series():
+ """
+ Series with period range index and random data for test purposes.
+ """
+ def _simple_period_range_series(start, end, freq='D'):
+ rng = period_range(start, end, freq=freq)
+ return Series(np.random.randn(len(rng)), index=rng)
+ return _simple_period_range_series
+
+
+def _index_start():
+ """Fixture for parametrization of index, series and frame."""
+ return datetime(2005, 1, 1)
+
+
+def _index_end():
+ """Fixture for parametrization of index, series and frame."""
+ return datetime(2005, 1, 10)
+
+
+def _index_freq():
+ """Fixture for parametrization of index, series and frame."""
+ return 'D'
+
+
+def _index_name():
+ """Fixture for parametrization of index, series and frame."""
+ return None
+
+
+def index(_index_factory, _index_start, _index_end, _index_freq, _index_name):
+ """Fixture for parametrization of date_range, period_range and
+ timedelta_range indexes"""
+ return _index_factory(
+ _index_start, _index_end, freq=_index_freq, name=_index_name)
+
+
+def _static_values(index):
+ """Fixture for parametrization of values used in parametrization of
+ Series and DataFrames with date_range, period_range and
+ timedelta_range indexes"""
+ return np.arange(len(index))
+
+
+def _series_name():
+ """Fixture for parametrization of Series name for Series used with
+ date_range, period_range and timedelta_range indexes"""
+ return None
+
+
+def series(index, _series_name, _static_values):
+ """Fixture for parametrization of Series with date_range, period_range and
+ timedelta_range indexes"""
+ return Series(_static_values, index=index, name=_series_name)
+
+
+def empty_series(series):
+ """Fixture for parametrization of empty Series with date_range,
+ period_range and timedelta_range indexes"""
+ return series[:0]
+
+
+def frame(index, _series_name, _static_values):
+ """Fixture for parametrization of DataFrame with date_range, period_range
+ and timedelta_range indexes"""
+ # _series_name is intentionally unused
+ return DataFrame({'value': _static_values}, index=index)
+
+
+def empty_frame(series):
+ """Fixture for parametrization of empty DataFrame with date_range,
+ period_range and timedelta_range indexes"""
+ index = series.index[:0]
+ return DataFrame(index=index)
+
+
[email protected](params=[Series, DataFrame])
+def series_and_frame(request, series, frame):
+ """Fixture for parametrization of Series and DataFrame with date_range,
+ period_range and timedelta_range indexes"""
+ if request.param == Series:
+ return series
+ if request.param == DataFrame:
+ return frame
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_base.py b/contrib/python/pandas/py2/pandas/tests/resample/test_base.py
new file mode 100644
index 00000000000..911cd990ab8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_base.py
@@ -0,0 +1,228 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import range, zip
+
+import pandas as pd
+from pandas import DataFrame, Series
+from pandas.core.groupby.groupby import DataError
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import PeriodIndex, period_range
+from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
+from pandas.core.resample import TimeGrouper
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_index_equal,
+ assert_series_equal)
+
+# a fixture value can be overridden by the test parameter value. Note that the
+# value of the fixture can be overridden this way even if the test doesn't use
+# it directly (doesn't mention it in the function prototype).
+# see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa
+# in this module we override the fixture values defined in conftest.py
+# tuples of '_index_factory,_series_name,_index_start,_index_end'
+DATE_RANGE = (date_range, 'dti', datetime(2005, 1, 1), datetime(2005, 1, 10))
+PERIOD_RANGE = (
+ period_range, 'pi', datetime(2005, 1, 1), datetime(2005, 1, 10))
+TIMEDELTA_RANGE = (timedelta_range, 'tdi', '1 day', '10 day')
+
+ALL_TIMESERIES_INDEXES = [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE]
+
+
+def pytest_generate_tests(metafunc):
+ # called once per each test function
+ if metafunc.function.__name__.endswith('_all_ts'):
+ metafunc.parametrize(
+ '_index_factory,_series_name,_index_start,_index_end',
+ ALL_TIMESERIES_INDEXES)
+
+
+def create_index(_index_factory):
+ def _create_index(*args, **kwargs):
+ """ return the _index_factory created using the args, kwargs """
+ return _index_factory(*args, **kwargs)
+ return _create_index
+
+
[email protected]('freq', ['2D', '1H'])
+ '_index_factory,_series_name,_index_start,_index_end',
+ [DATE_RANGE, TIMEDELTA_RANGE]
+)
+def test_asfreq(series_and_frame, freq, create_index):
+ obj = series_and_frame
+
+ result = obj.resample(freq).asfreq()
+ new_index = create_index(obj.index[0], obj.index[-1], freq=freq)
+ expected = obj.reindex(new_index)
+ assert_almost_equal(result, expected)
+
+
+ '_index_factory,_series_name,_index_start,_index_end',
+ [DATE_RANGE, TIMEDELTA_RANGE]
+)
+def test_asfreq_fill_value(series, create_index):
+ # test for fill value during resampling, issue 3715
+
+ s = series
+
+ result = s.resample('1H').asfreq()
+ new_index = create_index(s.index[0], s.index[-1], freq='1H')
+ expected = s.reindex(new_index)
+ assert_series_equal(result, expected)
+
+ frame = s.to_frame('value')
+ frame.iloc[1] = None
+ result = frame.resample('1H').asfreq(fill_value=4.0)
+ new_index = create_index(frame.index[0],
+ frame.index[-1], freq='1H')
+ expected = frame.reindex(new_index, fill_value=4.0)
+ assert_frame_equal(result, expected)
+
+
+def test_resample_interpolate_all_ts(frame):
+ # # 12925
+ df = frame
+ assert_frame_equal(
+ df.resample('1T').asfreq().interpolate(),
+ df.resample('1T').interpolate())
+
+
+def test_raises_on_non_datetimelike_index():
+ # this is a non datetimelike index
+ xp = DataFrame()
+ pytest.raises(TypeError, lambda: xp.resample('A').mean())
+
+
[email protected]('freq', ['M', 'D', 'H'])
+def test_resample_empty_series_all_ts(freq, empty_series, resample_method):
+ # GH12771 & GH12868
+
+ if resample_method == 'ohlc':
+ pytest.skip('need to test for ohlc from GH13083')
+
+ s = empty_series
+ result = getattr(s.resample(freq), resample_method)()
+
+ expected = s.copy()
+ if isinstance(s.index, PeriodIndex):
+ expected.index = s.index.asfreq(freq=freq)
+ else:
+ expected.index = s.index._shallow_copy(freq=freq)
+ assert_index_equal(result.index, expected.index)
+ assert result.index.freq == expected.index.freq
+ assert_series_equal(result, expected, check_dtype=False)
+
+
[email protected]('freq', ['M', 'D', 'H'])
+def test_resample_empty_dataframe_all_ts(empty_frame, freq, resample_method):
+ # GH13212
+ df = empty_frame
+ # count retains dimensions too
+ result = getattr(df.resample(freq), resample_method)()
+ if resample_method != 'size':
+ expected = df.copy()
+ else:
+ # GH14962
+ expected = Series([])
+
+ if isinstance(df.index, PeriodIndex):
+ expected.index = df.index.asfreq(freq=freq)
+ else:
+ expected.index = df.index._shallow_copy(freq=freq)
+ assert_index_equal(result.index, expected.index)
+ assert result.index.freq == expected.index.freq
+ assert_almost_equal(result, expected, check_dtype=False)
+
+ # test size for GH13212 (currently stays as df)
+
+
[email protected]("index", tm.all_timeseries_index_generator(0))
+ "dtype",
+ [np.float, np.int, np.object, 'datetime64[ns]'])
+def test_resample_empty_dtypes(index, dtype, resample_method):
+
+ # Empty series were sometimes causing a segfault (for the functions
+ # with Cython bounds-checking disabled) or an IndexError. We just run
+ # them to ensure they no longer do. (GH #10228)
+ empty_series = Series([], index, dtype)
+ try:
+ getattr(empty_series.resample('d'), resample_method)()
+ except DataError:
+ # Ignore these since some combinations are invalid
+ # (ex: doing mean with dtype of np.object)
+ pass
+
+
+def test_resample_loffset_arg_type_all_ts(frame, create_index):
+ # GH 13218, 15002
+ df = frame
+ expected_means = [df.values[i:i + 2].mean()
+ for i in range(0, len(df.values), 2)]
+ expected_index = create_index(df.index[0],
+ periods=len(df.index) / 2,
+ freq='2D')
+
+ # loffset coerces PeriodIndex to DateTimeIndex
+ if isinstance(expected_index, PeriodIndex):
+ expected_index = expected_index.to_timestamp()
+
+ expected_index += timedelta(hours=2)
+ expected = DataFrame({'value': expected_means}, index=expected_index)
+
+ for arg in ['mean', {'value': 'mean'}, ['mean']]:
+
+ result_agg = df.resample('2D', loffset='2H').agg(arg)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result_how = df.resample('2D', how=arg, loffset='2H')
+
+ if isinstance(arg, list):
+ expected.columns = pd.MultiIndex.from_tuples([('value',
+ 'mean')])
+
+ # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex
+ if isinstance(expected.index, TimedeltaIndex):
+ with pytest.raises(AssertionError):
+ assert_frame_equal(result_agg, expected)
+ assert_frame_equal(result_how, expected)
+ else:
+ assert_frame_equal(result_agg, expected)
+ assert_frame_equal(result_how, expected)
+
+
+def test_apply_to_empty_series_all_ts(empty_series):
+ # GH 14313
+ s = empty_series
+ for freq in ['M', 'D', 'H']:
+ result = s.resample(freq).apply(lambda x: 1)
+ expected = s.resample(freq).apply(np.sum)
+
+ assert_series_equal(result, expected, check_dtype=False)
+
+
+def test_resampler_is_iterable_all_ts(series):
+ # GH 15314
+ freq = 'H'
+ tg = TimeGrouper(freq, convention='start')
+ grouped = series.groupby(tg)
+ resampled = series.resample(freq)
+ for (rk, rv), (gk, gv) in zip(resampled, grouped):
+ assert rk == gk
+ assert_series_equal(rv, gv)
+
+
+def test_resample_quantile_all_ts(series):
+ # GH 15023
+ s = series
+ q = 0.75
+ freq = 'H'
+ result = s.resample(freq).quantile(q)
+ expected = s.resample(freq).agg(lambda x: x.quantile(q)).rename(s.name)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_datetime_index.py b/contrib/python/pandas/py2/pandas/tests/resample/test_datetime_index.py
new file mode 100644
index 00000000000..5a8a6b465c5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_datetime_index.py
@@ -0,0 +1,1530 @@
+from datetime import datetime, timedelta
+from functools import partial
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas.compat import StringIO, range
+from pandas.errors import UnsupportedFunctionCall
+
+import pandas as pd
+from pandas import DataFrame, Panel, Series, Timedelta, Timestamp, isna, notna
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import Period, period_range
+from pandas.core.resample import (
+ DatetimeIndex, TimeGrouper, _get_timestamp_range_edges)
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+import pandas.tseries.offsets as offsets
+from pandas.tseries.offsets import BDay, Minute
+
+
+def _index_factory():
+ return date_range
+
+
+def _index_freq():
+ return 'Min'
+
+
+def _static_values(index):
+ return np.random.rand(len(index))
+
+
+def test_custom_grouper(index):
+
+ dti = index
+ s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')
+
+ b = TimeGrouper(Minute(5))
+ g = s.groupby(b)
+
+ # check all cython functions work
+ funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
+ for f in funcs:
+ g._cython_agg_general(f)
+
+ b = TimeGrouper(Minute(5), closed='right', label='right')
+ g = s.groupby(b)
+ # check all cython functions work
+ funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
+ for f in funcs:
+ g._cython_agg_general(f)
+
+ assert g.ngroups == 2593
+ assert notna(g.mean()).all()
+
+ # construct expected val
+ arr = [1] + [5] * 2592
+ idx = dti[0:-1:5]
+ idx = idx.append(dti[-1:])
+ expect = Series(arr, index=idx)
+
+ # GH2763 - return in put dtype if we can
+ result = g.agg(np.sum)
+ assert_series_equal(result, expect)
+
+ df = DataFrame(np.random.rand(len(dti), 10),
+ index=dti, dtype='float64')
+ r = df.groupby(b).agg(np.sum)
+
+ assert len(r.columns) == 10
+ assert len(r.index) == 2593
+
+
+ '_index_start,_index_end,_index_name',
+ [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')])
[email protected]('closed, expected', [
+ ('right',
+ lambda s: Series(
+ [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
+ index=date_range(
+ '1/1/2000', periods=4, freq='5min', name='index'))),
+ ('left',
+ lambda s: Series(
+ [s[:5].mean(), s[5:10].mean(), s[10:].mean()],
+ index=date_range(
+ '1/1/2000 00:05', periods=3, freq='5min', name='index'))
+ )
+])
+def test_resample_basic(series, closed, expected):
+ s = series
+ expected = expected(s)
+ result = s.resample('5min', closed=closed, label='right').mean()
+ assert_series_equal(result, expected)
+
+
+def test_resample_integerarray():
+ # GH 25580, resample on IntegerArray
+ ts = pd.Series(range(9),
+ index=pd.date_range('1/1/2000', periods=9, freq='T'),
+ dtype='Int64')
+ result = ts.resample('3T').sum()
+ expected = Series([3, 12, 21],
+ index=pd.date_range('1/1/2000', periods=3, freq='3T'),
+ dtype="Int64")
+ assert_series_equal(result, expected)
+
+
+def test_resample_basic_grouper(series):
+ s = series
+ result = s.resample('5Min').last()
+ grouper = TimeGrouper(Minute(5), closed='left', label='left')
+ expected = s.groupby(grouper).agg(lambda x: x[-1])
+ assert_series_equal(result, expected)
+
+
+ '_index_start,_index_end,_index_name',
+ [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')])
[email protected]('kwargs', [
+ dict(label='righttt'),
+ dict(closed='righttt'),
+ dict(convention='starttt')
+])
+def test_resample_string_kwargs(series, kwargs):
+ # see gh-19303
+ # Check that wrong keyword argument strings raise an error
+ with pytest.raises(ValueError, match='Unsupported value'):
+ series.resample('5min', **kwargs)
+
+
+ '_index_start,_index_end,_index_name',
+ [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')])
+def test_resample_how(series, downsample_method):
+ if downsample_method == 'ohlc':
+ pytest.skip('covered by test_resample_how_ohlc')
+
+ s = series
+ grouplist = np.ones_like(s)
+ grouplist[0] = 0
+ grouplist[1:6] = 1
+ grouplist[6:11] = 2
+ grouplist[11:] = 3
+ expected = s.groupby(grouplist).agg(downsample_method)
+ expected.index = date_range(
+ '1/1/2000', periods=4, freq='5min', name='index')
+
+ result = getattr(s.resample(
+ '5min', closed='right', label='right'), downsample_method)()
+ assert_series_equal(result, expected)
+
+
+ '_index_start,_index_end,_index_name',
+ [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')])
+def test_resample_how_ohlc(series):
+ s = series
+ grouplist = np.ones_like(s)
+ grouplist[0] = 0
+ grouplist[1:6] = 1
+ grouplist[6:11] = 2
+ grouplist[11:] = 3
+
+ def _ohlc(group):
+ if isna(group).all():
+ return np.repeat(np.nan, 4)
+ return [group[0], group.max(), group.min(), group[-1]]
+
+ expected = DataFrame(
+ s.groupby(grouplist).agg(_ohlc).values.tolist(),
+ index=date_range('1/1/2000', periods=4, freq='5min', name='index'),
+ columns=['open', 'high', 'low', 'close'])
+
+ result = s.resample('5min', closed='right', label='right').ohlc()
+ assert_frame_equal(result, expected)
+
+
+ 'func', ['min', 'max', 'sum', 'prod', 'mean', 'var', 'std'])
+def test_numpy_compat(func):
+ # see gh-12811
+ s = Series([1, 2, 3, 4, 5], index=date_range(
+ '20130101', periods=5, freq='s'))
+ r = s.resample('2s')
+
+ msg = "numpy operations are not valid with resample"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(r, func)(func, 1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(r, func)(axis=1)
+
+
+def test_resample_how_callables():
+ # GH#7929
+ data = np.arange(5, dtype=np.int64)
+ ind = date_range(start='2014-01-01', periods=len(data), freq='d')
+ df = DataFrame({"A": data, "B": data}, index=ind)
+
+ def fn(x, a=1):
+ return str(type(x))
+
+ class FnClass(object):
+
+ def __call__(self, x):
+ return str(type(x))
+
+ df_standard = df.resample("M").apply(fn)
+ df_lambda = df.resample("M").apply(lambda x: str(type(x)))
+ df_partial = df.resample("M").apply(partial(fn))
+ df_partial2 = df.resample("M").apply(partial(fn, a=2))
+ df_class = df.resample("M").apply(FnClass())
+
+ assert_frame_equal(df_standard, df_lambda)
+ assert_frame_equal(df_standard, df_partial)
+ assert_frame_equal(df_standard, df_partial2)
+ assert_frame_equal(df_standard, df_class)
+
+
+def test_resample_rounding():
+ # GH 8371
+ # odd results when rounding is needed
+
+ data = """date,time,value
+11-08-2014,00:00:01.093,1
+11-08-2014,00:00:02.159,1
+11-08-2014,00:00:02.667,1
+11-08-2014,00:00:03.175,1
+11-08-2014,00:00:07.058,1
+11-08-2014,00:00:07.362,1
+11-08-2014,00:00:08.324,1
+11-08-2014,00:00:08.830,1
+11-08-2014,00:00:08.982,1
+11-08-2014,00:00:09.815,1
+11-08-2014,00:00:10.540,1
+11-08-2014,00:00:11.061,1
+11-08-2014,00:00:11.617,1
+11-08-2014,00:00:13.607,1
+11-08-2014,00:00:14.535,1
+11-08-2014,00:00:15.525,1
+11-08-2014,00:00:17.960,1
+11-08-2014,00:00:20.674,1
+11-08-2014,00:00:21.191,1"""
+
+ df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [
+ 'date', 'time']}, index_col='timestamp')
+ df.index.name = None
+ result = df.resample('6s').sum()
+ expected = DataFrame({'value': [
+ 4, 9, 4, 2
+ ]}, index=date_range('2014-11-08', freq='6s', periods=4))
+ assert_frame_equal(result, expected)
+
+ result = df.resample('7s').sum()
+ expected = DataFrame({'value': [
+ 4, 10, 4, 1
+ ]}, index=date_range('2014-11-08', freq='7s', periods=4))
+ assert_frame_equal(result, expected)
+
+ result = df.resample('11s').sum()
+ expected = DataFrame({'value': [
+ 11, 8
+ ]}, index=date_range('2014-11-08', freq='11s', periods=2))
+ assert_frame_equal(result, expected)
+
+ result = df.resample('13s').sum()
+ expected = DataFrame({'value': [
+ 13, 6
+ ]}, index=date_range('2014-11-08', freq='13s', periods=2))
+ assert_frame_equal(result, expected)
+
+ result = df.resample('17s').sum()
+ expected = DataFrame({'value': [
+ 16, 3
+ ]}, index=date_range('2014-11-08', freq='17s', periods=2))
+ assert_frame_equal(result, expected)
+
+
+def test_resample_basic_from_daily():
+ # from daily
+ dti = date_range(start=datetime(2005, 1, 1),
+ end=datetime(2005, 1, 10), freq='D', name='index')
+
+ s = Series(np.random.rand(len(dti)), dti)
+
+ # to weekly
+ result = s.resample('w-sun').last()
+
+ assert len(result) == 3
+ assert (result.index.dayofweek == [6, 6, 6]).all()
+ assert result.iloc[0] == s['1/2/2005']
+ assert result.iloc[1] == s['1/9/2005']
+ assert result.iloc[2] == s.iloc[-1]
+
+ result = s.resample('W-MON').last()
+ assert len(result) == 2
+ assert (result.index.dayofweek == [0, 0]).all()
+ assert result.iloc[0] == s['1/3/2005']
+ assert result.iloc[1] == s['1/10/2005']
+
+ result = s.resample('W-TUE').last()
+ assert len(result) == 2
+ assert (result.index.dayofweek == [1, 1]).all()
+ assert result.iloc[0] == s['1/4/2005']
+ assert result.iloc[1] == s['1/10/2005']
+
+ result = s.resample('W-WED').last()
+ assert len(result) == 2
+ assert (result.index.dayofweek == [2, 2]).all()
+ assert result.iloc[0] == s['1/5/2005']
+ assert result.iloc[1] == s['1/10/2005']
+
+ result = s.resample('W-THU').last()
+ assert len(result) == 2
+ assert (result.index.dayofweek == [3, 3]).all()
+ assert result.iloc[0] == s['1/6/2005']
+ assert result.iloc[1] == s['1/10/2005']
+
+ result = s.resample('W-FRI').last()
+ assert len(result) == 2
+ assert (result.index.dayofweek == [4, 4]).all()
+ assert result.iloc[0] == s['1/7/2005']
+ assert result.iloc[1] == s['1/10/2005']
+
+ # to biz day
+ result = s.resample('B').last()
+ assert len(result) == 7
+ assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all()
+
+ assert result.iloc[0] == s['1/2/2005']
+ assert result.iloc[1] == s['1/3/2005']
+ assert result.iloc[5] == s['1/9/2005']
+ assert result.index.name == 'index'
+
+
+def test_resample_upsampling_picked_but_not_correct():
+
+ # Test for issue #3020
+ dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D')
+ series = Series(1, index=dates)
+
+ result = series.resample('D').mean()
+ assert result.index[0] == dates[0]
+
+ # GH 5955
+ # incorrect deciding to upsample when the axis frequency matches the
+ # resample frequency
+
+ s = Series(np.arange(1., 6), index=[datetime(
+ 1975, 1, i, 12, 0) for i in range(1, 6)])
+ expected = Series(np.arange(1., 6), index=date_range(
+ '19750101', periods=5, freq='D'))
+
+ result = s.resample('D').count()
+ assert_series_equal(result, Series(1, index=expected.index))
+
+ result1 = s.resample('D').sum()
+ result2 = s.resample('D').mean()
+ assert_series_equal(result1, expected)
+ assert_series_equal(result2, expected)
+
+
+def test_resample_frame_basic():
+ df = tm.makeTimeDataFrame()
+
+ b = TimeGrouper('M')
+ g = df.groupby(b)
+
+ # check all cython functions work
+ funcs = ['add', 'mean', 'prod', 'min', 'max', 'var']
+ for f in funcs:
+ g._cython_agg_general(f)
+
+ result = df.resample('A').mean()
+ assert_series_equal(result['A'], df['A'].resample('A').mean())
+
+ result = df.resample('M').mean()
+ assert_series_equal(result['A'], df['A'].resample('M').mean())
+
+ df.resample('M', kind='period').mean()
+ df.resample('W-WED', kind='period').mean()
+
+
[email protected]('loffset', [timedelta(minutes=1),
+ '1min', Minute(1),
+ np.timedelta64(1, 'm')])
+def test_resample_loffset(loffset):
+ # GH 7687
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min')
+ s = Series(np.random.randn(14), index=rng)
+
+ result = s.resample('5min', closed='right', label='right',
+ loffset=loffset).mean()
+ idx = date_range('1/1/2000', periods=4, freq='5min')
+ expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
+ index=idx + timedelta(minutes=1))
+ assert_series_equal(result, expected)
+ assert result.index.freq == Minute(5)
+
+ # from daily
+ dti = date_range(start=datetime(2005, 1, 1),
+ end=datetime(2005, 1, 10), freq='D')
+ ser = Series(np.random.rand(len(dti)), dti)
+
+ # to weekly
+ result = ser.resample('w-sun').last()
+ business_day_offset = BDay()
+ expected = ser.resample('w-sun', loffset=-business_day_offset).last()
+ assert result.index[0] - business_day_offset == expected.index[0]
+
+
+def test_resample_loffset_upsample():
+ # GH 20744
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min')
+ s = Series(np.random.randn(14), index=rng)
+
+ result = s.resample('5min', closed='right', label='right',
+ loffset=timedelta(minutes=1)).ffill()
+ idx = date_range('1/1/2000', periods=4, freq='5min')
+ expected = Series([s[0], s[5], s[10], s[-1]],
+ index=idx + timedelta(minutes=1))
+
+ assert_series_equal(result, expected)
+
+
+def test_resample_loffset_count():
+ # GH 12725
+ start_time = '1/1/2000 00:00:00'
+ rng = date_range(start_time, periods=100, freq='S')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.resample('10S', loffset='1s').count()
+
+ expected_index = (
+ date_range(start_time, periods=10, freq='10S') +
+ timedelta(seconds=1)
+ )
+ expected = Series(10, index=expected_index)
+
+ assert_series_equal(result, expected)
+
+ # Same issue should apply to .size() since it goes through
+ # same code path
+ result = ts.resample('10S', loffset='1s').size()
+
+ assert_series_equal(result, expected)
+
+
+def test_resample_upsample():
+ # from daily
+ dti = date_range(start=datetime(2005, 1, 1),
+ end=datetime(2005, 1, 10), freq='D', name='index')
+
+ s = Series(np.random.rand(len(dti)), dti)
+
+ # to minutely, by padding
+ result = s.resample('Min').pad()
+ assert len(result) == 12961
+ assert result[0] == s[0]
+ assert result[-1] == s[-1]
+
+ assert result.index.name == 'index'
+
+
+def test_resample_how_method():
+ # GH9915
+ s = Series([11, 22],
+ index=[Timestamp('2015-03-31 21:48:52.672000'),
+ Timestamp('2015-03-31 21:49:52.739000')])
+ expected = Series([11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22],
+ index=[Timestamp('2015-03-31 21:48:50'),
+ Timestamp('2015-03-31 21:49:00'),
+ Timestamp('2015-03-31 21:49:10'),
+ Timestamp('2015-03-31 21:49:20'),
+ Timestamp('2015-03-31 21:49:30'),
+ Timestamp('2015-03-31 21:49:40'),
+ Timestamp('2015-03-31 21:49:50')])
+ assert_series_equal(s.resample("10S").mean(), expected)
+
+
+def test_resample_extra_index_point():
+ # GH#9756
+ index = date_range(start='20150101', end='20150331', freq='BM')
+ expected = DataFrame({'A': Series([21, 41, 63], index=index)})
+
+ index = date_range(start='20150101', end='20150331', freq='B')
+ df = DataFrame(
+ {'A': Series(range(len(index)), index=index)}, dtype='int64')
+ result = df.resample('BM').last()
+ assert_frame_equal(result, expected)
+
+
+def test_upsample_with_limit():
+ rng = date_range('1/1/2000', periods=3, freq='5t')
+ ts = Series(np.random.randn(len(rng)), rng)
+
+ result = ts.resample('t').ffill(limit=2)
+ expected = ts.reindex(result.index, method='ffill', limit=2)
+ assert_series_equal(result, expected)
+
+
+def test_nearest_upsample_with_limit():
+ rng = date_range('1/1/2000', periods=3, freq='5t')
+ ts = Series(np.random.randn(len(rng)), rng)
+
+ result = ts.resample('t').nearest(limit=2)
+ expected = ts.reindex(result.index, method='nearest', limit=2)
+ assert_series_equal(result, expected)
+
+
+def test_resample_ohlc(series):
+ s = series
+
+ grouper = TimeGrouper(Minute(5))
+ expect = s.groupby(grouper).agg(lambda x: x[-1])
+ result = s.resample('5Min').ohlc()
+
+ assert len(result) == len(expect)
+ assert len(result.columns) == 4
+
+ xs = result.iloc[-2]
+ assert xs['open'] == s[-6]
+ assert xs['high'] == s[-6:-1].max()
+ assert xs['low'] == s[-6:-1].min()
+ assert xs['close'] == s[-2]
+
+ xs = result.iloc[0]
+ assert xs['open'] == s[0]
+ assert xs['high'] == s[:5].max()
+ assert xs['low'] == s[:5].min()
+ assert xs['close'] == s[4]
+
+
+def test_resample_ohlc_result():
+
+ # GH 12332
+ index = pd.date_range('1-1-2000', '2-15-2000', freq='h')
+ index = index.union(pd.date_range('4-15-2000', '5-15-2000', freq='h'))
+ s = Series(range(len(index)), index=index)
+
+ a = s.loc[:'4-15-2000'].resample('30T').ohlc()
+ assert isinstance(a, DataFrame)
+
+ b = s.loc[:'4-14-2000'].resample('30T').ohlc()
+ assert isinstance(b, DataFrame)
+
+ # GH12348
+ # raising on odd period
+ rng = date_range('2013-12-30', '2014-01-07')
+ index = rng.drop([Timestamp('2014-01-01'),
+ Timestamp('2013-12-31'),
+ Timestamp('2014-01-04'),
+ Timestamp('2014-01-05')])
+ df = DataFrame(data=np.arange(len(index)), index=index)
+ result = df.resample('B').mean()
+ expected = df.reindex(index=date_range(rng[0], rng[-1], freq='B'))
+ assert_frame_equal(result, expected)
+
+
+def test_resample_ohlc_dataframe():
+ df = (
+ DataFrame({
+ 'PRICE': {
+ Timestamp('2011-01-06 10:59:05', tz=None): 24990,
+ Timestamp('2011-01-06 12:43:33', tz=None): 25499,
+ Timestamp('2011-01-06 12:54:09', tz=None): 25499},
+ 'VOLUME': {
+ Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
+ Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
+ Timestamp('2011-01-06 12:54:09', tz=None): 100000000}})
+ ).reindex(['VOLUME', 'PRICE'], axis=1)
+ res = df.resample('H').ohlc()
+ exp = pd.concat([df['VOLUME'].resample('H').ohlc(),
+ df['PRICE'].resample('H').ohlc()],
+ axis=1,
+ keys=['VOLUME', 'PRICE'])
+ assert_frame_equal(exp, res)
+
+ df.columns = [['a', 'b'], ['c', 'd']]
+ res = df.resample('H').ohlc()
+ exp.columns = pd.MultiIndex.from_tuples([
+ ('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'),
+ ('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'),
+ ('b', 'd', 'low'), ('b', 'd', 'close')])
+ assert_frame_equal(exp, res)
+
+ # dupe columns fail atm
+ # df.columns = ['PRICE', 'PRICE']
+
+
+def test_resample_dup_index():
+
+ # GH 4812
+ # dup columns with resample raising
+ df = DataFrame(np.random.randn(4, 12), index=[2000, 2000, 2000, 2000],
+ columns=[Period(year=2000, month=i + 1, freq='M')
+ for i in range(12)])
+ df.iloc[3, :] = np.nan
+ result = df.resample('Q', axis=1).mean()
+ expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean()
+ expected.columns = [
+ Period(year=2000, quarter=i + 1, freq='Q') for i in range(4)]
+ assert_frame_equal(result, expected)
+
+
+def test_resample_reresample():
+ dti = date_range(start=datetime(2005, 1, 1),
+ end=datetime(2005, 1, 10), freq='D')
+ s = Series(np.random.rand(len(dti)), dti)
+ bs = s.resample('B', closed='right', label='right').mean()
+ result = bs.resample('8H').mean()
+ assert len(result) == 22
+ assert isinstance(result.index.freq, offsets.DateOffset)
+ assert result.index.freq == offsets.Hour(8)
+
+
+def test_resample_timestamp_to_period(simple_date_range_series):
+ ts = simple_date_range_series('1/1/1990', '1/1/2000')
+
+ result = ts.resample('A-DEC', kind='period').mean()
+ expected = ts.resample('A-DEC').mean()
+ expected.index = period_range('1990', '2000', freq='a-dec')
+ assert_series_equal(result, expected)
+
+ result = ts.resample('A-JUN', kind='period').mean()
+ expected = ts.resample('A-JUN').mean()
+ expected.index = period_range('1990', '2000', freq='a-jun')
+ assert_series_equal(result, expected)
+
+ result = ts.resample('M', kind='period').mean()
+ expected = ts.resample('M').mean()
+ expected.index = period_range('1990-01', '2000-01', freq='M')
+ assert_series_equal(result, expected)
+
+ result = ts.resample('M', kind='period').mean()
+ expected = ts.resample('M').mean()
+ expected.index = period_range('1990-01', '2000-01', freq='M')
+ assert_series_equal(result, expected)
+
+
+def test_ohlc_5min():
+ def _ohlc(group):
+ if isna(group).all():
+ return np.repeat(np.nan, 4)
+ return [group[0], group.max(), group.min(), group[-1]]
+
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', freq='10s')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ resampled = ts.resample('5min', closed='right',
+ label='right').ohlc()
+
+ assert (resampled.loc['1/1/2000 00:00'] == ts[0]).all()
+
+ exp = _ohlc(ts[1:31])
+ assert (resampled.loc['1/1/2000 00:05'] == exp).all()
+
+ exp = _ohlc(ts['1/1/2000 5:55:01':])
+ assert (resampled.loc['1/1/2000 6:00:00'] == exp).all()
+
+
+def test_downsample_non_unique():
+ rng = date_range('1/1/2000', '2/29/2000')
+ rng2 = rng.repeat(5).values
+ ts = Series(np.random.randn(len(rng2)), index=rng2)
+
+ result = ts.resample('M').mean()
+
+ expected = ts.groupby(lambda x: x.month).mean()
+ assert len(result) == 2
+ assert_almost_equal(result[0], expected[1])
+ assert_almost_equal(result[1], expected[2])
+
+
+def test_asfreq_non_unique():
+ # GH #1077
+ rng = date_range('1/1/2000', '2/29/2000')
+ rng2 = rng.repeat(2).values
+ ts = Series(np.random.randn(len(rng2)), index=rng2)
+
+ msg = 'cannot reindex from a duplicate axis'
+ with pytest.raises(Exception, match=msg):
+ ts.asfreq('B')
+
+
+def test_resample_axis1():
+ rng = date_range('1/1/2000', '2/29/2000')
+ df = DataFrame(np.random.randn(3, len(rng)), columns=rng,
+ index=['a', 'b', 'c'])
+
+ result = df.resample('M', axis=1).mean()
+ expected = df.T.resample('M').mean().T
+ tm.assert_frame_equal(result, expected)
+
+
+def test_resample_panel():
+ rng = date_range('1/1/2000', '6/30/2000')
+ n = len(rng)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ panel = Panel(np.random.randn(3, n, 5),
+ items=['one', 'two', 'three'],
+ major_axis=rng,
+ minor_axis=['a', 'b', 'c', 'd', 'e'])
+
+ result = panel.resample('M', axis=1).mean()
+
+ def p_apply(panel, f):
+ result = {}
+ for item in panel.items:
+ result[item] = f(panel[item])
+ return Panel(result, items=panel.items)
+
+ expected = p_apply(panel, lambda x: x.resample('M').mean())
+ tm.assert_panel_equal(result, expected)
+
+ panel2 = panel.swapaxes(1, 2)
+ result = panel2.resample('M', axis=2).mean()
+ expected = p_apply(panel2,
+ lambda x: x.resample('M', axis=1).mean())
+ tm.assert_panel_equal(result, expected)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_resample_panel_numpy():
+ rng = date_range('1/1/2000', '6/30/2000')
+ n = len(rng)
+
+ with catch_warnings(record=True):
+ panel = Panel(np.random.randn(3, n, 5),
+ items=['one', 'two', 'three'],
+ major_axis=rng,
+ minor_axis=['a', 'b', 'c', 'd', 'e'])
+
+ result = panel.resample('M', axis=1).apply(lambda x: x.mean(1))
+ expected = panel.resample('M', axis=1).mean()
+ tm.assert_panel_equal(result, expected)
+
+ panel = panel.swapaxes(1, 2)
+ result = panel.resample('M', axis=2).apply(lambda x: x.mean(2))
+ expected = panel.resample('M', axis=2).mean()
+ tm.assert_panel_equal(result, expected)
+
+
+def test_resample_anchored_ticks():
+ # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should
+ # "anchor" the origin at midnight so we get regular intervals rather
+ # than starting from the first timestamp which might start in the
+ # middle of a desired interval
+
+ rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ ts[:2] = np.nan # so results are the same
+
+ freqs = ['t', '5t', '15t', '30t', '4h', '12h']
+ for freq in freqs:
+ result = ts[2:].resample(freq, closed='left', label='left').mean()
+ expected = ts.resample(freq, closed='left', label='left').mean()
+ assert_series_equal(result, expected)
+
+
+def test_resample_single_group():
+ mysum = lambda x: x.sum()
+
+ rng = date_range('2000-1-1', '2000-2-10', freq='D')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ assert_series_equal(ts.resample('M').sum(),
+ ts.resample('M').apply(mysum))
+
+ rng = date_range('2000-1-1', '2000-1-10', freq='D')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ assert_series_equal(ts.resample('M').sum(),
+ ts.resample('M').apply(mysum))
+
+ # GH 3849
+ s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'),
+ Timestamp('20070915 15:40:00')])
+ expected = Series([0.75], index=[Timestamp('20070915')])
+ result = s.resample('D').apply(lambda x: np.std(x))
+ assert_series_equal(result, expected)
+
+
+def test_resample_base():
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ resampled = ts.resample('5min', base=2).mean()
+ exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57',
+ freq='5min')
+ tm.assert_index_equal(resampled.index, exp_rng)
+
+
+def test_resample_daily_anchored():
+ rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ ts[:2] = np.nan # so results are the same
+
+ result = ts[2:].resample('D', closed='left', label='left').mean()
+ expected = ts.resample('D', closed='left', label='left').mean()
+ assert_series_equal(result, expected)
+
+
+def test_resample_to_period_monthly_buglet():
+ # GH #1259
+
+ rng = date_range('1/1/2000', '12/31/2000')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.resample('M', kind='period').mean()
+ exp_index = period_range('Jan-2000', 'Dec-2000', freq='M')
+ tm.assert_index_equal(result.index, exp_index)
+
+
+def test_period_with_agg():
+
+ # aggregate a period resampler with a lambda
+ s2 = Series(np.random.randint(0, 5, 50),
+ index=pd.period_range('2012-01-01', freq='H', periods=50),
+ dtype='float64')
+
+ expected = s2.to_timestamp().resample('D').mean().to_period()
+ result = s2.resample('D').agg(lambda x: x.mean())
+ assert_series_equal(result, expected)
+
+
+def test_resample_segfault():
+ # GH 8573
+ # segfaulting in older versions
+ all_wins_and_wagers = [
+ (1, datetime(2013, 10, 1, 16, 20), 1, 0),
+ (2, datetime(2013, 10, 1, 16, 10), 1, 0),
+ (2, datetime(2013, 10, 1, 18, 15), 1, 0),
+ (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0)]
+
+ df = DataFrame.from_records(all_wins_and_wagers,
+ columns=("ID", "timestamp", "A", "B")
+ ).set_index("timestamp")
+ result = df.groupby("ID").resample("5min").sum()
+ expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum())
+ assert_frame_equal(result, expected)
+
+
+def test_resample_dtype_preservation():
+
+ # GH 12202
+ # validation tests for dtype preservation
+
+ df = DataFrame({'date': pd.date_range(start='2016-01-01',
+ periods=4, freq='W'),
+ 'group': [1, 1, 2, 2],
+ 'val': Series([5, 6, 7, 8],
+ dtype='int32')}
+ ).set_index('date')
+
+ result = df.resample('1D').ffill()
+ assert result.val.dtype == np.int32
+
+ result = df.groupby('group').resample('1D').ffill()
+ assert result.val.dtype == np.int32
+
+
+def test_resample_dtype_coerceion():
+
+ pytest.importorskip('scipy.interpolate')
+
+ # GH 16361
+ df = {"a": [1, 3, 1, 4]}
+ df = DataFrame(df, index=pd.date_range("2017-01-01", "2017-01-04"))
+
+ expected = (df.astype("float64")
+ .resample("H")
+ .mean()
+ ["a"]
+ .interpolate("cubic")
+ )
+
+ result = df.resample("H")["a"].mean().interpolate("cubic")
+ tm.assert_series_equal(result, expected)
+
+ result = df.resample("H").mean()["a"].interpolate("cubic")
+ tm.assert_series_equal(result, expected)
+
+
+def test_weekly_resample_buglet():
+ # #1327
+ rng = date_range('1/1/2000', freq='B', periods=20)
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ resampled = ts.resample('W').mean()
+ expected = ts.resample('W-SUN').mean()
+ assert_series_equal(resampled, expected)
+
+
+def test_monthly_resample_error():
+ # #1451
+ dates = date_range('4/16/2012 20:00', periods=5000, freq='h')
+ ts = Series(np.random.randn(len(dates)), index=dates)
+ # it works!
+ ts.resample('M')
+
+
+def test_nanosecond_resample_error():
+ # GH 12307 - Values falls after last bin when
+ # Resampling using pd.tseries.offsets.Nano as period
+ start = 1443707890427
+ exp_start = 1443707890400
+ indx = pd.date_range(
+ start=pd.to_datetime(start),
+ periods=10,
+ freq='100n'
+ )
+ ts = Series(range(len(indx)), index=indx)
+ r = ts.resample(pd.tseries.offsets.Nano(100))
+ result = r.agg('mean')
+
+ exp_indx = pd.date_range(
+ start=pd.to_datetime(exp_start),
+ periods=10,
+ freq='100n'
+ )
+ exp = Series(range(len(exp_indx)), index=exp_indx)
+
+ assert_series_equal(result, exp)
+
+
+def test_resample_anchored_intraday(simple_date_range_series):
+ # #1471, #1458
+
+ rng = date_range('1/1/2012', '4/1/2012', freq='100min')
+ df = DataFrame(rng.month, index=rng)
+
+ result = df.resample('M').mean()
+ expected = df.resample(
+ 'M', kind='period').mean().to_timestamp(how='end')
+ expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D')
+ tm.assert_frame_equal(result, expected)
+
+ result = df.resample('M', closed='left').mean()
+ exp = df.tshift(1, freq='D').resample('M', kind='period').mean()
+ exp = exp.to_timestamp(how='end')
+
+ exp.index = exp.index + Timedelta(1, 'ns') - Timedelta(1, 'D')
+ tm.assert_frame_equal(result, exp)
+
+ rng = date_range('1/1/2012', '4/1/2012', freq='100min')
+ df = DataFrame(rng.month, index=rng)
+
+ result = df.resample('Q').mean()
+ expected = df.resample(
+ 'Q', kind='period').mean().to_timestamp(how='end')
+ expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D')
+ tm.assert_frame_equal(result, expected)
+
+ result = df.resample('Q', closed='left').mean()
+ expected = df.tshift(1, freq='D').resample('Q', kind='period',
+ closed='left').mean()
+ expected = expected.to_timestamp(how='end')
+ expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D')
+ tm.assert_frame_equal(result, expected)
+
+ ts = simple_date_range_series('2012-04-29 23:00', '2012-04-30 5:00',
+ freq='h')
+ resampled = ts.resample('M').mean()
+ assert len(resampled) == 1
+
+
+def test_resample_anchored_monthstart(simple_date_range_series):
+ ts = simple_date_range_series('1/1/2000', '12/31/2002')
+
+ freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN']
+
+ for freq in freqs:
+ ts.resample(freq).mean()
+
+
+def test_resample_anchored_multiday():
+ # When resampling a range spanning multiple days, ensure that the
+ # start date gets used to determine the offset. Fixes issue where
+ # a one day period is not a multiple of the frequency.
+ #
+ # See: https://github.com/pandas-dev/pandas/issues/8683
+
+ index = pd.date_range(
+ '2014-10-14 23:06:23.206', periods=3, freq='400L'
+ ) | pd.date_range(
+ '2014-10-15 23:00:00', periods=2, freq='2200L')
+
+ s = Series(np.random.randn(5), index=index)
+
+ # Ensure left closing works
+ result = s.resample('2200L').mean()
+ assert result.index[-1] == Timestamp('2014-10-15 23:00:02.000')
+
+ # Ensure right closing works
+ result = s.resample('2200L', label='right').mean()
+ assert result.index[-1] == Timestamp('2014-10-15 23:00:04.200')
+
+
+def test_corner_cases(simple_period_range_series,
+ simple_date_range_series):
+ # miscellaneous test coverage
+
+ rng = date_range('1/1/2000', periods=12, freq='t')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.resample('5t', closed='right', label='left').mean()
+ ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t')
+ tm.assert_index_equal(result.index, ex_index)
+
+ len0pts = simple_period_range_series(
+ '2007-01', '2010-05', freq='M')[:0]
+ # it works
+ result = len0pts.resample('A-DEC').mean()
+ assert len(result) == 0
+
+ # resample to periods
+ ts = simple_date_range_series(
+ '2000-04-28', '2000-04-30 11:00', freq='h')
+ result = ts.resample('M', kind='period').mean()
+ assert len(result) == 1
+ assert result.index[0] == Period('2000-04', freq='M')
+
+
+def test_anchored_lowercase_buglet():
+ dates = date_range('4/16/2012 20:00', periods=50000, freq='s')
+ ts = Series(np.random.randn(len(dates)), index=dates)
+ # it works!
+ ts.resample('d').mean()
+
+
+def test_upsample_apply_functions():
+ # #1596
+ rng = pd.date_range('2012-06-12', periods=4, freq='h')
+
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.resample('20min').aggregate(['mean', 'sum'])
+ assert isinstance(result, DataFrame)
+
+
+def test_resample_not_monotonic():
+ rng = pd.date_range('2012-06-12', periods=200, freq='h')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ ts = ts.take(np.random.permutation(len(ts)))
+
+ result = ts.resample('D').sum()
+ exp = ts.sort_index().resample('D').sum()
+ assert_series_equal(result, exp)
+
+
+def test_resample_median_bug_1688():
+
+ for dtype in ['int64', 'int32', 'float64', 'float32']:
+ df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0),
+ datetime(2012, 1, 1, 0, 5, 0)],
+ dtype=dtype)
+
+ result = df.resample("T").apply(lambda x: x.mean())
+ exp = df.asfreq('T')
+ tm.assert_frame_equal(result, exp)
+
+ result = df.resample("T").median()
+ exp = df.asfreq('T')
+ tm.assert_frame_equal(result, exp)
+
+
+def test_how_lambda_functions(simple_date_range_series):
+
+ ts = simple_date_range_series('1/1/2000', '4/1/2000')
+
+ result = ts.resample('M').apply(lambda x: x.mean())
+ exp = ts.resample('M').mean()
+ tm.assert_series_equal(result, exp)
+
+ foo_exp = ts.resample('M').mean()
+ foo_exp.name = 'foo'
+ bar_exp = ts.resample('M').std()
+ bar_exp.name = 'bar'
+
+ result = ts.resample('M').apply(
+ [lambda x: x.mean(), lambda x: x.std(ddof=1)])
+ result.columns = ['foo', 'bar']
+ tm.assert_series_equal(result['foo'], foo_exp)
+ tm.assert_series_equal(result['bar'], bar_exp)
+
+ # this is a MI Series, so comparing the names of the results
+ # doesn't make sense
+ result = ts.resample('M').aggregate({'foo': lambda x: x.mean(),
+ 'bar': lambda x: x.std(ddof=1)})
+ tm.assert_series_equal(result['foo'], foo_exp, check_names=False)
+ tm.assert_series_equal(result['bar'], bar_exp, check_names=False)
+
+
+def test_resample_unequal_times():
+ # #1772
+ start = datetime(1999, 3, 1, 5)
+ # end hour is less than start
+ end = datetime(2012, 7, 31, 4)
+ bad_ind = date_range(start, end, freq="30min")
+ df = DataFrame({'close': 1}, index=bad_ind)
+
+ # it works!
+ df.resample('AS').sum()
+
+
+def test_resample_consistency():
+
+ # GH 6418
+ # resample with bfill / limit / reindex consistency
+
+ i30 = pd.date_range('2002-02-02', periods=4, freq='30T')
+ s = Series(np.arange(4.), index=i30)
+ s[2] = np.NaN
+
+ # Upsample by factor 3 with reindex() and resample() methods:
+ i10 = pd.date_range(i30[0], i30[-1], freq='10T')
+
+ s10 = s.reindex(index=i10, method='bfill')
+ s10_2 = s.reindex(index=i10, method='bfill', limit=2)
+ rl = s.reindex_like(s10, method='bfill', limit=2)
+ r10_2 = s.resample('10Min').bfill(limit=2)
+ r10 = s.resample('10Min').bfill()
+
+ # s10_2, r10, r10_2, rl should all be equal
+ assert_series_equal(s10_2, r10)
+ assert_series_equal(s10_2, r10_2)
+ assert_series_equal(s10_2, rl)
+
+
+def test_resample_timegrouper():
+ # GH 7227
+ dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
+ datetime(2014, 11, 5), datetime(2014, 9, 5),
+ datetime(2014, 10, 8), datetime(2014, 7, 15)]
+
+ dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
+ dates3 = [pd.NaT] + dates1 + [pd.NaT]
+
+ for dates in [dates1, dates2, dates3]:
+ df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
+ result = df.set_index('A').resample('M').count()
+ exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
+ '2014-09-30',
+ '2014-10-31', '2014-11-30'],
+ freq='M', name='A')
+ expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
+ assert_frame_equal(result, expected)
+
+ result = df.groupby(pd.Grouper(freq='M', key='A')).count()
+ assert_frame_equal(result, expected)
+
+ df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
+ len(dates))))
+ result = df.set_index('A').resample('M').count()
+ expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
+ index=exp_idx, columns=['B', 'C'])
+ assert_frame_equal(result, expected)
+
+ result = df.groupby(pd.Grouper(freq='M', key='A')).count()
+ assert_frame_equal(result, expected)
+
+
+def test_resample_nunique():
+
+ # GH 12352
+ df = DataFrame({
+ 'ID': {Timestamp('2015-06-05 00:00:00'): '0010100903',
+ Timestamp('2015-06-08 00:00:00'): '0010150847'},
+ 'DATE': {Timestamp('2015-06-05 00:00:00'): '2015-06-05',
+ Timestamp('2015-06-08 00:00:00'): '2015-06-08'}})
+ r = df.resample('D')
+ g = df.groupby(pd.Grouper(freq='D'))
+ expected = df.groupby(pd.Grouper(freq='D')).ID.apply(lambda x:
+ x.nunique())
+ assert expected.name == 'ID'
+
+ for t in [r, g]:
+ result = r.ID.nunique()
+ assert_series_equal(result, expected)
+
+ result = df.ID.resample('D').nunique()
+ assert_series_equal(result, expected)
+
+ result = df.ID.groupby(pd.Grouper(freq='D')).nunique()
+ assert_series_equal(result, expected)
+
+
+def test_resample_nunique_with_date_gap():
+ # GH 13453
+ index = pd.date_range('1-1-2000', '2-15-2000', freq='h')
+ index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h')
+ index3 = index.append(index2)
+ s = Series(range(len(index3)), index=index3, dtype='int64')
+ r = s.resample('M')
+
+ # Since all elements are unique, these should all be the same
+ results = [
+ r.count(),
+ r.nunique(),
+ r.agg(Series.nunique),
+ r.agg('nunique')
+ ]
+
+ assert_series_equal(results[0], results[1])
+ assert_series_equal(results[0], results[2])
+ assert_series_equal(results[0], results[3])
+
+
[email protected]('n', [10000, 100000])
[email protected]('k', [10, 100, 1000])
+def test_resample_group_info(n, k):
+ # GH10914
+
+ # use a fixed seed to always have the same uniques
+ prng = np.random.RandomState(1234)
+
+ dr = date_range(start='2015-08-27', periods=n // 10, freq='T')
+ ts = Series(prng.randint(0, n // k, n).astype('int64'),
+ index=prng.choice(dr, n))
+
+ left = ts.resample('30T').nunique()
+ ix = date_range(start=ts.index.min(), end=ts.index.max(),
+ freq='30T')
+
+ vals = ts.values
+ bins = np.searchsorted(ix.values, ts.index, side='right')
+
+ sorter = np.lexsort((vals, bins))
+ vals, bins = vals[sorter], bins[sorter]
+
+ mask = np.r_[True, vals[1:] != vals[:-1]]
+ mask |= np.r_[True, bins[1:] != bins[:-1]]
+
+ arr = np.bincount(bins[mask] - 1,
+ minlength=len(ix)).astype('int64', copy=False)
+ right = Series(arr, index=ix)
+
+ assert_series_equal(left, right)
+
+
+def test_resample_size():
+ n = 10000
+ dr = date_range('2015-09-19', periods=n, freq='T')
+ ts = Series(np.random.randn(n), index=np.random.choice(dr, n))
+
+ left = ts.resample('7T').size()
+ ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T')
+
+ bins = np.searchsorted(ix.values, ts.index.values, side='right')
+ val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64',
+ copy=False)
+
+ right = Series(val, index=ix)
+ assert_series_equal(left, right)
+
+
+def test_resample_across_dst():
+ # The test resamples a DatetimeIndex with values before and after a
+ # DST change
+ # Issue: 14682
+
+ # The DatetimeIndex we will start with
+ # (note that DST happens at 03:00+02:00 -> 02:00+01:00)
+ # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00
+ df1 = DataFrame([1477786980, 1477790580], columns=['ts'])
+ dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s')
+ .dt.tz_localize('UTC')
+ .dt.tz_convert('Europe/Madrid'))
+
+ # The expected DatetimeIndex after resampling.
+ # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00
+ df2 = DataFrame([1477785600, 1477789200], columns=['ts'])
+ dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s')
+ .dt.tz_localize('UTC')
+ .dt.tz_convert('Europe/Madrid'))
+ df = DataFrame([5, 5], index=dti1)
+
+ result = df.resample(rule='H').sum()
+ expected = DataFrame([5, 5], index=dti2)
+
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_with_dst_time_change():
+ # GH 24972
+ index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000],
+ tz='UTC').tz_convert('America/Chicago')
+
+ df = pd.DataFrame([1, 2], index=index)
+ result = df.groupby(pd.Grouper(freq='1d')).last()
+ expected_index_values = pd.date_range('2016-11-02', '2016-11-24',
+ freq='d', tz='America/Chicago')
+
+ index = pd.DatetimeIndex(expected_index_values)
+ expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index)
+ assert_frame_equal(result, expected)
+
+
+def test_resample_dst_anchor():
+ # 5172
+ dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')
+ df = DataFrame([5], index=dti)
+ assert_frame_equal(df.resample(rule='D').sum(),
+ DataFrame([5], index=df.index.normalize()))
+ df.resample(rule='MS').sum()
+ assert_frame_equal(
+ df.resample(rule='MS').sum(),
+ DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)],
+ tz='US/Eastern')))
+
+ dti = date_range('2013-09-30', '2013-11-02', freq='30Min',
+ tz='Europe/Paris')
+ values = range(dti.size)
+ df = DataFrame({"a": values,
+ "b": values,
+ "c": values}, index=dti, dtype='int64')
+ how = {"a": "min", "b": "max", "c": "count"}
+
+ assert_frame_equal(
+ df.resample("W-MON").agg(how)[["a", "b", "c"]],
+ DataFrame({"a": [0, 48, 384, 720, 1056, 1394],
+ "b": [47, 383, 719, 1055, 1393, 1586],
+ "c": [48, 336, 336, 336, 338, 193]},
+ index=date_range('9/30/2013', '11/4/2013',
+ freq='W-MON', tz='Europe/Paris')),
+ 'W-MON Frequency')
+
+ assert_frame_equal(
+ df.resample("2W-MON").agg(how)[["a", "b", "c"]],
+ DataFrame({"a": [0, 48, 720, 1394],
+ "b": [47, 719, 1393, 1586],
+ "c": [48, 672, 674, 193]},
+ index=date_range('9/30/2013', '11/11/2013',
+ freq='2W-MON', tz='Europe/Paris')),
+ '2W-MON Frequency')
+
+ assert_frame_equal(
+ df.resample("MS").agg(how)[["a", "b", "c"]],
+ DataFrame({"a": [0, 48, 1538],
+ "b": [47, 1537, 1586],
+ "c": [48, 1490, 49]},
+ index=date_range('9/1/2013', '11/1/2013',
+ freq='MS', tz='Europe/Paris')),
+ 'MS Frequency')
+
+ assert_frame_equal(
+ df.resample("2MS").agg(how)[["a", "b", "c"]],
+ DataFrame({"a": [0, 1538],
+ "b": [1537, 1586],
+ "c": [1538, 49]},
+ index=date_range('9/1/2013', '11/1/2013',
+ freq='2MS', tz='Europe/Paris')),
+ '2MS Frequency')
+
+ df_daily = df['10/26/2013':'10/29/2013']
+ assert_frame_equal(
+ df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})
+ [["a", "b", "c"]],
+ DataFrame({"a": [1248, 1296, 1346, 1394],
+ "b": [1295, 1345, 1393, 1441],
+ "c": [48, 50, 48, 48]},
+ index=date_range('10/26/2013', '10/29/2013',
+ freq='D', tz='Europe/Paris')),
+ 'D Frequency')
+
+
+def test_downsample_across_dst():
+ # GH 8531
+ tz = pytz.timezone('Europe/Berlin')
+ dt = datetime(2014, 10, 26)
+ dates = date_range(tz.localize(dt), periods=4, freq='2H')
+ result = Series(5, index=dates).resample('H').mean()
+ expected = Series([5., np.nan] * 3 + [5.],
+ index=date_range(tz.localize(dt), periods=7,
+ freq='H'))
+ tm.assert_series_equal(result, expected)
+
+
+def test_downsample_across_dst_weekly():
+ # GH 9119, GH 21459
+ df = DataFrame(index=DatetimeIndex([
+ '2017-03-25', '2017-03-26', '2017-03-27',
+ '2017-03-28', '2017-03-29'
+ ], tz='Europe/Amsterdam'),
+ data=[11, 12, 13, 14, 15])
+ result = df.resample('1W').sum()
+ expected = DataFrame([23, 42], index=pd.DatetimeIndex([
+ '2017-03-26', '2017-04-02'
+ ], tz='Europe/Amsterdam'))
+ tm.assert_frame_equal(result, expected)
+
+ idx = pd.date_range("2013-04-01", "2013-05-01", tz='Europe/London',
+ freq='H')
+ s = Series(index=idx)
+ result = s.resample('W').mean()
+ expected = Series(index=pd.date_range(
+ '2013-04-07', freq='W', periods=5, tz='Europe/London'
+ ))
+ tm.assert_series_equal(result, expected)
+
+
+def test_resample_with_nat():
+ # GH 13020
+ index = DatetimeIndex([pd.NaT,
+ '1970-01-01 00:00:00',
+ pd.NaT,
+ '1970-01-01 00:00:01',
+ '1970-01-01 00:00:02'])
+ frame = DataFrame([2, 3, 5, 7, 11], index=index)
+
+ index_1s = DatetimeIndex(['1970-01-01 00:00:00',
+ '1970-01-01 00:00:01',
+ '1970-01-01 00:00:02'])
+ frame_1s = DataFrame([3, 7, 11], index=index_1s)
+ assert_frame_equal(frame.resample('1s').mean(), frame_1s)
+
+ index_2s = DatetimeIndex(['1970-01-01 00:00:00',
+ '1970-01-01 00:00:02'])
+ frame_2s = DataFrame([5, 11], index=index_2s)
+ assert_frame_equal(frame.resample('2s').mean(), frame_2s)
+
+ index_3s = DatetimeIndex(['1970-01-01 00:00:00'])
+ frame_3s = DataFrame([7], index=index_3s)
+ assert_frame_equal(frame.resample('3s').mean(), frame_3s)
+
+ assert_frame_equal(frame.resample('60s').mean(), frame_3s)
+
+
+def test_resample_datetime_values():
+ # GH 13119
+ # check that datetime dtype is preserved when NaT values are
+ # introduced by the resampling
+
+ dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)]
+ df = DataFrame({'timestamp': dates}, index=dates)
+
+ exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)],
+ index=date_range('2016-01-15', periods=3, freq='2D'),
+ name='timestamp')
+
+ res = df.resample('2D').first()['timestamp']
+ tm.assert_series_equal(res, exp)
+ res = df['timestamp'].resample('2D').first()
+ tm.assert_series_equal(res, exp)
+
+
+def test_resample_apply_with_additional_args(series):
+ # GH 14615
+ def f(data, add_arg):
+ return np.mean(data) * add_arg
+
+ multiplier = 10
+ result = series.resample('D').apply(f, multiplier)
+ expected = series.resample('D').mean().multiply(multiplier)
+ tm.assert_series_equal(result, expected)
+
+ # Testing as kwarg
+ result = series.resample('D').apply(f, add_arg=multiplier)
+ expected = series.resample('D').mean().multiply(multiplier)
+ tm.assert_series_equal(result, expected)
+
+ # Testing dataframe
+ df = pd.DataFrame({"A": 1, "B": 2},
+ index=pd.date_range('2017', periods=10))
+ result = df.groupby("A").resample("D").agg(f, multiplier)
+ expected = df.groupby("A").resample('D').mean().multiply(multiplier)
+ assert_frame_equal(result, expected)
+
+
[email protected]('k', [1, 2, 3])
[email protected]('n1, freq1, n2, freq2', [
+ (30, 'S', 0.5, 'Min'),
+ (60, 'S', 1, 'Min'),
+ (3600, 'S', 1, 'H'),
+ (60, 'Min', 1, 'H'),
+ (21600, 'S', 0.25, 'D'),
+ (86400, 'S', 1, 'D'),
+ (43200, 'S', 0.5, 'D'),
+ (1440, 'Min', 1, 'D'),
+ (12, 'H', 0.5, 'D'),
+ (24, 'H', 1, 'D'),
+])
+def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k):
+ # GH 24127
+ n1_ = n1 * k
+ n2_ = n2 * k
+ s = pd.Series(0, index=pd.date_range('19910905 13:00',
+ '19911005 07:00',
+ freq=freq1))
+ s = s + range(len(s))
+
+ result1 = s.resample(str(n1_) + freq1).mean()
+ result2 = s.resample(str(n2_) + freq2).mean()
+ assert_series_equal(result1, result2)
+
+
[email protected]('first,last,offset,exp_first,exp_last', [
+ ('19910905', '19920406', 'D', '19910905', '19920407'),
+ ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'),
+ ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
+ '19920406 07:00'),
+ ('19910906', '19920406', 'M', '19910831', '19920430'),
+ ('19910831', '19920430', 'M', '19910831', '19920531'),
+ ('1991-08', '1992-04', 'M', '19910831', '19920531'),
+])
+def test_get_timestamp_range_edges(first, last, offset,
+ exp_first, exp_last):
+ first = pd.Period(first)
+ first = first.to_timestamp(first.freq)
+ last = pd.Period(last)
+ last = last.to_timestamp(last.freq)
+
+ exp_first = pd.Timestamp(exp_first, freq=offset)
+ exp_last = pd.Timestamp(exp_last, freq=offset)
+
+ offset = pd.tseries.frequencies.to_offset(offset)
+ result = _get_timestamp_range_edges(first, last, offset)
+ expected = (exp_first, exp_last)
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_period_index.py b/contrib/python/pandas/py2/pandas/tests/resample/test_period_index.py
new file mode 100644
index 00000000000..c2fbb5bbb08
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_period_index.py
@@ -0,0 +1,759 @@
+from datetime import datetime, timedelta
+
+import dateutil
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs.ccalendar import DAYS, MONTHS
+from pandas._libs.tslibs.period import IncompatibleFrequency
+from pandas.compat import lrange, range, zip
+
+import pandas as pd
+from pandas import DataFrame, Series, Timestamp
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import Period, PeriodIndex, period_range
+from pandas.core.resample import _get_period_range_edges
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+import pandas.tseries.offsets as offsets
+
+
+def _index_factory():
+ return period_range
+
+
+def _series_name():
+ return 'pi'
+
+
+class TestPeriodIndex(object):
+
+ @pytest.mark.parametrize('freq', ['2D', '1H', '2H'])
+ @pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
+ def test_asfreq(self, series_and_frame, freq, kind):
+ # GH 12884, 15944
+ # make sure .asfreq() returns PeriodIndex (except kind='timestamp')
+
+ obj = series_and_frame
+ if kind == 'timestamp':
+ expected = obj.to_timestamp().resample(freq).asfreq()
+ else:
+ start = obj.index[0].to_timestamp(how='start')
+ end = (obj.index[-1] + obj.index.freq).to_timestamp(how='start')
+ new_index = date_range(start=start, end=end, freq=freq,
+ closed='left')
+ expected = obj.to_timestamp().reindex(new_index).to_period(freq)
+ result = obj.resample(freq, kind=kind).asfreq()
+ assert_almost_equal(result, expected)
+
+ def test_asfreq_fill_value(self, series):
+ # test for fill value during resampling, issue 3715
+
+ s = series
+ new_index = date_range(s.index[0].to_timestamp(how='start'),
+ (s.index[-1]).to_timestamp(how='start'),
+ freq='1H')
+ expected = s.to_timestamp().reindex(new_index, fill_value=4.0)
+ result = s.resample('1H', kind='timestamp').asfreq(fill_value=4.0)
+ assert_series_equal(result, expected)
+
+ frame = s.to_frame('value')
+ new_index = date_range(frame.index[0].to_timestamp(how='start'),
+ (frame.index[-1]).to_timestamp(how='start'),
+ freq='1H')
+ expected = frame.to_timestamp().reindex(new_index, fill_value=3.0)
+ result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W'])
+ @pytest.mark.parametrize('kind', [None, 'period', 'timestamp'])
+ def test_selection(self, index, freq, kind):
+ # This is a bug, these should be implemented
+ # GH 14008
+ rng = np.arange(len(index), dtype=np.int64)
+ df = DataFrame({'date': index, 'a': rng},
+ index=pd.MultiIndex.from_arrays([rng, index],
+ names=['v', 'd']))
+ with pytest.raises(NotImplementedError):
+ df.resample(freq, on='date', kind=kind)
+ with pytest.raises(NotImplementedError):
+ df.resample(freq, level='d', kind=kind)
+
+ @pytest.mark.parametrize('month', MONTHS)
+ @pytest.mark.parametrize('meth', ['ffill', 'bfill'])
+ @pytest.mark.parametrize('conv', ['start', 'end'])
+ @pytest.mark.parametrize('targ', ['D', 'B', 'M'])
+ def test_annual_upsample_cases(self, targ, conv, meth, month,
+ simple_period_range_series):
+ ts = simple_period_range_series(
+ '1/1/1990', '12/31/1991', freq='A-%s' % month)
+
+ result = getattr(ts.resample(targ, convention=conv), meth)()
+ expected = result.to_timestamp(targ, how=conv)
+ expected = expected.asfreq(targ, meth).to_period()
+ assert_series_equal(result, expected)
+
+ def test_basic_downsample(self, simple_period_range_series):
+ ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M')
+ result = ts.resample('a-dec').mean()
+
+ expected = ts.groupby(ts.index.year).mean()
+ expected.index = period_range('1/1/1990', '6/30/1995', freq='a-dec')
+ assert_series_equal(result, expected)
+
+ # this is ok
+ assert_series_equal(ts.resample('a-dec').mean(), result)
+ assert_series_equal(ts.resample('a').mean(), result)
+
+ def test_not_subperiod(self, simple_period_range_series):
+ # These are incompatible period rules for resampling
+ ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='w-wed')
+ pytest.raises(ValueError, lambda: ts.resample('a-dec').mean())
+ pytest.raises(ValueError, lambda: ts.resample('q-mar').mean())
+ pytest.raises(ValueError, lambda: ts.resample('M').mean())
+ pytest.raises(ValueError, lambda: ts.resample('w-thu').mean())
+
+ @pytest.mark.parametrize('freq', ['D', '2D'])
+ def test_basic_upsample(self, freq, simple_period_range_series):
+ ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M')
+ result = ts.resample('a-dec').mean()
+
+ resampled = result.resample(freq, convention='end').ffill()
+ expected = result.to_timestamp(freq, how='end')
+ expected = expected.asfreq(freq, 'ffill').to_period(freq)
+ assert_series_equal(resampled, expected)
+
+ def test_upsample_with_limit(self):
+ rng = period_range('1/1/2000', periods=5, freq='A')
+ ts = Series(np.random.randn(len(rng)), rng)
+
+ result = ts.resample('M', convention='end').ffill(limit=2)
+ expected = ts.asfreq('M').reindex(result.index, method='ffill',
+ limit=2)
+ assert_series_equal(result, expected)
+
+ def test_annual_upsample(self, simple_period_range_series):
+ ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='A-DEC')
+ df = DataFrame({'a': ts})
+ rdf = df.resample('D').ffill()
+ exp = df['a'].resample('D').ffill()
+ assert_series_equal(rdf['a'], exp)
+
+ rng = period_range('2000', '2003', freq='A-DEC')
+ ts = Series([1, 2, 3, 4], index=rng)
+
+ result = ts.resample('M').ffill()
+ ex_index = period_range('2000-01', '2003-12', freq='M')
+
+ expected = ts.asfreq('M', how='start').reindex(ex_index,
+ method='ffill')
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('month', MONTHS)
+ @pytest.mark.parametrize('target', ['D', 'B', 'M'])
+ @pytest.mark.parametrize('convention', ['start', 'end'])
+ def test_quarterly_upsample(self, month, target, convention,
+ simple_period_range_series):
+ freq = 'Q-{month}'.format(month=month)
+ ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq)
+ result = ts.resample(target, convention=convention).ffill()
+ expected = result.to_timestamp(target, how=convention)
+ expected = expected.asfreq(target, 'ffill').to_period()
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('target', ['D', 'B'])
+ @pytest.mark.parametrize('convention', ['start', 'end'])
+ def test_monthly_upsample(self, target, convention,
+ simple_period_range_series):
+ ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M')
+ result = ts.resample(target, convention=convention).ffill()
+ expected = result.to_timestamp(target, how=convention)
+ expected = expected.asfreq(target, 'ffill').to_period()
+ assert_series_equal(result, expected)
+
+ def test_resample_basic(self):
+ # GH3609
+ s = Series(range(100), index=date_range(
+ '20130101', freq='s', periods=100, name='idx'), dtype='float')
+ s[10:30] = np.nan
+ index = PeriodIndex([
+ Period('2013-01-01 00:00', 'T'),
+ Period('2013-01-01 00:01', 'T')], name='idx')
+ expected = Series([34.5, 79.5], index=index)
+ result = s.to_period().resample('T', kind='period').mean()
+ assert_series_equal(result, expected)
+ result2 = s.resample('T', kind='period').mean()
+ assert_series_equal(result2, expected)
+
+ @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]),
+ ('2M', [31 + 29, 31 + 9])])
+ def test_resample_count(self, freq, expected_vals):
+ # GH12774
+ series = Series(1, index=pd.period_range(start='2000', periods=100))
+ result = series.resample(freq).count()
+ expected_index = pd.period_range(start='2000', freq=freq,
+ periods=len(expected_vals))
+ expected = Series(expected_vals, index=expected_index)
+ assert_series_equal(result, expected)
+
+ def test_resample_same_freq(self, resample_method):
+
+ # GH12770
+ series = Series(range(3), index=pd.period_range(
+ start='2000', periods=3, freq='M'))
+ expected = series
+
+ result = getattr(series.resample('M'), resample_method)()
+ assert_series_equal(result, expected)
+
+ def test_resample_incompat_freq(self):
+
+ with pytest.raises(IncompatibleFrequency):
+ Series(range(3), index=pd.period_range(
+ start='2000', periods=3, freq='M')).resample('W').mean()
+
+ def test_with_local_timezone_pytz(self):
+ # see gh-5430
+ local_timezone = pytz.timezone('America/Los_Angeles')
+
+ start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
+ tzinfo=pytz.utc)
+ # 1 day later
+ end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
+ tzinfo=pytz.utc)
+
+ index = pd.date_range(start, end, freq='H')
+
+ series = Series(1, index=index)
+ series = series.tz_convert(local_timezone)
+ result = series.resample('D', kind='period').mean()
+
+ # Create the expected series
+ # Index is moved back a day with the timezone conversion from UTC to
+ # Pacific
+ expected_index = (pd.period_range(start=start, end=end, freq='D') -
+ offsets.Day())
+ expected = Series(1, index=expected_index)
+ assert_series_equal(result, expected)
+
+ def test_resample_with_pytz(self):
+ # GH 13238
+ s = Series(2, index=pd.date_range('2017-01-01', periods=48, freq="H",
+ tz="US/Eastern"))
+ result = s.resample("D").mean()
+ expected = Series(2, index=pd.DatetimeIndex(['2017-01-01',
+ '2017-01-02'],
+ tz="US/Eastern"))
+ assert_series_equal(result, expected)
+ # Especially assert that the timezone is LMT for pytz
+ assert result.index.tz == pytz.timezone('US/Eastern')
+
+ def test_with_local_timezone_dateutil(self):
+ # see gh-5430
+ local_timezone = 'dateutil/America/Los_Angeles'
+
+ start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
+ tzinfo=dateutil.tz.tzutc())
+ # 1 day later
+ end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
+ tzinfo=dateutil.tz.tzutc())
+
+ index = pd.date_range(start, end, freq='H', name='idx')
+
+ series = Series(1, index=index)
+ series = series.tz_convert(local_timezone)
+ result = series.resample('D', kind='period').mean()
+
+ # Create the expected series
+ # Index is moved back a day with the timezone conversion from UTC to
+ # Pacific
+ expected_index = (pd.period_range(start=start, end=end, freq='D',
+ name='idx') - offsets.Day())
+ expected = Series(1, index=expected_index)
+ assert_series_equal(result, expected)
+
+ def test_resample_nonexistent_time_bin_edge(self):
+ # GH 19375
+ index = date_range('2017-03-12', '2017-03-12 1:45:00', freq='15T')
+ s = Series(np.zeros(len(index)), index=index)
+ expected = s.tz_localize('US/Pacific')
+ result = expected.resample('900S').mean()
+ tm.assert_series_equal(result, expected)
+
+ # GH 23742
+ index = date_range(start='2017-10-10', end='2017-10-20', freq='1H')
+ index = index.tz_localize('UTC').tz_convert('America/Sao_Paulo')
+ df = DataFrame(data=list(range(len(index))), index=index)
+ result = df.groupby(pd.Grouper(freq='1D')).count()
+ expected = date_range(start='2017-10-09', end='2017-10-20', freq='D',
+ tz="America/Sao_Paulo",
+ nonexistent='shift_forward', closed='left')
+ tm.assert_index_equal(result.index, expected)
+
+ def test_resample_ambiguous_time_bin_edge(self):
+ # GH 10117
+ idx = pd.date_range("2014-10-25 22:00:00", "2014-10-26 00:30:00",
+ freq="30T", tz="Europe/London")
+ expected = Series(np.zeros(len(idx)), index=idx)
+ result = expected.resample('30T').mean()
+ tm.assert_series_equal(result, expected)
+
+ def test_fill_method_and_how_upsample(self):
+ # GH2073
+ s = Series(np.arange(9, dtype='int64'),
+ index=date_range('2010-01-01', periods=9, freq='Q'))
+ last = s.resample('M').ffill()
+ both = s.resample('M').ffill().resample('M').last().astype('int64')
+ assert_series_equal(last, both)
+
+ @pytest.mark.parametrize('day', DAYS)
+ @pytest.mark.parametrize('target', ['D', 'B'])
+ @pytest.mark.parametrize('convention', ['start', 'end'])
+ def test_weekly_upsample(self, day, target, convention,
+ simple_period_range_series):
+ freq = 'W-{day}'.format(day=day)
+ ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq)
+ result = ts.resample(target, convention=convention).ffill()
+ expected = result.to_timestamp(target, how=convention)
+ expected = expected.asfreq(target, 'ffill').to_period()
+ assert_series_equal(result, expected)
+
+ def test_resample_to_timestamps(self, simple_period_range_series):
+ ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M')
+
+ result = ts.resample('A-DEC', kind='timestamp').mean()
+ expected = ts.to_timestamp(how='start').resample('A-DEC').mean()
+ assert_series_equal(result, expected)
+
+ def test_resample_to_quarterly(self, simple_period_range_series):
+ for month in MONTHS:
+ ts = simple_period_range_series(
+ '1990', '1992', freq='A-%s' % month)
+ quar_ts = ts.resample('Q-%s' % month).ffill()
+
+ stamps = ts.to_timestamp('D', how='start')
+ qdates = period_range(ts.index[0].asfreq('D', 'start'),
+ ts.index[-1].asfreq('D', 'end'),
+ freq='Q-%s' % month)
+
+ expected = stamps.reindex(qdates.to_timestamp('D', 's'),
+ method='ffill')
+ expected.index = qdates
+
+ assert_series_equal(quar_ts, expected)
+
+ # conforms, but different month
+ ts = simple_period_range_series('1990', '1992', freq='A-JUN')
+
+ for how in ['start', 'end']:
+ result = ts.resample('Q-MAR', convention=how).ffill()
+ expected = ts.asfreq('Q-MAR', how=how)
+ expected = expected.reindex(result.index, method='ffill')
+
+ # .to_timestamp('D')
+ # expected = expected.resample('Q-MAR').ffill()
+
+ assert_series_equal(result, expected)
+
+ def test_resample_fill_missing(self):
+ rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A')
+
+ s = Series(np.random.randn(4), index=rng)
+
+ stamps = s.to_timestamp()
+ filled = s.resample('A').ffill()
+ expected = stamps.resample('A').ffill().to_period('A')
+ assert_series_equal(filled, expected)
+
+ def test_cant_fill_missing_dups(self):
+ rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A')
+ s = Series(np.random.randn(5), index=rng)
+ pytest.raises(Exception, lambda: s.resample('A').ffill())
+
+ @pytest.mark.parametrize('freq', ['5min'])
+ @pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
+ def test_resample_5minute(self, freq, kind):
+ rng = period_range('1/1/2000', '1/5/2000', freq='T')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ expected = ts.to_timestamp().resample(freq).mean()
+ if kind != 'timestamp':
+ expected = expected.to_period(freq)
+ result = ts.resample(freq, kind=kind).mean()
+ assert_series_equal(result, expected)
+
+ def test_upsample_daily_business_daily(self, simple_period_range_series):
+ ts = simple_period_range_series('1/1/2000', '2/1/2000', freq='B')
+
+ result = ts.resample('D').asfreq()
+ expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000'))
+ assert_series_equal(result, expected)
+
+ ts = simple_period_range_series('1/1/2000', '2/1/2000')
+ result = ts.resample('H', convention='s').asfreq()
+ exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H')
+ expected = ts.asfreq('H', how='s').reindex(exp_rng)
+ assert_series_equal(result, expected)
+
+ def test_resample_irregular_sparse(self):
+ dr = date_range(start='1/1/2012', freq='5min', periods=1000)
+ s = Series(np.array(100), index=dr)
+ # subset the data.
+ subset = s[:'2012-01-04 06:55']
+
+ result = subset.resample('10min').apply(len)
+ expected = s.resample('10min').apply(len).loc[result.index]
+ assert_series_equal(result, expected)
+
+ def test_resample_weekly_all_na(self):
+ rng = date_range('1/1/2000', periods=10, freq='W-WED')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.resample('W-THU').asfreq()
+
+ assert result.isna().all()
+
+ result = ts.resample('W-THU').asfreq().ffill()[:-1]
+ expected = ts.asfreq('W-THU').ffill()
+ assert_series_equal(result, expected)
+
+ def test_resample_tz_localized(self):
+ dr = date_range(start='2012-4-13', end='2012-5-1')
+ ts = Series(lrange(len(dr)), dr)
+
+ ts_utc = ts.tz_localize('UTC')
+ ts_local = ts_utc.tz_convert('America/Los_Angeles')
+
+ result = ts_local.resample('W').mean()
+
+ ts_local_naive = ts_local.copy()
+ ts_local_naive.index = [x.replace(tzinfo=None)
+ for x in ts_local_naive.index.to_pydatetime()]
+
+ exp = ts_local_naive.resample(
+ 'W').mean().tz_localize('America/Los_Angeles')
+
+ assert_series_equal(result, exp)
+
+ # it works
+ result = ts_local.resample('D').mean()
+
+ # #2245
+ idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T',
+ tz='Australia/Sydney')
+ s = Series([1, 2], index=idx)
+
+ result = s.resample('D', closed='right', label='right').mean()
+ ex_index = date_range('2001-09-21', periods=1, freq='D',
+ tz='Australia/Sydney')
+ expected = Series([1.5], index=ex_index)
+
+ assert_series_equal(result, expected)
+
+ # for good measure
+ result = s.resample('D', kind='period').mean()
+ ex_index = period_range('2001-09-20', periods=1, freq='D')
+ expected = Series([1.5], index=ex_index)
+ assert_series_equal(result, expected)
+
+ # GH 6397
+ # comparing an offset that doesn't propagate tz's
+ rng = date_range('1/1/2011', periods=20000, freq='H')
+ rng = rng.tz_localize('EST')
+ ts = DataFrame(index=rng)
+ ts['first'] = np.random.randn(len(rng))
+ ts['second'] = np.cumsum(np.random.randn(len(rng)))
+ expected = DataFrame(
+ {
+ 'first': ts.resample('A').sum()['first'],
+ 'second': ts.resample('A').mean()['second']},
+ columns=['first', 'second'])
+ result = ts.resample(
+ 'A').agg({'first': np.sum,
+ 'second': np.mean}).reindex(columns=['first', 'second'])
+ assert_frame_equal(result, expected)
+
+ def test_closed_left_corner(self):
+ # #1465
+ s = Series(np.random.randn(21),
+ index=date_range(start='1/1/2012 9:30',
+ freq='1min', periods=21))
+ s[0] = np.nan
+
+ result = s.resample('10min', closed='left', label='right').mean()
+ exp = s[1:].resample('10min', closed='left', label='right').mean()
+ assert_series_equal(result, exp)
+
+ result = s.resample('10min', closed='left', label='left').mean()
+ exp = s[1:].resample('10min', closed='left', label='left').mean()
+
+ ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3)
+
+ tm.assert_index_equal(result.index, ex_index)
+ assert_series_equal(result, exp)
+
+ def test_quarterly_resampling(self):
+ rng = period_range('2000Q1', periods=10, freq='Q-DEC')
+ ts = Series(np.arange(10), index=rng)
+
+ result = ts.resample('A').mean()
+ exp = ts.to_timestamp().resample('A').mean().to_period()
+ assert_series_equal(result, exp)
+
+ def test_resample_weekly_bug_1726(self):
+ # 8/6/12 is a Monday
+ ind = date_range(start="8/6/2012", end="8/26/2012", freq="D")
+ n = len(ind)
+ data = [[x] * 5 for x in range(n)]
+ df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'],
+ index=ind)
+
+ # it works!
+ df.resample('W-MON', closed='left', label='left').first()
+
+ def test_resample_with_dst_time_change(self):
+ # GH 15549
+ index = (
+ pd.DatetimeIndex([1457537600000000000, 1458059600000000000])
+ .tz_localize("UTC").tz_convert('America/Chicago')
+ )
+ df = pd.DataFrame([1, 2], index=index)
+ result = df.resample('12h', closed='right',
+ label='right').last().ffill()
+
+ expected_index_values = ['2016-03-09 12:00:00-06:00',
+ '2016-03-10 00:00:00-06:00',
+ '2016-03-10 12:00:00-06:00',
+ '2016-03-11 00:00:00-06:00',
+ '2016-03-11 12:00:00-06:00',
+ '2016-03-12 00:00:00-06:00',
+ '2016-03-12 12:00:00-06:00',
+ '2016-03-13 00:00:00-06:00',
+ '2016-03-13 13:00:00-05:00',
+ '2016-03-14 01:00:00-05:00',
+ '2016-03-14 13:00:00-05:00',
+ '2016-03-15 01:00:00-05:00',
+ '2016-03-15 13:00:00-05:00']
+ index = pd.to_datetime(expected_index_values, utc=True).tz_convert(
+ 'America/Chicago')
+ expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 2.0], index=index)
+ assert_frame_equal(result, expected)
+
+ def test_resample_bms_2752(self):
+ # GH2753
+ foo = Series(index=pd.bdate_range('20000101', '20000201'))
+ res1 = foo.resample("BMS").mean()
+ res2 = foo.resample("BMS").mean().resample("B").mean()
+ assert res1.index[0] == Timestamp('20000103')
+ assert res1.index[0] == res2.index[0]
+
+ # def test_monthly_convention_span(self):
+ # rng = period_range('2000-01', periods=3, freq='M')
+ # ts = Series(np.arange(3), index=rng)
+
+ # # hacky way to get same thing
+ # exp_index = period_range('2000-01-01', '2000-03-31', freq='D')
+ # expected = ts.asfreq('D', how='end').reindex(exp_index)
+ # expected = expected.fillna(method='bfill')
+
+ # result = ts.resample('D', convention='span').mean()
+
+ # assert_series_equal(result, expected)
+
+ def test_default_right_closed_label(self):
+ end_freq = ['D', 'Q', 'M', 'D']
+ end_types = ['M', 'A', 'Q', 'W']
+
+ for from_freq, to_freq in zip(end_freq, end_types):
+ idx = date_range(start='8/15/2012', periods=100, freq=from_freq)
+ df = DataFrame(np.random.randn(len(idx), 2), idx)
+
+ resampled = df.resample(to_freq).mean()
+ assert_frame_equal(resampled, df.resample(to_freq, closed='right',
+ label='right').mean())
+
+ def test_default_left_closed_label(self):
+ others = ['MS', 'AS', 'QS', 'D', 'H']
+ others_freq = ['D', 'Q', 'M', 'H', 'T']
+
+ for from_freq, to_freq in zip(others_freq, others):
+ idx = date_range(start='8/15/2012', periods=100, freq=from_freq)
+ df = DataFrame(np.random.randn(len(idx), 2), idx)
+
+ resampled = df.resample(to_freq).mean()
+ assert_frame_equal(resampled, df.resample(to_freq, closed='left',
+ label='left').mean())
+
+ def test_all_values_single_bin(self):
+ # 2070
+ index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
+ s = Series(np.random.randn(len(index)), index=index)
+
+ result = s.resample("A").mean()
+ tm.assert_almost_equal(result[0], s.mean())
+
+ def test_evenly_divisible_with_no_extra_bins(self):
+ # 4076
+ # when the frequency is evenly divisible, sometimes extra bins
+
+ df = DataFrame(np.random.randn(9, 3),
+ index=date_range('2000-1-1', periods=9))
+ result = df.resample('5D').mean()
+ expected = pd.concat(
+ [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T
+ expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')]
+ assert_frame_equal(result, expected)
+
+ index = date_range(start='2001-5-4', periods=28)
+ df = DataFrame(
+ [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90,
+ 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 +
+ [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10,
+ 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28,
+ index=index.append(index)).sort_index()
+
+ index = date_range('2001-5-4', periods=4, freq='7D')
+ expected = DataFrame(
+ [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14,
+ 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4,
+ index=index)
+ result = df.resample('7D').count()
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(
+ [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,
+ 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4,
+ index=index)
+ result = df.resample('7D').sum()
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
+ @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']])
+ def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg):
+ # make sure passing loffset returns DatetimeIndex in all cases
+ # basic method taken from Base.test_resample_loffset_arg_type()
+ df = frame
+ expected_means = [df.values[i:i + 2].mean()
+ for i in range(0, len(df.values), 2)]
+ expected_index = period_range(
+ df.index[0], periods=len(df.index) / 2, freq='2D')
+
+ # loffset coerces PeriodIndex to DateTimeIndex
+ expected_index = expected_index.to_timestamp()
+ expected_index += timedelta(hours=2)
+ expected = DataFrame({'value': expected_means}, index=expected_index)
+
+ result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result_how = df.resample('2D', how=agg_arg, loffset='2H',
+ kind=kind)
+ if isinstance(agg_arg, list):
+ expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')])
+ assert_frame_equal(result_agg, expected)
+ assert_frame_equal(result_how, expected)
+
+ @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)])
+ @pytest.mark.parametrize('kind', [None, 'period'])
+ def test_upsampling_ohlc(self, freq, period_mult, kind):
+ # GH 13083
+ pi = period_range(start='2000', freq='D', periods=10)
+ s = Series(range(len(pi)), index=pi)
+ expected = s.to_timestamp().resample(freq).ohlc().to_period(freq)
+
+ # timestamp-based resampling doesn't include all sub-periods
+ # of the last original period, so extend accordingly:
+ new_index = period_range(start='2000', freq=freq,
+ periods=period_mult * len(pi))
+ expected = expected.reindex(new_index)
+ result = s.resample(freq, kind=kind).ohlc()
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('periods, values',
+ [([pd.NaT, '1970-01-01 00:00:00', pd.NaT,
+ '1970-01-01 00:00:02', '1970-01-01 00:00:03'],
+ [2, 3, 5, 7, 11]),
+ ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT,
+ pd.NaT, pd.NaT, '1970-01-01 00:00:02',
+ '1970-01-01 00:00:03', pd.NaT, pd.NaT],
+ [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])])
+ @pytest.mark.parametrize('freq, expected_values',
+ [('1s', [3, np.NaN, 7, 11]),
+ ('2s', [3, int((7 + 11) / 2)]),
+ ('3s', [int((3 + 7) / 2), 11])])
+ def test_resample_with_nat(self, periods, values, freq, expected_values):
+ # GH 13224
+ index = PeriodIndex(periods, freq='S')
+ frame = DataFrame(values, index=index)
+
+ expected_index = period_range('1970-01-01 00:00:00',
+ periods=len(expected_values), freq=freq)
+ expected = DataFrame(expected_values, index=expected_index)
+ result = frame.resample(freq).mean()
+ assert_frame_equal(result, expected)
+
+ def test_resample_with_only_nat(self):
+ # GH 13224
+ pi = PeriodIndex([pd.NaT] * 3, freq='S')
+ frame = DataFrame([2, 3, 5], index=pi)
+ expected_index = PeriodIndex(data=[], freq=pi.freq)
+ expected = DataFrame([], index=expected_index)
+ result = frame.resample('1s').mean()
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [
+ ('19910905', '19910909 03:00', 'H', '24H', 10),
+ ('19910905', '19910909 12:00', 'H', '24H', 10),
+ ('19910905', '19910909 23:00', 'H', '24H', 10),
+ ('19910905 10:00', '19910909', 'H', '24H', 10),
+ ('19910905 10:00', '19910909 10:00', 'H', '24H', 10),
+ ('19910905', '19910909 10:00', 'H', '24H', 10),
+ ('19910905 12:00', '19910909', 'H', '24H', 10),
+ ('19910905 12:00', '19910909 03:00', 'H', '24H', 10),
+ ('19910905 12:00', '19910909 12:00', 'H', '24H', 10),
+ ('19910905 12:00', '19910909 12:00', 'H', '24H', 34),
+ ('19910905 12:00', '19910909 12:00', 'H', '17H', 10),
+ ('19910905 12:00', '19910909 12:00', 'H', '17H', 3),
+ ('19910905 12:00', '19910909 1:00', 'H', 'M', 3),
+ ('19910905', '19910913 06:00', '2H', '24H', 10),
+ ('19910905', '19910905 01:39', 'Min', '5Min', 3),
+ ('19910905', '19910905 03:18', '2Min', '5Min', 3),
+ ])
+ def test_resample_with_non_zero_base(self, start, end, start_freq,
+ end_freq, base):
+ # GH 23882
+ s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq))
+ s = s + np.arange(len(s))
+ result = s.resample(end_freq, base=base).mean()
+ result = result.to_timestamp(end_freq)
+ # to_timestamp casts 24H -> D
+ result = result.asfreq(end_freq) if end_freq == '24H' else result
+ expected = s.to_timestamp().resample(end_freq, base=base).mean()
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [
+ ('19910905', '19920406', 'D', '19910905', '19920406'),
+ ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'),
+ ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
+ '19920406 06:00'),
+ ('19910906', '19920406', 'M', '1991-09', '1992-04'),
+ ('19910831', '19920430', 'M', '1991-08', '1992-04'),
+ ('1991-08', '1992-04', 'M', '1991-08', '1992-04'),
+ ])
+ def test_get_period_range_edges(self, first, last, offset,
+ exp_first, exp_last):
+ first = pd.Period(first)
+ last = pd.Period(last)
+
+ exp_first = pd.Period(exp_first, freq=offset)
+ exp_last = pd.Period(exp_last, freq=offset)
+
+ offset = pd.tseries.frequencies.to_offset(offset)
+ result = _get_period_range_edges(first, last, offset)
+ expected = (exp_first, exp_last)
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_resample_api.py b/contrib/python/pandas/py2/pandas/tests/resample/test_resample_api.py
new file mode 100644
index 00000000000..69684daf05f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_resample_api.py
@@ -0,0 +1,544 @@
+# pylint: disable=E1101
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import OrderedDict, range
+
+import pandas as pd
+from pandas import DataFrame, Series
+from pandas.core.indexes.datetimes import date_range
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+dti = date_range(start=datetime(2005, 1, 1),
+ end=datetime(2005, 1, 10), freq='Min')
+
+test_series = Series(np.random.rand(len(dti)), dti)
+test_frame = DataFrame(
+ {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))})
+
+
+def test_str():
+
+ r = test_series.resample('H')
+ assert ('DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, '
+ 'label=left, convention=start, base=0]' in str(r))
+
+
+def test_api():
+
+ r = test_series.resample('H')
+ result = r.mean()
+ assert isinstance(result, Series)
+ assert len(result) == 217
+
+ r = test_series.to_frame().resample('H')
+ result = r.mean()
+ assert isinstance(result, DataFrame)
+ assert len(result) == 217
+
+
+def test_groupby_resample_api():
+
+ # GH 12448
+ # .groupby(...).resample(...) hitting warnings
+ # when appropriate
+ df = DataFrame({'date': pd.date_range(start='2016-01-01',
+ periods=4,
+ freq='W'),
+ 'group': [1, 1, 2, 2],
+ 'val': [5, 6, 7, 8]}).set_index('date')
+
+ # replication step
+ i = pd.date_range('2016-01-03', periods=8).tolist() + \
+ pd.date_range('2016-01-17', periods=8).tolist()
+ index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i],
+ names=['group', 'date'])
+ expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]},
+ index=index)
+ result = df.groupby('group').apply(
+ lambda x: x.resample('1D').ffill())[['val']]
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_api():
+
+ # GH 15021
+ # .groupby(...).resample(on=...) results in an unexpected
+ # keyword warning.
+ df = DataFrame({'key': ['A', 'B'] * 5,
+ 'dates': pd.date_range('2016-01-01', periods=10),
+ 'values': np.random.randn(10)})
+
+ expected = df.set_index('dates').groupby('key').resample('D').mean()
+
+ result = df.groupby('key').resample('D', on='dates').mean()
+ assert_frame_equal(result, expected)
+
+
+def test_pipe():
+ # GH17905
+
+ # series
+ r = test_series.resample('H')
+ expected = r.max() - r.mean()
+ result = r.pipe(lambda x: x.max() - x.mean())
+ tm.assert_series_equal(result, expected)
+
+ # dataframe
+ r = test_frame.resample('H')
+ expected = r.max() - r.mean()
+ result = r.pipe(lambda x: x.max() - x.mean())
+ tm.assert_frame_equal(result, expected)
+
+
+def test_getitem():
+
+ r = test_frame.resample('H')
+ tm.assert_index_equal(r._selected_obj.columns, test_frame.columns)
+
+ r = test_frame.resample('H')['B']
+ assert r._selected_obj.name == test_frame.columns[1]
+
+ # technically this is allowed
+ r = test_frame.resample('H')['A', 'B']
+ tm.assert_index_equal(r._selected_obj.columns,
+ test_frame.columns[[0, 1]])
+
+ r = test_frame.resample('H')['A', 'B']
+ tm.assert_index_equal(r._selected_obj.columns,
+ test_frame.columns[[0, 1]])
+
+
+def test_select_bad_cols():
+
+ g = test_frame.resample('H')
+ pytest.raises(KeyError, g.__getitem__, ['D'])
+
+ pytest.raises(KeyError, g.__getitem__, ['A', 'D'])
+ with pytest.raises(KeyError, match='^[^A]+$'):
+ # A should not be referenced as a bad column...
+ # will have to rethink regex if you change message!
+ g[['A', 'D']]
+
+
+def test_attribute_access():
+
+ r = test_frame.resample('H')
+ tm.assert_series_equal(r.A.sum(), r['A'].sum())
+
+
+def test_api_compat_before_use():
+
+ # make sure that we are setting the binner
+ # on these attributes
+ for attr in ['groups', 'ngroups', 'indices']:
+ rng = pd.date_range('1/1/2012', periods=100, freq='S')
+ ts = Series(np.arange(len(rng)), index=rng)
+ rs = ts.resample('30s')
+
+ # before use
+ getattr(rs, attr)
+
+ # after grouper is initialized is ok
+ rs.mean()
+ getattr(rs, attr)
+
+
+def tests_skip_nuisance():
+
+ df = test_frame
+ df['D'] = 'foo'
+ r = df.resample('H')
+ result = r[['A', 'B']].sum()
+ expected = pd.concat([r.A.sum(), r.B.sum()], axis=1)
+ assert_frame_equal(result, expected)
+
+ expected = r[['A', 'B', 'C']].sum()
+ result = r.sum()
+ assert_frame_equal(result, expected)
+
+
+def test_downsample_but_actually_upsampling():
+
+ # this is reindex / asfreq
+ rng = pd.date_range('1/1/2012', periods=100, freq='S')
+ ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
+ result = ts.resample('20s').asfreq()
+ expected = Series([0, 20, 40, 60, 80],
+ index=pd.date_range('2012-01-01 00:00:00',
+ freq='20s',
+ periods=5))
+ assert_series_equal(result, expected)
+
+
+def test_combined_up_downsampling_of_irregular():
+
+ # since we are reallydoing an operation like this
+ # ts2.resample('2s').mean().ffill()
+ # preserve these semantics
+
+ rng = pd.date_range('1/1/2012', periods=100, freq='S')
+ ts = Series(np.arange(len(rng)), index=rng)
+ ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = ts2.resample('2s', how='mean', fill_method='ffill')
+ expected = ts2.resample('2s').mean().ffill()
+ assert_series_equal(result, expected)
+
+
+def test_transform():
+
+ r = test_series.resample('20min')
+ expected = test_series.groupby(
+ pd.Grouper(freq='20min')).transform('mean')
+ result = r.transform('mean')
+ assert_series_equal(result, expected)
+
+
+def test_fillna():
+
+ # need to upsample here
+ rng = pd.date_range('1/1/2012', periods=10, freq='2S')
+ ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
+ r = ts.resample('s')
+
+ expected = r.ffill()
+ result = r.fillna(method='ffill')
+ assert_series_equal(result, expected)
+
+ expected = r.bfill()
+ result = r.fillna(method='bfill')
+ assert_series_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ r.fillna(0)
+
+
+def test_apply_without_aggregation():
+
+ # both resample and groupby should work w/o aggregation
+ r = test_series.resample('20min')
+ g = test_series.groupby(pd.Grouper(freq='20min'))
+
+ for t in [g, r]:
+ result = t.apply(lambda x: x)
+ assert_series_equal(result, test_series)
+
+
+def test_agg_consistency():
+
+ # make sure that we are consistent across
+ # similar aggregations with and w/o selection list
+ df = DataFrame(np.random.randn(1000, 3),
+ index=pd.date_range('1/1/2012', freq='S', periods=1000),
+ columns=['A', 'B', 'C'])
+
+ r = df.resample('3T')
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'})
+ result = r.agg({'r1': 'mean', 'r2': 'sum'})
+ assert_frame_equal(result, expected)
+
+# TODO: once GH 14008 is fixed, move these tests into
+# `Base` test class
+
+
+def test_agg():
+ # test with all three Resampler apis and TimeGrouper
+
+ np.random.seed(1234)
+ index = date_range(datetime(2005, 1, 1),
+ datetime(2005, 1, 10), freq='D')
+ index.name = 'date'
+ df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
+ df_col = df.reset_index()
+ df_mult = df_col.copy()
+ df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
+ names=['index', 'date'])
+ r = df.resample('2D')
+ cases = [
+ r,
+ df_col.resample('2D', on='date'),
+ df_mult.resample('2D', level='date'),
+ df.groupby(pd.Grouper(freq='2D'))
+ ]
+
+ a_mean = r['A'].mean()
+ a_std = r['A'].std()
+ a_sum = r['A'].sum()
+ b_mean = r['B'].mean()
+ b_std = r['B'].std()
+ b_sum = r['B'].sum()
+
+ expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+ expected.columns = pd.MultiIndex.from_product([['A', 'B'],
+ ['mean', 'std']])
+ for t in cases:
+ result = t.aggregate([np.mean, np.std])
+ assert_frame_equal(result, expected)
+
+ expected = pd.concat([a_mean, b_std], axis=1)
+ for t in cases:
+ result = t.aggregate({'A': np.mean,
+ 'B': np.std})
+ assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([a_mean, a_std], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
+ ('A', 'std')])
+ for t in cases:
+ result = t.aggregate({'A': ['mean', 'std']})
+ assert_frame_equal(result, expected)
+
+ expected = pd.concat([a_mean, a_sum], axis=1)
+ expected.columns = ['mean', 'sum']
+ for t in cases:
+ result = t['A'].aggregate(['mean', 'sum'])
+ assert_frame_equal(result, expected)
+
+ expected = pd.concat([a_mean, a_sum], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
+ ('A', 'sum')])
+ for t in cases:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
+ assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
+ ('A', 'sum'),
+ ('B', 'mean2'),
+ ('B', 'sum2')])
+ for t in cases:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'},
+ 'B': {'mean2': 'mean', 'sum2': 'sum'}})
+ assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
+ ('A', 'std'),
+ ('B', 'mean'),
+ ('B', 'std')])
+ for t in cases:
+ result = t.aggregate({'A': ['mean', 'std'],
+ 'B': ['mean', 'std']})
+ assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'),
+ ('r1', 'A', 'sum'),
+ ('r2', 'B', 'mean'),
+ ('r2', 'B', 'sum')])
+
+
+def test_agg_misc():
+ # test with all three Resampler apis and TimeGrouper
+
+ np.random.seed(1234)
+ index = date_range(datetime(2005, 1, 1),
+ datetime(2005, 1, 10), freq='D')
+ index.name = 'date'
+ df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
+ df_col = df.reset_index()
+ df_mult = df_col.copy()
+ df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
+ names=['index', 'date'])
+
+ r = df.resample('2D')
+ cases = [
+ r,
+ df_col.resample('2D', on='date'),
+ df_mult.resample('2D', level='date'),
+ df.groupby(pd.Grouper(freq='2D'))
+ ]
+
+ # passed lambda
+ for t in cases:
+ result = t.agg({'A': np.sum,
+ 'B': lambda x: np.std(x, ddof=1)})
+ rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
+ expected = pd.concat([r['A'].sum(), rcustom], axis=1)
+ assert_frame_equal(result, expected, check_like=True)
+
+ # agg with renamers
+ expected = pd.concat([t['A'].sum(),
+ t['B'].sum(),
+ t['A'].mean(),
+ t['B'].mean()],
+ axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
+ ('result1', 'B'),
+ ('result2', 'A'),
+ ('result2', 'B')])
+
+ for t in cases:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum),
+ ('result2', np.mean)]))
+ assert_frame_equal(result, expected, check_like=True)
+
+ # agg with different hows
+ expected = pd.concat([t['A'].sum(),
+ t['A'].std(),
+ t['B'].mean(),
+ t['B'].std()],
+ axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
+ ('A', 'std'),
+ ('B', 'mean'),
+ ('B', 'std')])
+ for t in cases:
+ result = t.agg(OrderedDict([('A', ['sum', 'std']),
+ ('B', ['mean', 'std'])]))
+ assert_frame_equal(result, expected, check_like=True)
+
+ # equivalent of using a selection list / or not
+ for t in cases:
+ result = t[['A', 'B']].agg({'A': ['sum', 'std'],
+ 'B': ['mean', 'std']})
+ assert_frame_equal(result, expected, check_like=True)
+
+ # series like aggs
+ for t in cases:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t['A'].agg({'A': ['sum', 'std']})
+ expected = pd.concat([t['A'].sum(),
+ t['A'].std()],
+ axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
+ ('A', 'std')])
+ assert_frame_equal(result, expected, check_like=True)
+
+ expected = pd.concat([t['A'].agg(['sum', 'std']),
+ t['A'].agg(['mean', 'std'])],
+ axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
+ ('A', 'std'),
+ ('B', 'mean'),
+ ('B', 'std')])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t['A'].agg({'A': ['sum', 'std'],
+ 'B': ['mean', 'std']})
+ assert_frame_equal(result, expected, check_like=True)
+
+ # errors
+ # invalid names in the agg specification
+ for t in cases:
+ with pytest.raises(KeyError):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ t[['A']].agg({'A': ['sum', 'std'],
+ 'B': ['mean', 'std']})
+
+
+def test_agg_nested_dicts():
+
+ np.random.seed(1234)
+ index = date_range(datetime(2005, 1, 1),
+ datetime(2005, 1, 10), freq='D')
+ index.name = 'date'
+ df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
+ df_col = df.reset_index()
+ df_mult = df_col.copy()
+ df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
+ names=['index', 'date'])
+ r = df.resample('2D')
+ cases = [
+ r,
+ df_col.resample('2D', on='date'),
+ df_mult.resample('2D', level='date'),
+ df.groupby(pd.Grouper(freq='2D'))
+ ]
+
+ for t in cases:
+ def f():
+ t.aggregate({'r1': {'A': ['mean', 'sum']},
+ 'r2': {'B': ['mean', 'sum']}})
+ pytest.raises(ValueError, f)
+
+ for t in cases:
+ expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(),
+ t['B'].std()], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
+ 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
+ 'B': {'rb': ['mean', 'std']}})
+ assert_frame_equal(result, expected, check_like=True)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = t.agg({'A': {'ra': ['mean', 'std']},
+ 'B': {'rb': ['mean', 'std']}})
+ assert_frame_equal(result, expected, check_like=True)
+
+
+def test_try_aggregate_non_existing_column():
+ # GH 16766
+ data = [
+ {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0},
+ {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0},
+ {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5}
+ ]
+ df = DataFrame(data).set_index('dt')
+
+ # Error as we don't have 'z' column
+ with pytest.raises(KeyError):
+ df.resample('30T').agg({'x': ['mean'],
+ 'y': ['median'],
+ 'z': ['sum']})
+
+
+def test_selection_api_validation():
+ # GH 13500
+ index = date_range(datetime(2005, 1, 1),
+ datetime(2005, 1, 10), freq='D')
+
+ rng = np.arange(len(index), dtype=np.int64)
+ df = DataFrame({'date': index, 'a': rng},
+ index=pd.MultiIndex.from_arrays([rng, index],
+ names=['v', 'd']))
+ df_exp = DataFrame({'a': rng}, index=index)
+
+ # non DatetimeIndex
+ with pytest.raises(TypeError):
+ df.resample('2D', level='v')
+
+ with pytest.raises(ValueError):
+ df.resample('2D', on='date', level='d')
+
+ with pytest.raises(TypeError):
+ df.resample('2D', on=['a', 'date'])
+
+ with pytest.raises(KeyError):
+ df.resample('2D', level=['a', 'date'])
+
+ # upsampling not allowed
+ with pytest.raises(ValueError):
+ df.resample('2D', level='d').asfreq()
+
+ with pytest.raises(ValueError):
+ df.resample('2D', on='date').asfreq()
+
+ exp = df_exp.resample('2D').sum()
+ exp.index.name = 'date'
+ assert_frame_equal(exp, df.resample('2D', on='date').sum())
+
+ exp.index.name = 'd'
+ assert_frame_equal(exp, df.resample('2D', level='d').sum())
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_resampler_grouper.py b/contrib/python/pandas/py2/pandas/tests/resample/test_resampler_grouper.py
new file mode 100644
index 00000000000..b61acfc3d2c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_resampler_grouper.py
@@ -0,0 +1,260 @@
+# pylint: disable=E1101
+
+from textwrap import dedent
+
+import numpy as np
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import DataFrame, Series, Timestamp
+from pandas.core.indexes.datetimes import date_range
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+test_frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
+ 'B': np.arange(40)},
+ index=date_range('1/1/2000',
+ freq='s',
+ periods=40))
+
+
+def test_tab_complete_ipython6_warning(ip):
+ from IPython.core.completer import provisionalcompleter
+ code = dedent("""\
+ import pandas.util.testing as tm
+ s = tm.makeTimeSeries()
+ rs = s.resample("D")
+ """)
+ ip.run_code(code)
+
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('rs.', 1))
+
+
+def test_deferred_with_groupby():
+
+ # GH 12486
+ # support deferred resample ops with groupby
+ data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
+ ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
+ ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
+ ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
+ ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]
+
+ df = DataFrame(data, columns=['date', 'id', 'score'])
+ df.date = pd.to_datetime(df.date)
+
+ def f(x):
+ return x.set_index('date').resample('D').asfreq()
+ expected = df.groupby('id').apply(f)
+ result = df.set_index('date').groupby('id').resample('D').asfreq()
+ assert_frame_equal(result, expected)
+
+ df = DataFrame({'date': pd.date_range(start='2016-01-01',
+ periods=4,
+ freq='W'),
+ 'group': [1, 1, 2, 2],
+ 'val': [5, 6, 7, 8]}).set_index('date')
+
+ def f(x):
+ return x.resample('1D').ffill()
+ expected = df.groupby('group').apply(f)
+ result = df.groupby('group').resample('1D').ffill()
+ assert_frame_equal(result, expected)
+
+
+def test_getitem():
+ g = test_frame.groupby('A')
+
+ expected = g.B.apply(lambda x: x.resample('2s').mean())
+
+ result = g.resample('2s').B.mean()
+ assert_series_equal(result, expected)
+
+ result = g.B.resample('2s').mean()
+ assert_series_equal(result, expected)
+
+ result = g.resample('2s').mean().B
+ assert_series_equal(result, expected)
+
+
+def test_getitem_multiple():
+
+ # GH 13174
+ # multiple calls after selection causing an issue with aliasing
+ data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}]
+ df = DataFrame(data, index=pd.date_range('2016-01-01', periods=2))
+ r = df.groupby('id').resample('1D')
+ result = r['buyer'].count()
+ expected = Series([1, 1],
+ index=pd.MultiIndex.from_tuples(
+ [(1, Timestamp('2016-01-01')),
+ (2, Timestamp('2016-01-02'))],
+ names=['id', None]),
+ name='buyer')
+ assert_series_equal(result, expected)
+
+ result = r['buyer'].count()
+ assert_series_equal(result, expected)
+
+
+def test_groupby_resample_on_api_with_getitem():
+ # GH 17813
+ df = pd.DataFrame({'id': list('aabbb'),
+ 'date': pd.date_range('1-1-2016', periods=5),
+ 'data': 1})
+ exp = df.set_index('date').groupby('id').resample('2D')['data'].sum()
+ result = df.groupby('id').resample('2D', on='date')['data'].sum()
+ assert_series_equal(result, exp)
+
+
+def test_nearest():
+
+ # GH 17496
+ # Resample nearest
+ index = pd.date_range('1/1/2000', periods=3, freq='T')
+ result = Series(range(3), index=index).resample('20s').nearest()
+
+ expected = Series(
+ [0, 0, 1, 1, 1, 2, 2],
+ index=pd.DatetimeIndex(
+ ['2000-01-01 00:00:00', '2000-01-01 00:00:20',
+ '2000-01-01 00:00:40', '2000-01-01 00:01:00',
+ '2000-01-01 00:01:20', '2000-01-01 00:01:40',
+ '2000-01-01 00:02:00'],
+ dtype='datetime64[ns]',
+ freq='20S'))
+ assert_series_equal(result, expected)
+
+
+def test_methods():
+ g = test_frame.groupby('A')
+ r = g.resample('2s')
+
+ for f in ['first', 'last', 'median', 'sem', 'sum', 'mean',
+ 'min', 'max']:
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
+ assert_frame_equal(result, expected)
+
+ for f in ['size']:
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
+ assert_series_equal(result, expected)
+
+ for f in ['count']:
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
+ assert_frame_equal(result, expected)
+
+ # series only
+ for f in ['nunique']:
+ result = getattr(r.B, f)()
+ expected = g.B.apply(lambda x: getattr(x.resample('2s'), f)())
+ assert_series_equal(result, expected)
+
+ for f in ['nearest', 'backfill', 'ffill', 'asfreq']:
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
+ assert_frame_equal(result, expected)
+
+ result = r.ohlc()
+ expected = g.apply(lambda x: x.resample('2s').ohlc())
+ assert_frame_equal(result, expected)
+
+ for f in ['std', 'var']:
+ result = getattr(r, f)(ddof=1)
+ expected = g.apply(lambda x: getattr(x.resample('2s'), f)(ddof=1))
+ assert_frame_equal(result, expected)
+
+
+def test_apply():
+
+ g = test_frame.groupby('A')
+ r = g.resample('2s')
+
+ # reduction
+ expected = g.resample('2s').sum()
+
+ def f(x):
+ return x.resample('2s').sum()
+
+ result = r.apply(f)
+ assert_frame_equal(result, expected)
+
+ def f(x):
+ return x.resample('2s').apply(lambda y: y.sum())
+
+ result = g.apply(f)
+ assert_frame_equal(result, expected)
+
+
+def test_apply_with_mutated_index():
+ # GH 15169
+ index = pd.date_range('1-1-2015', '12-31-15', freq='D')
+ df = DataFrame(data={'col1': np.random.rand(len(index))}, index=index)
+
+ def f(x):
+ s = Series([1, 2], index=['a', 'b'])
+ return s
+
+ expected = df.groupby(pd.Grouper(freq='M')).apply(f)
+
+ result = df.resample('M').apply(f)
+ assert_frame_equal(result, expected)
+
+ # A case for series
+ expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f)
+ result = df['col1'].resample('M').apply(f)
+ assert_series_equal(result, expected)
+
+
+def test_resample_groupby_with_label():
+ # GH 13235
+ index = date_range('2000-01-01', freq='2D', periods=5)
+ df = DataFrame(index=index,
+ data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]}
+ )
+ result = df.groupby('col0').resample('1W', label='left').sum()
+
+ mi = [np.array([0, 0, 1, 2]),
+ pd.to_datetime(np.array(['1999-12-26', '2000-01-02',
+ '2000-01-02', '2000-01-02'])
+ )
+ ]
+ mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None])
+ expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]},
+ index=mindex
+ )
+
+ assert_frame_equal(result, expected)
+
+
+def test_consistency_with_window():
+
+ # consistent return values with window
+ df = test_frame
+ expected = pd.Int64Index([1, 2, 3], name='A')
+ result = df.groupby('A').resample('2s').mean()
+ assert result.index.nlevels == 2
+ tm.assert_index_equal(result.index.levels[0], expected)
+
+ result = df.groupby('A').rolling(20).mean()
+ assert result.index.nlevels == 2
+ tm.assert_index_equal(result.index.levels[0], expected)
+
+
+def test_median_duplicate_columns():
+ # GH 14233
+
+ df = DataFrame(np.random.randn(20, 3),
+ columns=list('aaa'),
+ index=pd.date_range('2012-01-01', periods=20, freq='s'))
+ df2 = df.copy()
+ df2.columns = ['a', 'b', 'c']
+ expected = df2.resample('5s').median()
+ result = df.resample('5s').median()
+ expected.columns = result.columns
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_time_grouper.py b/contrib/python/pandas/py2/pandas/tests/resample/test_time_grouper.py
new file mode 100644
index 00000000000..ec29b55ac9d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_time_grouper.py
@@ -0,0 +1,287 @@
+from datetime import datetime
+from operator import methodcaller
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Panel, Series
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.resample import TimeGrouper
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+test_series = Series(np.random.randn(1000),
+ index=date_range('1/1/2000', periods=1000))
+
+
+def test_apply():
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
+
+ grouped = test_series.groupby(grouper)
+
+ def f(x):
+ return x.sort_values()[-3:]
+
+ applied = grouped.apply(f)
+ expected = test_series.groupby(lambda x: x.year).apply(f)
+
+ applied.index = applied.index.droplevel(0)
+ expected.index = expected.index.droplevel(0)
+ assert_series_equal(applied, expected)
+
+
+def test_count():
+ test_series[::3] = np.nan
+
+ expected = test_series.groupby(lambda x: x.year).count()
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
+ result = test_series.groupby(grouper).count()
+ expected.index = result.index
+ assert_series_equal(result, expected)
+
+ result = test_series.resample('A').count()
+ expected.index = result.index
+ assert_series_equal(result, expected)
+
+
+def test_numpy_reduction():
+ result = test_series.resample('A', closed='right').prod()
+
+ expected = test_series.groupby(lambda x: x.year).agg(np.prod)
+ expected.index = result.index
+
+ assert_series_equal(result, expected)
+
+
+def test_apply_iteration():
+ # #2300
+ N = 1000
+ ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
+ df = DataFrame({'open': 1, 'close': 2}, index=ind)
+ tg = TimeGrouper('M')
+
+ _, grouper, _ = tg._get_grouper(df)
+
+ # Errors
+ grouped = df.groupby(grouper, group_keys=False)
+
+ def f(df):
+ return df['close'] / df['open']
+
+ # it works!
+ result = grouped.apply(f)
+ tm.assert_index_equal(result.index, df.index)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_panel_aggregation():
+ ind = pd.date_range('1/1/2000', periods=100)
+ data = np.random.randn(2, len(ind), 4)
+
+ wp = Panel(data, items=['Item1', 'Item2'], major_axis=ind,
+ minor_axis=['A', 'B', 'C', 'D'])
+
+ tg = TimeGrouper('M', axis=1)
+ _, grouper, _ = tg._get_grouper(wp)
+ bingrouped = wp.groupby(grouper)
+ binagg = bingrouped.mean()
+
+ def f(x):
+ assert (isinstance(x, Panel))
+ return x.mean(1)
+
+ result = bingrouped.agg(f)
+ tm.assert_panel_equal(result, binagg)
+
+
[email protected]('name, func', [
+ ('Int64Index', tm.makeIntIndex),
+ ('Index', tm.makeUnicodeIndex),
+ ('Float64Index', tm.makeFloatIndex),
+ ('MultiIndex', lambda m: tm.makeCustomIndex(m, 2))
+])
+def test_fails_on_no_datetime_index(name, func):
+ n = 2
+ index = func(n)
+ df = DataFrame({'a': np.random.randn(n)}, index=index)
+
+ msg = ("Only valid with DatetimeIndex, TimedeltaIndex "
+ "or PeriodIndex, but got an instance of %r" % name)
+ with pytest.raises(TypeError, match=msg):
+ df.groupby(TimeGrouper('D'))
+
+
+def test_aaa_group_order():
+ # GH 12840
+ # check TimeGrouper perform stable sorts
+ n = 20
+ data = np.random.randn(n, 4)
+ df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
+ datetime(2013, 1, 3), datetime(2013, 1, 4),
+ datetime(2013, 1, 5)] * 4
+ grouped = df.groupby(TimeGrouper(key='key', freq='D'))
+
+ tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)),
+ df[::5])
+ tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)),
+ df[1::5])
+ tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)),
+ df[2::5])
+ tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)),
+ df[3::5])
+ tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)),
+ df[4::5])
+
+
+def test_aggregate_normal(resample_method):
+ """Check TimeGrouper's aggregation is identical as normal groupby."""
+
+ if resample_method == 'ohlc':
+ pytest.xfail(reason='DataError: No numeric types to aggregate')
+
+ data = np.random.randn(20, 4)
+ normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ normal_df['key'] = [1, 2, 3, 4, 5] * 4
+
+ dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
+ datetime(2013, 1, 3), datetime(2013, 1, 4),
+ datetime(2013, 1, 5)] * 4
+
+ normal_grouped = normal_df.groupby('key')
+ dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
+
+ expected = getattr(normal_grouped, resample_method)()
+ dt_result = getattr(dt_grouped, resample_method)()
+ expected.index = date_range(start='2013-01-01', freq='D',
+ periods=5, name='key')
+ tm.assert_equal(expected, dt_result)
+
+ # if TimeGrouper is used included, 'nth' doesn't work yet
+
+ """
+ for func in ['nth']:
+ expected = getattr(normal_grouped, func)(3)
+ expected.index = date_range(start='2013-01-01',
+ freq='D', periods=5, name='key')
+ dt_result = getattr(dt_grouped, func)(3)
+ assert_frame_equal(expected, dt_result)
+ """
+
+
[email protected]('method, method_args, unit', [
+ ('sum', dict(), 0),
+ ('sum', dict(min_count=0), 0),
+ ('sum', dict(min_count=1), np.nan),
+ ('prod', dict(), 1),
+ ('prod', dict(min_count=0), 1),
+ ('prod', dict(min_count=1), np.nan)
+])
+def test_resample_entirly_nat_window(method, method_args, unit):
+ s = pd.Series([0] * 2 + [np.nan] * 2,
+ index=pd.date_range('2017', periods=4))
+ result = methodcaller(method, **method_args)(s.resample("2d"))
+ expected = pd.Series([0.0, unit],
+ index=pd.to_datetime(['2017-01-01',
+ '2017-01-03']))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('func, fill_value', [
+ ('min', np.nan),
+ ('max', np.nan),
+ ('sum', 0),
+ ('prod', 1),
+ ('count', 0),
+])
+def test_aggregate_with_nat(func, fill_value):
+ # check TimeGrouper's aggregation is identical as normal groupby
+ # if NaT is included, 'var', 'std', 'mean', 'first','last'
+ # and 'nth' doesn't work yet
+
+ n = 20
+ data = np.random.randn(n, 4).astype('int64')
+ normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
+
+ dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
+ datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
+
+ normal_grouped = normal_df.groupby('key')
+ dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
+
+ normal_result = getattr(normal_grouped, func)()
+ dt_result = getattr(dt_grouped, func)()
+
+ pad = DataFrame([[fill_value] * 4], index=[3],
+ columns=['A', 'B', 'C', 'D'])
+ expected = normal_result.append(pad)
+ expected = expected.sort_index()
+ expected.index = date_range(start='2013-01-01', freq='D',
+ periods=5, name='key')
+ assert_frame_equal(expected, dt_result)
+ assert dt_result.index.name == 'key'
+
+
+def test_aggregate_with_nat_size():
+ # GH 9925
+ n = 20
+ data = np.random.randn(n, 4).astype('int64')
+ normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
+
+ dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
+ dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
+ datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
+
+ normal_grouped = normal_df.groupby('key')
+ dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
+
+ normal_result = normal_grouped.size()
+ dt_result = dt_grouped.size()
+
+ pad = Series([0], index=[3])
+ expected = normal_result.append(pad)
+ expected = expected.sort_index()
+ expected.index = date_range(start='2013-01-01', freq='D',
+ periods=5, name='key')
+ assert_series_equal(expected, dt_result)
+ assert dt_result.index.name == 'key'
+
+
+def test_repr():
+ # GH18203
+ result = repr(TimeGrouper(key='A', freq='H'))
+ expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
+ "closed='left', label='left', how='mean', "
+ "convention='e', base=0)")
+ assert result == expected
+
+
[email protected]('method, method_args, expected_values', [
+ ('sum', dict(), [1, 0, 1]),
+ ('sum', dict(min_count=0), [1, 0, 1]),
+ ('sum', dict(min_count=1), [1, np.nan, 1]),
+ ('sum', dict(min_count=2), [np.nan, np.nan, np.nan]),
+ ('prod', dict(), [1, 1, 1]),
+ ('prod', dict(min_count=0), [1, 1, 1]),
+ ('prod', dict(min_count=1), [1, np.nan, 1]),
+ ('prod', dict(min_count=2), [np.nan, np.nan, np.nan]),
+])
+def test_upsample_sum(method, method_args, expected_values):
+ s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H"))
+ resampled = s.resample("30T")
+ index = pd.to_datetime(['2017-01-01T00:00:00',
+ '2017-01-01T00:30:00',
+ '2017-01-01T01:00:00'])
+ result = methodcaller(method, **method_args)(resampled)
+ expected = pd.Series(expected_values, index=index)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/resample/test_timedelta.py b/contrib/python/pandas/py2/pandas/tests/resample/test_timedelta.py
new file mode 100644
index 00000000000..3498d30d116
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/resample/test_timedelta.py
@@ -0,0 +1,128 @@
+from datetime import timedelta
+
+import numpy as np
+
+import pandas as pd
+from pandas import DataFrame, Series
+from pandas.core.indexes.timedeltas import timedelta_range
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+def test_asfreq_bug():
+ df = DataFrame(data=[1, 3],
+ index=[timedelta(), timedelta(minutes=3)])
+ result = df.resample('1T').asfreq()
+ expected = DataFrame(data=[1, np.nan, np.nan, 3],
+ index=timedelta_range('0 day',
+ periods=4,
+ freq='1T'))
+ assert_frame_equal(result, expected)
+
+
+def test_resample_with_nat():
+ # GH 13223
+ index = pd.to_timedelta(['0s', pd.NaT, '2s'])
+ result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean()
+ expected = DataFrame({'value': [2.5, np.nan, 5.0]},
+ index=timedelta_range('0 day',
+ periods=3,
+ freq='1S'))
+ assert_frame_equal(result, expected)
+
+
+def test_resample_as_freq_with_subperiod():
+ # GH 13022
+ index = timedelta_range('00:00:00', '00:10:00', freq='5T')
+ df = DataFrame(data={'value': [1, 5, 10]}, index=index)
+ result = df.resample('2T').asfreq()
+ expected_data = {'value': [1, np.nan, np.nan, np.nan, np.nan, 10]}
+ expected = DataFrame(data=expected_data,
+ index=timedelta_range('00:00:00',
+ '00:10:00', freq='2T'))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_resample_with_timedeltas():
+
+ expected = DataFrame({'A': np.arange(1480)})
+ expected = expected.groupby(expected.index // 30).sum()
+ expected.index = pd.timedelta_range('0 days', freq='30T', periods=50)
+
+ df = DataFrame({'A': np.arange(1480)}, index=pd.to_timedelta(
+ np.arange(1480), unit='T'))
+ result = df.resample('30T').sum()
+
+ assert_frame_equal(result, expected)
+
+ s = df['A']
+ result = s.resample('30T').sum()
+ assert_series_equal(result, expected['A'])
+
+
+def test_resample_single_period_timedelta():
+
+ s = Series(list(range(5)), index=pd.timedelta_range(
+ '1 day', freq='s', periods=5))
+ result = s.resample('2s').sum()
+ expected = Series([1, 5, 4], index=pd.timedelta_range(
+ '1 day', freq='2s', periods=3))
+ assert_series_equal(result, expected)
+
+
+def test_resample_timedelta_idempotency():
+
+ # GH 12072
+ index = pd.timedelta_range('0', periods=9, freq='10L')
+ series = Series(range(9), index=index)
+ result = series.resample('10L').mean()
+ expected = series
+ assert_series_equal(result, expected)
+
+
+def test_resample_base_with_timedeltaindex():
+
+ # GH 10530
+ rng = timedelta_range(start='0s', periods=25, freq='s')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ with_base = ts.resample('2s', base=5).mean()
+ without_base = ts.resample('2s').mean()
+
+ exp_without_base = timedelta_range(start='0s', end='25s', freq='2s')
+ exp_with_base = timedelta_range(start='5s', end='29s', freq='2s')
+
+ tm.assert_index_equal(without_base.index, exp_without_base)
+ tm.assert_index_equal(with_base.index, exp_with_base)
+
+
+def test_resample_categorical_data_with_timedeltaindex():
+ # GH #12169
+ df = DataFrame({'Group_obj': 'A'},
+ index=pd.to_timedelta(list(range(20)), unit='s'))
+ df['Group'] = df['Group_obj'].astype('category')
+ result = df.resample('10s').agg(lambda x: (x.value_counts().index[0]))
+ expected = DataFrame({'Group_obj': ['A', 'A'],
+ 'Group': ['A', 'A']},
+ index=pd.to_timedelta([0, 10], unit='s'))
+ expected = expected.reindex(['Group_obj', 'Group'], axis=1)
+ expected['Group'] = expected['Group_obj'].astype('category')
+ tm.assert_frame_equal(result, expected)
+
+
+def test_resample_timedelta_values():
+ # GH 13119
+ # check that timedelta dtype is preserved when NaT values are
+ # introduced by the resampling
+
+ times = timedelta_range('1 day', '4 day', freq='4D')
+ df = DataFrame({'time': times}, index=times)
+
+ times2 = timedelta_range('1 day', '4 day', freq='2D')
+ exp = Series(times2, index=times2, name='time')
+ exp.iloc[1] = pd.NaT
+
+ res = df.resample('2D').first()['time']
+ tm.assert_series_equal(res, exp)
+ res = df['time'].resample('2D').first()
+ tm.assert_series_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/__init__.py b/contrib/python/pandas/py2/pandas/tests/reshape/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/__init__.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_join.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_join.py
new file mode 100644
index 00000000000..c2a214446bb
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_join.py
@@ -0,0 +1,880 @@
+# pylint: disable=E1103
+
+from warnings import catch_warnings
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+from pandas._libs import join as libjoin
+import pandas.compat as compat
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
+from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+a_ = np.array
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestJoin(object):
+
+ def setup_method(self, method):
+ # aggregate multiple columns
+ self.df = DataFrame({'key1': get_test_data(),
+ 'key2': get_test_data(),
+ 'data1': np.random.randn(N),
+ 'data2': np.random.randn(N)})
+
+ # exclude a couple keys for fun
+ self.df = self.df[self.df['key2'] > 1]
+
+ self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
+ 'key2': get_test_data(ngroups=NGROUPS // 2,
+ n=N // 5),
+ 'value': np.random.randn(N // 5)})
+
+ index, data = tm.getMixedTypeDict()
+ self.target = DataFrame(data, index=index)
+
+ # Join on string value
+ self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
+ index=data['C'])
+
+ def test_cython_left_outer_join(self):
+ left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
+ right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
+ max_group = 5
+
+ ls, rs = libjoin.left_outer_join(left, right, max_group)
+
+ exp_ls = left.argsort(kind='mergesort')
+ exp_rs = right.argsort(kind='mergesort')
+
+ exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+ 6, 6, 7, 7, 8, 8, 9, 10])
+ exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
+ 4, 5, 4, 5, 4, 5, -1, -1])
+
+ exp_ls = exp_ls.take(exp_li)
+ exp_ls[exp_li == -1] = -1
+
+ exp_rs = exp_rs.take(exp_ri)
+ exp_rs[exp_ri == -1] = -1
+
+ tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
+ tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
+
+ def test_cython_right_outer_join(self):
+ left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
+ right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
+ max_group = 5
+
+ rs, ls = libjoin.left_outer_join(right, left, max_group)
+
+ exp_ls = left.argsort(kind='mergesort')
+ exp_rs = right.argsort(kind='mergesort')
+
+ # 0 1 1 1
+ exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
+ # 2 2 4
+ 6, 7, 8, 6, 7, 8, -1])
+ exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
+ 4, 4, 4, 5, 5, 5, 6])
+
+ exp_ls = exp_ls.take(exp_li)
+ exp_ls[exp_li == -1] = -1
+
+ exp_rs = exp_rs.take(exp_ri)
+ exp_rs[exp_ri == -1] = -1
+
+ tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
+ tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
+
+ def test_cython_inner_join(self):
+ left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
+ right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
+ max_group = 5
+
+ ls, rs = libjoin.inner_join(left, right, max_group)
+
+ exp_ls = left.argsort(kind='mergesort')
+ exp_rs = right.argsort(kind='mergesort')
+
+ exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+ 6, 6, 7, 7, 8, 8])
+ exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
+ 4, 5, 4, 5, 4, 5])
+
+ exp_ls = exp_ls.take(exp_li)
+ exp_ls[exp_li == -1] = -1
+
+ exp_rs = exp_rs.take(exp_ri)
+ exp_rs[exp_ri == -1] = -1
+
+ tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
+ tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
+
+ def test_left_outer_join(self):
+ joined_key2 = merge(self.df, self.df2, on='key2')
+ _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
+
+ joined_both = merge(self.df, self.df2)
+ _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+ how='left')
+
+ def test_right_outer_join(self):
+ joined_key2 = merge(self.df, self.df2, on='key2', how='right')
+ _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
+
+ joined_both = merge(self.df, self.df2, how='right')
+ _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+ how='right')
+
+ def test_full_outer_join(self):
+ joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
+ _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
+
+ joined_both = merge(self.df, self.df2, how='outer')
+ _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+ how='outer')
+
+ def test_inner_join(self):
+ joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
+ _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
+
+ joined_both = merge(self.df, self.df2, how='inner')
+ _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+ how='inner')
+
+ def test_handle_overlap(self):
+ joined = merge(self.df, self.df2, on='key2',
+ suffixes=['.foo', '.bar'])
+
+ assert 'key1.foo' in joined
+ assert 'key1.bar' in joined
+
+ def test_handle_overlap_arbitrary_key(self):
+ joined = merge(self.df, self.df2,
+ left_on='key2', right_on='key1',
+ suffixes=['.foo', '.bar'])
+ assert 'key1.foo' in joined
+ assert 'key2.bar' in joined
+
+ def test_join_on(self):
+ target = self.target
+ source = self.source
+
+ merged = target.join(source, on='C')
+ tm.assert_series_equal(merged['MergedA'], target['A'],
+ check_names=False)
+ tm.assert_series_equal(merged['MergedD'], target['D'],
+ check_names=False)
+
+ # join with duplicates (fix regression from DataFrame/Matrix merge)
+ df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
+ df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
+ joined = df.join(df2, on='key')
+ expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
+ 'value': [0, 0, 1, 1, 2]})
+ assert_frame_equal(joined, expected)
+
+ # Test when some are missing
+ df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
+ columns=['one'])
+ df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
+ columns=['two'])
+ df_c = DataFrame([[1], [2]], index=[1, 2],
+ columns=['three'])
+ joined = df_a.join(df_b, on='one')
+ joined = joined.join(df_c, on='one')
+ assert np.isnan(joined['two']['c'])
+ assert np.isnan(joined['three']['c'])
+
+ # merge column not p resent
+ with pytest.raises(KeyError, match="^'E'$"):
+ target.join(source, on='E')
+
+ # overlap
+ source_copy = source.copy()
+ source_copy['A'] = 0
+ msg = ("You are trying to merge on float64 and object columns. If"
+ " you wish to proceed you should use pd.concat")
+ with pytest.raises(ValueError, match=msg):
+ target.join(source_copy, on='A')
+
+ def test_join_on_fails_with_different_right_index(self):
+ df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
+ 'b': np.random.randn(3)})
+ df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
+ 'b': np.random.randn(10)},
+ index=tm.makeCustomIndex(10, 2))
+ msg = (r'len\(left_on\) must equal the number of levels in the index'
+ ' of "right"')
+ with pytest.raises(ValueError, match=msg):
+ merge(df, df2, left_on='a', right_index=True)
+
+ def test_join_on_fails_with_different_left_index(self):
+ df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
+ 'b': np.random.randn(3)},
+ index=tm.makeCustomIndex(3, 2))
+ df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
+ 'b': np.random.randn(10)})
+ msg = (r'len\(right_on\) must equal the number of levels in the index'
+ ' of "left"')
+ with pytest.raises(ValueError, match=msg):
+ merge(df, df2, right_on='b', left_index=True)
+
+ def test_join_on_fails_with_different_column_counts(self):
+ df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
+ 'b': np.random.randn(3)})
+ df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
+ 'b': np.random.randn(10)},
+ index=tm.makeCustomIndex(10, 2))
+ msg = r"len\(right_on\) must equal len\(left_on\)"
+ with pytest.raises(ValueError, match=msg):
+ merge(df, df2, right_on='a', left_on=['a', 'b'])
+
+ @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
+ def test_join_on_fails_with_wrong_object_type(self, wrong_type):
+ # GH12081 - original issue
+
+ # GH21220 - merging of Series and DataFrame is now allowed
+ # Edited test to remove the Series object from test parameters
+
+ df = DataFrame({'a': [1, 1]})
+ msg = ("Can only merge Series or DataFrame objects, a {} was passed"
+ .format(str(type(wrong_type))))
+ with pytest.raises(TypeError, match=msg):
+ merge(wrong_type, df, left_on='a', right_on='a')
+ with pytest.raises(TypeError, match=msg):
+ merge(df, wrong_type, left_on='a', right_on='a')
+
+ def test_join_on_pass_vector(self):
+ expected = self.target.join(self.source, on='C')
+ del expected['C']
+
+ join_col = self.target.pop('C')
+ result = self.target.join(self.source, on=join_col)
+ assert_frame_equal(result, expected)
+
+ def test_join_with_len0(self):
+ # nothing to merge
+ merged = self.target.join(self.source.reindex([]), on='C')
+ for col in self.source:
+ assert col in merged
+ assert merged[col].isna().all()
+
+ merged2 = self.target.join(self.source.reindex([]), on='C',
+ how='inner')
+ tm.assert_index_equal(merged2.columns, merged.columns)
+ assert len(merged2) == 0
+
+ def test_join_on_inner(self):
+ df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
+ df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
+
+ joined = df.join(df2, on='key', how='inner')
+
+ expected = df.join(df2, on='key')
+ expected = expected[expected['value'].notna()]
+ tm.assert_series_equal(joined['key'], expected['key'],
+ check_dtype=False)
+ tm.assert_series_equal(joined['value'], expected['value'],
+ check_dtype=False)
+ tm.assert_index_equal(joined.index, expected.index)
+
+ def test_join_on_singlekey_list(self):
+ df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
+ df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
+
+ # corner cases
+ joined = df.join(df2, on=['key'])
+ expected = df.join(df2, on='key')
+
+ assert_frame_equal(joined, expected)
+
+ def test_join_on_series(self):
+ result = self.target.join(self.source['MergedA'], on='C')
+ expected = self.target.join(self.source[['MergedA']], on='C')
+ assert_frame_equal(result, expected)
+
+ def test_join_on_series_buglet(self):
+ # GH #638
+ df = DataFrame({'a': [1, 1]})
+ ds = Series([2], index=[1], name='b')
+ result = df.join(ds, on='a')
+ expected = DataFrame({'a': [1, 1],
+ 'b': [2, 2]}, index=df.index)
+ tm.assert_frame_equal(result, expected)
+
+ def test_join_index_mixed(self, join_type):
+ # no overlapping blocks
+ df1 = DataFrame(index=np.arange(10))
+ df1['bool'] = True
+ df1['string'] = 'foo'
+
+ df2 = DataFrame(index=np.arange(5, 15))
+ df2['int'] = 1
+ df2['float'] = 1.
+
+ joined = df1.join(df2, how=join_type)
+ expected = _join_by_hand(df1, df2, how=join_type)
+ assert_frame_equal(joined, expected)
+
+ joined = df2.join(df1, how=join_type)
+ expected = _join_by_hand(df2, df1, how=join_type)
+ assert_frame_equal(joined, expected)
+
+ def test_join_index_mixed_overlap(self):
+ df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
+ index=np.arange(10),
+ columns=['A', 'B', 'C', 'D'])
+ assert df1['B'].dtype == np.int64
+ assert df1['D'].dtype == np.bool_
+
+ df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
+ index=np.arange(0, 10, 2),
+ columns=['A', 'B', 'C', 'D'])
+
+ # overlap
+ joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
+ expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
+ 'A_two', 'B_two', 'C_two', 'D_two']
+ df1.columns = expected_columns[:4]
+ df2.columns = expected_columns[4:]
+ expected = _join_by_hand(df1, df2)
+ assert_frame_equal(joined, expected)
+
+ def test_join_empty_bug(self):
+ # generated an exception in 0.4.3
+ x = DataFrame()
+ x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
+
+ def test_join_unconsolidated(self):
+ # GH #331
+ a = DataFrame(randn(30, 2), columns=['a', 'b'])
+ c = Series(randn(30))
+ a['c'] = c
+ d = DataFrame(randn(30, 1), columns=['q'])
+
+ # it works!
+ a.join(d)
+ d.join(a)
+
+ def test_join_multiindex(self):
+ index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
+ [1, 2, 3, 1, 2, 3]],
+ names=['first', 'second'])
+
+ index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
+ [1, 2, 3, 1, 2, 3]],
+ names=['first', 'second'])
+
+ df1 = DataFrame(data=np.random.randn(6), index=index1,
+ columns=['var X'])
+ df2 = DataFrame(data=np.random.randn(6), index=index2,
+ columns=['var Y'])
+
+ df1 = df1.sort_index(level=0)
+ df2 = df2.sort_index(level=0)
+
+ joined = df1.join(df2, how='outer')
+ ex_index = Index(index1.values).union(Index(index2.values))
+ expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
+ expected.index.names = index1.names
+ assert_frame_equal(joined, expected)
+ assert joined.index.names == index1.names
+
+ df1 = df1.sort_index(level=1)
+ df2 = df2.sort_index(level=1)
+
+ joined = df1.join(df2, how='outer').sort_index(level=0)
+ ex_index = Index(index1.values).union(Index(index2.values))
+ expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
+ expected.index.names = index1.names
+
+ assert_frame_equal(joined, expected)
+ assert joined.index.names == index1.names
+
+ def test_join_inner_multiindex(self):
+ key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
+ 'qux', 'snap']
+ key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
+ 'three', 'one']
+
+ data = np.random.randn(len(key1))
+ data = DataFrame({'key1': key1, 'key2': key2,
+ 'data': data})
+
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ to_join = DataFrame(np.random.randn(10, 3), index=index,
+ columns=['j_one', 'j_two', 'j_three'])
+
+ joined = data.join(to_join, on=['key1', 'key2'], how='inner')
+ expected = merge(data, to_join.reset_index(),
+ left_on=['key1', 'key2'],
+ right_on=['first', 'second'], how='inner',
+ sort=False)
+
+ expected2 = merge(to_join, data,
+ right_on=['key1', 'key2'], left_index=True,
+ how='inner', sort=False)
+ assert_frame_equal(joined, expected2.reindex_like(joined))
+
+ expected2 = merge(to_join, data, right_on=['key1', 'key2'],
+ left_index=True, how='inner', sort=False)
+
+ expected = expected.drop(['first', 'second'], axis=1)
+ expected.index = joined.index
+
+ assert joined.index.is_monotonic
+ assert_frame_equal(joined, expected)
+
+ # _assert_same_contents(expected, expected2.loc[:, expected.columns])
+
+ def test_join_hierarchical_mixed(self):
+ # GH 2024
+ df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
+ new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
+ other_df = DataFrame(
+ [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
+ other_df.set_index('a', inplace=True)
+ # GH 9455, 12219
+ with tm.assert_produces_warning(UserWarning):
+ result = merge(new_df, other_df, left_index=True, right_index=True)
+ assert ('b', 'mean') in result
+ assert 'b' in result
+
+ def test_join_float64_float32(self):
+
+ a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
+ b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
+ joined = a.join(b)
+ assert joined.dtypes['a'] == 'float64'
+ assert joined.dtypes['b'] == 'float64'
+ assert joined.dtypes['c'] == 'float32'
+
+ a = np.random.randint(0, 5, 100).astype('int64')
+ b = np.random.random(100).astype('float64')
+ c = np.random.random(100).astype('float32')
+ df = DataFrame({'a': a, 'b': b, 'c': c})
+ xpdf = DataFrame({'a': a, 'b': b, 'c': c})
+ s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
+ rs = df.merge(s, left_on='a', right_index=True)
+ assert rs.dtypes['a'] == 'int64'
+ assert rs.dtypes['b'] == 'float64'
+ assert rs.dtypes['c'] == 'float32'
+ assert rs.dtypes['md'] == 'float32'
+
+ xp = xpdf.merge(s, left_on='a', right_index=True)
+ assert_frame_equal(rs, xp)
+
+ def test_join_many_non_unique_index(self):
+ df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
+ df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
+ df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
+ idf1 = df1.set_index(["a", "b"])
+ idf2 = df2.set_index(["a", "b"])
+ idf3 = df3.set_index(["a", "b"])
+
+ result = idf1.join([idf2, idf3], how='outer')
+
+ df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
+ expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
+
+ result = result.reset_index()
+ expected = expected[result.columns]
+ expected['a'] = expected.a.astype('int64')
+ expected['b'] = expected.b.astype('int64')
+ assert_frame_equal(result, expected)
+
+ df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
+ df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
+ df3 = DataFrame(
+ {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
+ idf1 = df1.set_index(["a", "b"])
+ idf2 = df2.set_index(["a", "b"])
+ idf3 = df3.set_index(["a", "b"])
+ result = idf1.join([idf2, idf3], how='inner')
+
+ df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
+ expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
+
+ result = result.reset_index()
+
+ assert_frame_equal(result, expected.loc[:, result.columns])
+
+ # GH 11519
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8)})
+ s = Series(np.repeat(np.arange(8), 2),
+ index=np.repeat(np.arange(8), 2), name='TEST')
+ inner = df.join(s, how='inner')
+ outer = df.join(s, how='outer')
+ left = df.join(s, how='left')
+ right = df.join(s, how='right')
+ assert_frame_equal(inner, outer)
+ assert_frame_equal(inner, left)
+ assert_frame_equal(inner, right)
+
+ def test_join_sort(self):
+ left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
+ 'value': [1, 2, 3, 4]})
+ right = DataFrame({'value2': ['a', 'b', 'c']},
+ index=['bar', 'baz', 'foo'])
+
+ joined = left.join(right, on='key', sort=True)
+ expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
+ 'value': [2, 3, 1, 4],
+ 'value2': ['a', 'b', 'c', 'c']},
+ index=[1, 2, 0, 3])
+ assert_frame_equal(joined, expected)
+
+ # smoke test
+ joined = left.join(right, on='key', sort=False)
+ tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
+
+ def test_join_mixed_non_unique_index(self):
+ # GH 12814, unorderable types in py3 with a non-unique index
+ df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
+ df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
+ result = df1.join(df2)
+ expected = DataFrame({'a': [1, 2, 3, 3, 4],
+ 'b': [5, np.nan, 6, 7, np.nan]},
+ index=[1, 2, 3, 3, 'a'])
+ tm.assert_frame_equal(result, expected)
+
+ df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
+ df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
+ result = df3.join(df4)
+ expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
+ index=[1, 2, 2, 'a'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_join_non_unique_period_index(self):
+ # GH #16871
+ index = pd.period_range('2016-01-01', periods=16, freq='M')
+ df = DataFrame([i for i in range(len(index))],
+ index=index, columns=['pnum'])
+ df2 = concat([df, df])
+ result = df.join(df2, how='inner', rsuffix='_df2')
+ expected = DataFrame(
+ np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
+ columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
+ tm.assert_frame_equal(result, expected)
+
+ def test_mixed_type_join_with_suffix(self):
+ # GH #916
+ df = DataFrame(np.random.randn(20, 6),
+ columns=['a', 'b', 'c', 'd', 'e', 'f'])
+ df.insert(0, 'id', 0)
+ df.insert(5, 'dt', 'foo')
+
+ grouped = df.groupby('id')
+ mn = grouped.mean()
+ cn = grouped.count()
+
+ # it works!
+ mn.join(cn, rsuffix='_right')
+
+ def test_join_many(self):
+ df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
+ df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
+
+ joined = df_list[0].join(df_list[1:])
+ tm.assert_frame_equal(joined, df)
+
+ df_list = [df[['a', 'b']][:-2],
+ df[['c', 'd']][2:], df[['e', 'f']][1:9]]
+
+ def _check_diff_index(df_list, result, exp_index):
+ reindexed = [x.reindex(exp_index) for x in df_list]
+ expected = reindexed[0].join(reindexed[1:])
+ tm.assert_frame_equal(result, expected)
+
+ # different join types
+ joined = df_list[0].join(df_list[1:], how='outer')
+ _check_diff_index(df_list, joined, df.index)
+
+ joined = df_list[0].join(df_list[1:])
+ _check_diff_index(df_list, joined, df_list[0].index)
+
+ joined = df_list[0].join(df_list[1:], how='inner')
+ _check_diff_index(df_list, joined, df.index[2:8])
+
+ msg = "Joining multiple DataFrames only supported for joining on index"
+ with pytest.raises(ValueError, match=msg):
+ df_list[0].join(df_list[1:], on='a')
+
+ def test_join_many_mixed(self):
+ df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
+ df['key'] = ['foo', 'bar'] * 4
+ df1 = df.loc[:, ['A', 'B']]
+ df2 = df.loc[:, ['C', 'D']]
+ df3 = df.loc[:, ['key']]
+
+ result = df1.join([df2, df3])
+ assert_frame_equal(result, df)
+
+ def test_join_dups(self):
+
+ # joining dups
+ df = concat([DataFrame(np.random.randn(10, 4),
+ columns=['A', 'A', 'B', 'B']),
+ DataFrame(np.random.randint(0, 10, size=20)
+ .reshape(10, 2),
+ columns=['A', 'C'])],
+ axis=1)
+
+ expected = concat([df, df], axis=1)
+ result = df.join(df, rsuffix='_2')
+ result.columns = expected.columns
+ assert_frame_equal(result, expected)
+
+ # GH 4975, invalid join on dups
+ w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
+ x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
+ y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
+ z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
+
+ dta = x.merge(y, left_index=True, right_index=True).merge(
+ z, left_index=True, right_index=True, how="outer")
+ dta = dta.merge(w, left_index=True, right_index=True)
+ expected = concat([x, y, z, w], axis=1)
+ expected.columns = ['x_x', 'y_x', 'x_y',
+ 'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
+ assert_frame_equal(dta, expected)
+
+ def test_panel_join(self):
+ with catch_warnings(record=True):
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+
+ p1 = panel.iloc[:2, :10, :3]
+ p2 = panel.iloc[2:, 5:, 2:]
+
+ # left join
+ result = p1.join(p2)
+ expected = p1.copy()
+ expected['ItemC'] = p2['ItemC']
+ tm.assert_panel_equal(result, expected)
+
+ # right join
+ result = p1.join(p2, how='right')
+ expected = p2.copy()
+ expected['ItemA'] = p1['ItemA']
+ expected['ItemB'] = p1['ItemB']
+ expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
+ tm.assert_panel_equal(result, expected)
+
+ # inner join
+ result = p1.join(p2, how='inner')
+ expected = panel.iloc[:, 5:10, 2:3]
+ tm.assert_panel_equal(result, expected)
+
+ # outer join
+ result = p1.join(p2, how='outer')
+ expected = p1.reindex(major=panel.major_axis,
+ minor=panel.minor_axis)
+ expected = expected.join(p2.reindex(major=panel.major_axis,
+ minor=panel.minor_axis))
+ tm.assert_panel_equal(result, expected)
+
+ def test_panel_join_overlap(self):
+ with catch_warnings(record=True):
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+
+ p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
+ p2 = panel.loc[['ItemB', 'ItemC']]
+
+ # Expected index is
+ #
+ # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
+ joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
+ p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
+ p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
+ no_overlap = panel.loc[['ItemA']]
+ expected = no_overlap.join(p1_suf.join(p2_suf))
+ tm.assert_panel_equal(joined, expected)
+
+ def test_panel_join_many(self):
+ with catch_warnings(record=True):
+ tm.K = 10
+ panel = tm.makePanel()
+ tm.K = 4
+
+ panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]
+
+ joined = panels[0].join(panels[1:])
+ tm.assert_panel_equal(joined, panel)
+
+ panels = [panel.iloc[:2, :-5],
+ panel.iloc[2:6, 2:],
+ panel.iloc[6:, 5:-7]]
+
+ data_dict = {}
+ for p in panels:
+ data_dict.update(p.iteritems())
+
+ joined = panels[0].join(panels[1:], how='inner')
+ expected = pd.Panel.from_dict(data_dict, intersect=True)
+ tm.assert_panel_equal(joined, expected)
+
+ joined = panels[0].join(panels[1:], how='outer')
+ expected = pd.Panel.from_dict(data_dict, intersect=False)
+ tm.assert_panel_equal(joined, expected)
+
+ # edge cases
+ msg = "Suffixes not supported when passing multiple panels"
+ with pytest.raises(ValueError, match=msg):
+ panels[0].join(panels[1:], how='outer', lsuffix='foo',
+ rsuffix='bar')
+ msg = "Right join not supported with multiple panels"
+ with pytest.raises(ValueError, match=msg):
+ panels[0].join(panels[1:], how='right')
+
+ def test_join_multi_to_multi(self, join_type):
+ # GH 20475
+ leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
+ names=['abc', 'xy', 'num'])
+ left = DataFrame({'v1': range(12)}, index=leftindex)
+
+ rightindex = MultiIndex.from_product([list('abc'), list('xy')],
+ names=['abc', 'xy'])
+ right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
+ index=rightindex)
+
+ result = left.join(right, on=['abc', 'xy'], how=join_type)
+ expected = (left.reset_index()
+ .merge(right.reset_index(),
+ on=['abc', 'xy'], how=join_type)
+ .set_index(['abc', 'xy', 'num'])
+ )
+ assert_frame_equal(expected, result)
+
+ msg = (r'len\(left_on\) must equal the number of levels in the index'
+ ' of "right"')
+ with pytest.raises(ValueError, match=msg):
+ left.join(right, on='xy', how=join_type)
+
+ with pytest.raises(ValueError, match=msg):
+ right.join(left, on=['abc', 'xy'], how=join_type)
+
+ def test_join_on_tz_aware_datetimeindex(self):
+ # GH 23931
+ df1 = pd.DataFrame(
+ {
+ 'date': pd.date_range(start='2018-01-01', periods=5,
+ tz='America/Chicago'),
+ 'vals': list('abcde')
+ }
+ )
+
+ df2 = pd.DataFrame(
+ {
+ 'date': pd.date_range(start='2018-01-03', periods=5,
+ tz='America/Chicago'),
+ 'vals_2': list('tuvwx')
+ }
+ )
+ result = df1.join(df2.set_index('date'), on='date')
+ expected = df1.copy()
+ expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object)
+ assert_frame_equal(result, expected)
+
+
+def _check_join(left, right, result, join_col, how='left',
+ lsuffix='_x', rsuffix='_y'):
+
+ # some smoke tests
+ for c in join_col:
+ assert(result[c].notna().all())
+
+ left_grouped = left.groupby(join_col)
+ right_grouped = right.groupby(join_col)
+
+ for group_key, group in result.groupby(join_col):
+ l_joined = _restrict_to_columns(group, left.columns, lsuffix)
+ r_joined = _restrict_to_columns(group, right.columns, rsuffix)
+
+ try:
+ lgroup = left_grouped.get_group(group_key)
+ except KeyError:
+ if how in ('left', 'inner'):
+ raise AssertionError('key %s should not have been in the join'
+ % str(group_key))
+
+ _assert_all_na(l_joined, left.columns, join_col)
+ else:
+ _assert_same_contents(l_joined, lgroup)
+
+ try:
+ rgroup = right_grouped.get_group(group_key)
+ except KeyError:
+ if how in ('right', 'inner'):
+ raise AssertionError('key %s should not have been in the join'
+ % str(group_key))
+
+ _assert_all_na(r_joined, right.columns, join_col)
+ else:
+ _assert_same_contents(r_joined, rgroup)
+
+
+def _restrict_to_columns(group, columns, suffix):
+ found = [c for c in group.columns
+ if c in columns or c.replace(suffix, '') in columns]
+
+ # filter
+ group = group.loc[:, found]
+
+ # get rid of suffixes, if any
+ group = group.rename(columns=lambda x: x.replace(suffix, ''))
+
+ # put in the right order...
+ group = group.loc[:, columns]
+
+ return group
+
+
+def _assert_same_contents(join_chunk, source):
+ NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
+
+ jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
+ svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
+
+ rows = {tuple(row) for row in jvalues}
+ assert(len(rows) == len(source))
+ assert(all(tuple(row) in rows for row in svalues))
+
+
+def _assert_all_na(join_chunk, source_columns, join_col):
+ for c in source_columns:
+ if c in join_col:
+ continue
+ assert(join_chunk[c].isna().all())
+
+
+def _join_by_hand(a, b, how='left'):
+ join_index = a.index.join(b.index, how=how)
+
+ a_re = a.reindex(join_index)
+ b_re = b.reindex(join_index)
+
+ result_columns = a.columns.append(b.columns)
+
+ for col, s in compat.iteritems(b_re):
+ a_re[col] = s
+ return a_re.reindex(columns=result_columns)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge.py
new file mode 100644
index 00000000000..9fe4049dd69
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge.py
@@ -0,0 +1,1609 @@
+# pylint: disable=E1103
+
+from collections import OrderedDict
+from datetime import date, datetime
+import random
+import re
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas.compat import lrange
+
+from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
+ Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
+from pandas.api.types import CategoricalDtype as CDT
+from pandas.core.reshape.concat import concat
+from pandas.core.reshape.merge import MergeError, merge
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+N = 50
+NGROUPS = 8
+
+
+def get_test_data(ngroups=NGROUPS, n=N):
+ unique_groups = lrange(ngroups)
+ arr = np.asarray(np.tile(unique_groups, n // ngroups))
+
+ if len(arr) < n:
+ arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
+
+ random.shuffle(arr)
+ return arr
+
+
+def get_series():
+ return [
+ pd.Series([1], dtype='int64'),
+ pd.Series([1], dtype='Int64'),
+ pd.Series([1.23]),
+ pd.Series(['foo']),
+ pd.Series([True]),
+ pd.Series([pd.Timestamp('2018-01-01')]),
+ pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]),
+ ]
+
+
+def get_series_na():
+ return [
+ pd.Series([np.nan], dtype='Int64'),
+ pd.Series([np.nan], dtype='float'),
+ pd.Series([np.nan], dtype='object'),
+ pd.Series([pd.NaT]),
+ ]
+
+
[email protected](params=get_series(), ids=lambda x: x.dtype.name)
+def series_of_dtype(request):
+ """
+ A parametrized fixture returning a variety of Series of different
+ dtypes
+ """
+ return request.param
+
+
[email protected](params=get_series(), ids=lambda x: x.dtype.name)
+def series_of_dtype2(request):
+ """
+ A duplicate of the series_of_dtype fixture, so that it can be used
+ twice by a single function
+ """
+ return request.param
+
+
[email protected](params=get_series_na(), ids=lambda x: x.dtype.name)
+def series_of_dtype_all_na(request):
+ """
+ A parametrized fixture returning a variety of Series with all NA
+ values
+ """
+ return request.param
+
+
+class TestMerge(object):
+
+ def setup_method(self, method):
+ # aggregate multiple columns
+ self.df = DataFrame({'key1': get_test_data(),
+ 'key2': get_test_data(),
+ 'data1': np.random.randn(N),
+ 'data2': np.random.randn(N)})
+
+ # exclude a couple keys for fun
+ self.df = self.df[self.df['key2'] > 1]
+
+ self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
+ 'key2': get_test_data(ngroups=NGROUPS // 2,
+ n=N // 5),
+ 'value': np.random.randn(N // 5)})
+
+ self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
+ 'v1': np.random.randn(7)})
+ self.right = DataFrame({'v2': np.random.randn(4)},
+ index=['d', 'b', 'c', 'a'])
+
+ def test_merge_inner_join_empty(self):
+ # GH 15328
+ df_empty = pd.DataFrame()
+ df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
+ result = pd.merge(df_empty, df_a, left_index=True, right_index=True)
+ expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
+ assert_frame_equal(result, expected)
+
+ def test_merge_common(self):
+ joined = merge(self.df, self.df2)
+ exp = merge(self.df, self.df2, on=['key1', 'key2'])
+ tm.assert_frame_equal(joined, exp)
+
+ def test_merge_index_as_on_arg(self):
+ # GH14355
+
+ left = self.df.set_index('key1')
+ right = self.df2.set_index('key1')
+ result = merge(left, right, on='key1')
+ expected = merge(self.df, self.df2, on='key1').set_index('key1')
+ assert_frame_equal(result, expected)
+
+ def test_merge_index_singlekey_right_vs_left(self):
+ left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
+ 'v1': np.random.randn(7)})
+ right = DataFrame({'v2': np.random.randn(4)},
+ index=['d', 'b', 'c', 'a'])
+
+ merged1 = merge(left, right, left_on='key',
+ right_index=True, how='left', sort=False)
+ merged2 = merge(right, left, right_on='key',
+ left_index=True, how='right', sort=False)
+ assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
+
+ merged1 = merge(left, right, left_on='key',
+ right_index=True, how='left', sort=True)
+ merged2 = merge(right, left, right_on='key',
+ left_index=True, how='right', sort=True)
+ assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
+
+ def test_merge_index_singlekey_inner(self):
+ left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
+ 'v1': np.random.randn(7)})
+ right = DataFrame({'v2': np.random.randn(4)},
+ index=['d', 'b', 'c', 'a'])
+
+ # inner join
+ result = merge(left, right, left_on='key', right_index=True,
+ how='inner')
+ expected = left.join(right, on='key').loc[result.index]
+ assert_frame_equal(result, expected)
+
+ result = merge(right, left, right_on='key', left_index=True,
+ how='inner')
+ expected = left.join(right, on='key').loc[result.index]
+ assert_frame_equal(result, expected.loc[:, result.columns])
+
+ def test_merge_misspecified(self):
+ msg = "Must pass right_on or right_index=True"
+ with pytest.raises(pd.errors.MergeError, match=msg):
+ merge(self.left, self.right, left_index=True)
+ msg = "Must pass left_on or left_index=True"
+ with pytest.raises(pd.errors.MergeError, match=msg):
+ merge(self.left, self.right, right_index=True)
+
+ msg = ('Can only pass argument "on" OR "left_on" and "right_on", not'
+ ' a combination of both')
+ with pytest.raises(pd.errors.MergeError, match=msg):
+ merge(self.left, self.left, left_on='key', on='key')
+
+ msg = r"len\(right_on\) must equal len\(left_on\)"
+ with pytest.raises(ValueError, match=msg):
+ merge(self.df, self.df2, left_on=['key1'],
+ right_on=['key1', 'key2'])
+
+ def test_index_and_on_parameters_confusion(self):
+ msg = ("right_index parameter must be of type bool, not"
+ r" <(class|type) 'list'>")
+ with pytest.raises(ValueError, match=msg):
+ merge(self.df, self.df2, how='left',
+ left_index=False, right_index=['key1', 'key2'])
+ msg = ("left_index parameter must be of type bool, not "
+ r"<(class|type) 'list'>")
+ with pytest.raises(ValueError, match=msg):
+ merge(self.df, self.df2, how='left',
+ left_index=['key1', 'key2'], right_index=False)
+ with pytest.raises(ValueError, match=msg):
+ merge(self.df, self.df2, how='left',
+ left_index=['key1', 'key2'], right_index=['key1', 'key2'])
+
+ def test_merge_overlap(self):
+ merged = merge(self.left, self.left, on='key')
+ exp_len = (self.left['key'].value_counts() ** 2).sum()
+ assert len(merged) == exp_len
+ assert 'v1_x' in merged
+ assert 'v1_y' in merged
+
+ def test_merge_different_column_key_names(self):
+ left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+ 'value': [1, 2, 3, 4]})
+ right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
+ 'value': [5, 6, 7, 8]})
+
+ merged = left.merge(right, left_on='lkey', right_on='rkey',
+ how='outer', sort=True)
+
+ exp = pd.Series(['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan],
+ name='lkey')
+ tm.assert_series_equal(merged['lkey'], exp)
+
+ exp = pd.Series(['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'],
+ name='rkey')
+ tm.assert_series_equal(merged['rkey'], exp)
+
+ exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name='value_x')
+ tm.assert_series_equal(merged['value_x'], exp)
+
+ exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name='value_y')
+ tm.assert_series_equal(merged['value_y'], exp)
+
+ def test_merge_copy(self):
+ left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
+ right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
+
+ merged = merge(left, right, left_index=True,
+ right_index=True, copy=True)
+
+ merged['a'] = 6
+ assert (left['a'] == 0).all()
+
+ merged['d'] = 'peekaboo'
+ assert (right['d'] == 'bar').all()
+
+ def test_merge_nocopy(self):
+ left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
+ right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
+
+ merged = merge(left, right, left_index=True,
+ right_index=True, copy=False)
+
+ merged['a'] = 6
+ assert (left['a'] == 6).all()
+
+ merged['d'] = 'peekaboo'
+ assert (right['d'] == 'peekaboo').all()
+
+ def test_intelligently_handle_join_key(self):
+ # #733, be a bit more 1337 about not returning unconsolidated DataFrame
+
+ left = DataFrame({'key': [1, 1, 2, 2, 3],
+ 'value': lrange(5)}, columns=['value', 'key'])
+ right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
+ 'rvalue': lrange(6)})
+
+ joined = merge(left, right, on='key', how='outer')
+ expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5],
+ 'value': np.array([0, 0, 1, 1, 2, 3, 4,
+ np.nan, np.nan]),
+ 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]},
+ columns=['value', 'key', 'rvalue'])
+ assert_frame_equal(joined, expected)
+
+ def test_merge_join_key_dtype_cast(self):
+ # #8596
+
+ df1 = DataFrame({'key': [1], 'v1': [10]})
+ df2 = DataFrame({'key': [2], 'v1': [20]})
+ df = merge(df1, df2, how='outer')
+ assert df['key'].dtype == 'int64'
+
+ df1 = DataFrame({'key': [True], 'v1': [1]})
+ df2 = DataFrame({'key': [False], 'v1': [0]})
+ df = merge(df1, df2, how='outer')
+
+ # GH13169
+ # this really should be bool
+ assert df['key'].dtype == 'object'
+
+ df1 = DataFrame({'val': [1]})
+ df2 = DataFrame({'val': [2]})
+ lkey = np.array([1])
+ rkey = np.array([2])
+ df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer')
+ assert df['key_0'].dtype == 'int64'
+
+ def test_handle_join_key_pass_array(self):
+ left = DataFrame({'key': [1, 1, 2, 2, 3],
+ 'value': lrange(5)}, columns=['value', 'key'])
+ right = DataFrame({'rvalue': lrange(6)})
+ key = np.array([1, 1, 2, 3, 4, 5])
+
+ merged = merge(left, right, left_on='key', right_on=key, how='outer')
+ merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
+
+ assert_series_equal(merged['key'], merged2['key'])
+ assert merged['key'].notna().all()
+ assert merged2['key'].notna().all()
+
+ left = DataFrame({'value': lrange(5)}, columns=['value'])
+ right = DataFrame({'rvalue': lrange(6)})
+ lkey = np.array([1, 1, 2, 2, 3])
+ rkey = np.array([1, 1, 2, 3, 4, 5])
+
+ merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
+ tm.assert_series_equal(merged['key_0'], Series([1, 1, 1, 1, 2,
+ 2, 3, 4, 5],
+ name='key_0'))
+
+ left = DataFrame({'value': lrange(3)})
+ right = DataFrame({'rvalue': lrange(6)})
+
+ key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
+ merged = merge(left, right, left_index=True, right_on=key, how='outer')
+ tm.assert_series_equal(merged['key_0'], Series(key, name='key_0'))
+
+ def test_no_overlap_more_informative_error(self):
+ dt = datetime.now()
+ df1 = DataFrame({'x': ['a']}, index=[dt])
+
+ df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
+
+ msg = ('No common columns to perform merge on. '
+ 'Merge options: left_on={lon}, right_on={ron}, '
+ 'left_index={lidx}, right_index={ridx}'
+ .format(lon=None, ron=None, lidx=False, ridx=False))
+
+ with pytest.raises(MergeError, match=msg):
+ merge(df1, df2)
+
+ def test_merge_non_unique_indexes(self):
+
+ dt = datetime(2012, 5, 1)
+ dt2 = datetime(2012, 5, 2)
+ dt3 = datetime(2012, 5, 3)
+ dt4 = datetime(2012, 5, 4)
+
+ df1 = DataFrame({'x': ['a']}, index=[dt])
+ df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
+ _check_merge(df1, df2)
+
+ # Not monotonic
+ df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
+ df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
+ index=[dt3, dt3, dt2, dt2, dt, dt])
+ _check_merge(df1, df2)
+
+ df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
+ df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
+ _check_merge(df1, df2)
+
+ def test_merge_non_unique_index_many_to_many(self):
+ dt = datetime(2012, 5, 1)
+ dt2 = datetime(2012, 5, 2)
+ dt3 = datetime(2012, 5, 3)
+ df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
+ index=[dt2, dt2, dt, dt])
+ df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
+ index=[dt2, dt2, dt3, dt, dt])
+ _check_merge(df1, df2)
+
+ def test_left_merge_empty_dataframe(self):
+ left = DataFrame({'key': [1], 'value': [2]})
+ right = DataFrame({'key': []})
+
+ result = merge(left, right, on='key', how='left')
+ assert_frame_equal(result, left)
+
+ result = merge(right, left, on='key', how='right')
+ assert_frame_equal(result, left)
+
+ @pytest.mark.parametrize('kwarg',
+ [dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on='x'),
+ dict(left_on='a', right_index=True),
+ dict(left_on='a', right_on='x')])
+ def test_merge_left_empty_right_empty(self, join_type, kwarg):
+ # GH 10824
+ left = pd.DataFrame([], columns=['a', 'b', 'c'])
+ right = pd.DataFrame([], columns=['x', 'y', 'z'])
+
+ exp_in = pd.DataFrame([], columns=['a', 'b', 'c', 'x', 'y', 'z'],
+ index=pd.Index([], dtype=object),
+ dtype=object)
+
+ result = pd.merge(left, right, how=join_type, **kwarg)
+ tm.assert_frame_equal(result, exp_in)
+
+ def test_merge_left_empty_right_notempty(self):
+ # GH 10824
+ left = pd.DataFrame([], columns=['a', 'b', 'c'])
+ right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=['x', 'y', 'z'])
+
+ exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object),
+ 'b': np.array([np.nan] * 3, dtype=object),
+ 'c': np.array([np.nan] * 3, dtype=object),
+ 'x': [1, 4, 7],
+ 'y': [2, 5, 8],
+ 'z': [3, 6, 9]},
+ columns=['a', 'b', 'c', 'x', 'y', 'z'])
+ exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
+ # result will have object dtype
+ exp_in.index = exp_in.index.astype(object)
+
+ def check1(exp, kwarg):
+ result = pd.merge(left, right, how='inner', **kwarg)
+ tm.assert_frame_equal(result, exp)
+ result = pd.merge(left, right, how='left', **kwarg)
+ tm.assert_frame_equal(result, exp)
+
+ def check2(exp, kwarg):
+ result = pd.merge(left, right, how='right', **kwarg)
+ tm.assert_frame_equal(result, exp)
+ result = pd.merge(left, right, how='outer', **kwarg)
+ tm.assert_frame_equal(result, exp)
+
+ for kwarg in [dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on='x')]:
+ check1(exp_in, kwarg)
+ check2(exp_out, kwarg)
+
+ kwarg = dict(left_on='a', right_index=True)
+ check1(exp_in, kwarg)
+ exp_out['a'] = [0, 1, 2]
+ check2(exp_out, kwarg)
+
+ kwarg = dict(left_on='a', right_on='x')
+ check1(exp_in, kwarg)
+ exp_out['a'] = np.array([np.nan] * 3, dtype=object)
+ check2(exp_out, kwarg)
+
+ def test_merge_left_notempty_right_empty(self):
+ # GH 10824
+ left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ columns=['a', 'b', 'c'])
+ right = pd.DataFrame([], columns=['x', 'y', 'z'])
+
+ exp_out = pd.DataFrame({'a': [1, 4, 7],
+ 'b': [2, 5, 8],
+ 'c': [3, 6, 9],
+ 'x': np.array([np.nan] * 3, dtype=object),
+ 'y': np.array([np.nan] * 3, dtype=object),
+ 'z': np.array([np.nan] * 3, dtype=object)},
+ columns=['a', 'b', 'c', 'x', 'y', 'z'])
+ exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
+ # result will have object dtype
+ exp_in.index = exp_in.index.astype(object)
+
+ def check1(exp, kwarg):
+ result = pd.merge(left, right, how='inner', **kwarg)
+ tm.assert_frame_equal(result, exp)
+ result = pd.merge(left, right, how='right', **kwarg)
+ tm.assert_frame_equal(result, exp)
+
+ def check2(exp, kwarg):
+ result = pd.merge(left, right, how='left', **kwarg)
+ tm.assert_frame_equal(result, exp)
+ result = pd.merge(left, right, how='outer', **kwarg)
+ tm.assert_frame_equal(result, exp)
+
+ for kwarg in [dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on='x'),
+ dict(left_on='a', right_index=True),
+ dict(left_on='a', right_on='x')]:
+ check1(exp_in, kwarg)
+ check2(exp_out, kwarg)
+
+ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
+ # GH 25183
+ df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2},
+ columns=['key', 'value'])
+ df_empty = df[:0]
+ expected = pd.DataFrame({
+ 'value_x': pd.Series(dtype=df.dtypes['value']),
+ 'key': pd.Series(dtype=df.dtypes['key']),
+ 'value_y': pd.Series(dtype=df.dtypes['value']),
+ }, columns=['value_x', 'key', 'value_y'])
+ actual = df_empty.merge(df, on='key')
+ assert_frame_equal(actual, expected)
+
+ def test_merge_all_na_column(self, series_of_dtype,
+ series_of_dtype_all_na):
+ # GH 25183
+ df_left = pd.DataFrame(
+ {'key': series_of_dtype, 'value': series_of_dtype_all_na},
+ columns=['key', 'value'])
+ df_right = pd.DataFrame(
+ {'key': series_of_dtype, 'value': series_of_dtype_all_na},
+ columns=['key', 'value'])
+ expected = pd.DataFrame({
+ 'key': series_of_dtype,
+ 'value_x': series_of_dtype_all_na,
+ 'value_y': series_of_dtype_all_na,
+ }, columns=['key', 'value_x', 'value_y'])
+ actual = df_left.merge(df_right, on='key')
+ assert_frame_equal(actual, expected)
+
+ def test_merge_nosort(self):
+ # #2098, anything to do?
+
+ from datetime import datetime
+
+ d = {"var1": np.random.randint(0, 10, size=10),
+ "var2": np.random.randint(0, 10, size=10),
+ "var3": [datetime(2012, 1, 12),
+ datetime(2011, 2, 4),
+ datetime(2010, 2, 3),
+ datetime(2012, 1, 12),
+ datetime(2011, 2, 4),
+ datetime(2012, 4, 3),
+ datetime(2012, 3, 4),
+ datetime(2008, 5, 1),
+ datetime(2010, 2, 3),
+ datetime(2012, 2, 3)]}
+ df = DataFrame.from_dict(d)
+ var3 = df.var3.unique()
+ var3.sort()
+ new = DataFrame.from_dict({"var3": var3,
+ "var8": np.random.random(7)})
+
+ result = df.merge(new, on="var3", sort=False)
+ exp = merge(df, new, on='var3', sort=False)
+ assert_frame_equal(result, exp)
+
+ assert (df.var3.unique() == result.var3.unique()).all()
+
+ def test_merge_nan_right(self):
+ df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
+ df2 = DataFrame({"i1": [0], "i3": [0]})
+ result = df1.join(df2, on="i1", rsuffix="_")
+ expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
+ 'i1_': {0: 0, 1: np.nan},
+ 'i3': {0: 0.0, 1: np.nan},
+ None: {0: 0, 1: 0}})
+ .set_index(None)
+ .reset_index()[['i1', 'i2', 'i1_', 'i3']])
+ assert_frame_equal(result, expected, check_dtype=False)
+
+ df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
+ df2 = DataFrame({"i1": [0], "i3": [0.7]})
+ result = df1.join(df2, rsuffix="_", on='i1')
+ expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
+ 'i2': {0: 0.5, 1: 1.5},
+ 'i3': {0: 0.69999999999999996,
+ 1: nan}})
+ [['i1', 'i2', 'i1_', 'i3']])
+ assert_frame_equal(result, expected)
+
+ def test_merge_type(self):
+ class NotADataFrame(DataFrame):
+
+ @property
+ def _constructor(self):
+ return NotADataFrame
+
+ nad = NotADataFrame(self.df)
+ result = nad.merge(self.df2, on='key1')
+
+ assert isinstance(result, NotADataFrame)
+
+ def test_join_append_timedeltas(self):
+
+ import datetime as dt
+ from pandas import NaT
+
+ # timedelta64 issues with join/merge
+ # GH 5695
+
+ d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
+ df = DataFrame(columns=list('dt'))
+ df = df.append(d, ignore_index=True)
+ result = df.append(d, ignore_index=True)
+ expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
+ dt.datetime(2013, 11, 5, 5, 56)],
+ 't': [dt.timedelta(0, 22500),
+ dt.timedelta(0, 22500)]})
+ assert_frame_equal(result, expected)
+
+ td = np.timedelta64(300000000)
+ lhs = DataFrame(Series([td, td], index=["A", "B"]))
+ rhs = DataFrame(Series([td], index=["A"]))
+
+ result = lhs.join(rhs, rsuffix='r', how="left")
+ expected = DataFrame({'0': Series([td, td], index=list('AB')),
+ '0r': Series([td, NaT], index=list('AB'))})
+ assert_frame_equal(result, expected)
+
+ def test_other_datetime_unit(self):
+ # GH 13389
+ df1 = pd.DataFrame({'entity_id': [101, 102]})
+ s = pd.Series([None, None], index=[101, 102], name='days')
+
+ for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]',
+ 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]',
+ 'datetime64[ns]']:
+
+ df2 = s.astype(dtype).to_frame('days')
+ # coerces to datetime64[ns], thus sholuld not be affected
+ assert df2['days'].dtype == 'datetime64[ns]'
+
+ result = df1.merge(df2, left_on='entity_id', right_index=True)
+
+ exp = pd.DataFrame({'entity_id': [101, 102],
+ 'days': np.array(['nat', 'nat'],
+ dtype='datetime64[ns]')},
+ columns=['entity_id', 'days'])
+ tm.assert_frame_equal(result, exp)
+
+ @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns'])
+ def test_other_timedelta_unit(self, unit):
+ # GH 13389
+ df1 = pd.DataFrame({'entity_id': [101, 102]})
+ s = pd.Series([None, None], index=[101, 102], name='days')
+
+ dtype = "m8[{}]".format(unit)
+ df2 = s.astype(dtype).to_frame('days')
+ assert df2['days'].dtype == 'm8[ns]'
+
+ result = df1.merge(df2, left_on='entity_id', right_index=True)
+
+ exp = pd.DataFrame({'entity_id': [101, 102],
+ 'days': np.array(['nat', 'nat'],
+ dtype=dtype)},
+ columns=['entity_id', 'days'])
+ tm.assert_frame_equal(result, exp)
+
+ def test_overlapping_columns_error_message(self):
+ df = DataFrame({'key': [1, 2, 3],
+ 'v1': [4, 5, 6],
+ 'v2': [7, 8, 9]})
+ df2 = DataFrame({'key': [1, 2, 3],
+ 'v1': [4, 5, 6],
+ 'v2': [7, 8, 9]})
+
+ df.columns = ['key', 'foo', 'foo']
+ df2.columns = ['key', 'bar', 'bar']
+ expected = DataFrame({'key': [1, 2, 3],
+ 'v1': [4, 5, 6],
+ 'v2': [7, 8, 9],
+ 'v3': [4, 5, 6],
+ 'v4': [7, 8, 9]})
+ expected.columns = ['key', 'foo', 'foo', 'bar', 'bar']
+ assert_frame_equal(merge(df, df2), expected)
+
+ # #2649, #10639
+ df2.columns = ['key1', 'foo', 'foo']
+ msg = (r"Data columns not unique: Index\(\[u?'foo', u?'foo'\],"
+ r" dtype='object'\)")
+ with pytest.raises(MergeError, match=msg):
+ merge(df, df2)
+
+ def test_merge_on_datetime64tz(self):
+
+ # GH11405
+ left = pd.DataFrame({'key': pd.date_range('20151010', periods=2,
+ tz='US/Eastern'),
+ 'value': [1, 2]})
+ right = pd.DataFrame({'key': pd.date_range('20151011', periods=3,
+ tz='US/Eastern'),
+ 'value': [1, 2, 3]})
+
+ expected = DataFrame({'key': pd.date_range('20151010', periods=4,
+ tz='US/Eastern'),
+ 'value_x': [1, 2, np.nan, np.nan],
+ 'value_y': [np.nan, 1, 2, 3]})
+ result = pd.merge(left, right, on='key', how='outer')
+ assert_frame_equal(result, expected)
+
+ left = pd.DataFrame({'key': [1, 2],
+ 'value': pd.date_range('20151010', periods=2,
+ tz='US/Eastern')})
+ right = pd.DataFrame({'key': [2, 3],
+ 'value': pd.date_range('20151011', periods=2,
+ tz='US/Eastern')})
+ expected = DataFrame({
+ 'key': [1, 2, 3],
+ 'value_x': list(pd.date_range('20151010', periods=2,
+ tz='US/Eastern')) + [pd.NaT],
+ 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2,
+ tz='US/Eastern'))})
+ result = pd.merge(left, right, on='key', how='outer')
+ assert_frame_equal(result, expected)
+ assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
+ assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'
+
+ def test_merge_on_datetime64tz_empty(self):
+ # https://github.com/pandas-dev/pandas/issues/25014
+ dtz = pd.DatetimeTZDtype(tz='UTC')
+ right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)],
+ 'value': [4.0],
+ 'date2': [pd.Timestamp('2019', tz=dtz.tz)]},
+ columns=['date', 'value', 'date2'])
+ left = right[:0]
+ result = left.merge(right, on='date')
+ expected = pd.DataFrame({
+ 'value_x': pd.Series(dtype=float),
+ 'date2_x': pd.Series(dtype=dtz),
+ 'date': pd.Series(dtype=dtz),
+ 'value_y': pd.Series(dtype=float),
+ 'date2_y': pd.Series(dtype=dtz),
+ }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_merge_datetime64tz_with_dst_transition(self):
+ # GH 18885
+ df1 = pd.DataFrame(pd.date_range(
+ '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
+ columns=['date'])
+ df1['value'] = 1
+ df2 = pd.DataFrame({
+ 'date': pd.to_datetime([
+ '2017-10-29 03:00:00', '2017-10-29 04:00:00',
+ '2017-10-29 05:00:00'
+ ]),
+ 'value': 2
+ })
+ df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
+ 'Europe/Madrid')
+ result = pd.merge(df1, df2, how='outer', on='date')
+ expected = pd.DataFrame({
+ 'date': pd.date_range(
+ '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
+ 'value_x': [1] * 4 + [np.nan] * 3,
+ 'value_y': [np.nan] * 4 + [2] * 3
+ })
+ assert_frame_equal(result, expected)
+
+ def test_merge_non_unique_period_index(self):
+ # GH #16871
+ index = pd.period_range('2016-01-01', periods=16, freq='M')
+ df = DataFrame([i for i in range(len(index))],
+ index=index, columns=['pnum'])
+ df2 = concat([df, df])
+ result = df.merge(df2, left_index=True, right_index=True, how='inner')
+ expected = DataFrame(
+ np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
+ columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index)
+ tm.assert_frame_equal(result, expected)
+
+ def test_merge_on_periods(self):
+ left = pd.DataFrame({'key': pd.period_range('20151010', periods=2,
+ freq='D'),
+ 'value': [1, 2]})
+ right = pd.DataFrame({'key': pd.period_range('20151011', periods=3,
+ freq='D'),
+ 'value': [1, 2, 3]})
+
+ expected = DataFrame({'key': pd.period_range('20151010', periods=4,
+ freq='D'),
+ 'value_x': [1, 2, np.nan, np.nan],
+ 'value_y': [np.nan, 1, 2, 3]})
+ result = pd.merge(left, right, on='key', how='outer')
+ assert_frame_equal(result, expected)
+
+ left = pd.DataFrame({'key': [1, 2],
+ 'value': pd.period_range('20151010', periods=2,
+ freq='D')})
+ right = pd.DataFrame({'key': [2, 3],
+ 'value': pd.period_range('20151011', periods=2,
+ freq='D')})
+
+ exp_x = pd.period_range('20151010', periods=2, freq='D')
+ exp_y = pd.period_range('20151011', periods=2, freq='D')
+ expected = DataFrame({'key': [1, 2, 3],
+ 'value_x': list(exp_x) + [pd.NaT],
+ 'value_y': [pd.NaT] + list(exp_y)})
+ result = pd.merge(left, right, on='key', how='outer')
+ assert_frame_equal(result, expected)
+ assert result['value_x'].dtype == 'Period[D]'
+ assert result['value_y'].dtype == 'Period[D]'
+
+ def test_indicator(self):
+ # PR #10054. xref #7412 and closes #8790.
+ df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2],
+ 'col_left': ['a', 'b']})
+ df1_copy = df1.copy()
+
+ df2 = DataFrame({'col1': [1, 2, 3, 4, 5],
+ 'col_conflict': [1, 2, 3, 4, 5],
+ 'col_right': [2, 2, 2, 2, 2]})
+ df2_copy = df2.copy()
+
+ df_result = DataFrame({
+ 'col1': [0, 1, 2, 3, 4, 5],
+ 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan],
+ 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan],
+ 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5],
+ 'col_right': [np.nan, 2, 2, 2, 2, 2]})
+ df_result['_merge'] = Categorical(
+ ['left_only', 'both', 'right_only',
+ 'right_only', 'right_only', 'right_only'],
+ categories=['left_only', 'right_only', 'both'])
+
+ df_result = df_result[['col1', 'col_conflict_x', 'col_left',
+ 'col_conflict_y', 'col_right', '_merge']]
+
+ test = merge(df1, df2, on='col1', how='outer', indicator=True)
+ assert_frame_equal(test, df_result)
+ test = df1.merge(df2, on='col1', how='outer', indicator=True)
+ assert_frame_equal(test, df_result)
+
+ # No side effects
+ assert_frame_equal(df1, df1_copy)
+ assert_frame_equal(df2, df2_copy)
+
+ # Check with custom name
+ df_result_custom_name = df_result
+ df_result_custom_name = df_result_custom_name.rename(
+ columns={'_merge': 'custom_name'})
+
+ test_custom_name = merge(
+ df1, df2, on='col1', how='outer', indicator='custom_name')
+ assert_frame_equal(test_custom_name, df_result_custom_name)
+ test_custom_name = df1.merge(
+ df2, on='col1', how='outer', indicator='custom_name')
+ assert_frame_equal(test_custom_name, df_result_custom_name)
+
+ # Check only accepts strings and booleans
+ msg = "indicator option can only accept boolean or string arguments"
+ with pytest.raises(ValueError, match=msg):
+ merge(df1, df2, on='col1', how='outer', indicator=5)
+ with pytest.raises(ValueError, match=msg):
+ df1.merge(df2, on='col1', how='outer', indicator=5)
+
+ # Check result integrity
+
+ test2 = merge(df1, df2, on='col1', how='left', indicator=True)
+ assert (test2._merge != 'right_only').all()
+ test2 = df1.merge(df2, on='col1', how='left', indicator=True)
+ assert (test2._merge != 'right_only').all()
+
+ test3 = merge(df1, df2, on='col1', how='right', indicator=True)
+ assert (test3._merge != 'left_only').all()
+ test3 = df1.merge(df2, on='col1', how='right', indicator=True)
+ assert (test3._merge != 'left_only').all()
+
+ test4 = merge(df1, df2, on='col1', how='inner', indicator=True)
+ assert (test4._merge == 'both').all()
+ test4 = df1.merge(df2, on='col1', how='inner', indicator=True)
+ assert (test4._merge == 'both').all()
+
+ # Check if working name in df
+ for i in ['_right_indicator', '_left_indicator', '_merge']:
+ df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]})
+
+ msg = ("Cannot use `indicator=True` option when data contains a"
+ " column named {}|"
+ "Cannot use name of an existing column for indicator"
+ " column").format(i)
+ with pytest.raises(ValueError, match=msg):
+ merge(df1, df_badcolumn, on='col1',
+ how='outer', indicator=True)
+ with pytest.raises(ValueError, match=msg):
+ df1.merge(df_badcolumn, on='col1', how='outer', indicator=True)
+
+ # Check for name conflict with custom name
+ df_badcolumn = DataFrame(
+ {'col1': [1, 2], 'custom_column_name': [2, 2]})
+
+ msg = "Cannot use name of an existing column for indicator column"
+ with pytest.raises(ValueError, match=msg):
+ merge(df1, df_badcolumn, on='col1', how='outer',
+ indicator='custom_column_name')
+ with pytest.raises(ValueError, match=msg):
+ df1.merge(df_badcolumn, on='col1', how='outer',
+ indicator='custom_column_name')
+
+ # Merge on multiple columns
+ df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']})
+
+ df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']})
+
+ hand_coded_result = DataFrame({'col1': [0, 1, 1, 3],
+ 'col2': ['a', 'b', 'x', 'y']})
+ hand_coded_result['_merge'] = Categorical(
+ ['left_only', 'both', 'right_only', 'right_only'],
+ categories=['left_only', 'right_only', 'both'])
+
+ test5 = merge(df3, df4, on=['col1', 'col2'],
+ how='outer', indicator=True)
+ assert_frame_equal(test5, hand_coded_result)
+ test5 = df3.merge(df4, on=['col1', 'col2'],
+ how='outer', indicator=True)
+ assert_frame_equal(test5, hand_coded_result)
+
+ def test_validation(self):
+ left = DataFrame({'a': ['a', 'b', 'c', 'd'],
+ 'b': ['cat', 'dog', 'weasel', 'horse']},
+ index=range(4))
+
+ right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
+ 'c': ['meow', 'bark', 'um... weasel noise?',
+ 'nay', 'chirp']},
+ index=range(5))
+
+ # Make sure no side effects.
+ left_copy = left.copy()
+ right_copy = right.copy()
+
+ result = merge(left, right, left_index=True, right_index=True,
+ validate='1:1')
+ assert_frame_equal(left, left_copy)
+ assert_frame_equal(right, right_copy)
+
+ # make sure merge still correct
+ expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'],
+ 'b': ['cat', 'dog', 'weasel', 'horse'],
+ 'a_y': ['a', 'b', 'c', 'd'],
+ 'c': ['meow', 'bark', 'um... weasel noise?',
+ 'nay']},
+ index=range(4),
+ columns=['a_x', 'b', 'a_y', 'c'])
+
+ result = merge(left, right, left_index=True, right_index=True,
+ validate='one_to_one')
+ assert_frame_equal(result, expected)
+
+ expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'],
+ 'b': ['cat', 'dog', 'weasel', 'horse'],
+ 'c': ['meow', 'bark', 'um... weasel noise?',
+ 'nay']},
+ index=range(4))
+
+ result = merge(left, right, on='a', validate='1:1')
+ assert_frame_equal(left, left_copy)
+ assert_frame_equal(right, right_copy)
+ assert_frame_equal(result, expected_2)
+
+ result = merge(left, right, on='a', validate='one_to_one')
+ assert_frame_equal(result, expected_2)
+
+ # One index, one column
+ expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'],
+ 'a': ['a', 'b', 'c', 'd'],
+ 'c': ['meow', 'bark', 'um... weasel noise?',
+ 'nay']},
+ columns=['b', 'a', 'c'],
+ index=range(4))
+
+ left_index_reset = left.set_index('a')
+ result = merge(left_index_reset, right, left_index=True,
+ right_on='a', validate='one_to_one')
+ assert_frame_equal(result, expected_3)
+
+ # Dups on right
+ right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
+ index=[4]))
+ merge(left, right_w_dups, left_index=True, right_index=True,
+ validate='one_to_many')
+
+ msg = ("Merge keys are not unique in right dataset; not a one-to-one"
+ " merge")
+ with pytest.raises(MergeError, match=msg):
+ merge(left, right_w_dups, left_index=True, right_index=True,
+ validate='one_to_one')
+
+ with pytest.raises(MergeError, match=msg):
+ merge(left, right_w_dups, on='a', validate='one_to_one')
+
+ # Dups on left
+ left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
+ index=[3]), sort=True)
+ merge(left_w_dups, right, left_index=True, right_index=True,
+ validate='many_to_one')
+
+ msg = ("Merge keys are not unique in left dataset; not a one-to-one"
+ " merge")
+ with pytest.raises(MergeError, match=msg):
+ merge(left_w_dups, right, left_index=True, right_index=True,
+ validate='one_to_one')
+
+ with pytest.raises(MergeError, match=msg):
+ merge(left_w_dups, right, on='a', validate='one_to_one')
+
+ # Dups on both
+ merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
+
+ msg = ("Merge keys are not unique in right dataset; not a many-to-one"
+ " merge")
+ with pytest.raises(MergeError, match=msg):
+ merge(left_w_dups, right_w_dups, left_index=True,
+ right_index=True, validate='many_to_one')
+
+ msg = ("Merge keys are not unique in left dataset; not a one-to-many"
+ " merge")
+ with pytest.raises(MergeError, match=msg):
+ merge(left_w_dups, right_w_dups, on='a',
+ validate='one_to_many')
+
+ # Check invalid arguments
+ msg = "Not a valid argument for validate"
+ with pytest.raises(ValueError, match=msg):
+ merge(left, right, on='a', validate='jibberish')
+
+ # Two column merge, dups in both, but jointly no dups.
+ left = DataFrame({'a': ['a', 'a', 'b', 'b'],
+ 'b': [0, 1, 0, 1],
+ 'c': ['cat', 'dog', 'weasel', 'horse']},
+ index=range(4))
+
+ right = DataFrame({'a': ['a', 'a', 'b'],
+ 'b': [0, 1, 0],
+ 'd': ['meow', 'bark', 'um... weasel noise?']},
+ index=range(3))
+
+ expected_multi = DataFrame({'a': ['a', 'a', 'b'],
+ 'b': [0, 1, 0],
+ 'c': ['cat', 'dog', 'weasel'],
+ 'd': ['meow', 'bark',
+ 'um... weasel noise?']},
+ index=range(3))
+
+ msg = ("Merge keys are not unique in either left or right dataset;"
+ " not a one-to-one merge")
+ with pytest.raises(MergeError, match=msg):
+ merge(left, right, on='a', validate='1:1')
+
+ result = merge(left, right, on=['a', 'b'], validate='1:1')
+ assert_frame_equal(result, expected_multi)
+
+ def test_merge_two_empty_df_no_division_error(self):
+ # GH17776, PR #17846
+ a = pd.DataFrame({'a': [], 'b': [], 'c': []})
+ with np.errstate(divide='raise'):
+ merge(a, a, on=('a', 'b'))
+
+ @pytest.mark.parametrize('how', ['left', 'outer'])
+ @pytest.mark.xfail(reason="GH-24897")
+ def test_merge_on_index_with_more_values(self, how):
+ # GH 24212
+ # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
+ # interpreted as a missing value instead of the last element
+ df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
+ columns=['a', 'b'])
+ df2 = pd.DataFrame([[3, 30], [4, 40]],
+ columns=['a', 'c'])
+ df1.set_index('a', drop=False, inplace=True)
+ df2.set_index('a', inplace=True)
+ result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
+ expected = pd.DataFrame([[1, 2, np.nan],
+ [2, 4, np.nan],
+ [3, 6, 30.0],
+ [4, 8, 40.0]],
+ columns=['a', 'b', 'c'])
+ expected.set_index('a', drop=False, inplace=True)
+ assert_frame_equal(result, expected)
+
+ def test_merge_right_index_right(self):
+ # Note: the expected output here is probably incorrect.
+ # See https://github.com/pandas-dev/pandas/issues/17257 for more.
+ # We include this as a regression test for GH-24897.
+ left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]})
+ right = pd.DataFrame({'b': [1, 2, 3]})
+
+ expected = pd.DataFrame({'a': [1, 2, 3, None],
+ 'key': [0, 1, 1, 2],
+ 'b': [1, 2, 2, 3]},
+ columns=['a', 'key', 'b'],
+ index=[0, 1, 2, 2])
+ result = left.merge(right, left_on='key', right_index=True,
+ how='right')
+ tm.assert_frame_equal(result, expected)
+
+
+def _check_merge(x, y):
+ for how in ['inner', 'left', 'outer']:
+ result = x.join(y, how=how)
+
+ expected = merge(x.reset_index(), y.reset_index(), how=how,
+ sort=True)
+ expected = expected.set_index('index')
+
+ # TODO check_names on merge?
+ assert_frame_equal(result, expected, check_names=False)
+
+
+class TestMergeDtypes(object):
+
+ @pytest.mark.parametrize('right_vals', [
+ ['foo', 'bar'],
+ Series(['foo', 'bar']).astype('category'),
+ ])
+ def test_different(self, right_vals):
+
+ left = DataFrame({'A': ['foo', 'bar'],
+ 'B': Series(['foo', 'bar']).astype('category'),
+ 'C': [1, 2],
+ 'D': [1.0, 2.0],
+ 'E': Series([1, 2], dtype='uint64'),
+ 'F': Series([1, 2], dtype='int32')})
+ right = DataFrame({'A': right_vals})
+
+ # GH 9780
+ # We allow merging on object and categorical cols and cast
+ # categorical cols to object
+ result = pd.merge(left, right, on='A')
+ assert is_object_dtype(result.A.dtype)
+
+ @pytest.mark.parametrize('d1', [np.int64, np.int32,
+ np.int16, np.int8, np.uint8])
+ @pytest.mark.parametrize('d2', [np.int64, np.float64,
+ np.float32, np.float16])
+ def test_join_multi_dtypes(self, d1, d2):
+
+ dtype1 = np.dtype(d1)
+ dtype2 = np.dtype(d2)
+
+ left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
+ 'k2': ['foo', 'bar'] * 12,
+ 'v': np.array(np.arange(24), dtype=np.int64)})
+
+ index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+ right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
+
+ result = left.join(right, on=['k1', 'k2'])
+
+ expected = left.copy()
+
+ if dtype2.kind == 'i':
+ dtype2 = np.dtype('float64')
+ expected['v2'] = np.array(np.nan, dtype=dtype2)
+ expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
+ expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
+
+ tm.assert_frame_equal(result, expected)
+
+ result = left.join(right, on=['k1', 'k2'], sort=True)
+ expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [
+ ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}),
+ ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}),
+ ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}),
+ ])
+ def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
+ # GH 16572
+ # Check that float column is not cast to object if
+ # merging on float and int columns
+ A = DataFrame({'X': int_vals})
+ B = DataFrame({'Y': float_vals})
+ expected = DataFrame(exp_vals)
+
+ result = A.merge(B, left_on='X', right_on='Y')
+ assert_frame_equal(result, expected)
+
+ result = B.merge(A, left_on='Y', right_on='X')
+ assert_frame_equal(result, expected[['Y', 'X']])
+
+ def test_merge_on_ints_floats_warning(self):
+ # GH 16572
+ # merge will produce a warning when merging on int and
+ # float columns where the float values are not exactly
+ # equal to their int representation
+ A = DataFrame({'X': [1, 2, 3]})
+ B = DataFrame({'Y': [1.1, 2.5, 3.0]})
+ expected = DataFrame({'X': [3], 'Y': [3.0]})
+
+ with tm.assert_produces_warning(UserWarning):
+ result = A.merge(B, left_on='X', right_on='Y')
+ assert_frame_equal(result, expected)
+
+ with tm.assert_produces_warning(UserWarning):
+ result = B.merge(A, left_on='Y', right_on='X')
+ assert_frame_equal(result, expected[['Y', 'X']])
+
+ # test no warning if float has NaNs
+ B = DataFrame({'Y': [np.nan, np.nan, 3.0]})
+
+ with tm.assert_produces_warning(None):
+ result = B.merge(A, left_on='Y', right_on='X')
+ assert_frame_equal(result, expected[['Y', 'X']])
+
+ def test_merge_incompat_infer_boolean_object(self):
+ # GH21119: bool + object bool merge OK
+ df1 = DataFrame({'key': Series([True, False], dtype=object)})
+ df2 = DataFrame({'key': [True, False]})
+
+ expected = DataFrame({'key': [True, False]}, dtype=object)
+ result = pd.merge(df1, df2, on='key')
+ assert_frame_equal(result, expected)
+ result = pd.merge(df2, df1, on='key')
+ assert_frame_equal(result, expected)
+
+ # with missing value
+ df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)})
+ df2 = DataFrame({'key': [True, False]})
+
+ expected = DataFrame({'key': [True, False]}, dtype=object)
+ result = pd.merge(df1, df2, on='key')
+ assert_frame_equal(result, expected)
+ result = pd.merge(df2, df1, on='key')
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('df1_vals, df2_vals', [
+
+ # merge on category coerces to object
+ ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
+ ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
+
+ # no not infer
+ ([0, 1], pd.Series([False, True], dtype=object)),
+ ([0, 1], pd.Series([False, True], dtype=bool)),
+ ])
+ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
+ # these are explicity allowed incompat merges, that pass thru
+ # the result type is dependent on if the values on the rhs are
+ # inferred, otherwise these will be coereced to object
+
+ df1 = DataFrame({'A': df1_vals})
+ df2 = DataFrame({'A': df2_vals})
+
+ result = pd.merge(df1, df2, on=['A'])
+ assert is_object_dtype(result.A.dtype)
+ result = pd.merge(df2, df1, on=['A'])
+ assert is_object_dtype(result.A.dtype)
+
+ @pytest.mark.parametrize('df1_vals, df2_vals', [
+ # do not infer to numeric
+
+ (Series([1, 2], dtype='uint64'), ["a", "b", "c"]),
+ (Series([1, 2], dtype='int32'), ["a", "b", "c"]),
+ ([0, 1, 2], ["0", "1", "2"]),
+ ([0.0, 1.0, 2.0], ["0", "1", "2"]),
+ ([0, 1, 2], [u"0", u"1", u"2"]),
+ (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
+ '2011-01-02']),
+ (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
+ (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
+ (pd.date_range('20130101', periods=3),
+ pd.date_range('20130101', periods=3, tz='US/Eastern')),
+ ])
+ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
+ # GH 9780, GH 15800
+ # Raise a ValueError when a user tries to merge on
+ # dtypes that are incompatible (e.g., obj and int/float)
+
+ df1 = DataFrame({'A': df1_vals})
+ df2 = DataFrame({'A': df2_vals})
+
+ msg = ("You are trying to merge on {lk_dtype} and "
+ "{rk_dtype} columns. If you wish to proceed "
+ "you should use pd.concat".format(lk_dtype=df1['A'].dtype,
+ rk_dtype=df2['A'].dtype))
+ msg = re.escape(msg)
+ with pytest.raises(ValueError, match=msg):
+ pd.merge(df1, df2, on=['A'])
+
+ # Check that error still raised when swapping order of dataframes
+ msg = ("You are trying to merge on {lk_dtype} and "
+ "{rk_dtype} columns. If you wish to proceed "
+ "you should use pd.concat".format(lk_dtype=df2['A'].dtype,
+ rk_dtype=df1['A'].dtype))
+ msg = re.escape(msg)
+ with pytest.raises(ValueError, match=msg):
+ pd.merge(df2, df1, on=['A'])
+
+
+def left():
+ np.random.seed(1234)
+ return DataFrame(
+ {'X': Series(np.random.choice(
+ ['foo', 'bar'],
+ size=(10,))).astype(CDT(['foo', 'bar'])),
+ 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
+
+
+def right():
+ np.random.seed(1234)
+ return DataFrame(
+ {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])),
+ 'Z': [1, 2]})
+
+
+class TestMergeCategorical(object):
+
+ def test_identical(self, left):
+ # merging on the same, should preserve dtypes
+ merged = pd.merge(left, left, on='X')
+ result = merged.dtypes.sort_index()
+ expected = Series([CategoricalDtype(),
+ np.dtype('O'),
+ np.dtype('O')],
+ index=['X', 'Y_x', 'Y_y'])
+ assert_series_equal(result, expected)
+
+ def test_basic(self, left, right):
+ # we have matching Categorical dtypes in X
+ # so should preserve the merged column
+ merged = pd.merge(left, right, on='X')
+ result = merged.dtypes.sort_index()
+ expected = Series([CategoricalDtype(),
+ np.dtype('O'),
+ np.dtype('int64')],
+ index=['X', 'Y', 'Z'])
+ assert_series_equal(result, expected)
+
+ def test_merge_categorical(self):
+ # GH 9426
+
+ right = DataFrame({'c': {0: 'a',
+ 1: 'b',
+ 2: 'c',
+ 3: 'd',
+ 4: 'e'},
+ 'd': {0: 'null',
+ 1: 'null',
+ 2: 'null',
+ 3: 'null',
+ 4: 'null'}})
+ left = DataFrame({'a': {0: 'f',
+ 1: 'f',
+ 2: 'f',
+ 3: 'f',
+ 4: 'f'},
+ 'b': {0: 'g',
+ 1: 'g',
+ 2: 'g',
+ 3: 'g',
+ 4: 'g'}})
+ df = pd.merge(left, right, how='left', left_on='b', right_on='c')
+
+ # object-object
+ expected = df.copy()
+
+ # object-cat
+ # note that we propagate the category
+ # because we don't have any matching rows
+ cright = right.copy()
+ cright['d'] = cright['d'].astype('category')
+ result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
+ expected['d'] = expected['d'].astype(CategoricalDtype(['null']))
+ tm.assert_frame_equal(result, expected)
+
+ # cat-object
+ cleft = left.copy()
+ cleft['b'] = cleft['b'].astype('category')
+ result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
+ tm.assert_frame_equal(result, expected)
+
+ # cat-cat
+ cright = right.copy()
+ cright['d'] = cright['d'].astype('category')
+ cleft = left.copy()
+ cleft['b'] = cleft['b'].astype('category')
+ result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
+ tm.assert_frame_equal(result, expected)
+
+ def tests_merge_categorical_unordered_equal(self):
+ # GH-19551
+ df1 = DataFrame({
+ 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']),
+ 'Left': ['A0', 'B0', 'C0'],
+ })
+
+ df2 = DataFrame({
+ 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']),
+ 'Right': ['C1', 'B1', 'A1'],
+ })
+ result = pd.merge(df1, df2, on=['Foo'])
+ expected = DataFrame({
+ 'Foo': pd.Categorical(['A', 'B', 'C']),
+ 'Left': ['A0', 'B0', 'C0'],
+ 'Right': ['A1', 'B1', 'C1'],
+ })
+ assert_frame_equal(result, expected)
+
+ def test_other_columns(self, left, right):
+ # non-merge columns should preserve if possible
+ right = right.assign(Z=right.Z.astype('category'))
+
+ merged = pd.merge(left, right, on='X')
+ result = merged.dtypes.sort_index()
+ expected = Series([CategoricalDtype(),
+ np.dtype('O'),
+ CategoricalDtype()],
+ index=['X', 'Y', 'Z'])
+ assert_series_equal(result, expected)
+
+ # categories are preserved
+ assert left.X.values.is_dtype_equal(merged.X.values)
+ assert right.Z.values.is_dtype_equal(merged.Z.values)
+
+ @pytest.mark.parametrize(
+ 'change', [lambda x: x,
+ lambda x: x.astype(CDT(['foo', 'bar', 'bah'])),
+ lambda x: x.astype(CDT(ordered=True))])
+ def test_dtype_on_merged_different(self, change, join_type, left, right):
+ # our merging columns, X now has 2 different dtypes
+ # so we must be object as a result
+
+ X = change(right.X.astype('object'))
+ right = right.assign(X=X)
+ assert is_categorical_dtype(left.X.values)
+ # assert not left.X.values.is_dtype_equal(right.X.values)
+
+ merged = pd.merge(left, right, on='X', how=join_type)
+
+ result = merged.dtypes.sort_index()
+ expected = Series([np.dtype('O'),
+ np.dtype('O'),
+ np.dtype('int64')],
+ index=['X', 'Y', 'Z'])
+ assert_series_equal(result, expected)
+
+ def test_self_join_multiple_categories(self):
+ # GH 16767
+ # non-duplicates should work with multiple categories
+ m = 5
+ df = pd.DataFrame({
+ 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
+ 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
+ 'c': [letter
+ for each in ['m', 'n', 'u', 'p', 'o']
+ for letter in [each] * 2 * m],
+ 'd': [letter
+ for each in ['aa', 'bb', 'cc', 'dd', 'ee',
+ 'ff', 'gg', 'hh', 'ii', 'jj']
+ for letter in [each] * m]})
+
+ # change them all to categorical variables
+ df = df.apply(lambda x: x.astype('category'))
+
+ # self-join should equal ourselves
+ result = pd.merge(df, df, on=list(df.columns))
+
+ assert_frame_equal(result, df)
+
+ def test_dtype_on_categorical_dates(self):
+ # GH 16900
+ # dates should not be coerced to ints
+
+ df = pd.DataFrame(
+ [[date(2001, 1, 1), 1.1],
+ [date(2001, 1, 2), 1.3]],
+ columns=['date', 'num2']
+ )
+ df['date'] = df['date'].astype('category')
+
+ df2 = pd.DataFrame(
+ [[date(2001, 1, 1), 1.3],
+ [date(2001, 1, 3), 1.4]],
+ columns=['date', 'num4']
+ )
+ df2['date'] = df2['date'].astype('category')
+
+ expected_outer = pd.DataFrame([
+ [pd.Timestamp('2001-01-01'), 1.1, 1.3],
+ [pd.Timestamp('2001-01-02'), 1.3, np.nan],
+ [pd.Timestamp('2001-01-03'), np.nan, 1.4]],
+ columns=['date', 'num2', 'num4']
+ )
+ result_outer = pd.merge(df, df2, how='outer', on=['date'])
+ assert_frame_equal(result_outer, expected_outer)
+
+ expected_inner = pd.DataFrame(
+ [[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
+ columns=['date', 'num2', 'num4']
+ )
+ result_inner = pd.merge(df, df2, how='inner', on=['date'])
+ assert_frame_equal(result_inner, expected_inner)
+
+ @pytest.mark.parametrize('ordered', [True, False])
+ @pytest.mark.parametrize('category_column,categories,expected_categories',
+ [([False, True, True, False], [True, False],
+ [True, False]),
+ ([2, 1, 1, 2], [1, 2], [1, 2]),
+ (['False', 'True', 'True', 'False'],
+ ['True', 'False'], ['True', 'False'])])
+ def test_merging_with_bool_or_int_cateorical_column(self, category_column,
+ categories,
+ expected_categories,
+ ordered):
+ # GH 17187
+ # merging with a boolean/int categorical column
+ df1 = pd.DataFrame({'id': [1, 2, 3, 4],
+ 'cat': category_column})
+ df1['cat'] = df1['cat'].astype(CDT(categories, ordered=ordered))
+ df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]})
+ result = df1.merge(df2)
+ expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories,
+ 'num': [1, 9]})
+ expected['cat'] = expected['cat'].astype(
+ CDT(categories, ordered=ordered))
+ assert_frame_equal(expected, result)
+
+ def test_merge_on_int_array(self):
+ # GH 23020
+ df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
+ 'B': 1})
+ result = pd.merge(df, df, on='A')
+ expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
+ 'B_x': 1,
+ 'B_y': 1})
+ assert_frame_equal(result, expected)
+
+
+def left_df():
+ return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
+
+
+def right_df():
+ return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
+
+
+class TestMergeOnIndexes(object):
+
+ @pytest.mark.parametrize(
+ "how, sort, expected",
+ [('inner', False, DataFrame({'a': [20, 10],
+ 'b': [200, 100]},
+ index=[2, 1])),
+ ('inner', True, DataFrame({'a': [10, 20],
+ 'b': [100, 200]},
+ index=[1, 2])),
+ ('left', False, DataFrame({'a': [20, 10, 0],
+ 'b': [200, 100, np.nan]},
+ index=[2, 1, 0])),
+ ('left', True, DataFrame({'a': [0, 10, 20],
+ 'b': [np.nan, 100, 200]},
+ index=[0, 1, 2])),
+ ('right', False, DataFrame({'a': [np.nan, 10, 20],
+ 'b': [300, 100, 200]},
+ index=[3, 1, 2])),
+ ('right', True, DataFrame({'a': [10, 20, np.nan],
+ 'b': [100, 200, 300]},
+ index=[1, 2, 3])),
+ ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
+ 'b': [np.nan, 100, 200, 300]},
+ index=[0, 1, 2, 3])),
+ ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
+ 'b': [np.nan, 100, 200, 300]},
+ index=[0, 1, 2, 3]))])
+ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
+ result = pd.merge(left_df, right_df,
+ left_index=True,
+ right_index=True,
+ how=how,
+ sort=sort)
+ tm.assert_frame_equal(result, expected)
+
+
+ 'index', [
+ CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
+ Float64Index([1.0, 2.0], name='index_col'),
+ Int64Index([1, 2], name='index_col'),
+ UInt64Index([1, 2], name='index_col'),
+ RangeIndex(start=0, stop=2, name='index_col'),
+ DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
+ ], ids=lambda x: type(x).__name__)
+def test_merge_index_types(index):
+ # gh-20777
+ # assert key access is consistent across index types
+ left = DataFrame({"left_data": [1, 2]}, index=index)
+ right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
+
+ result = left.merge(right, on=['index_col'])
+
+ expected = DataFrame(
+ OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
+ index=index)
+ assert_frame_equal(result, expected)
+
+
[email protected]("on,left_on,right_on,left_index,right_index,nm", [
+ (['outer', 'inner'], None, None, False, False, 'B'),
+ (None, None, None, True, True, 'B'),
+ (None, ['outer', 'inner'], None, False, True, 'B'),
+ (None, None, ['outer', 'inner'], True, False, 'B'),
+ (['outer', 'inner'], None, None, False, False, None),
+ (None, None, None, True, True, None),
+ (None, ['outer', 'inner'], None, False, True, None),
+ (None, None, ['outer', 'inner'], True, False, None)])
+def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
+ # GH 21220
+ a = pd.DataFrame({"A": [1, 2, 3, 4]},
+ index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]],
+ names=['outer', 'inner']))
+ b = pd.Series([1, 2, 3, 4],
+ index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]],
+ names=['outer', 'inner']), name=nm)
+ expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]},
+ index=pd.MultiIndex.from_product([['a', 'b'], [1]],
+ names=['outer', 'inner']))
+ if nm is not None:
+ result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
+ left_index=left_index, right_index=right_index)
+ tm.assert_frame_equal(result, expected)
+ else:
+ msg = "Cannot merge a Series without a name"
+ with pytest.raises(ValueError, match=msg):
+ result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
+ left_index=left_index, right_index=right_index)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_asof.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_asof.py
new file mode 100644
index 00000000000..1d1d7d48ada
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_asof.py
@@ -0,0 +1,1038 @@
+import numpy as np
+import pytest
+import pytz
+
+import pandas as pd
+from pandas import Timedelta, merge_asof, read_csv, to_datetime
+from pandas.core.reshape.merge import MergeError
+from pandas.util.testing import assert_frame_equal
+
+
+class TestAsOfMerge(object):
+
+ def read_data(self, datapath, name, dedupe=False):
+ path = datapath('reshape', 'merge', 'data', name)
+ x = read_csv(path)
+ if dedupe:
+ x = (x.drop_duplicates(['time', 'ticker'], keep='last')
+ .reset_index(drop=True)
+ )
+ x.time = to_datetime(x.time)
+ return x
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, datapath):
+
+ self.trades = self.read_data(datapath, 'trades.csv')
+ self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True)
+ self.asof = self.read_data(datapath, 'asof.csv')
+ self.tolerance = self.read_data(datapath, 'tolerance.csv')
+ self.allow_exact_matches = self.read_data(datapath,
+ 'allow_exact_matches.csv')
+ self.allow_exact_matches_and_tolerance = self.read_data(
+ datapath, 'allow_exact_matches_and_tolerance.csv')
+
+ def test_examples1(self):
+ """ doc-string examples """
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+ 'right_val': [1, 2, 3, 6, 7]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, 3, 7]})
+
+ result = pd.merge_asof(left, right, on='a')
+ assert_frame_equal(result, expected)
+
+ def test_examples2(self):
+ """ doc-string examples """
+
+ trades = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.038',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.048']),
+ 'ticker': ['MSFT', 'MSFT',
+ 'GOOG', 'GOOG', 'AAPL'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100]},
+ columns=['time', 'ticker', 'price', 'quantity'])
+
+ quotes = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.030',
+ '20160525 13:30:00.041',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.049',
+ '20160525 13:30:00.072',
+ '20160525 13:30:00.075']),
+ 'ticker': ['GOOG', 'MSFT', 'MSFT',
+ 'MSFT', 'GOOG', 'AAPL', 'GOOG',
+ 'MSFT'],
+ 'bid': [720.50, 51.95, 51.97, 51.99,
+ 720.50, 97.99, 720.50, 52.01],
+ 'ask': [720.93, 51.96, 51.98, 52.00,
+ 720.93, 98.01, 720.88, 52.03]},
+ columns=['time', 'ticker', 'bid', 'ask'])
+
+ pd.merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+
+ pd.merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=pd.Timedelta('2ms'))
+
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.038',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.048']),
+ 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100],
+ 'bid': [np.nan, 51.97, np.nan,
+ np.nan, np.nan],
+ 'ask': [np.nan, 51.98, np.nan,
+ np.nan, np.nan]},
+ columns=['time', 'ticker', 'price', 'quantity',
+ 'bid', 'ask'])
+
+ result = pd.merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=pd.Timedelta('10ms'),
+ allow_exact_matches=False)
+ assert_frame_equal(result, expected)
+
+ def test_examples3(self):
+ """ doc-string examples """
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+ 'right_val': [1, 2, 3, 6, 7]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, 6, np.nan]})
+
+ result = pd.merge_asof(left, right, on='a', direction='forward')
+ assert_frame_equal(result, expected)
+
+ def test_examples4(self):
+ """ doc-string examples """
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+ 'right_val': [1, 2, 3, 6, 7]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, 6, 7]})
+
+ result = pd.merge_asof(left, right, on='a', direction='nearest')
+ assert_frame_equal(result, expected)
+
+ def test_basic(self):
+
+ expected = self.asof
+ trades = self.trades
+ quotes = self.quotes
+
+ result = merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_basic_categorical(self):
+
+ expected = self.asof
+ trades = self.trades.copy()
+ trades.ticker = trades.ticker.astype('category')
+ quotes = self.quotes.copy()
+ quotes.ticker = quotes.ticker.astype('category')
+ expected.ticker = expected.ticker.astype('category')
+
+ result = merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_basic_left_index(self):
+
+ # GH14253
+ expected = self.asof
+ trades = self.trades.set_index('time')
+ quotes = self.quotes
+
+ result = merge_asof(trades, quotes,
+ left_index=True,
+ right_on='time',
+ by='ticker')
+ # left-only index uses right's index, oddly
+ expected.index = result.index
+ # time column appears after left's columns
+ expected = expected[result.columns]
+ assert_frame_equal(result, expected)
+
+ def test_basic_right_index(self):
+
+ expected = self.asof
+ trades = self.trades
+ quotes = self.quotes.set_index('time')
+
+ result = merge_asof(trades, quotes,
+ left_on='time',
+ right_index=True,
+ by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_basic_left_index_right_index(self):
+
+ expected = self.asof.set_index('time')
+ trades = self.trades.set_index('time')
+ quotes = self.quotes.set_index('time')
+
+ result = merge_asof(trades, quotes,
+ left_index=True,
+ right_index=True,
+ by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_multi_index(self):
+
+ # MultiIndex is prohibited
+ trades = self.trades.set_index(['time', 'price'])
+ quotes = self.quotes.set_index('time')
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ left_index=True,
+ right_index=True)
+
+ trades = self.trades.set_index('time')
+ quotes = self.quotes.set_index(['time', 'bid'])
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ left_index=True,
+ right_index=True)
+
+ def test_on_and_index(self):
+
+ # 'on' parameter and index together is prohibited
+ trades = self.trades.set_index('time')
+ quotes = self.quotes.set_index('time')
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ left_on='price',
+ left_index=True,
+ right_index=True)
+
+ trades = self.trades.set_index('time')
+ quotes = self.quotes.set_index('time')
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ right_on='bid',
+ left_index=True,
+ right_index=True)
+
+ def test_basic_left_by_right_by(self):
+
+ # GH14253
+ expected = self.asof
+ trades = self.trades
+ quotes = self.quotes
+
+ result = merge_asof(trades, quotes,
+ on='time',
+ left_by='ticker',
+ right_by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_missing_right_by(self):
+
+ expected = self.asof
+ trades = self.trades
+ quotes = self.quotes
+
+ q = quotes[quotes.ticker != 'MSFT']
+ result = merge_asof(trades, q,
+ on='time',
+ by='ticker')
+ expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan
+ assert_frame_equal(result, expected)
+
+ def test_multiby(self):
+ # GH13936
+ trades = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.046',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.050']),
+ 'ticker': ['MSFT', 'MSFT',
+ 'GOOG', 'GOOG', 'AAPL'],
+ 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100]},
+ columns=['time', 'ticker', 'exch',
+ 'price', 'quantity'])
+
+ quotes = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.030',
+ '20160525 13:30:00.041',
+ '20160525 13:30:00.045',
+ '20160525 13:30:00.049']),
+ 'ticker': ['GOOG', 'MSFT', 'MSFT',
+ 'MSFT', 'GOOG', 'AAPL'],
+ 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA',
+ 'NSDQ', 'ARCA'],
+ 'bid': [720.51, 51.95, 51.97, 51.99,
+ 720.50, 97.99],
+ 'ask': [720.92, 51.96, 51.98, 52.00,
+ 720.93, 98.01]},
+ columns=['time', 'ticker', 'exch', 'bid', 'ask'])
+
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.046',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.050']),
+ 'ticker': ['MSFT', 'MSFT',
+ 'GOOG', 'GOOG', 'AAPL'],
+ 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100],
+ 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan],
+ 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]},
+ columns=['time', 'ticker', 'exch',
+ 'price', 'quantity', 'bid', 'ask'])
+
+ result = pd.merge_asof(trades, quotes, on='time',
+ by=['ticker', 'exch'])
+ assert_frame_equal(result, expected)
+
+ def test_multiby_heterogeneous_types(self):
+ # GH13936
+ trades = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.046',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.050']),
+ 'ticker': [0, 0, 1, 1, 2],
+ 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100]},
+ columns=['time', 'ticker', 'exch',
+ 'price', 'quantity'])
+
+ quotes = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.030',
+ '20160525 13:30:00.041',
+ '20160525 13:30:00.045',
+ '20160525 13:30:00.049']),
+ 'ticker': [1, 0, 0, 0, 1, 2],
+ 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA',
+ 'NSDQ', 'ARCA'],
+ 'bid': [720.51, 51.95, 51.97, 51.99,
+ 720.50, 97.99],
+ 'ask': [720.92, 51.96, 51.98, 52.00,
+ 720.93, 98.01]},
+ columns=['time', 'ticker', 'exch', 'bid', 'ask'])
+
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.023',
+ '20160525 13:30:00.023',
+ '20160525 13:30:00.046',
+ '20160525 13:30:00.048',
+ '20160525 13:30:00.050']),
+ 'ticker': [0, 0, 1, 1, 2],
+ 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'],
+ 'price': [51.95, 51.95,
+ 720.77, 720.92, 98.00],
+ 'quantity': [75, 155,
+ 100, 100, 100],
+ 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan],
+ 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]},
+ columns=['time', 'ticker', 'exch',
+ 'price', 'quantity', 'bid', 'ask'])
+
+ result = pd.merge_asof(trades, quotes, on='time',
+ by=['ticker', 'exch'])
+ assert_frame_equal(result, expected)
+
+ def test_multiby_indexed(self):
+ # GH15676
+ left = pd.DataFrame([
+ [pd.to_datetime('20160602'), 1, 'a'],
+ [pd.to_datetime('20160602'), 2, 'a'],
+ [pd.to_datetime('20160603'), 1, 'b'],
+ [pd.to_datetime('20160603'), 2, 'b']],
+ columns=['time', 'k1', 'k2']).set_index('time')
+
+ right = pd.DataFrame([
+ [pd.to_datetime('20160502'), 1, 'a', 1.0],
+ [pd.to_datetime('20160502'), 2, 'a', 2.0],
+ [pd.to_datetime('20160503'), 1, 'b', 3.0],
+ [pd.to_datetime('20160503'), 2, 'b', 4.0]],
+ columns=['time', 'k1', 'k2', 'value']).set_index('time')
+
+ expected = pd.DataFrame([
+ [pd.to_datetime('20160602'), 1, 'a', 1.0],
+ [pd.to_datetime('20160602'), 2, 'a', 2.0],
+ [pd.to_datetime('20160603'), 1, 'b', 3.0],
+ [pd.to_datetime('20160603'), 2, 'b', 4.0]],
+ columns=['time', 'k1', 'k2', 'value']).set_index('time')
+
+ result = pd.merge_asof(left,
+ right,
+ left_index=True,
+ right_index=True,
+ by=['k1', 'k2'])
+
+ assert_frame_equal(expected, result)
+
+ with pytest.raises(MergeError):
+ pd.merge_asof(left, right, left_index=True, right_index=True,
+ left_by=['k1', 'k2'], right_by=['k1'])
+
+ def test_basic2(self, datapath):
+
+ expected = self.read_data(datapath, 'asof2.csv')
+ trades = self.read_data(datapath, 'trades2.csv')
+ quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True)
+
+ result = merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+ assert_frame_equal(result, expected)
+
+ def test_basic_no_by(self):
+ f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \
+ .reset_index(drop=True)
+
+ # just use a single ticker
+ expected = f(self.asof)
+ trades = f(self.trades)
+ quotes = f(self.quotes)
+
+ result = merge_asof(trades, quotes,
+ on='time')
+ assert_frame_equal(result, expected)
+
+ def test_valid_join_keys(self):
+
+ trades = self.trades
+ quotes = self.quotes
+
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ left_on='time',
+ right_on='bid',
+ by='ticker')
+
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ on=['time', 'ticker'],
+ by='ticker')
+
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ by='ticker')
+
+ def test_with_duplicates(self, datapath):
+
+ q = pd.concat([self.quotes, self.quotes]).sort_values(
+ ['time', 'ticker']).reset_index(drop=True)
+ result = merge_asof(self.trades, q,
+ on='time',
+ by='ticker')
+ expected = self.read_data(datapath, 'asof.csv')
+ assert_frame_equal(result, expected)
+
+ def test_with_duplicates_no_on(self):
+
+ df1 = pd.DataFrame({'key': [1, 1, 3],
+ 'left_val': [1, 2, 3]})
+ df2 = pd.DataFrame({'key': [1, 2, 2],
+ 'right_val': [1, 2, 3]})
+ result = merge_asof(df1, df2, on='key')
+ expected = pd.DataFrame({'key': [1, 1, 3],
+ 'left_val': [1, 2, 3],
+ 'right_val': [1, 1, 3]})
+ assert_frame_equal(result, expected)
+
+ def test_valid_allow_exact_matches(self):
+
+ trades = self.trades
+ quotes = self.quotes
+
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ allow_exact_matches='foo')
+
+ def test_valid_tolerance(self):
+
+ trades = self.trades
+ quotes = self.quotes
+
+ # dti
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=Timedelta('1s'))
+
+ # integer
+ merge_asof(trades.reset_index(), quotes.reset_index(),
+ on='index',
+ by='ticker',
+ tolerance=1)
+
+ # incompat
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=1)
+
+ # invalid
+ with pytest.raises(MergeError):
+ merge_asof(trades.reset_index(), quotes.reset_index(),
+ on='index',
+ by='ticker',
+ tolerance=1.0)
+
+ # invalid negative
+ with pytest.raises(MergeError):
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=-Timedelta('1s'))
+
+ with pytest.raises(MergeError):
+ merge_asof(trades.reset_index(), quotes.reset_index(),
+ on='index',
+ by='ticker',
+ tolerance=-1)
+
+ def test_non_sorted(self):
+
+ trades = self.trades.sort_values('time', ascending=False)
+ quotes = self.quotes.sort_values('time', ascending=False)
+
+ # we require that we are already sorted on time & quotes
+ assert not trades.time.is_monotonic
+ assert not quotes.time.is_monotonic
+ with pytest.raises(ValueError):
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+
+ trades = self.trades.sort_values('time')
+ assert trades.time.is_monotonic
+ assert not quotes.time.is_monotonic
+ with pytest.raises(ValueError):
+ merge_asof(trades, quotes,
+ on='time',
+ by='ticker')
+
+ quotes = self.quotes.sort_values('time')
+ assert trades.time.is_monotonic
+ assert quotes.time.is_monotonic
+
+ # ok, though has dupes
+ merge_asof(trades, self.quotes,
+ on='time',
+ by='ticker')
+
+ def test_tolerance(self):
+
+ trades = self.trades
+ quotes = self.quotes
+
+ result = merge_asof(trades, quotes,
+ on='time',
+ by='ticker',
+ tolerance=Timedelta('1day'))
+ expected = self.tolerance
+ assert_frame_equal(result, expected)
+
+ def test_tolerance_forward(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
+ 'right_val': [1, 2, 3, 7, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, np.nan, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='forward',
+ tolerance=1)
+ assert_frame_equal(result, expected)
+
+ def test_tolerance_nearest(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
+ 'right_val': [1, 2, 3, 7, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, np.nan, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='nearest',
+ tolerance=1)
+ assert_frame_equal(result, expected)
+
+ def test_tolerance_tz(self):
+ # GH 14844
+ left = pd.DataFrame(
+ {'date': pd.date_range(start=pd.to_datetime('2016-01-02'),
+ freq='D', periods=5,
+ tz=pytz.timezone('UTC')),
+ 'value1': np.arange(5)})
+ right = pd.DataFrame(
+ {'date': pd.date_range(start=pd.to_datetime('2016-01-01'),
+ freq='D', periods=5,
+ tz=pytz.timezone('UTC')),
+ 'value2': list("ABCDE")})
+ result = pd.merge_asof(left, right, on='date',
+ tolerance=pd.Timedelta('1 day'))
+
+ expected = pd.DataFrame(
+ {'date': pd.date_range(start=pd.to_datetime('2016-01-02'),
+ freq='D', periods=5,
+ tz=pytz.timezone('UTC')),
+ 'value1': np.arange(5),
+ 'value2': list("BCDEE")})
+ assert_frame_equal(result, expected)
+
+ def test_tolerance_float(self):
+ # GH22981
+ left = pd.DataFrame({'a': [1.1, 3.5, 10.9],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1.0, 2.5, 3.3, 7.5, 11.5],
+ 'right_val': [1.0, 2.5, 3.3, 7.5, 11.5]})
+
+ expected = pd.DataFrame({'a': [1.1, 3.5, 10.9],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [1, 3.3, np.nan]})
+
+ result = pd.merge_asof(left, right, on='a', direction='nearest',
+ tolerance=0.5)
+ assert_frame_equal(result, expected)
+
+ def test_index_tolerance(self):
+ # GH 15135
+ expected = self.tolerance.set_index('time')
+ trades = self.trades.set_index('time')
+ quotes = self.quotes.set_index('time')
+
+ result = pd.merge_asof(trades, quotes,
+ left_index=True,
+ right_index=True,
+ by='ticker',
+ tolerance=pd.Timedelta('1day'))
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches(self):
+
+ result = merge_asof(self.trades, self.quotes,
+ on='time',
+ by='ticker',
+ allow_exact_matches=False)
+ expected = self.allow_exact_matches
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_forward(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
+ 'right_val': [1, 2, 3, 7, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [2, 7, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='forward',
+ allow_exact_matches=False)
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_nearest(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 2, 3, 7, 11],
+ 'right_val': [1, 2, 3, 7, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [2, 3, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='nearest',
+ allow_exact_matches=False)
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_and_tolerance(self):
+
+ result = merge_asof(self.trades, self.quotes,
+ on='time',
+ by='ticker',
+ tolerance=Timedelta('100ms'),
+ allow_exact_matches=False)
+ expected = self.allow_exact_matches_and_tolerance
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_and_tolerance2(self):
+ # GH 13695
+ df1 = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
+ 'username': ['bob']})
+ df2 = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
+ '2016-07-15 13:30:00.030']),
+ 'version': [1, 2]})
+
+ result = pd.merge_asof(df1, df2, on='time')
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
+ 'username': ['bob'],
+ 'version': [2]})
+ assert_frame_equal(result, expected)
+
+ result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False)
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
+ 'username': ['bob'],
+ 'version': [1]})
+ assert_frame_equal(result, expected)
+
+ result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
+ tolerance=pd.Timedelta('10ms'))
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
+ 'username': ['bob'],
+ 'version': [np.nan]})
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_and_tolerance3(self):
+ # GH 13709
+ df1 = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
+ '2016-07-15 13:30:00.030']),
+ 'username': ['bob', 'charlie']})
+ df2 = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
+ '2016-07-15 13:30:00.030']),
+ 'version': [1, 2]})
+
+ result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
+ tolerance=pd.Timedelta('10ms'))
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
+ '2016-07-15 13:30:00.030']),
+ 'username': ['bob', 'charlie'],
+ 'version': [np.nan, np.nan]})
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_and_tolerance_forward(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 3, 4, 6, 11],
+ 'right_val': [1, 3, 4, 6, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [np.nan, 6, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='forward',
+ allow_exact_matches=False, tolerance=1)
+ assert_frame_equal(result, expected)
+
+ def test_allow_exact_matches_and_tolerance_nearest(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c']})
+ right = pd.DataFrame({'a': [1, 3, 4, 6, 11],
+ 'right_val': [1, 3, 4, 7, 11]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10],
+ 'left_val': ['a', 'b', 'c'],
+ 'right_val': [np.nan, 4, 11]})
+
+ result = pd.merge_asof(left, right, on='a', direction='nearest',
+ allow_exact_matches=False, tolerance=1)
+ assert_frame_equal(result, expected)
+
+ def test_forward_by(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10, 12, 15],
+ 'b': ['X', 'X', 'Y', 'Z', 'Y'],
+ 'left_val': ['a', 'b', 'c', 'd', 'e']})
+ right = pd.DataFrame({'a': [1, 6, 11, 15, 16],
+ 'b': ['X', 'Z', 'Y', 'Z', 'Y'],
+ 'right_val': [1, 6, 11, 15, 16]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10, 12, 15],
+ 'b': ['X', 'X', 'Y', 'Z', 'Y'],
+ 'left_val': ['a', 'b', 'c', 'd', 'e'],
+ 'right_val': [1, np.nan, 11, 15, 16]})
+
+ result = pd.merge_asof(left, right, on='a', by='b',
+ direction='forward')
+ assert_frame_equal(result, expected)
+
+ def test_nearest_by(self):
+ # GH14887
+
+ left = pd.DataFrame({'a': [1, 5, 10, 12, 15],
+ 'b': ['X', 'X', 'Z', 'Z', 'Y'],
+ 'left_val': ['a', 'b', 'c', 'd', 'e']})
+ right = pd.DataFrame({'a': [1, 6, 11, 15, 16],
+ 'b': ['X', 'Z', 'Z', 'Z', 'Y'],
+ 'right_val': [1, 6, 11, 15, 16]})
+
+ expected = pd.DataFrame({'a': [1, 5, 10, 12, 15],
+ 'b': ['X', 'X', 'Z', 'Z', 'Y'],
+ 'left_val': ['a', 'b', 'c', 'd', 'e'],
+ 'right_val': [1, 1, 11, 11, 16]})
+
+ result = pd.merge_asof(left, right, on='a', by='b',
+ direction='nearest')
+ assert_frame_equal(result, expected)
+
+ def test_by_int(self):
+ # we specialize by type, so test that this is correct
+ df1 = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.020',
+ '20160525 13:30:00.030',
+ '20160525 13:30:00.040',
+ '20160525 13:30:00.050',
+ '20160525 13:30:00.060']),
+ 'key': [1, 2, 1, 3, 2],
+ 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]},
+ columns=['time', 'key', 'value1'])
+
+ df2 = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.015',
+ '20160525 13:30:00.020',
+ '20160525 13:30:00.025',
+ '20160525 13:30:00.035',
+ '20160525 13:30:00.040',
+ '20160525 13:30:00.055',
+ '20160525 13:30:00.060',
+ '20160525 13:30:00.065']),
+ 'key': [2, 1, 1, 3, 2, 1, 2, 3],
+ 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]},
+ columns=['time', 'key', 'value2'])
+
+ result = pd.merge_asof(df1, df2, on='time', by='key')
+
+ expected = pd.DataFrame({
+ 'time': pd.to_datetime(['20160525 13:30:00.020',
+ '20160525 13:30:00.030',
+ '20160525 13:30:00.040',
+ '20160525 13:30:00.050',
+ '20160525 13:30:00.060']),
+ 'key': [1, 2, 1, 3, 2],
+ 'value1': [1.1, 1.2, 1.3, 1.4, 1.5],
+ 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]},
+ columns=['time', 'key', 'value1', 'value2'])
+
+ assert_frame_equal(result, expected)
+
+ def test_on_float(self):
+ # mimics how to determine the minimum-price variation
+ df1 = pd.DataFrame({
+ 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078],
+ 'symbol': list("ABCDEFG")},
+ columns=['symbol', 'price'])
+
+ df2 = pd.DataFrame({
+ 'price': [0.0, 1.0, 100.0],
+ 'mpv': [0.0001, 0.01, 0.05]},
+ columns=['price', 'mpv'])
+
+ df1 = df1.sort_values('price').reset_index(drop=True)
+
+ result = pd.merge_asof(df1, df2, on='price')
+
+ expected = pd.DataFrame({
+ 'symbol': list("BGACEDF"),
+ 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90],
+ 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]},
+ columns=['symbol', 'price', 'mpv'])
+
+ assert_frame_equal(result, expected)
+
+ def test_on_specialized_type(self, any_real_dtype):
+ # see gh-13936
+ dtype = np.dtype(any_real_dtype).type
+
+ df1 = pd.DataFrame({
+ "value": [5, 2, 25, 100, 78, 120, 79],
+ "symbol": list("ABCDEFG")},
+ columns=["symbol", "value"])
+ df1.value = dtype(df1.value)
+
+ df2 = pd.DataFrame({
+ "value": [0, 80, 120, 125],
+ "result": list("xyzw")},
+ columns=["value", "result"])
+ df2.value = dtype(df2.value)
+
+ df1 = df1.sort_values("value").reset_index(drop=True)
+ result = pd.merge_asof(df1, df2, on="value")
+
+ expected = pd.DataFrame(
+ {"symbol": list("BACEGDF"),
+ "value": [2, 5, 25, 78, 79, 100, 120],
+ "result": list("xxxxxyz")
+ }, columns=["symbol", "value", "result"])
+ expected.value = dtype(expected.value)
+
+ assert_frame_equal(result, expected)
+
+ def test_on_specialized_type_by_int(self, any_real_dtype):
+ # see gh-13936
+ dtype = np.dtype(any_real_dtype).type
+
+ df1 = pd.DataFrame({
+ "value": [5, 2, 25, 100, 78, 120, 79],
+ "key": [1, 2, 3, 2, 3, 1, 2],
+ "symbol": list("ABCDEFG")},
+ columns=["symbol", "key", "value"])
+ df1.value = dtype(df1.value)
+
+ df2 = pd.DataFrame({
+ "value": [0, 80, 120, 125],
+ "key": [1, 2, 2, 3],
+ "result": list("xyzw")},
+ columns=["value", "key", "result"])
+ df2.value = dtype(df2.value)
+
+ df1 = df1.sort_values("value").reset_index(drop=True)
+ result = pd.merge_asof(df1, df2, on="value", by="key")
+
+ expected = pd.DataFrame({
+ "symbol": list("BACEGDF"),
+ "key": [2, 1, 3, 3, 2, 2, 1],
+ "value": [2, 5, 25, 78, 79, 100, 120],
+ "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]},
+ columns=["symbol", "key", "value", "result"])
+ expected.value = dtype(expected.value)
+
+ assert_frame_equal(result, expected)
+
+ def test_on_float_by_int(self):
+ # type specialize both "by" and "on" parameters
+ df1 = pd.DataFrame({
+ 'symbol': list("AAABBBCCC"),
+ 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+ 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59,
+ 12.5, 378.15, 378.2, 378.25]},
+ columns=['symbol', 'exch', 'price'])
+
+ df2 = pd.DataFrame({
+ 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0],
+ 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]},
+ columns=['exch', 'price', 'mpv'])
+
+ df1 = df1.sort_values('price').reset_index(drop=True)
+ df2 = df2.sort_values('price').reset_index(drop=True)
+
+ result = pd.merge_asof(df1, df2, on='price', by='exch')
+
+ expected = pd.DataFrame({
+ 'symbol': list("AAABBBCCC"),
+ 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3],
+ 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58,
+ 12.59, 378.15, 378.2, 378.25],
+ 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]},
+ columns=['symbol', 'exch', 'price', 'mpv'])
+
+ assert_frame_equal(result, expected)
+
+ def test_merge_datatype_error(self):
+ """ Tests merge datatype mismatch error """
+ msg = r'merge keys \[0\] object and int64, must be the same type'
+
+ left = pd.DataFrame({'left_val': [1, 5, 10],
+ 'a': ['a', 'b', 'c']})
+ right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7],
+ 'a': [1, 2, 3, 6, 7]})
+
+ with pytest.raises(MergeError, match=msg):
+ merge_asof(left, right, on='a')
+
+ @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)],
+ ids=['numeric', 'datetime'])
+ @pytest.mark.parametrize('side', ['left', 'right'])
+ def test_merge_on_nans(self, func, side):
+ # GH 23189
+ msg = "Merge keys contain null values on {} side".format(side)
+ nulls = func([1.0, 5.0, np.nan])
+ non_nulls = func([1.0, 5.0, 10.])
+ df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']})
+ df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]})
+
+ with pytest.raises(ValueError, match=msg):
+ if side == 'left':
+ merge_asof(df_null, df, on='a')
+ else:
+ merge_asof(df, df_null, on='a')
+
+ def test_merge_by_col_tz_aware(self):
+ # GH 21184
+ left = pd.DataFrame(
+ {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'),
+ 'on_col': [2], 'values': ['a']})
+ right = pd.DataFrame(
+ {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'),
+ 'on_col': [1], 'values': ['b']})
+ result = pd.merge_asof(left, right, by='by_col', on='on_col')
+ expected = pd.DataFrame([
+ [pd.Timestamp('2018-01-01', tz='UTC'), 2, 'a', 'b']
+ ], columns=['by_col', 'on_col', 'values_x', 'values_y'])
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_index_as_string.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_index_as_string.py
new file mode 100644
index 00000000000..12d9483af87
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_index_as_string.py
@@ -0,0 +1,177 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame
+from pandas.util.testing import assert_frame_equal
+
+
+def df1():
+ return DataFrame(dict(
+ outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
+ inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
+ v1=np.linspace(0, 1, 11)))
+
+
+def df2():
+ return DataFrame(dict(
+ outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
+ inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
+ v2=np.linspace(10, 11, 12)))
+
+
[email protected](params=[[], ['outer'], ['outer', 'inner']])
+def left_df(request, df1):
+ """ Construct left test DataFrame with specified levels
+ (any of 'outer', 'inner', and 'v1')"""
+ levels = request.param
+ if levels:
+ df1 = df1.set_index(levels)
+
+ return df1
+
+
[email protected](params=[[], ['outer'], ['outer', 'inner']])
+def right_df(request, df2):
+ """ Construct right test DataFrame with specified levels
+ (any of 'outer', 'inner', and 'v2')"""
+ levels = request.param
+
+ if levels:
+ df2 = df2.set_index(levels)
+
+ return df2
+
+
+def compute_expected(df_left, df_right,
+ on=None, left_on=None, right_on=None, how=None):
+ """
+ Compute the expected merge result for the test case.
+
+ This method computes the expected result of merging two DataFrames on
+ a combination of their columns and index levels. It does so by
+ explicitly dropping/resetting their named index levels, performing a
+ merge on their columns, and then finally restoring the appropriate
+ index in the result.
+
+ Parameters
+ ----------
+ df_left : DataFrame
+ The left DataFrame (may have zero or more named index levels)
+ df_right : DataFrame
+ The right DataFrame (may have zero or more named index levels)
+ on : list of str
+ The on parameter to the merge operation
+ left_on : list of str
+ The left_on parameter to the merge operation
+ right_on : list of str
+ The right_on parameter to the merge operation
+ how : str
+ The how parameter to the merge operation
+
+ Returns
+ -------
+ DataFrame
+ The expected merge result
+ """
+
+ # Handle on param if specified
+ if on is not None:
+ left_on, right_on = on, on
+
+ # Compute input named index levels
+ left_levels = [n for n in df_left.index.names if n is not None]
+ right_levels = [n for n in df_right.index.names if n is not None]
+
+ # Compute output named index levels
+ output_levels = [i for i in left_on
+ if i in right_levels and i in left_levels]
+
+ # Drop index levels that aren't involved in the merge
+ drop_left = [n for n in left_levels if n not in left_on]
+ if drop_left:
+ df_left = df_left.reset_index(drop_left, drop=True)
+
+ drop_right = [n for n in right_levels if n not in right_on]
+ if drop_right:
+ df_right = df_right.reset_index(drop_right, drop=True)
+
+ # Convert remaining index levels to columns
+ reset_left = [n for n in left_levels if n in left_on]
+ if reset_left:
+ df_left = df_left.reset_index(level=reset_left)
+
+ reset_right = [n for n in right_levels if n in right_on]
+ if reset_right:
+ df_right = df_right.reset_index(level=reset_right)
+
+ # Perform merge
+ expected = df_left.merge(df_right,
+ left_on=left_on,
+ right_on=right_on,
+ how=how)
+
+ # Restore index levels
+ if output_levels:
+ expected = expected.set_index(output_levels)
+
+ return expected
+
+
+ [(['outer'], 'inner'),
+ (['inner'], 'left'),
+ (['outer', 'inner'], 'right'),
+ (['inner', 'outer'], 'outer')])
+def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
+
+ # Construct expected result
+ expected = compute_expected(left_df, right_df, on=on, how=how)
+
+ # Perform merge
+ result = left_df.merge(right_df, on=on, how=how)
+ assert_frame_equal(result, expected, check_like=True)
+
+
[email protected]('left_on,right_on,how',
+ [(['outer'], ['outer'], 'inner'),
+ (['inner'], ['inner'], 'right'),
+ (['outer', 'inner'], ['outer', 'inner'], 'left'),
+ (['inner', 'outer'], ['inner', 'outer'], 'outer')])
+def test_merge_indexes_and_columns_lefton_righton(
+ left_df, right_df, left_on, right_on, how):
+
+ # Construct expected result
+ expected = compute_expected(left_df, right_df,
+ left_on=left_on,
+ right_on=right_on,
+ how=how)
+
+ # Perform merge
+ result = left_df.merge(right_df,
+ left_on=left_on, right_on=right_on, how=how)
+ assert_frame_equal(result, expected, check_like=True)
+
+
[email protected]('left_index',
+ ['inner', ['inner', 'outer']])
+def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
+
+ # Construct left_df
+ left_df = df1.set_index(left_index)
+
+ # Construct right_df
+ right_df = df2.set_index(['outer', 'inner'])
+
+ # Result
+ expected = (left_df.reset_index()
+ .join(right_df, on=['outer', 'inner'], how=join_type,
+ lsuffix='_x', rsuffix='_y')
+ .set_index(left_index))
+
+ # Perform join
+ result = left_df.join(right_df, on=['outer', 'inner'], how=join_type,
+ lsuffix='_x', rsuffix='_y')
+
+ assert_frame_equal(result, expected, check_like=True)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_ordered.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_ordered.py
new file mode 100644
index 00000000000..414f46cdb29
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_merge_ordered.py
@@ -0,0 +1,103 @@
+from numpy import nan
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, merge_ordered
+from pandas.util.testing import assert_frame_equal
+
+
+class TestMergeOrdered(object):
+
+ def setup_method(self, method):
+ self.left = DataFrame({'key': ['a', 'c', 'e'],
+ 'lvalue': [1, 2., 3]})
+
+ self.right = DataFrame({'key': ['b', 'c', 'd', 'f'],
+ 'rvalue': [1, 2, 3., 4]})
+
+ def test_basic(self):
+ result = merge_ordered(self.left, self.right, on='key')
+ expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
+ 'lvalue': [1, nan, 2, nan, 3, nan],
+ 'rvalue': [nan, 1, 2, 3, nan, 4]})
+
+ assert_frame_equal(result, expected)
+
+ def test_ffill(self):
+ result = merge_ordered(
+ self.left, self.right, on='key', fill_method='ffill')
+ expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
+ 'lvalue': [1., 1, 2, 2, 3, 3.],
+ 'rvalue': [nan, 1, 2, 3, 3, 4]})
+ assert_frame_equal(result, expected)
+
+ def test_multigroup(self):
+ left = pd.concat([self.left, self.left], ignore_index=True)
+
+ left['group'] = ['a'] * 3 + ['b'] * 3
+
+ result = merge_ordered(left, self.right, on='key', left_by='group',
+ fill_method='ffill')
+ expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
+ 'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
+ 'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
+ expected['group'] = ['a'] * 6 + ['b'] * 6
+
+ assert_frame_equal(result, expected.loc[:, result.columns])
+
+ result2 = merge_ordered(self.right, left, on='key', right_by='group',
+ fill_method='ffill')
+ assert_frame_equal(result, result2.loc[:, result.columns])
+
+ result = merge_ordered(left, self.right, on='key', left_by='group')
+ assert result['group'].notna().all()
+
+ def test_merge_type(self):
+ class NotADataFrame(DataFrame):
+
+ @property
+ def _constructor(self):
+ return NotADataFrame
+
+ nad = NotADataFrame(self.left)
+ result = nad.merge(self.right, on='key')
+
+ assert isinstance(result, NotADataFrame)
+
+ def test_empty_sequence_concat(self):
+ # GH 9157
+ empty_pat = "[Nn]o objects"
+ none_pat = "objects.*None"
+ test_cases = [
+ ((), empty_pat),
+ ([], empty_pat),
+ ({}, empty_pat),
+ ([None], none_pat),
+ ([None, None], none_pat)
+ ]
+ for df_seq, pattern in test_cases:
+ with pytest.raises(ValueError, match=pattern):
+ pd.concat(df_seq)
+
+ pd.concat([pd.DataFrame()])
+ pd.concat([None, pd.DataFrame()])
+ pd.concat([pd.DataFrame(), None])
+
+ def test_doc_example(self):
+ left = DataFrame({'group': list('aaabbb'),
+ 'key': ['a', 'c', 'e', 'a', 'c', 'e'],
+ 'lvalue': [1, 2, 3] * 2,
+ })
+
+ right = DataFrame({'key': ['b', 'c', 'd'],
+ 'rvalue': [1, 2, 3]})
+
+ result = merge_ordered(left, right, fill_method='ffill',
+ left_by='group')
+
+ expected = DataFrame({'group': list('aaaaabbbbb'),
+ 'key': ['a', 'b', 'c', 'd', 'e'] * 2,
+ 'lvalue': [1, 1, 2, 2, 3] * 2,
+ 'rvalue': [nan, 1, 2, 3, 3] * 2})
+
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_multi.py b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_multi.py
new file mode 100644
index 00000000000..7e8b5b1120b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/merge/test_multi.py
@@ -0,0 +1,668 @@
+# pylint: disable=E1103
+
+from collections import OrderedDict
+
+import numpy as np
+from numpy import nan
+from numpy.random import randn
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.core.reshape.concat import concat
+from pandas.core.reshape.merge import merge
+import pandas.util.testing as tm
+
+
+def left():
+ """left dataframe (not multi-indexed) for multi-index join tests"""
+ # a little relevant example with NAs
+ key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
+ 'qux', 'snap']
+ key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
+ 'three', 'one']
+
+ data = np.random.randn(len(key1))
+ return DataFrame({'key1': key1, 'key2': key2, 'data': data})
+
+
+def right():
+ """right dataframe (multi-indexed) for multi-index join tests"""
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['key1', 'key2'])
+
+ return DataFrame(np.random.randn(10, 3), index=index,
+ columns=['j_one', 'j_two', 'j_three'])
+
+
+def left_multi():
+ return (
+ DataFrame(
+ dict(Origin=['A', 'A', 'B', 'B', 'C'],
+ Destination=['A', 'B', 'A', 'C', 'A'],
+ Period=['AM', 'AM', 'IP', 'AM', 'OP'],
+ TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
+ Trips=[1987, 3647, 2470, 4296, 4444]),
+ columns=['Origin', 'Destination', 'Period',
+ 'TripPurp', 'Trips'])
+ .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
+
+
+def right_multi():
+ return (
+ DataFrame(
+ dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'],
+ Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'],
+ Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'],
+ LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'],
+ Distance=[100, 80, 90, 80, 75, 35, 55]),
+ columns=['Origin', 'Destination', 'Period',
+ 'LinkType', 'Distance'])
+ .set_index(['Origin', 'Destination', 'Period', 'LinkType']))
+
+
+def on_cols_multi():
+ return ['Origin', 'Destination', 'Period']
+
+
+def idx_cols_multi():
+ return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']
+
+
+class TestMergeMulti(object):
+
+ def setup_method(self):
+ self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
+ columns=['j_one', 'j_two', 'j_three'])
+
+ # a little relevant example with NAs
+ key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
+ 'qux', 'snap']
+ key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
+ 'three', 'one']
+
+ data = np.random.randn(len(key1))
+ self.data = DataFrame({'key1': key1, 'key2': key2,
+ 'data': data})
+
+ def test_merge_on_multikey(self, left, right, join_type):
+ on_cols = ['key1', 'key2']
+ result = (left.join(right, on=on_cols, how=join_type)
+ .reset_index(drop=True))
+
+ expected = pd.merge(left, right.reset_index(),
+ on=on_cols, how=join_type)
+
+ tm.assert_frame_equal(result, expected)
+
+ result = (left.join(right, on=on_cols, how=join_type, sort=True)
+ .reset_index(drop=True))
+
+ expected = pd.merge(left, right.reset_index(),
+ on=on_cols, how=join_type, sort=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("sort", [False, True])
+ def test_left_join_multi_index(self, left, right, sort):
+ icols = ['1st', '2nd', '3rd']
+
+ def bind_cols(df):
+ iord = lambda a: 0 if a != a else ord(a)
+ f = lambda ts: ts.map(iord) - ord('a')
+ return (f(df['1st']) + f(df['3rd']) * 1e2 +
+ df['2nd'].fillna(0) * 1e4)
+
+ def run_asserts(left, right, sort):
+ res = left.join(right, on=icols, how='left', sort=sort)
+
+ assert len(left) < len(res) + 1
+ assert not res['4th'].isna().any()
+ assert not res['5th'].isna().any()
+
+ tm.assert_series_equal(
+ res['4th'], - res['5th'], check_names=False)
+ result = bind_cols(res.iloc[:, :-2])
+ tm.assert_series_equal(res['4th'], result, check_names=False)
+ assert result.name is None
+
+ if sort:
+ tm.assert_frame_equal(
+ res, res.sort_values(icols, kind='mergesort'))
+
+ out = merge(left, right.reset_index(), on=icols,
+ sort=sort, how='left')
+
+ res.index = np.arange(len(res))
+ tm.assert_frame_equal(out, res)
+
+ lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
+ left = DataFrame(np.random.choice(lc, (5000, 2)),
+ columns=['1st', '3rd'])
+ left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))
+
+ i = np.random.permutation(len(left))
+ right = left.iloc[i].copy()
+
+ left['4th'] = bind_cols(left)
+ right['5th'] = - bind_cols(right)
+ right.set_index(icols, inplace=True)
+
+ run_asserts(left, right, sort)
+
+ # inject some nulls
+ left.loc[1::23, '1st'] = np.nan
+ left.loc[2::37, '2nd'] = np.nan
+ left.loc[3::43, '3rd'] = np.nan
+ left['4th'] = bind_cols(left)
+
+ i = np.random.permutation(len(left))
+ right = left.iloc[i, :-1]
+ right['5th'] = - bind_cols(right)
+ right.set_index(icols, inplace=True)
+
+ run_asserts(left, right, sort)
+
+ @pytest.mark.parametrize("sort", [False, True])
+ def test_merge_right_vs_left(self, left, right, sort):
+ # compare left vs right merge with multikey
+ on_cols = ['key1', 'key2']
+ merged_left_right = left.merge(right,
+ left_on=on_cols, right_index=True,
+ how='left', sort=sort)
+
+ merge_right_left = right.merge(left,
+ right_on=on_cols, left_index=True,
+ how='right', sort=sort)
+
+ # Reorder columns
+ merge_right_left = merge_right_left[merged_left_right.columns]
+
+ tm.assert_frame_equal(merged_left_right, merge_right_left)
+
+ def test_compress_group_combinations(self):
+
+ # ~ 40000000 possible unique groups
+ key1 = tm.rands_array(10, 10000)
+ key1 = np.tile(key1, 2)
+ key2 = key1[::-1]
+
+ df = DataFrame({'key1': key1, 'key2': key2,
+ 'value1': np.random.randn(20000)})
+
+ df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
+ 'value2': np.random.randn(10000)})
+
+ # just to hit the label compression code path
+ merge(df, df2, how='outer')
+
+ def test_left_join_index_preserve_order(self):
+
+ on_cols = ['k1', 'k2']
+ left = DataFrame({'k1': [0, 1, 2] * 8,
+ 'k2': ['foo', 'bar'] * 12,
+ 'v': np.array(np.arange(24), dtype=np.int64)})
+
+ index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+ right = DataFrame({'v2': [5, 7]}, index=index)
+
+ result = left.join(right, on=on_cols)
+
+ expected = left.copy()
+ expected['v2'] = np.nan
+ expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
+ expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
+
+ tm.assert_frame_equal(result, expected)
+
+ result.sort_values(on_cols, kind='mergesort', inplace=True)
+ expected = left.join(right, on=on_cols, sort=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ # test join with multi dtypes blocks
+ left = DataFrame({'k1': [0, 1, 2] * 8,
+ 'k2': ['foo', 'bar'] * 12,
+ 'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
+ 'v': np.array(np.arange(24), dtype=np.int32)})
+
+ index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+ right = DataFrame({'v2': [5, 7]}, index=index)
+
+ result = left.join(right, on=on_cols)
+
+ expected = left.copy()
+ expected['v2'] = np.nan
+ expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
+ expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
+
+ tm.assert_frame_equal(result, expected)
+
+ result = result.sort_values(on_cols, kind='mergesort')
+ expected = left.join(right, on=on_cols, sort=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_left_join_index_multi_match_multiindex(self):
+ left = DataFrame([
+ ['X', 'Y', 'C', 'a'],
+ ['W', 'Y', 'C', 'e'],
+ ['V', 'Q', 'A', 'h'],
+ ['V', 'R', 'D', 'i'],
+ ['X', 'Y', 'D', 'b'],
+ ['X', 'Y', 'A', 'c'],
+ ['W', 'Q', 'B', 'f'],
+ ['W', 'R', 'C', 'g'],
+ ['V', 'Y', 'C', 'j'],
+ ['X', 'Y', 'B', 'd']],
+ columns=['cola', 'colb', 'colc', 'tag'],
+ index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])
+
+ right = (DataFrame([
+ ['W', 'R', 'C', 0],
+ ['W', 'Q', 'B', 3],
+ ['W', 'Q', 'B', 8],
+ ['X', 'Y', 'A', 1],
+ ['X', 'Y', 'A', 4],
+ ['X', 'Y', 'B', 5],
+ ['X', 'Y', 'C', 6],
+ ['X', 'Y', 'C', 9],
+ ['X', 'Q', 'C', -6],
+ ['X', 'R', 'C', -9],
+ ['V', 'Y', 'C', 7],
+ ['V', 'R', 'D', 2],
+ ['V', 'R', 'D', -1],
+ ['V', 'Q', 'A', -3]],
+ columns=['col1', 'col2', 'col3', 'val'])
+ .set_index(['col1', 'col2', 'col3']))
+
+ result = left.join(right, on=['cola', 'colb', 'colc'], how='left')
+
+ expected = DataFrame([
+ ['X', 'Y', 'C', 'a', 6],
+ ['X', 'Y', 'C', 'a', 9],
+ ['W', 'Y', 'C', 'e', nan],
+ ['V', 'Q', 'A', 'h', -3],
+ ['V', 'R', 'D', 'i', 2],
+ ['V', 'R', 'D', 'i', -1],
+ ['X', 'Y', 'D', 'b', nan],
+ ['X', 'Y', 'A', 'c', 1],
+ ['X', 'Y', 'A', 'c', 4],
+ ['W', 'Q', 'B', 'f', 3],
+ ['W', 'Q', 'B', 'f', 8],
+ ['W', 'R', 'C', 'g', 0],
+ ['V', 'Y', 'C', 'j', 7],
+ ['X', 'Y', 'B', 'd', 5]],
+ columns=['cola', 'colb', 'colc', 'tag', 'val'],
+ index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])
+
+ tm.assert_frame_equal(result, expected)
+
+ result = left.join(right, on=['cola', 'colb', 'colc'],
+ how='left', sort=True)
+
+ expected = expected.sort_values(['cola', 'colb', 'colc'],
+ kind='mergesort')
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_left_join_index_multi_match(self):
+ left = DataFrame([
+ ['c', 0],
+ ['b', 1],
+ ['a', 2],
+ ['b', 3]],
+ columns=['tag', 'val'],
+ index=[2, 0, 1, 3])
+
+ right = (DataFrame([
+ ['a', 'v'],
+ ['c', 'w'],
+ ['c', 'x'],
+ ['d', 'y'],
+ ['a', 'z'],
+ ['c', 'r'],
+ ['e', 'q'],
+ ['c', 's']],
+ columns=['tag', 'char'])
+ .set_index('tag'))
+
+ result = left.join(right, on='tag', how='left')
+
+ expected = DataFrame([
+ ['c', 0, 'w'],
+ ['c', 0, 'x'],
+ ['c', 0, 'r'],
+ ['c', 0, 's'],
+ ['b', 1, nan],
+ ['a', 2, 'v'],
+ ['a', 2, 'z'],
+ ['b', 3, nan]],
+ columns=['tag', 'val', 'char'],
+ index=[2, 2, 2, 2, 0, 1, 1, 3])
+
+ tm.assert_frame_equal(result, expected)
+
+ result = left.join(right, on='tag', how='left', sort=True)
+ expected2 = expected.sort_values('tag', kind='mergesort')
+
+ tm.assert_frame_equal(result, expected2)
+
+ # GH7331 - maintain left frame order in left merge
+ result = merge(left, right.reset_index(), how='left', on='tag')
+ expected.index = np.arange(len(expected))
+ tm.assert_frame_equal(result, expected)
+
+ def test_left_merge_na_buglet(self):
+ left = DataFrame({'id': list('abcde'), 'v1': randn(5),
+ 'v2': randn(5), 'dummy': list('abcde'),
+ 'v3': randn(5)},
+ columns=['id', 'v1', 'v2', 'dummy', 'v3'])
+ right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
+ 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
+
+ result = merge(left, right, on='id', how='left')
+
+ rdf = right.drop(['id'], axis=1)
+ expected = left.join(rdf)
+ tm.assert_frame_equal(result, expected)
+
+ def test_merge_na_keys(self):
+ data = [[1950, "A", 1.5],
+ [1950, "B", 1.5],
+ [1955, "B", 1.5],
+ [1960, "B", np.nan],
+ [1970, "B", 4.],
+ [1950, "C", 4.],
+ [1960, "C", np.nan],
+ [1965, "C", 3.],
+ [1970, "C", 4.]]
+
+ frame = DataFrame(data, columns=["year", "panel", "data"])
+
+ other_data = [[1960, 'A', np.nan],
+ [1970, 'A', np.nan],
+ [1955, 'A', np.nan],
+ [1965, 'A', np.nan],
+ [1965, 'B', np.nan],
+ [1955, 'C', np.nan]]
+ other = DataFrame(other_data, columns=['year', 'panel', 'data'])
+
+ result = frame.merge(other, how='outer')
+
+ expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
+ expected = expected.replace(-999, np.nan)
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
+ def test_merge_datetime_index(self, klass):
+ # see gh-19038
+ df = DataFrame([1, 2, 3],
+ ["2016-01-01", "2017-01-01", "2018-01-01"],
+ columns=["a"])
+ df.index = pd.to_datetime(df.index)
+ on_vector = df.index.year
+
+ if klass is not None:
+ on_vector = klass(on_vector)
+
+ expected = DataFrame(
+ OrderedDict([
+ ("a", [1, 2, 3]),
+ ("key_1", [2016, 2017, 2018]),
+ ])
+ )
+
+ result = df.merge(df, on=["a", on_vector], how="inner")
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame(
+ OrderedDict([
+ ("key_0", [2016, 2017, 2018]),
+ ("a_x", [1, 2, 3]),
+ ("a_y", [1, 2, 3]),
+ ])
+ )
+
+ result = df.merge(df, on=[df.index.year], how="inner")
+ tm.assert_frame_equal(result, expected)
+
+ def test_join_multi_levels(self):
+
+ # GH 3662
+ # merge multi-levels
+ household = (
+ DataFrame(
+ dict(household_id=[1, 2, 3],
+ male=[0, 1, 0],
+ wealth=[196087.3, 316478.7, 294750]),
+ columns=['household_id', 'male', 'wealth'])
+ .set_index('household_id'))
+ portfolio = (
+ DataFrame(
+ dict(household_id=[1, 2, 2, 3, 3, 3, 4],
+ asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
+ "gb00b03mlx29", "lu0197800237", "nl0000289965",
+ np.nan],
+ name=["ABN Amro", "Robeco", "Royal Dutch Shell",
+ "Royal Dutch Shell",
+ "AAB Eastern Europe Equity Fund",
+ "Postbank BioTech Fonds", np.nan],
+ share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
+ columns=['household_id', 'asset_id', 'name', 'share'])
+ .set_index(['household_id', 'asset_id']))
+ result = household.join(portfolio, how='inner')
+ expected = (
+ DataFrame(
+ dict(male=[0, 1, 1, 0, 0, 0],
+ wealth=[196087.3, 316478.7, 316478.7,
+ 294750.0, 294750.0, 294750.0],
+ name=['ABN Amro', 'Robeco', 'Royal Dutch Shell',
+ 'Royal Dutch Shell',
+ 'AAB Eastern Europe Equity Fund',
+ 'Postbank BioTech Fonds'],
+ share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
+ household_id=[1, 2, 2, 3, 3, 3],
+ asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29',
+ 'gb00b03mlx29', 'lu0197800237',
+ 'nl0000289965']))
+ .set_index(['household_id', 'asset_id'])
+ .reindex(columns=['male', 'wealth', 'name', 'share']))
+ tm.assert_frame_equal(result, expected)
+
+ # equivalency
+ result = (merge(household.reset_index(), portfolio.reset_index(),
+ on=['household_id'], how='inner')
+ .set_index(['household_id', 'asset_id']))
+ tm.assert_frame_equal(result, expected)
+
+ result = household.join(portfolio, how='outer')
+ expected = (concat([
+ expected,
+ (DataFrame(
+ dict(share=[1.00]),
+ index=MultiIndex.from_tuples(
+ [(4, np.nan)],
+ names=['household_id', 'asset_id'])))
+ ], axis=0, sort=True).reindex(columns=expected.columns))
+ tm.assert_frame_equal(result, expected)
+
+ # invalid cases
+ household.index.name = 'foo'
+
+ with pytest.raises(ValueError):
+ household.join(portfolio, how='inner')
+
+ portfolio2 = portfolio.copy()
+ portfolio2.index.set_names(['household_id', 'foo'])
+
+ with pytest.raises(ValueError):
+ portfolio2.join(portfolio, how='inner')
+
+ def test_join_multi_levels2(self):
+
+ # some more advanced merges
+ # GH6360
+ household = (
+ DataFrame(
+ dict(household_id=[1, 2, 2, 3, 3, 3, 4],
+ asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
+ "gb00b03mlx29", "lu0197800237", "nl0000289965",
+ np.nan],
+ share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
+ columns=['household_id', 'asset_id', 'share'])
+ .set_index(['household_id', 'asset_id']))
+
+ log_return = DataFrame(dict(
+ asset_id=["gb00b03mlx29", "gb00b03mlx29",
+ "gb00b03mlx29", "lu0197800237", "lu0197800237"],
+ t=[233, 234, 235, 180, 181],
+ log_return=[.09604978, -.06524096, .03532373, .03025441, .036997]
+ )).set_index(["asset_id", "t"])
+
+ expected = (
+ DataFrame(dict(
+ household_id=[2, 2, 2, 3, 3, 3, 3, 3],
+ asset_id=["gb00b03mlx29", "gb00b03mlx29",
+ "gb00b03mlx29", "gb00b03mlx29",
+ "gb00b03mlx29", "gb00b03mlx29",
+ "lu0197800237", "lu0197800237"],
+ t=[233, 234, 235, 233, 234, 235, 180, 181],
+ share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
+ log_return=[.09604978, -.06524096, .03532373,
+ .09604978, -.06524096, .03532373,
+ .03025441, .036997]
+ ))
+ .set_index(["household_id", "asset_id", "t"])
+ .reindex(columns=['share', 'log_return']))
+
+ # this is the equivalency
+ result = (merge(household.reset_index(), log_return.reset_index(),
+ on=['asset_id'], how='inner')
+ .set_index(['household_id', 'asset_id', 't']))
+ tm.assert_frame_equal(result, expected)
+
+ expected = (
+ DataFrame(dict(
+ household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
+ asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
+ "gb00b03mlx29", "gb00b03mlx29",
+ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
+ "lu0197800237", "lu0197800237",
+ "nl0000289965", None],
+ t=[None, None, 233, 234, 235, 233, 234,
+ 235, 180, 181, None, None],
+ share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15,
+ 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
+ log_return=[None, None, .09604978, -.06524096, .03532373,
+ .09604978, -.06524096, .03532373,
+ .03025441, .036997, None, None]
+ ))
+ .set_index(["household_id", "asset_id", "t"])
+ .reindex(columns=['share', 'log_return']))
+
+ result = (merge(household.reset_index(), log_return.reset_index(),
+ on=['asset_id'], how='outer')
+ .set_index(['household_id', 'asset_id', 't']))
+
+ tm.assert_frame_equal(result, expected)
+
+
+class TestJoinMultiMulti(object):
+
+ def test_join_multi_multi(self, left_multi, right_multi, join_type,
+ on_cols_multi, idx_cols_multi):
+ # Multi-index join tests
+ expected = (pd.merge(left_multi.reset_index(),
+ right_multi.reset_index(),
+ how=join_type, on=on_cols_multi).
+ set_index(idx_cols_multi).sort_index())
+
+ result = left_multi.join(right_multi, how=join_type).sort_index()
+ tm.assert_frame_equal(result, expected)
+
+ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
+ on_cols_multi, idx_cols_multi):
+
+ left_multi = left_multi.drop(columns=left_multi.columns)
+ right_multi = right_multi.drop(columns=right_multi.columns)
+
+ expected = (pd.merge(left_multi.reset_index(),
+ right_multi.reset_index(),
+ how=join_type, on=on_cols_multi)
+ .set_index(idx_cols_multi).sort_index())
+
+ result = left_multi.join(right_multi, how=join_type).sort_index()
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
+ def test_merge_datetime_index(self, box):
+ # see gh-19038
+ df = DataFrame([1, 2, 3],
+ ["2016-01-01", "2017-01-01", "2018-01-01"],
+ columns=["a"])
+ df.index = pd.to_datetime(df.index)
+ on_vector = df.index.year
+
+ if box is not None:
+ on_vector = box(on_vector)
+
+ expected = DataFrame(
+ OrderedDict([
+ ("a", [1, 2, 3]),
+ ("key_1", [2016, 2017, 2018]),
+ ])
+ )
+
+ result = df.merge(df, on=["a", on_vector], how="inner")
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame(
+ OrderedDict([
+ ("key_0", [2016, 2017, 2018]),
+ ("a_x", [1, 2, 3]),
+ ("a_y", [1, 2, 3]),
+ ])
+ )
+
+ result = df.merge(df, on=[df.index.year], how="inner")
+ tm.assert_frame_equal(result, expected)
+
+ def test_single_common_level(self):
+ index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
+ ('K1', 'X2')],
+ names=['key', 'X'])
+
+ left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
+ 'B': ['B0', 'B1', 'B2']},
+ index=index_left)
+
+ index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
+ ('K2', 'Y2'), ('K2', 'Y3')],
+ names=['key', 'Y'])
+
+ right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
+ 'D': ['D0', 'D1', 'D2', 'D3']},
+ index=index_right)
+
+ result = left.join(right)
+ expected = (pd.merge(left.reset_index(), right.reset_index(),
+ on=['key'], how='inner')
+ .set_index(['key', 'X', 'Y']))
+
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_concat.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_concat.py
new file mode 100644
index 00000000000..ec6123bae32
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_concat.py
@@ -0,0 +1,2600 @@
+from collections import deque
+import datetime as dt
+from datetime import datetime
+from decimal import Decimal
+from itertools import combinations
+from warnings import catch_warnings, simplefilter
+
+import dateutil
+import numpy as np
+from numpy.random import randn
+import pytest
+
+from pandas.compat import PY2, Iterable, StringIO, iteritems
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Panel, Series,
+ Timestamp, concat, date_range, isna, read_csv)
+from pandas.tests.extension.decimal import to_decimal
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal, makeCustomDataframe as mkdf
+
+
[email protected](params=[True, False])
+def sort(request):
+ """Boolean sort keyword for concat and DataFrame.append."""
+ return request.param
+
+
[email protected](params=[True, False, None])
+def sort_with_none(request):
+ """Boolean sort keyword for concat and DataFrame.append.
+
+ Includes the default of None
+ """
+ # TODO: Replace with sort once keyword changes.
+ return request.param
+
+
+class ConcatenateBase(object):
+
+ def setup_method(self, method):
+ self.frame = DataFrame(tm.getSeriesData())
+ self.mixed_frame = self.frame.copy()
+ self.mixed_frame['foo'] = 'bar'
+
+
+class TestConcatAppendCommon(ConcatenateBase):
+
+ """
+ Test common dtype coercion rules between concat and append.
+ """
+
+ def setup_method(self, method):
+
+ dt_data = [pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03')]
+ tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03', tz='US/Eastern')]
+
+ td_data = [pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Timedelta('3 days')]
+
+ period_data = [pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02', freq='M'),
+ pd.Period('2011-03', freq='M')]
+
+ self.data = {'bool': [True, False, True],
+ 'int64': [1, 2, 3],
+ 'float64': [1.1, np.nan, 3.3],
+ 'category': pd.Categorical(['X', 'Y', 'Z']),
+ 'object': ['a', 'b', 'c'],
+ 'datetime64[ns]': dt_data,
+ 'datetime64[ns, US/Eastern]': tz_data,
+ 'timedelta64[ns]': td_data,
+ 'period[M]': period_data}
+
+ def _check_expected_dtype(self, obj, label):
+ """
+ Check whether obj has expected dtype depending on label
+ considering not-supported dtypes
+ """
+ if isinstance(obj, pd.Index):
+ if label == 'bool':
+ assert obj.dtype == 'object'
+ else:
+ assert obj.dtype == label
+ elif isinstance(obj, pd.Series):
+ if label.startswith('period'):
+ assert obj.dtype == 'Period[M]'
+ else:
+ assert obj.dtype == label
+ else:
+ raise ValueError
+
+ def test_dtypes(self):
+ # to confirm test case covers intended dtypes
+ for typ, vals in iteritems(self.data):
+ self._check_expected_dtype(pd.Index(vals), typ)
+ self._check_expected_dtype(pd.Series(vals), typ)
+
+ def test_concatlike_same_dtypes(self):
+ # GH 13660
+ for typ1, vals1 in iteritems(self.data):
+
+ vals2 = vals1
+ vals3 = vals1
+
+ if typ1 == 'category':
+ exp_data = pd.Categorical(list(vals1) + list(vals2))
+ exp_data3 = pd.Categorical(list(vals1) + list(vals2) +
+ list(vals3))
+ else:
+ exp_data = vals1 + vals2
+ exp_data3 = vals1 + vals2 + vals3
+
+ # ----- Index ----- #
+
+ # index.append
+ res = pd.Index(vals1).append(pd.Index(vals2))
+ exp = pd.Index(exp_data)
+ tm.assert_index_equal(res, exp)
+
+ # 3 elements
+ res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)])
+ exp = pd.Index(exp_data3)
+ tm.assert_index_equal(res, exp)
+
+ # index.append name mismatch
+ i1 = pd.Index(vals1, name='x')
+ i2 = pd.Index(vals2, name='y')
+ res = i1.append(i2)
+ exp = pd.Index(exp_data)
+ tm.assert_index_equal(res, exp)
+
+ # index.append name match
+ i1 = pd.Index(vals1, name='x')
+ i2 = pd.Index(vals2, name='x')
+ res = i1.append(i2)
+ exp = pd.Index(exp_data, name='x')
+ tm.assert_index_equal(res, exp)
+
+ # cannot append non-index
+ with pytest.raises(TypeError, match='all inputs must be Index'):
+ pd.Index(vals1).append(vals2)
+
+ with pytest.raises(TypeError, match='all inputs must be Index'):
+ pd.Index(vals1).append([pd.Index(vals2), vals3])
+
+ # ----- Series ----- #
+
+ # series.append
+ res = pd.Series(vals1).append(pd.Series(vals2),
+ ignore_index=True)
+ exp = pd.Series(exp_data)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # concat
+ res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
+ ignore_index=True)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # 3 elements
+ res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)],
+ ignore_index=True)
+ exp = pd.Series(exp_data3)
+ tm.assert_series_equal(res, exp)
+
+ res = pd.concat([pd.Series(vals1), pd.Series(vals2),
+ pd.Series(vals3)], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ # name mismatch
+ s1 = pd.Series(vals1, name='x')
+ s2 = pd.Series(vals2, name='y')
+ res = s1.append(s2, ignore_index=True)
+ exp = pd.Series(exp_data)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ res = pd.concat([s1, s2], ignore_index=True)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # name match
+ s1 = pd.Series(vals1, name='x')
+ s2 = pd.Series(vals2, name='x')
+ res = s1.append(s2, ignore_index=True)
+ exp = pd.Series(exp_data, name='x')
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ res = pd.concat([s1, s2], ignore_index=True)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # cannot append non-index
+ msg = (r'cannot concatenate object of type \"(.+?)\";'
+ ' only pd.Series, pd.DataFrame, and pd.Panel'
+ r' \(deprecated\) objs are valid')
+ with pytest.raises(TypeError, match=msg):
+ pd.Series(vals1).append(vals2)
+
+ with pytest.raises(TypeError, match=msg):
+ pd.Series(vals1).append([pd.Series(vals2), vals3])
+
+ with pytest.raises(TypeError, match=msg):
+ pd.concat([pd.Series(vals1), vals2])
+
+ with pytest.raises(TypeError, match=msg):
+ pd.concat([pd.Series(vals1), pd.Series(vals2), vals3])
+
+ def test_concatlike_dtypes_coercion(self):
+ # GH 13660
+ for typ1, vals1 in iteritems(self.data):
+ for typ2, vals2 in iteritems(self.data):
+
+ vals3 = vals2
+
+ # basically infer
+ exp_index_dtype = None
+ exp_series_dtype = None
+
+ if typ1 == typ2:
+ # same dtype is tested in test_concatlike_same_dtypes
+ continue
+ elif typ1 == 'category' or typ2 == 'category':
+ # ToDo: suspicious
+ continue
+
+ # specify expected dtype
+ if typ1 == 'bool' and typ2 in ('int64', 'float64'):
+ # series coerces to numeric based on numpy rule
+ # index doesn't because bool is object dtype
+ exp_series_dtype = typ2
+ elif typ2 == 'bool' and typ1 in ('int64', 'float64'):
+ exp_series_dtype = typ1
+ elif (typ1 == 'datetime64[ns, US/Eastern]' or
+ typ2 == 'datetime64[ns, US/Eastern]' or
+ typ1 == 'timedelta64[ns]' or
+ typ2 == 'timedelta64[ns]'):
+ exp_index_dtype = object
+ exp_series_dtype = object
+
+ exp_data = vals1 + vals2
+ exp_data3 = vals1 + vals2 + vals3
+
+ # ----- Index ----- #
+
+ # index.append
+ res = pd.Index(vals1).append(pd.Index(vals2))
+ exp = pd.Index(exp_data, dtype=exp_index_dtype)
+ tm.assert_index_equal(res, exp)
+
+ # 3 elements
+ res = pd.Index(vals1).append([pd.Index(vals2),
+ pd.Index(vals3)])
+ exp = pd.Index(exp_data3, dtype=exp_index_dtype)
+ tm.assert_index_equal(res, exp)
+
+ # ----- Series ----- #
+
+ # series.append
+ res = pd.Series(vals1).append(pd.Series(vals2),
+ ignore_index=True)
+ exp = pd.Series(exp_data, dtype=exp_series_dtype)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # concat
+ res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
+ ignore_index=True)
+ tm.assert_series_equal(res, exp, check_index_type=True)
+
+ # 3 elements
+ res = pd.Series(vals1).append([pd.Series(vals2),
+ pd.Series(vals3)],
+ ignore_index=True)
+ exp = pd.Series(exp_data3, dtype=exp_series_dtype)
+ tm.assert_series_equal(res, exp)
+
+ res = pd.concat([pd.Series(vals1), pd.Series(vals2),
+ pd.Series(vals3)], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ def test_concatlike_common_coerce_to_pandas_object(self):
+ # GH 13626
+ # result must be Timestamp/Timedelta, not datetime.datetime/timedelta
+ dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'])
+ tdi = pd.TimedeltaIndex(['1 days', '2 days'])
+
+ exp = pd.Index([pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-01-02'),
+ pd.Timedelta('1 days'),
+ pd.Timedelta('2 days')])
+
+ res = dti.append(tdi)
+ tm.assert_index_equal(res, exp)
+ assert isinstance(res[0], pd.Timestamp)
+ assert isinstance(res[-1], pd.Timedelta)
+
+ dts = pd.Series(dti)
+ tds = pd.Series(tdi)
+ res = dts.append(tds)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+ assert isinstance(res.iloc[0], pd.Timestamp)
+ assert isinstance(res.iloc[-1], pd.Timedelta)
+
+ res = pd.concat([dts, tds])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+ assert isinstance(res.iloc[0], pd.Timestamp)
+ assert isinstance(res.iloc[-1], pd.Timedelta)
+
+ def test_concatlike_datetimetz(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH 7795
+ dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
+ dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz)
+
+ exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
+ '2012-01-01', '2012-01-02'], tz=tz)
+
+ res = dti1.append(dti2)
+ tm.assert_index_equal(res, exp)
+
+ dts1 = pd.Series(dti1)
+ dts2 = pd.Series(dti2)
+ res = dts1.append(dts2)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([dts1, dts2])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ @pytest.mark.parametrize('tz',
+ ['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT'])
+ def test_concatlike_datetimetz_short(self, tz):
+ # GH#7795
+ ix1 = pd.date_range(start='2014-07-15', end='2014-07-17',
+ freq='D', tz=tz)
+ ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz)
+ df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B'])
+ df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B'])
+
+ exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16',
+ '2014-07-17', '2014-07-11',
+ '2014-07-21'], tz=tz)
+ exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B'])
+
+ tm.assert_frame_equal(df1.append(df2), exp)
+ tm.assert_frame_equal(pd.concat([df1, df2]), exp)
+
+ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH 13660
+
+ # different tz coerces to object
+ dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
+ dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'])
+
+ exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2011-01-02', tz=tz),
+ pd.Timestamp('2012-01-01'),
+ pd.Timestamp('2012-01-02')], dtype=object)
+
+ res = dti1.append(dti2)
+ tm.assert_index_equal(res, exp)
+
+ dts1 = pd.Series(dti1)
+ dts2 = pd.Series(dti2)
+ res = dts1.append(dts2)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([dts1, dts2])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ # different tz
+ dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'],
+ tz='US/Pacific')
+
+ exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2011-01-02', tz=tz),
+ pd.Timestamp('2012-01-01', tz='US/Pacific'),
+ pd.Timestamp('2012-01-02', tz='US/Pacific')],
+ dtype=object)
+
+ res = dti1.append(dti3)
+ # tm.assert_index_equal(res, exp)
+
+ dts1 = pd.Series(dti1)
+ dts3 = pd.Series(dti3)
+ res = dts1.append(dts3)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([dts1, dts3])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ def test_concatlike_common_period(self):
+ # GH 13660
+ pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
+ pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M')
+
+ exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01',
+ '2012-02'], freq='M')
+
+ res = pi1.append(pi2)
+ tm.assert_index_equal(res, exp)
+
+ ps1 = pd.Series(pi1)
+ ps2 = pd.Series(pi2)
+ res = ps1.append(ps2)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([ps1, ps2])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ def test_concatlike_common_period_diff_freq_to_object(self):
+ # GH 13221
+ pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
+ pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D')
+
+ exp = pd.Index([pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02', freq='M'),
+ pd.Period('2012-01-01', freq='D'),
+ pd.Period('2012-02-01', freq='D')], dtype=object)
+
+ res = pi1.append(pi2)
+ tm.assert_index_equal(res, exp)
+
+ ps1 = pd.Series(pi1)
+ ps2 = pd.Series(pi2)
+ res = ps1.append(ps2)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([ps1, ps2])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ def test_concatlike_common_period_mixed_dt_to_object(self):
+ # GH 13221
+ # different datetimelike
+ pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
+ tdi = pd.TimedeltaIndex(['1 days', '2 days'])
+ exp = pd.Index([pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02', freq='M'),
+ pd.Timedelta('1 days'),
+ pd.Timedelta('2 days')], dtype=object)
+
+ res = pi1.append(tdi)
+ tm.assert_index_equal(res, exp)
+
+ ps1 = pd.Series(pi1)
+ tds = pd.Series(tdi)
+ res = ps1.append(tds)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([ps1, tds])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ # inverse
+ exp = pd.Index([pd.Timedelta('1 days'),
+ pd.Timedelta('2 days'),
+ pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02', freq='M')], dtype=object)
+
+ res = tdi.append(pi1)
+ tm.assert_index_equal(res, exp)
+
+ ps1 = pd.Series(pi1)
+ tds = pd.Series(tdi)
+ res = tds.append(ps1)
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ res = pd.concat([tds, ps1])
+ tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
+
+ def test_concat_categorical(self):
+ # GH 13524
+
+ # same categories -> category
+ s1 = pd.Series([1, 2, np.nan], dtype='category')
+ s2 = pd.Series([2, 1, 2], dtype='category')
+
+ exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category')
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ # partially different categories => not-category
+ s1 = pd.Series([3, 2], dtype='category')
+ s2 = pd.Series([2, 1], dtype='category')
+
+ exp = pd.Series([3, 2, 2, 1])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ # completely different categories (same dtype) => not-category
+ s1 = pd.Series([10, 11, np.nan], dtype='category')
+ s2 = pd.Series([np.nan, 1, 3, 2], dtype='category')
+
+ exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object')
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ def test_union_categorical_same_categories_different_order(self):
+ # https://github.com/pandas-dev/pandas/issues/19096
+ a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']))
+ b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']))
+ result = pd.concat([a, b], ignore_index=True)
+ expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+ categories=['a', 'b', 'c']))
+ tm.assert_series_equal(result, expected)
+
+ def test_concat_categorical_coercion(self):
+ # GH 13524
+
+ # category + not-category => not-category
+ s1 = pd.Series([1, 2, np.nan], dtype='category')
+ s2 = pd.Series([2, 1, 2])
+
+ exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object')
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ # result shouldn't be affected by 1st elem dtype
+ exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object')
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ # all values are not in category => not-category
+ s1 = pd.Series([3, 2], dtype='category')
+ s2 = pd.Series([2, 1])
+
+ exp = pd.Series([3, 2, 2, 1])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ exp = pd.Series([2, 1, 3, 2])
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ # completely different categories => not-category
+ s1 = pd.Series([10, 11, np.nan], dtype='category')
+ s2 = pd.Series([1, 3, 2])
+
+ exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object')
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object')
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ # different dtype => not-category
+ s1 = pd.Series([10, 11, np.nan], dtype='category')
+ s2 = pd.Series(['a', 'b', 'c'])
+
+ exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c'])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan])
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ # if normal series only contains NaN-likes => not-category
+ s1 = pd.Series([10, 11], dtype='category')
+ s2 = pd.Series([np.nan, np.nan, np.nan])
+
+ exp = pd.Series([10, 11, np.nan, np.nan, np.nan])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ exp = pd.Series([np.nan, np.nan, np.nan, 10, 11])
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ def test_concat_categorical_3elem_coercion(self):
+ # GH 13524
+
+ # mixed dtypes => not-category
+ s1 = pd.Series([1, 2, np.nan], dtype='category')
+ s2 = pd.Series([2, 1, 2], dtype='category')
+ s3 = pd.Series([1, 2, 1, 2, np.nan])
+
+ exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan],
+ dtype='object')
+ tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
+
+ exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2],
+ dtype='object')
+ tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
+
+ # values are all in either category => not-category
+ s1 = pd.Series([4, 5, 6], dtype='category')
+ s2 = pd.Series([1, 2, 3], dtype='category')
+ s3 = pd.Series([1, 3, 4])
+
+ exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
+ tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
+
+ exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
+ tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
+
+ # values are all in either category => not-category
+ s1 = pd.Series([4, 5, 6], dtype='category')
+ s2 = pd.Series([1, 2, 3], dtype='category')
+ s3 = pd.Series([10, 11, 12])
+
+ exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
+ tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
+
+ exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
+ tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
+
+ def test_concat_categorical_multi_coercion(self):
+ # GH 13524
+
+ s1 = pd.Series([1, 3], dtype='category')
+ s2 = pd.Series([3, 4], dtype='category')
+ s3 = pd.Series([2, 3])
+ s4 = pd.Series([2, 2], dtype='category')
+ s5 = pd.Series([1, np.nan])
+ s6 = pd.Series([1, 3, 2], dtype='category')
+
+ # mixed dtype, values are all in categories => not-category
+ exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
+ res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+ res = s1.append([s2, s3, s4, s5, s6], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
+ res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+ res = s6.append([s5, s4, s3, s2, s1], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ def test_concat_categorical_ordered(self):
+ # GH 13524
+
+ s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True))
+ s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True))
+
+ exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan],
+ ordered=True))
+ tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp)
+
+ def test_concat_categorical_coercion_nan(self):
+ # GH 13524
+
+ # some edge cases
+ # category + not-category => not category
+ s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64),
+ dtype='category')
+ s2 = pd.Series([np.nan, 1])
+
+ exp = pd.Series([np.nan, np.nan, np.nan, 1])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ s1 = pd.Series([1, np.nan], dtype='category')
+ s2 = pd.Series([np.nan, np.nan])
+
+ exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object')
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ # mixed dtype, all nan-likes => not-category
+ s1 = pd.Series([np.nan, np.nan], dtype='category')
+ s2 = pd.Series([np.nan, np.nan])
+
+ exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+ # all category nan-likes => category
+ s1 = pd.Series([np.nan, np.nan], dtype='category')
+ s2 = pd.Series([np.nan, np.nan], dtype='category')
+
+ exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category')
+
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ def test_concat_categorical_empty(self):
+ # GH 13524
+
+ s1 = pd.Series([], dtype='category')
+ s2 = pd.Series([1, 2], dtype='category')
+
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
+
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
+
+ s1 = pd.Series([], dtype='category')
+ s2 = pd.Series([], dtype='category')
+
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
+
+ s1 = pd.Series([], dtype='category')
+ s2 = pd.Series([], dtype='object')
+
+ # different dtype => not-category
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
+
+ s1 = pd.Series([], dtype='category')
+ s2 = pd.Series([np.nan, np.nan])
+
+ # empty Series is ignored
+ exp = pd.Series([np.nan, np.nan])
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
+ tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
+
+ tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
+ tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
+
+
+class TestAppend(ConcatenateBase):
+
+ def test_append(self, sort):
+ begin_index = self.frame.index[:5]
+ end_index = self.frame.index[5:]
+
+ begin_frame = self.frame.reindex(begin_index)
+ end_frame = self.frame.reindex(end_index)
+
+ appended = begin_frame.append(end_frame)
+ tm.assert_almost_equal(appended['A'], self.frame['A'])
+
+ del end_frame['A']
+ partial_appended = begin_frame.append(end_frame, sort=sort)
+ assert 'A' in partial_appended
+
+ partial_appended = end_frame.append(begin_frame, sort=sort)
+ assert 'A' in partial_appended
+
+ # mixed type handling
+ appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
+ tm.assert_frame_equal(appended, self.mixed_frame)
+
+ # what to test here
+ mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort)
+ mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:],
+ sort=sort)
+
+ # all equal except 'foo' column
+ tm.assert_frame_equal(
+ mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
+ mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))
+
+ # append empty
+ empty = DataFrame({})
+
+ appended = self.frame.append(empty)
+ tm.assert_frame_equal(self.frame, appended)
+ assert appended is not self.frame
+
+ appended = empty.append(self.frame)
+ tm.assert_frame_equal(self.frame, appended)
+ assert appended is not self.frame
+
+ # Overlap
+ msg = "Indexes have overlapping values"
+ with pytest.raises(ValueError, match=msg):
+ self.frame.append(self.frame, verify_integrity=True)
+
+ # see gh-6129: new columns
+ df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
+ row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
+ expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {
+ 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}})
+ result = df.append(row)
+ tm.assert_frame_equal(result, expected)
+
+ def test_append_length0_frame(self, sort):
+ df = DataFrame(columns=['A', 'B', 'C'])
+ df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
+ df5 = df.append(df3, sort=sort)
+
+ expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
+ assert_frame_equal(df5, expected)
+
+ def test_append_records(self):
+ arr1 = np.zeros((2,), dtype=('i4,f4,a10'))
+ arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")]
+
+ arr2 = np.zeros((3,), dtype=('i4,f4,a10'))
+ arr2[:] = [(3, 4., 'foo'),
+ (5, 6., "bar"),
+ (7., 8., 'baz')]
+
+ df1 = DataFrame(arr1)
+ df2 = DataFrame(arr2)
+
+ result = df1.append(df2, ignore_index=True)
+ expected = DataFrame(np.concatenate((arr1, arr2)))
+ assert_frame_equal(result, expected)
+
+ # rewrite sort fixture, since we also want to test default of None
+ def test_append_sorts(self, sort_with_none):
+ df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
+ df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3])
+
+ if sort_with_none is None:
+ # only warn if not explicitly specified
+ # don't check stacklevel since its set for concat, and append
+ # has an extra stack.
+ ctx = tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False)
+ else:
+ ctx = tm.assert_produces_warning(None)
+
+ with ctx:
+ result = df1.append(df2, sort=sort_with_none)
+
+ # for None / True
+ expected = pd.DataFrame({"b": [1, 2, None, None],
+ "a": [1, 2, 1, 2],
+ "c": [None, None, 3, 4]},
+ columns=['a', 'b', 'c'])
+ if sort_with_none is False:
+ expected = expected[['b', 'a', 'c']]
+ tm.assert_frame_equal(result, expected)
+
+ def test_append_different_columns(self, sort):
+ df = DataFrame({'bools': np.random.randn(10) > 0,
+ 'ints': np.random.randint(0, 10, 10),
+ 'floats': np.random.randn(10),
+ 'strings': ['foo', 'bar'] * 5})
+
+ a = df[:5].loc[:, ['bools', 'ints', 'floats']]
+ b = df[5:].loc[:, ['strings', 'ints', 'floats']]
+
+ appended = a.append(b, sort=sort)
+ assert isna(appended['strings'][0:4]).all()
+ assert isna(appended['bools'][5:]).all()
+
+ def test_append_many(self, sort):
+ chunks = [self.frame[:5], self.frame[5:10],
+ self.frame[10:15], self.frame[15:]]
+
+ result = chunks[0].append(chunks[1:])
+ tm.assert_frame_equal(result, self.frame)
+
+ chunks[-1] = chunks[-1].copy()
+ chunks[-1]['foo'] = 'bar'
+ result = chunks[0].append(chunks[1:], sort=sort)
+ tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame)
+ assert (result['foo'][15:] == 'bar').all()
+ assert result['foo'][:15].isna().all()
+
+ def test_append_preserve_index_name(self):
+ # #980
+ df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
+ df1 = df1.set_index(['A'])
+ df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
+ columns=['A', 'B', 'C'])
+ df2 = df2.set_index(['A'])
+
+ result = df1.append(df2)
+ assert result.index.name == 'A'
+
+ indexes_can_append = [
+ pd.RangeIndex(3),
+ pd.Index([4, 5, 6]),
+ pd.Index([4.5, 5.5, 6.5]),
+ pd.Index(list('abc')),
+ pd.CategoricalIndex('A B C'.split()),
+ pd.CategoricalIndex('D E F'.split(), ordered=True),
+ pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
+ dt.datetime(2013, 1, 3, 6, 10),
+ dt.datetime(2013, 1, 3, 7, 12)]),
+ ]
+
+ indexes_cannot_append_with_other = [
+ pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
+ pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
+ ]
+
+ all_indexes = indexes_can_append + indexes_cannot_append_with_other
+
+ @pytest.mark.parametrize("index",
+ all_indexes,
+ ids=lambda x: x.__class__.__name__)
+ def test_append_same_columns_type(self, index):
+ # GH18359
+
+ # df wider than ser
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
+ ser_index = index[:2]
+ ser = pd.Series([7, 8], index=ser_index, name=2)
+ result = df.append(ser)
+ expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
+ index=[0, 1, 2],
+ columns=index)
+ assert_frame_equal(result, expected)
+
+ # ser wider than df
+ ser_index = index
+ index = index[:2]
+ df = pd.DataFrame([[1, 2], [4, 5]], columns=index)
+ ser = pd.Series([7, 8, 9], index=ser_index, name=2)
+ result = df.append(ser)
+ expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
+ index=[0, 1, 2],
+ columns=ser_index)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("df_columns, series_index",
+ combinations(indexes_can_append, r=2),
+ ids=lambda x: x.__class__.__name__)
+ def test_append_different_columns_types(self, df_columns, series_index):
+ # GH18359
+ # See also test 'test_append_different_columns_types_raises' below
+ # for errors raised when appending
+
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
+ ser = pd.Series([7, 8, 9], index=series_index, name=2)
+
+ result = df.append(ser)
+ idx_diff = ser.index.difference(df_columns)
+ combined_columns = Index(df_columns.tolist()).append(idx_diff)
+ expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan],
+ [4, 5, 6, np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan, 7, 8, 9]],
+ index=[0, 1, 2],
+ columns=combined_columns)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('index_can_append', indexes_can_append,
+ ids=lambda x: x.__class__.__name__)
+ @pytest.mark.parametrize('index_cannot_append_with_other',
+ indexes_cannot_append_with_other,
+ ids=lambda x: x.__class__.__name__)
+ def test_append_different_columns_types_raises(
+ self, index_can_append, index_cannot_append_with_other):
+ # GH18359
+ # Dataframe.append will raise if IntervalIndex/MultiIndex appends
+ # or is appended to a different index type
+ #
+ # See also test 'test_append_different_columns_types' above for
+ # appending without raising.
+
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
+ ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other,
+ name=2)
+ msg = ("the other index needs to be an IntervalIndex too, but was"
+ r" type {}|"
+ r"object of type '(int|long|float|Timestamp)' has no len\(\)|"
+ "Expected tuple, got str")
+ with pytest.raises(TypeError, match=msg.format(
+ index_can_append.__class__.__name__)):
+ df.append(ser)
+
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
+ columns=index_cannot_append_with_other)
+ ser = pd.Series([7, 8, 9], index=index_can_append, name=2)
+ msg = (r"unorderable types: (Interval|int)\(\) > "
+ r"(int|long|float|str)\(\)|"
+ r"Expected tuple, got (int|long|float|str)|"
+ r"Cannot compare type 'Timestamp' with type '(int|long)'|"
+ r"'>' not supported between instances of 'int' and 'str'")
+ with pytest.raises(TypeError, match=msg):
+ df.append(ser)
+
+ def test_append_dtype_coerce(self, sort):
+
+ # GH 4993
+ # appending with datetime will incorrectly convert datetime64
+
+ df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
+ dt.datetime(2013, 1, 2, 0, 0)],
+ columns=['start_time'])
+ df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0),
+ dt.datetime(2013, 1, 3, 6, 10)],
+ [dt.datetime(2013, 1, 4, 0, 0),
+ dt.datetime(2013, 1, 4, 7, 10)]],
+ columns=['start_time', 'end_time'])
+
+ expected = concat([Series([pd.NaT,
+ pd.NaT,
+ dt.datetime(2013, 1, 3, 6, 10),
+ dt.datetime(2013, 1, 4, 7, 10)],
+ name='end_time'),
+ Series([dt.datetime(2013, 1, 1, 0, 0),
+ dt.datetime(2013, 1, 2, 0, 0),
+ dt.datetime(2013, 1, 3, 0, 0),
+ dt.datetime(2013, 1, 4, 0, 0)],
+ name='start_time')],
+ axis=1, sort=sort)
+ result = df1.append(df2, ignore_index=True, sort=sort)
+ if sort:
+ expected = expected[['end_time', 'start_time']]
+ else:
+ expected = expected[['start_time', 'end_time']]
+
+ assert_frame_equal(result, expected)
+
+ def test_append_missing_column_proper_upcast(self, sort):
+ df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')})
+ df2 = DataFrame({'B': np.array([True, False, True, False],
+ dtype=bool)})
+
+ appended = df1.append(df2, ignore_index=True, sort=sort)
+ assert appended['A'].dtype == 'f8'
+ assert appended['B'].dtype == 'O'
+
+ def test_append_empty_frame_to_series_with_dateutil_tz(self):
+ # GH 23682
+ date = Timestamp('2018-10-24 07:30:00', tz=dateutil.tz.tzutc())
+ s = Series({'date': date, 'a': 1.0, 'b': 2.0})
+ df = DataFrame(columns=['c', 'd'])
+ result = df.append(s, ignore_index=True)
+ # n.b. it's not clear to me that expected is correct here.
+ # It's possible that the `date` column should have
+ # datetime64[ns, tz] dtype for both result and expected.
+ # that would be more consistent with new columns having
+ # their own dtype (float for a and b, datetime64ns, tz for date).
+ expected = DataFrame([[np.nan, np.nan, 1., 2., date]],
+ columns=['c', 'd', 'a', 'b', 'date'],
+ dtype=object)
+ # These columns get cast to object after append
+ expected['a'] = expected['a'].astype(float)
+ expected['b'] = expected['b'].astype(float)
+ assert_frame_equal(result, expected)
+
+
+class TestConcatenate(ConcatenateBase):
+
+ def test_concat_copy(self):
+ df = DataFrame(np.random.randn(4, 3))
+ df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
+ df3 = DataFrame({5: 'foo'}, index=range(4))
+
+ # These are actual copies.
+ result = concat([df, df2, df3], axis=1, copy=True)
+
+ for b in result._data.blocks:
+ assert b.values.base is None
+
+ # These are the same.
+ result = concat([df, df2, df3], axis=1, copy=False)
+
+ for b in result._data.blocks:
+ if b.is_float:
+ assert b.values.base is df._data.blocks[0].values.base
+ elif b.is_integer:
+ assert b.values.base is df2._data.blocks[0].values.base
+ elif b.is_object:
+ assert b.values.base is not None
+
+ # Float block was consolidated.
+ df4 = DataFrame(np.random.randn(4, 1))
+ result = concat([df, df2, df3, df4], axis=1, copy=False)
+ for b in result._data.blocks:
+ if b.is_float:
+ assert b.values.base is None
+ elif b.is_integer:
+ assert b.values.base is df2._data.blocks[0].values.base
+ elif b.is_object:
+ assert b.values.base is not None
+
+ def test_concat_with_group_keys(self):
+ df = DataFrame(np.random.randn(4, 3))
+ df2 = DataFrame(np.random.randn(4, 4))
+
+ # axis=0
+ df = DataFrame(np.random.randn(3, 4))
+ df2 = DataFrame(np.random.randn(4, 4))
+
+ result = concat([df, df2], keys=[0, 1])
+ exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1],
+ [0, 1, 2, 0, 1, 2, 3]])
+ expected = DataFrame(np.r_[df.values, df2.values],
+ index=exp_index)
+ tm.assert_frame_equal(result, expected)
+
+ result = concat([df, df], keys=[0, 1])
+ exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
+ [0, 1, 2, 0, 1, 2]])
+ expected = DataFrame(np.r_[df.values, df.values],
+ index=exp_index2)
+ tm.assert_frame_equal(result, expected)
+
+ # axis=1
+ df = DataFrame(np.random.randn(4, 3))
+ df2 = DataFrame(np.random.randn(4, 4))
+
+ result = concat([df, df2], keys=[0, 1], axis=1)
+ expected = DataFrame(np.c_[df.values, df2.values],
+ columns=exp_index)
+ tm.assert_frame_equal(result, expected)
+
+ result = concat([df, df], keys=[0, 1], axis=1)
+ expected = DataFrame(np.c_[df.values, df.values],
+ columns=exp_index2)
+ tm.assert_frame_equal(result, expected)
+
+ def test_concat_keys_specific_levels(self):
+ df = DataFrame(np.random.randn(10, 4))
+ pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
+ level = ['three', 'two', 'one', 'zero']
+ result = concat(pieces, axis=1, keys=['one', 'two', 'three'],
+ levels=[level],
+ names=['group_key'])
+
+ tm.assert_index_equal(result.columns.levels[0],
+ Index(level, name='group_key'))
+ assert result.columns.names[0] == 'group_key'
+
+ def test_concat_dataframe_keys_bug(self, sort):
+ t1 = DataFrame({
+ 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'],
+ name='id'))})
+ t2 = DataFrame({
+ 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))})
+
+ # it works
+ result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort)
+ assert list(result.columns) == [('t1', 'value'), ('t2', 'value')]
+
+ def test_concat_series_partial_columns_names(self):
+ # GH10698
+ foo = Series([1, 2], name='foo')
+ bar = Series([1, 2])
+ baz = Series([4, 5])
+
+ result = concat([foo, bar, baz], axis=1)
+ expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [
+ 4, 5]}, columns=['foo', 0, 1])
+ tm.assert_frame_equal(result, expected)
+
+ result = concat([foo, bar, baz], axis=1, keys=[
+ 'red', 'blue', 'yellow'])
+ expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [
+ 4, 5]}, columns=['red', 'blue', 'yellow'])
+ tm.assert_frame_equal(result, expected)
+
+ result = concat([foo, bar, baz], axis=1, ignore_index=True)
+ expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
+ tm.assert_frame_equal(result, expected)
+
+ def test_concat_dict(self):
+ frames = {'foo': DataFrame(np.random.randn(4, 3)),
+ 'bar': DataFrame(np.random.randn(4, 3)),
+ 'baz': DataFrame(np.random.randn(4, 3)),
+ 'qux': DataFrame(np.random.randn(4, 3))}
+
+ sorted_keys = sorted(frames)
+
+ result = concat(frames)
+ expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
+ tm.assert_frame_equal(result, expected)
+
+ result = concat(frames, axis=1)
+ expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys,
+ axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ keys = ['baz', 'foo', 'bar']
+ result = concat(frames, keys=keys)
+ expected = concat([frames[k] for k in keys], keys=keys)
+ tm.assert_frame_equal(result, expected)
+
+ def test_concat_ignore_index(self, sort):
+ frame1 = DataFrame({"test1": ["a", "b", "c"],
+ "test2": [1, 2, 3],
+ "test3": [4.5, 3.2, 1.2]})
+ frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
+ frame1.index = Index(["x", "y", "z"])
+ frame2.index = Index(["x", "y", "q"])
+
+ v1 = concat([frame1, frame2], axis=1,
+ ignore_index=True, sort=sort)
+
+ nan = np.nan
+ expected = DataFrame([[nan, nan, nan, 4.3],
+ ['a', 1, 4.5, 5.2],
+ ['b', 2, 3.2, 2.2],
+ ['c', 3, 1.2, nan]],
+ index=Index(["q", "x", "y", "z"]))
+ if not sort:
+ expected = expected.loc[['x', 'y', 'z', 'q']]
+
+ tm.assert_frame_equal(v1, expected)
+
+ def test_concat_multiindex_with_keys(self):
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
+ ['one', 'two', 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ frame = DataFrame(np.random.randn(10, 3), index=index,
+ columns=Index(['A', 'B', 'C'], name='exp'))
+ result = concat([frame, frame], keys=[0, 1], names=['iteration'])
+
+ assert result.index.names == ('iteration',) + index.names
+ tm.assert_frame_equal(result.loc[0], frame)
+ tm.assert_frame_equal(result.loc[1], frame)
+ assert result.index.nlevels == 3
+
+ def test_concat_multiindex_with_tz(self):
+ # GH 6606
+ df = DataFrame({'dt': [datetime(2014, 1, 1),
+ datetime(2014, 1, 2),
+ datetime(2014, 1, 3)],
+ 'b': ['A', 'B', 'C'],
+ 'c': [1, 2, 3], 'd': [4, 5, 6]})
+ df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
+ df = df.set_index(['dt', 'b'])
+
+ exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02',
+ '2014-01-03'] * 2,
+ tz='US/Pacific', name='dt')
+ exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b')
+ exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
+ expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2},
+ index=exp_idx, columns=['c', 'd'])
+
+ result = concat([df, df])
+ tm.assert_frame_equal(result, expected)
+
+ def test_concat_multiindex_with_none_in_index_names(self):
+ # GH 15787
+ index = pd.MultiIndex.from_product([[1], range(5)],
+ names=['level1', None])
+ df = pd.DataFrame({'col': range(5)}, index=index, dtype=np.int32)
+
+ result = concat([df, df], keys=[1, 2], names=['level2'])
+ index = pd.MultiIndex.from_product([[1, 2], [1], range(5)],
+ names=['level2', 'level1', None])
+ expected = pd.DataFrame({'col': list(range(5)) * 2},
+ index=index, dtype=np.int32)
+ assert_frame_equal(result, expected)
+
+ result = concat([df, df[:2]], keys=[1, 2], names=['level2'])
+ level2 = [1] * 5 + [2] * 2
+ level1 = [1] * 7
+ no_name = list(range(5)) + list(range(2))
+ tuples = list(zip(level2, level1, no_name))
+ index = pd.MultiIndex.from_tuples(tuples,
+ names=['level2', 'level1', None])
+ expected = pd.DataFrame({'col': no_name}, index=index,
+ dtype=np.int32)
+ assert_frame_equal(result, expected)
+
+ def test_concat_keys_and_levels(self):
+ df = DataFrame(np.random.randn(1, 3))
+ df2 = DataFrame(np.random.randn(1, 4))
+
+ levels = [['foo', 'baz'], ['one', 'two']]
+ names = ['first', 'second']
+ result = concat([df, df2, df, df2],
+ keys=[('foo', 'one'), ('foo', 'two'),
+ ('baz', 'one'), ('baz', 'two')],
+ levels=levels,
+ names=names)
+ expected = concat([df, df2, df, df2])
+ exp_index = MultiIndex(levels=levels + [[0]],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1],
+ [0, 0, 0, 0]],
+ names=names + [None])
+ expected.index = exp_index
+
+ tm.assert_frame_equal(result, expected)
+
+ # no names
+ result = concat([df, df2, df, df2],
+ keys=[('foo', 'one'), ('foo', 'two'),
+ ('baz', 'one'), ('baz', 'two')],
+ levels=levels)
+ assert result.index.names == (None,) * 3
+
+ # no levels
+ result = concat([df, df2, df, df2],
+ keys=[('foo', 'one'), ('foo', 'two'),
+ ('baz', 'one'), ('baz', 'two')],
+ names=['first', 'second'])
+ assert result.index.names == ('first', 'second') + (None,)
+ tm.assert_index_equal(result.index.levels[0],
+ Index(['baz', 'foo'], name='first'))
+
+ def test_concat_keys_levels_no_overlap(self):
+ # GH #1406
+ df = DataFrame(np.random.randn(1, 3), index=['a'])
+ df2 = DataFrame(np.random.randn(1, 4), index=['b'])
+
+ msg = "Values not found in passed level"
+ with pytest.raises(ValueError, match=msg):
+ concat([df, df],
+ keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
+
+ msg = "Key one not in level"
+ with pytest.raises(ValueError, match=msg):
+ concat([df, df2],
+ keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
+
+ def test_concat_rename_index(self):
+ a = DataFrame(np.random.rand(3, 3),
+ columns=list('ABC'),
+ index=Index(list('abc'), name='index_a'))
+ b = DataFrame(np.random.rand(3, 3),
+ columns=list('ABC'),
+ index=Index(list('abc'), name='index_b'))
+
+ result = concat([a, b], keys=['key0', 'key1'],
+ names=['lvl0', 'lvl1'])
+
+ exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0'])
+ names = list(exp.index.names)
+ names[1] = 'lvl1'
+ exp.index.set_names(names, inplace=True)
+
+ tm.assert_frame_equal(result, exp)
+ assert result.index.names == exp.index.names
+
+ def test_crossed_dtypes_weird_corner(self):
+ columns = ['A', 'B', 'C', 'D']
+ df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'),
+ 'B': np.array([1, 2, 3, 4], dtype='i8'),
+ 'C': np.array([1, 2, 3, 4], dtype='f8'),
+ 'D': np.array([1, 2, 3, 4], dtype='i8')},
+ columns=columns)
+
+ df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'),
+ 'B': np.array([1, 2, 3, 4], dtype='f8'),
+ 'C': np.array([1, 2, 3, 4], dtype='i8'),
+ 'D': np.array([1, 2, 3, 4], dtype='f8')},
+ columns=columns)
+
+ appended = df1.append(df2, ignore_index=True)
+ expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0),
+ columns=columns)
+ tm.assert_frame_equal(appended, expected)
+
+ df = DataFrame(np.random.randn(1, 3), index=['a'])
+ df2 = DataFrame(np.random.randn(1, 4), index=['b'])
+ result = concat(
+ [df, df2], keys=['one', 'two'], names=['first', 'second'])
+ assert result.index.names == ('first', 'second')
+
+ def test_dups_index(self):
+ # GH 4771
+
+ # single dtypes
+ df = DataFrame(np.random.randint(0, 10, size=40).reshape(
+ 10, 4), columns=['A', 'A', 'C', 'C'])
+
+ result = concat([df, df], axis=1)
+ assert_frame_equal(result.iloc[:, :4], df)
+ assert_frame_equal(result.iloc[:, 4:], df)
+
+ result = concat([df, df], axis=0)
+ assert_frame_equal(result.iloc[:10], df)
+ assert_frame_equal(result.iloc[10:], df)
+
+ # multi dtypes
+ df = concat([DataFrame(np.random.randn(10, 4),
+ columns=['A', 'A', 'B', 'B']),
+ DataFrame(np.random.randint(0, 10, size=20)
+ .reshape(10, 2),
+ columns=['A', 'C'])],
+ axis=1)
+
+ result = concat([df, df], axis=1)
+ assert_frame_equal(result.iloc[:, :6], df)
+ assert_frame_equal(result.iloc[:, 6:], df)
+
+ result = concat([df, df], axis=0)
+ assert_frame_equal(result.iloc[:10], df)
+ assert_frame_equal(result.iloc[10:], df)
+
+ # append
+ result = df.iloc[0:8, :].append(df.iloc[8:])
+ assert_frame_equal(result, df)
+
+ result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10])
+ assert_frame_equal(result, df)
+
+ expected = concat([df, df], axis=0)
+ result = df.append(df)
+ assert_frame_equal(result, expected)
+
+ def test_with_mixed_tuples(self, sort):
+ # 10697
+ # columns have mixed tuples, so handle properly
+ df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2))
+ df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2))
+
+ # it works
+ concat([df1, df2], sort=sort)
+
+ def test_handle_empty_objects(self, sort):
+ df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
+
+ baz = df[:5].copy()
+ baz['foo'] = 'bar'
+ empty = df[5:5]
+
+ frames = [baz, empty, empty, df[5:]]
+ concatted = concat(frames, axis=0, sort=sort)
+
+ expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo'])
+ expected['foo'] = expected['foo'].astype('O')
+ expected.loc[0:4, 'foo'] = 'bar'
+
+ tm.assert_frame_equal(concatted, expected)
+
+ # empty as first element with time series
+ # GH3259
+ df = DataFrame(dict(A=range(10000)), index=date_range(
+ '20130101', periods=10000, freq='s'))
+ empty = DataFrame()
+ result = concat([df, empty], axis=1)
+ assert_frame_equal(result, df)
+ result = concat([empty, df], axis=1)
+ assert_frame_equal(result, df)
+
+ result = concat([df, empty])
+ assert_frame_equal(result, df)
+ result = concat([empty, df])
+ assert_frame_equal(result, df)
+
+ def test_concat_mixed_objs(self):
+
+ # concat mixed series/frames
+ # G2385
+
+ # axis 1
+ index = date_range('01-Jan-2013', periods=10, freq='H')
+ arr = np.arange(10, dtype='int64')
+ s1 = Series(arr, index=index)
+ s2 = Series(arr, index=index)
+ df = DataFrame(arr.reshape(-1, 1), index=index)
+
+ expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
+ index=index, columns=[0, 0])
+ result = concat([df, df], axis=1)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
+ index=index, columns=[0, 1])
+ result = concat([s1, s2], axis=1)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
+ index=index, columns=[0, 1, 2])
+ result = concat([s1, s2, s1], axis=1)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5),
+ index=index, columns=[0, 0, 1, 2, 3])
+ result = concat([s1, df, s2, s2, s1], axis=1)
+ assert_frame_equal(result, expected)
+
+ # with names
+ s1.name = 'foo'
+ expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
+ index=index, columns=['foo', 0, 0])
+ result = concat([s1, df, s2], axis=1)
+ assert_frame_equal(result, expected)
+
+ s2.name = 'bar'
+ expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
+ index=index, columns=['foo', 0, 'bar'])
+ result = concat([s1, df, s2], axis=1)
+ assert_frame_equal(result, expected)
+
+ # ignore index
+ expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
+ index=index, columns=[0, 1, 2])
+ result = concat([s1, df, s2], axis=1, ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ # axis 0
+ expected = DataFrame(np.tile(arr, 3).reshape(-1, 1),
+ index=index.tolist() * 3, columns=[0])
+ result = concat([s1, df, s2])
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
+ result = concat([s1, df, s2], ignore_index=True)
+ assert_frame_equal(result, expected)
+
+ # invalid concatente of mixed dims
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ panel = tm.makePanel()
+ msg = ("cannot concatenate unaligned mixed dimensional NDFrame"
+ " objects")
+ with pytest.raises(ValueError, match=msg):
+ concat([panel, s1], axis=1)
+
+ def test_empty_dtype_coerce(self):
+
+ # xref to #12411
+ # xref to #12045
+ # xref to #11594
+ # see below
+
+ # 10571
+ df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b'])
+ df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b'])
+ result = concat([df1, df2])
+ expected = df1.dtypes
+ tm.assert_series_equal(result.dtypes, expected)
+
+ def test_dtype_coerceion(self):
+
+ # 12411
+ df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
+ pd.NaT]})
+
+ result = concat([df.iloc[[0]], df.iloc[[1]]])
+ tm.assert_series_equal(result.dtypes, df.dtypes)
+
+ # 12045
+ import datetime
+ df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
+ datetime.datetime(1012, 1, 2)]})
+ result = concat([df.iloc[[0]], df.iloc[[1]]])
+ tm.assert_series_equal(result.dtypes, df.dtypes)
+
+ # 11594
+ df = DataFrame({'text': ['some words'] + [None] * 9})
+ result = concat([df.iloc[[0]], df.iloc[[1]]])
+ tm.assert_series_equal(result.dtypes, df.dtypes)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_panel_concat_other_axes(self):
+ panel = tm.makePanel()
+
+ p1 = panel.iloc[:, :5, :]
+ p2 = panel.iloc[:, 5:, :]
+
+ result = concat([p1, p2], axis=1)
+ tm.assert_panel_equal(result, panel)
+
+ p1 = panel.iloc[:, :, :2]
+ p2 = panel.iloc[:, :, 2:]
+
+ result = concat([p1, p2], axis=2)
+ tm.assert_panel_equal(result, panel)
+
+ # if things are a bit misbehaved
+ p1 = panel.iloc[:2, :, :2]
+ p2 = panel.iloc[:, :, 2:]
+ p1['ItemC'] = 'baz'
+
+ result = concat([p1, p2], axis=2)
+
+ expected = panel.copy()
+ expected['ItemC'] = expected['ItemC'].astype('O')
+ expected.loc['ItemC', :, :2] = 'baz'
+ tm.assert_panel_equal(result, expected)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ # Panel.rename warning we don't care about
+ @pytest.mark.filterwarnings("ignore:Using:FutureWarning")
+ def test_panel_concat_buglet(self, sort):
+ # #2257
+ def make_panel():
+ index = 5
+ cols = 3
+
+ def df():
+ return DataFrame(np.random.randn(index, cols),
+ index=["I%s" % i for i in range(index)],
+ columns=["C%s" % i for i in range(cols)])
+ return Panel({"Item%s" % x: df() for x in ['A', 'B', 'C']})
+
+ panel1 = make_panel()
+ panel2 = make_panel()
+
+ panel2 = panel2.rename(major_axis={x: "%s_1" % x
+ for x in panel2.major_axis})
+
+ panel3 = panel2.rename(major_axis=lambda x: '%s_1' % x)
+ panel3 = panel3.rename(minor_axis=lambda x: '%s_1' % x)
+
+ # it works!
+ concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort)
+
+ def test_concat_series(self):
+
+ ts = tm.makeTimeSeries()
+ ts.name = 'foo'
+
+ pieces = [ts[:5], ts[5:15], ts[15:]]
+
+ result = concat(pieces)
+ tm.assert_series_equal(result, ts)
+ assert result.name == ts.name
+
+ result = concat(pieces, keys=[0, 1, 2])
+ expected = ts.copy()
+
+ ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]'))
+
+ exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]),
+ np.arange(len(ts))]
+ exp_index = MultiIndex(levels=[[0, 1, 2], ts.index],
+ codes=exp_codes)
+ expected.index = exp_index
+ tm.assert_series_equal(result, expected)
+
+ def test_concat_series_axis1(self, sort=sort):
+ ts = tm.makeTimeSeries()
+
+ pieces = [ts[:-2], ts[2:], ts[2:-2]]
+
+ result = concat(pieces, axis=1)
+ expected = DataFrame(pieces).T
+ assert_frame_equal(result, expected)
+
+ result = concat(pieces, keys=['A', 'B', 'C'], axis=1)
+ expected = DataFrame(pieces, index=['A', 'B', 'C']).T
+ assert_frame_equal(result, expected)
+
+ # preserve series names, #2489
+ s = Series(randn(5), name='A')
+ s2 = Series(randn(5), name='B')
+
+ result = concat([s, s2], axis=1)
+ expected = DataFrame({'A': s, 'B': s2})
+ assert_frame_equal(result, expected)
+
+ s2.name = None
+ result = concat([s, s2], axis=1)
+ tm.assert_index_equal(result.columns,
+ Index(['A', 0], dtype='object'))
+
+ # must reindex, #2603
+ s = Series(randn(3), index=['c', 'a', 'b'], name='A')
+ s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
+ result = concat([s, s2], axis=1, sort=sort)
+ expected = DataFrame({'A': s, 'B': s2})
+ assert_frame_equal(result, expected)
+
+ def test_concat_series_axis1_names_applied(self):
+ # ensure names argument is not ignored on axis=1, #23490
+ s = Series([1, 2, 3])
+ s2 = Series([4, 5, 6])
+ result = concat([s, s2], axis=1, keys=['a', 'b'], names=['A'])
+ expected = DataFrame([[1, 4], [2, 5], [3, 6]],
+ columns=pd.Index(['a', 'b'], name='A'))
+ assert_frame_equal(result, expected)
+
+ result = concat([s, s2], axis=1, keys=[('a', 1), ('b', 2)],
+ names=['A', 'B'])
+ expected = DataFrame([[1, 4], [2, 5], [3, 6]],
+ columns=MultiIndex.from_tuples([('a', 1),
+ ('b', 2)],
+ names=['A', 'B']))
+ assert_frame_equal(result, expected)
+
+ def test_concat_single_with_key(self):
+ df = DataFrame(np.random.randn(10, 4))
+
+ result = concat([df], keys=['foo'])
+ expected = concat([df, df], keys=['foo', 'bar'])
+ tm.assert_frame_equal(result, expected[:10])
+
+ def test_concat_exclude_none(self):
+ df = DataFrame(np.random.randn(10, 4))
+
+ pieces = [df[:5], None, None, df[5:]]
+ result = concat(pieces)
+ tm.assert_frame_equal(result, df)
+ with pytest.raises(ValueError, match="All objects passed were None"):
+ concat([None, None])
+
+ def test_concat_datetime64_block(self):
+ from pandas.core.indexes.datetimes import date_range
+
+ rng = date_range('1/1/2000', periods=10)
+
+ df = DataFrame({'time': rng})
+
+ result = concat([df, df])
+ assert (result.iloc[:10]['time'] == rng).all()
+ assert (result.iloc[10:]['time'] == rng).all()
+
+ def test_concat_timedelta64_block(self):
+ from pandas import to_timedelta
+
+ rng = to_timedelta(np.arange(10), unit='s')
+
+ df = DataFrame({'time': rng})
+
+ result = concat([df, df])
+ assert (result.iloc[:10]['time'] == rng).all()
+ assert (result.iloc[10:]['time'] == rng).all()
+
+ def test_concat_keys_with_none(self):
+ # #1649
+ df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
+
+ result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0))
+ expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0))
+ tm.assert_frame_equal(result, expected)
+
+ result = concat([None, df0, df0[:2], df0[:1], df0],
+ keys=['a', 'b', 'c', 'd', 'e'])
+ expected = concat([df0, df0[:2], df0[:1], df0],
+ keys=['b', 'c', 'd', 'e'])
+ tm.assert_frame_equal(result, expected)
+
+ def test_concat_bug_1719(self):
+ ts1 = tm.makeTimeSeries()
+ ts2 = tm.makeTimeSeries()[::2]
+
+ # to join with union
+ # these two are of different length!
+ left = concat([ts1, ts2], join='outer', axis=1)
+ right = concat([ts2, ts1], join='outer', axis=1)
+
+ assert len(left) == len(right)
+
+ def test_concat_bug_2972(self):
+ ts0 = Series(np.zeros(5))
+ ts1 = Series(np.ones(5))
+ ts0.name = ts1.name = 'same name'
+ result = concat([ts0, ts1], axis=1)
+
+ expected = DataFrame({0: ts0, 1: ts1})
+ expected.columns = ['same name', 'same name']
+ assert_frame_equal(result, expected)
+
+ def test_concat_bug_3602(self):
+
+ # GH 3602, duplicate columns
+ df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6],
+ 'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']})
+ df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4],
+ 'prc': [6, 6, 6, 6]})
+ expected = DataFrame([[0, 6, 'rrr', 9, 1, 6],
+ [0, 6, 'rrr', 10, 2, 6],
+ [0, 6, 'rrr', 11, 3, 6],
+ [0, 6, 'rrr', 12, 4, 6]])
+ expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc']
+
+ result = concat([df1, df2], axis=1)
+ assert_frame_equal(result, expected)
+
+ def test_concat_inner_join_empty(self):
+ # GH 15328
+ df_empty = pd.DataFrame()
+ df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
+ df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
+
+ for how, expected in [('inner', df_expected), ('outer', df_a)]:
+ result = pd.concat([df_a, df_empty], axis=1, join=how)
+ assert_frame_equal(result, expected)
+
+ def test_concat_series_axis1_same_names_ignore_index(self):
+ dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1]
+ s1 = Series(randn(len(dates)), index=dates, name='value')
+ s2 = Series(randn(len(dates)), index=dates, name='value')
+
+ result = concat([s1, s2], axis=1, ignore_index=True)
+ expected = Index([0, 1])
+
+ tm.assert_index_equal(result.columns, expected)
+
+ def test_concat_iterables(self):
+ # GH8645 check concat works with tuples, list, generators, and weird
+ # stuff like deque and custom iterables
+ df1 = DataFrame([1, 2, 3])
+ df2 = DataFrame([4, 5, 6])
+ expected = DataFrame([1, 2, 3, 4, 5, 6])
+ assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
+ assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
+ assert_frame_equal(concat((df for df in (df1, df2)),
+ ignore_index=True), expected)
+ assert_frame_equal(
+ concat(deque((df1, df2)), ignore_index=True), expected)
+
+ class CustomIterator1(object):
+
+ def __len__(self):
+ return 2
+
+ def __getitem__(self, index):
+ try:
+ return {0: df1, 1: df2}[index]
+ except KeyError:
+ raise IndexError
+ assert_frame_equal(pd.concat(CustomIterator1(),
+ ignore_index=True), expected)
+
+ class CustomIterator2(Iterable):
+
+ def __iter__(self):
+ yield df1
+ yield df2
+ assert_frame_equal(pd.concat(CustomIterator2(),
+ ignore_index=True), expected)
+
+ def test_concat_invalid(self):
+
+ # trying to concat a ndframe with a non-ndframe
+ df1 = mkdf(10, 2)
+ msg = ('cannot concatenate object of type "{}";'
+ ' only pd.Series, pd.DataFrame, and pd.Panel'
+ r' \(deprecated\) objs are valid')
+ for obj in [1, dict(), [1, 2], (1, 2)]:
+ with pytest.raises(TypeError, match=msg.format(type(obj))):
+ concat([df1, obj])
+
+ def test_concat_invalid_first_argument(self):
+ df1 = mkdf(10, 2)
+ df2 = mkdf(10, 2)
+ msg = ('first argument must be an iterable of pandas '
+ 'objects, you passed an object of type "DataFrame"')
+ with pytest.raises(TypeError, match=msg):
+ concat(df1, df2)
+
+ # generator ok though
+ concat(DataFrame(np.random.rand(5, 5)) for _ in range(3))
+
+ # text reader ok
+ # GH6583
+ data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+ reader = read_csv(StringIO(data), chunksize=1)
+ result = concat(reader, ignore_index=True)
+ expected = read_csv(StringIO(data))
+ assert_frame_equal(result, expected)
+
+ def test_concat_NaT_series(self):
+ # GH 11693
+ # test for merging NaT series with datetime series.
+ x = Series(date_range('20151124 08:00', '20151124 09:00',
+ freq='1h', tz='US/Eastern'))
+ y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
+ expected = Series([x[0], x[1], pd.NaT, pd.NaT])
+
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ # all NaT with tz
+ expected = Series(pd.NaT, index=range(4),
+ dtype='datetime64[ns, US/Eastern]')
+ result = pd.concat([y, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ # without tz
+ x = pd.Series(pd.date_range('20151124 08:00',
+ '20151124 09:00', freq='1h'))
+ y = pd.Series(pd.date_range('20151124 10:00',
+ '20151124 11:00', freq='1h'))
+ y[:] = pd.NaT
+ expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT])
+ result = pd.concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ # all NaT without tz
+ x[:] = pd.NaT
+ expected = pd.Series(pd.NaT, index=range(4),
+ dtype='datetime64[ns]')
+ result = pd.concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ def test_concat_tz_frame(self):
+ df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'),
+ B=pd.Timestamp('20130603', tz='CET')),
+ index=range(5))
+
+ # concat
+ df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
+ assert_frame_equal(df2, df3)
+
+ def test_concat_tz_series(self):
+ # gh-11755: tz and no tz
+ x = Series(date_range('20151124 08:00',
+ '20151124 09:00',
+ freq='1h', tz='UTC'))
+ y = Series(date_range('2012-01-01', '2012-01-02'))
+ expected = Series([x[0], x[1], y[0], y[1]],
+ dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ # gh-11887: concat tz and object
+ x = Series(date_range('20151124 08:00',
+ '20151124 09:00',
+ freq='1h', tz='UTC'))
+ y = Series(['a', 'b'])
+ expected = Series([x[0], x[1], y[0], y[1]],
+ dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ # see gh-12217 and gh-12306
+ # Concatenating two UTC times
+ first = pd.DataFrame([[datetime(2016, 1, 1)]])
+ first[0] = first[0].dt.tz_localize('UTC')
+
+ second = pd.DataFrame([[datetime(2016, 1, 2)]])
+ second[0] = second[0].dt.tz_localize('UTC')
+
+ result = pd.concat([first, second])
+ assert result[0].dtype == 'datetime64[ns, UTC]'
+
+ # Concatenating two London times
+ first = pd.DataFrame([[datetime(2016, 1, 1)]])
+ first[0] = first[0].dt.tz_localize('Europe/London')
+
+ second = pd.DataFrame([[datetime(2016, 1, 2)]])
+ second[0] = second[0].dt.tz_localize('Europe/London')
+
+ result = pd.concat([first, second])
+ assert result[0].dtype == 'datetime64[ns, Europe/London]'
+
+ # Concatenating 2+1 London times
+ first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]])
+ first[0] = first[0].dt.tz_localize('Europe/London')
+
+ second = pd.DataFrame([[datetime(2016, 1, 3)]])
+ second[0] = second[0].dt.tz_localize('Europe/London')
+
+ result = pd.concat([first, second])
+ assert result[0].dtype == 'datetime64[ns, Europe/London]'
+
+ # Concat'ing 1+2 London times
+ first = pd.DataFrame([[datetime(2016, 1, 1)]])
+ first[0] = first[0].dt.tz_localize('Europe/London')
+
+ second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]])
+ second[0] = second[0].dt.tz_localize('Europe/London')
+
+ result = pd.concat([first, second])
+ assert result[0].dtype == 'datetime64[ns, Europe/London]'
+
+ def test_concat_tz_series_with_datetimelike(self):
+ # see gh-12620: tz and timedelta
+ x = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-02-01', tz='US/Eastern')]
+ y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')]
+ result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
+ tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
+
+ # tz and period
+ y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')]
+ result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
+ tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
+
+ def test_concat_tz_series_tzlocal(self):
+ # see gh-13583
+ x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()),
+ pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())]
+ y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()),
+ pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())]
+
+ result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
+ tm.assert_series_equal(result, pd.Series(x + y))
+ assert result.dtype == 'datetime64[ns, tzlocal()]'
+
+ @pytest.mark.parametrize('tz1', [None, 'UTC'])
+ @pytest.mark.parametrize('tz2', [None, 'UTC'])
+ @pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')])
+ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s):
+ # GH 12396
+
+ # tz-naive
+ first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply(
+ lambda x: x.dt.tz_localize(tz1))
+ second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2))
+
+ result = pd.concat([first, second], axis=0)
+ expected = pd.DataFrame(pd.Series(
+ [pd.NaT, pd.NaT, s], index=[0, 1, 0]))
+ expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
+ if tz1 != tz2:
+ expected = expected.astype(object)
+
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('tz1', [None, 'UTC'])
+ @pytest.mark.parametrize('tz2', [None, 'UTC'])
+ def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
+ # GH 12396
+
+ first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
+ second = pd.DataFrame(pd.Series(
+ [pd.NaT]).dt.tz_localize(tz2), columns=[1])
+ expected = pd.DataFrame(
+ {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
+ 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)}
+ )
+ result = pd.concat([first, second], axis=1)
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('tz1', [None, 'UTC'])
+ @pytest.mark.parametrize('tz2', [None, 'UTC'])
+ def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
+ # GH 12396
+
+ # tz-naive
+ first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
+ second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)],
+ [pd.Timestamp('2016/01/01', tz=tz2)]],
+ index=[2, 3])
+
+ expected = pd.DataFrame([pd.NaT, pd.NaT,
+ pd.Timestamp('2015/01/01', tz=tz2),
+ pd.Timestamp('2016/01/01', tz=tz2)])
+ if tz1 != tz2:
+ expected = expected.astype(object)
+
+ result = pd.concat([first, second])
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ def test_concat_NaT_dataframes(self, tz):
+ # GH 12396
+
+ first = pd.DataFrame([[pd.NaT], [pd.NaT]])
+ first = first.apply(lambda x: x.dt.tz_localize(tz))
+ second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)],
+ [pd.Timestamp('2016/01/01', tz=tz)]],
+ index=[2, 3])
+ expected = pd.DataFrame([pd.NaT, pd.NaT,
+ pd.Timestamp('2015/01/01', tz=tz),
+ pd.Timestamp('2016/01/01', tz=tz)])
+
+ result = pd.concat([first, second], axis=0)
+ assert_frame_equal(result, expected)
+
+ def test_concat_period_series(self):
+ x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
+ y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D'))
+ expected = Series([x[0], x[1], y[0], y[1]], dtype='Period[D]')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+ def test_concat_period_multiple_freq_series(self):
+ x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
+ y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M'))
+ expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'object'
+
+ def test_concat_period_other_series(self):
+ x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
+ y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M'))
+ expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'object'
+
+ # non-period
+ x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
+ y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01']))
+ expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'object'
+
+ x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
+ y = Series(['A', 'B'])
+ expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
+ result = concat([x, y], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'object'
+
+ def test_concat_empty_series(self):
+ # GH 11082
+ s1 = pd.Series([1, 2, 3], name='x')
+ s2 = pd.Series(name='y')
+ res = pd.concat([s1, s2], axis=1)
+ exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]})
+ tm.assert_frame_equal(res, exp)
+
+ s1 = pd.Series([1, 2, 3], name='x')
+ s2 = pd.Series(name='y')
+ res = pd.concat([s1, s2], axis=0)
+ # name will be reset
+ exp = pd.Series([1, 2, 3])
+ tm.assert_series_equal(res, exp)
+
+ # empty Series with no name
+ s1 = pd.Series([1, 2, 3], name='x')
+ s2 = pd.Series(name=None)
+ res = pd.concat([s1, s2], axis=1)
+ exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
+ columns=['x', 0])
+ tm.assert_frame_equal(res, exp)
+
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ @pytest.mark.parametrize('values', [[], [1, 2, 3]])
+ def test_concat_empty_series_timelike(self, tz, values):
+ # GH 18447
+
+ first = Series([], dtype='M8[ns]').dt.tz_localize(tz)
+ second = Series(values)
+ expected = DataFrame(
+ {0: pd.Series([pd.NaT] * len(values),
+ dtype='M8[ns]'
+ ).dt.tz_localize(tz),
+ 1: values})
+ result = concat([first, second], axis=1)
+ assert_frame_equal(result, expected)
+
+ def test_default_index(self):
+ # is_series and ignore_index
+ s1 = pd.Series([1, 2, 3], name='x')
+ s2 = pd.Series([4, 5, 6], name='y')
+ res = pd.concat([s1, s2], axis=1, ignore_index=True)
+ assert isinstance(res.columns, pd.RangeIndex)
+ exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
+ # use check_index_type=True to check the result have
+ # RangeIndex (default index)
+ tm.assert_frame_equal(res, exp, check_index_type=True,
+ check_column_type=True)
+
+ # is_series and all inputs have no names
+ s1 = pd.Series([1, 2, 3])
+ s2 = pd.Series([4, 5, 6])
+ res = pd.concat([s1, s2], axis=1, ignore_index=False)
+ assert isinstance(res.columns, pd.RangeIndex)
+ exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
+ exp.columns = pd.RangeIndex(2)
+ tm.assert_frame_equal(res, exp, check_index_type=True,
+ check_column_type=True)
+
+ # is_dataframe and ignore_index
+ df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]})
+ df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]})
+
+ res = pd.concat([df1, df2], axis=0, ignore_index=True)
+ exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]],
+ columns=['A', 'B'])
+ tm.assert_frame_equal(res, exp, check_index_type=True,
+ check_column_type=True)
+
+ res = pd.concat([df1, df2], axis=1, ignore_index=True)
+ exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
+ tm.assert_frame_equal(res, exp, check_index_type=True,
+ check_column_type=True)
+
+ def test_concat_multiindex_rangeindex(self):
+ # GH13542
+ # when multi-index levels are RangeIndex objects
+ # there is a bug in concat with objects of len 1
+
+ df = DataFrame(np.random.randn(9, 2))
+ df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
+ codes=[np.repeat(np.arange(3), 3),
+ np.tile(np.arange(3), 3)])
+
+ res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
+ exp = df.iloc[[2, 3, 4, 5], :]
+ tm.assert_frame_equal(res, exp)
+
+ def test_concat_multiindex_dfs_with_deepcopy(self):
+ # GH 9967
+ from copy import deepcopy
+ example_multiindex1 = pd.MultiIndex.from_product([['a'], ['b']])
+ example_dataframe1 = pd.DataFrame([0], index=example_multiindex1)
+
+ example_multiindex2 = pd.MultiIndex.from_product([['a'], ['c']])
+ example_dataframe2 = pd.DataFrame([1], index=example_multiindex2)
+
+ example_dict = {'s1': example_dataframe1, 's2': example_dataframe2}
+ expected_index = pd.MultiIndex(levels=[['s1', 's2'],
+ ['a'],
+ ['b', 'c']],
+ codes=[[0, 1], [0, 0], [0, 1]],
+ names=['testname', None, None])
+ expected = pd.DataFrame([[0], [1]], index=expected_index)
+ result_copy = pd.concat(deepcopy(example_dict), names=['testname'])
+ tm.assert_frame_equal(result_copy, expected)
+ result_no_copy = pd.concat(example_dict, names=['testname'])
+ tm.assert_frame_equal(result_no_copy, expected)
+
+ def test_categorical_concat_append(self):
+ cat = Categorical(["a", "b"], categories=["a", "b"])
+ vals = [1, 2]
+ df = DataFrame({"cats": cat, "vals": vals})
+ cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
+ vals2 = [1, 2, 1, 2]
+ exp = DataFrame({"cats": cat2, "vals": vals2},
+ index=Index([0, 1, 0, 1]))
+
+ tm.assert_frame_equal(pd.concat([df, df]), exp)
+ tm.assert_frame_equal(df.append(df), exp)
+
+ # GH 13524 can concat different categories
+ cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
+ vals3 = [1, 2]
+ df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
+
+ res = pd.concat([df, df_different_categories], ignore_index=True)
+ exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]})
+ tm.assert_frame_equal(res, exp)
+
+ res = df.append(df_different_categories, ignore_index=True)
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_concat_dtypes(self):
+
+ # GH8143
+ index = ['cat', 'obj', 'num']
+ cat = Categorical(['a', 'b', 'c'])
+ obj = Series(['a', 'b', 'c'])
+ num = Series([1, 2, 3])
+ df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
+
+ result = df.dtypes == 'object'
+ expected = Series([False, True, False], index=index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.dtypes == 'int64'
+ expected = Series([False, False, True], index=index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.dtypes == 'category'
+ expected = Series([True, False, False], index=index)
+ tm.assert_series_equal(result, expected)
+
+ def test_categorical_concat(self, sort):
+ # See GH 10177
+ df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3),
+ columns=["a", "b", "c"])
+
+ df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2),
+ columns=["a", "c"])
+
+ cat_values = ["one", "one", "two", "one", "two", "two", "one"]
+ df2['h'] = Series(Categorical(cat_values))
+
+ res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
+ exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
+ 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan,
+ np.nan, np.nan, np.nan, np.nan],
+ 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
+ 'h': [None] * 6 + cat_values})
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_concat_gh7864(self):
+ # GH 7864
+ # make sure ordering is preserverd
+ df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')})
+ df["grade"] = Categorical(df["raw_grade"])
+ df['grade'].cat.set_categories(['e', 'a', 'b'])
+
+ df1 = df[0:3]
+ df2 = df[3:]
+
+ tm.assert_index_equal(df['grade'].cat.categories,
+ df1['grade'].cat.categories)
+ tm.assert_index_equal(df['grade'].cat.categories,
+ df2['grade'].cat.categories)
+
+ dfx = pd.concat([df1, df2])
+ tm.assert_index_equal(df['grade'].cat.categories,
+ dfx['grade'].cat.categories)
+
+ dfa = df1.append(df2)
+ tm.assert_index_equal(df['grade'].cat.categories,
+ dfa['grade'].cat.categories)
+
+ def test_categorical_concat_preserve(self):
+
+ # GH 8641 series concat not preserving category dtype
+ # GH 13524 can concat different categories
+ s = Series(list('abc'), dtype='category')
+ s2 = Series(list('abd'), dtype='category')
+
+ exp = Series(list('abcabd'))
+ res = pd.concat([s, s2], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(list('abcabc'), dtype='category')
+ res = pd.concat([s, s], ignore_index=True)
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2],
+ dtype='category')
+ res = pd.concat([s, s])
+ tm.assert_series_equal(res, exp)
+
+ a = Series(np.arange(6, dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A': a,
+ 'B': b.astype(CategoricalDtype(list('cab')))})
+ res = pd.concat([df2, df2])
+ exp = DataFrame(
+ {'A': pd.concat([a, a]),
+ 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))})
+ tm.assert_frame_equal(res, exp)
+
+ def test_categorical_index_preserver(self):
+
+ a = Series(np.arange(6, dtype='int64'))
+ b = Series(list('aabbca'))
+
+ df2 = DataFrame({'A': a,
+ 'B': b.astype(CategoricalDtype(list('cab')))
+ }).set_index('B')
+ result = pd.concat([df2, df2])
+ expected = DataFrame(
+ {'A': pd.concat([a, a]),
+ 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))
+ }).set_index('B')
+ tm.assert_frame_equal(result, expected)
+
+ # wrong catgories
+ df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe'))
+ }).set_index('B')
+ msg = "categories must match existing categories when appending"
+ with pytest.raises(TypeError, match=msg):
+ pd.concat([df2, df3])
+
+ def test_concat_categoricalindex(self):
+ # GH 16111, categories that aren't lexsorted
+ categories = [9, 0, 1, 2, 3]
+
+ a = pd.Series(1, index=pd.CategoricalIndex([9, 0],
+ categories=categories))
+ b = pd.Series(2, index=pd.CategoricalIndex([0, 1],
+ categories=categories))
+ c = pd.Series(3, index=pd.CategoricalIndex([1, 2],
+ categories=categories))
+
+ result = pd.concat([a, b, c], axis=1)
+
+ exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
+ exp = pd.DataFrame({0: [1, 1, np.nan, np.nan],
+ 1: [np.nan, 2, 2, np.nan],
+ 2: [np.nan, np.nan, 3, 3]},
+ columns=[0, 1, 2],
+ index=exp_idx)
+ tm.assert_frame_equal(result, exp)
+
+ def test_concat_order(self):
+ # GH 17344
+ dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])]
+ dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a'])
+ for i in range(100)]
+
+ result = pd.concat(dfs, sort=True).columns
+
+ if PY2:
+ # Different sort order between incomparable objects between
+ # python 2 and python3 via Index.union.
+ expected = dfs[1].columns
+ else:
+ expected = dfs[0].columns
+ tm.assert_index_equal(result, expected)
+
+ def test_concat_datetime_timezone(self):
+ # GH 18523
+ idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
+ tz='Europe/Paris')
+ idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
+ df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
+ df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
+ result = pd.concat([df1, df2], axis=1)
+
+ exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
+ '2011-01-01 01:00:00+01:00',
+ '2011-01-01 02:00:00+01:00'],
+ freq='H'
+ ).tz_convert('UTC').tz_convert('Europe/Paris')
+
+ expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
+ index=exp_idx, columns=['a', 'b'])
+
+ tm.assert_frame_equal(result, expected)
+
+ idx3 = pd.date_range('2011-01-01', periods=3,
+ freq='H', tz='Asia/Tokyo')
+ df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
+ result = pd.concat([df1, df3], axis=1)
+
+ exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
+ '2010-12-31 16:00:00+00:00',
+ '2010-12-31 17:00:00+00:00',
+ '2010-12-31 23:00:00+00:00',
+ '2011-01-01 00:00:00+00:00',
+ '2011-01-01 01:00:00+00:00']
+ )
+
+ expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
+ [1, np.nan], [2, np.nan], [3, np.nan]],
+ index=exp_idx, columns=['a', 'b'])
+
+ tm.assert_frame_equal(result, expected)
+
+ # GH 13783: Concat after resample
+ result = pd.concat([df1.resample('H').mean(),
+ df2.resample('H').mean()], sort=True)
+ expected = pd.DataFrame({'a': [1, 2, 3] + [np.nan] * 3,
+ 'b': [np.nan] * 3 + [1, 2, 3]},
+ index=idx1.append(idx1))
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(PY2, reason="Unhashable Decimal dtype")
+ def test_concat_different_extension_dtypes_upcasts(self):
+ a = pd.Series(pd.core.arrays.integer_array([1, 2]))
+ b = pd.Series(to_decimal([1, 2]))
+
+ result = pd.concat([a, b], ignore_index=True)
+ expected = pd.Series([
+ 1, 2,
+ Decimal(1), Decimal(2)
+ ], dtype=object)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('pdt', [pd.Series, pd.DataFrame, pd.Panel])
[email protected]('dt', np.sctypes['float'])
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_concat_no_unnecessary_upcast(dt, pdt):
+ # GH 13247
+ dims = pdt().ndim
+ dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
+ pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
+ pdt(np.array([5], dtype=dt, ndmin=dims))]
+ x = pd.concat(dfs)
+ assert x.values.dtype == dt
+
+
[email protected]('pdt', [pd.Series, pd.DataFrame, pd.Panel])
[email protected]('dt', np.sctypes['int'])
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_concat_will_upcast(dt, pdt):
+ with catch_warnings(record=True):
+ dims = pdt().ndim
+ dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
+ pdt(np.array([np.nan], ndmin=dims)),
+ pdt(np.array([5], dtype=dt, ndmin=dims))]
+ x = pd.concat(dfs)
+ assert x.values.dtype == 'float64'
+
+
+def test_concat_empty_and_non_empty_frame_regression():
+ # GH 18178 regression test
+ df1 = pd.DataFrame({'foo': [1]})
+ df2 = pd.DataFrame({'foo': []})
+ expected = pd.DataFrame({'foo': [1.0]})
+ result = pd.concat([df1, df2])
+ assert_frame_equal(result, expected)
+
+
+def test_concat_empty_and_non_empty_series_regression():
+ # GH 18187 regression test
+ s1 = pd.Series([1])
+ s2 = pd.Series([])
+ expected = s1
+ result = pd.concat([s1, s2])
+ tm.assert_series_equal(result, expected)
+
+
+def test_concat_sorts_columns(sort_with_none):
+ # GH-4588
+ df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
+ df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]})
+
+ # for sort=True/None
+ expected = pd.DataFrame({"a": [1, 2, 3, 4],
+ "b": [1, 2, None, None],
+ "c": [None, None, 5, 6]},
+ columns=['a', 'b', 'c'])
+
+ if sort_with_none is False:
+ expected = expected[['b', 'a', 'c']]
+
+ if sort_with_none is None:
+ # only warn if not explicitly specified
+ ctx = tm.assert_produces_warning(FutureWarning)
+ else:
+ ctx = tm.assert_produces_warning(None)
+
+ # default
+ with ctx:
+ result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_concat_sorts_index(sort_with_none):
+ df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b'])
+ df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b'])
+
+ # For True/None
+ expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]},
+ index=['a', 'b', 'c'],
+ columns=['a', 'b'])
+ if sort_with_none is False:
+ expected = expected.loc[['c', 'a', 'b']]
+
+ if sort_with_none is None:
+ # only warn if not explicitly specified
+ ctx = tm.assert_produces_warning(FutureWarning)
+ else:
+ ctx = tm.assert_produces_warning(None)
+
+ # Warn and sort by default
+ with ctx:
+ result = pd.concat([df1, df2], axis=1, sort=sort_with_none)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_concat_inner_sort(sort_with_none):
+ # https://github.com/pandas-dev/pandas/pull/20613
+ df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]},
+ columns=['b', 'a', 'c'])
+ df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4])
+
+ with tm.assert_produces_warning(None):
+ # unset sort should *not* warn for inner join
+ # since that never sorted
+ result = pd.concat([df1, df2], sort=sort_with_none,
+ join='inner',
+ ignore_index=True)
+
+ expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]},
+ columns=['b', 'a'])
+ if sort_with_none is True:
+ expected = expected[['a', 'b']]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_concat_aligned_sort():
+ # GH-4588
+ df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]},
+ columns=['c', 'b', 'a'])
+ result = pd.concat([df, df], sort=True, ignore_index=True)
+ expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4],
+ 'c': [1, 2, 1, 2]},
+ columns=['a', 'b', 'c'])
+ tm.assert_frame_equal(result, expected)
+
+ result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True,
+ ignore_index=True)
+ expected = expected[['b', 'c']]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_concat_aligned_sort_does_not_raise():
+ # GH-4588
+ # We catch TypeErrors from sorting internally and do not re-raise.
+ df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a'])
+ expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]},
+ columns=[1, 'a'])
+ result = pd.concat([df, df], ignore_index=True, sort=True)
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("s1name,s2name", [
+ (np.int64(190), (43, 0)), (190, (43, 0))])
+def test_concat_series_name_npscalar_tuple(s1name, s2name):
+ # GH21015
+ s1 = pd.Series({'a': 1, 'b': 2}, name=s1name)
+ s2 = pd.Series({'c': 5, 'd': 6}, name=s2name)
+ result = pd.concat([s1, s2])
+ expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6})
+ tm.assert_series_equal(result, expected)
+
+
+def test_concat_categorical_tz():
+ # GH-23816
+ a = pd.Series(pd.date_range('2017-01-01', periods=2, tz='US/Pacific'))
+ b = pd.Series(['a', 'b'], dtype='category')
+ result = pd.concat([a, b], ignore_index=True)
+ expected = pd.Series([
+ pd.Timestamp('2017-01-01', tz="US/Pacific"),
+ pd.Timestamp('2017-01-02', tz="US/Pacific"),
+ 'a', 'b'
+ ])
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_cut.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_cut.py
new file mode 100644
index 00000000000..6833460fa51
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_cut.py
@@ -0,0 +1,458 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex,
+ Series, TimedeltaIndex, Timestamp, cut, date_range, isna, qcut,
+ timedelta_range, to_datetime)
+from pandas.api.types import CategoricalDtype as CDT
+import pandas.core.reshape.tile as tmod
+import pandas.util.testing as tm
+
+
+def test_simple():
+ data = np.ones(5, dtype="int64")
+ result = cut(data, 4, labels=False)
+
+ expected = np.array([1, 1, 1, 1, 1])
+ tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+
+def test_bins():
+ data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
+ result, bins = cut(data, 3, retbins=True)
+
+ intervals = IntervalIndex.from_breaks(bins.round(3))
+ intervals = intervals.take([0, 0, 0, 1, 2, 0])
+ expected = Categorical(intervals, ordered=True)
+
+ tm.assert_categorical_equal(result, expected)
+ tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
+ 6.53333333, 9.7]))
+
+
+def test_right():
+ data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
+ result, bins = cut(data, 4, right=True, retbins=True)
+
+ intervals = IntervalIndex.from_breaks(bins.round(3))
+ expected = Categorical(intervals, ordered=True)
+ expected = expected.take([0, 0, 0, 2, 3, 0, 0])
+
+ tm.assert_categorical_equal(result, expected)
+ tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
+
+
+def test_no_right():
+ data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
+ result, bins = cut(data, 4, right=False, retbins=True)
+
+ intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
+ intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
+ expected = Categorical(intervals, ordered=True)
+
+ tm.assert_categorical_equal(result, expected)
+ tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
+
+
+def test_array_like():
+ data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+ result, bins = cut(data, 3, retbins=True)
+
+ intervals = IntervalIndex.from_breaks(bins.round(3))
+ intervals = intervals.take([0, 0, 0, 1, 2, 0])
+ expected = Categorical(intervals, ordered=True)
+
+ tm.assert_categorical_equal(result, expected)
+ tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
+ 6.53333333, 9.7]))
+
+
+def test_bins_from_interval_index():
+ c = cut(range(5), 3)
+ expected = c
+ result = cut(range(5), bins=expected.categories)
+ tm.assert_categorical_equal(result, expected)
+
+ expected = Categorical.from_codes(np.append(c.codes, -1),
+ categories=c.categories,
+ ordered=True)
+ result = cut(range(6), bins=expected.categories)
+ tm.assert_categorical_equal(result, expected)
+
+
+def test_bins_from_interval_index_doc_example():
+ # Make sure we preserve the bins.
+ ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
+ c = cut(ages, bins=[0, 18, 35, 70])
+ expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
+ tm.assert_index_equal(c.categories, expected)
+
+ result = cut([25, 20, 50], bins=c.categories)
+ tm.assert_index_equal(result.categories, expected)
+ tm.assert_numpy_array_equal(result.codes,
+ np.array([1, 1, 2], dtype="int8"))
+
+
+def test_bins_not_overlapping_from_interval_index():
+ # see gh-23980
+ msg = "Overlapping IntervalIndex is not accepted"
+ ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
+
+ with pytest.raises(ValueError, match=msg):
+ cut([5, 6], bins=ii)
+
+
+def test_bins_not_monotonic():
+ msg = "bins must increase monotonically"
+ data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+
+ with pytest.raises(ValueError, match=msg):
+ cut(data, [0.1, 1.5, 1, 10])
+
+
+def test_wrong_num_labels():
+ msg = "Bin labels must be one fewer than the number of bin edges"
+ data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+
+ with pytest.raises(ValueError, match=msg):
+ cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
+
+
[email protected]("x,bins,msg", [
+ ([], 2, "Cannot cut empty array"),
+ ([1, 2, 3], 0.5, "`bins` should be a positive integer")
+])
+def test_cut_corner(x, bins, msg):
+ with pytest.raises(ValueError, match=msg):
+ cut(x, bins)
+
+
[email protected]("arg", [2, np.eye(2), DataFrame(np.eye(2))])
[email protected]("cut_func", [cut, qcut])
+def test_cut_not_1d_arg(arg, cut_func):
+ msg = "Input array must be 1 dimensional"
+ with pytest.raises(ValueError, match=msg):
+ cut_func(arg, 2)
+
+
+ [0, 1, 2, 3, 4, np.inf],
+ [-np.inf, 0, 1, 2, 3, 4],
+ [-np.inf, 0, 1, 2, 3, 4, np.inf]])
+def test_int_bins_with_inf(data):
+ # GH 24314
+ msg = 'cannot specify integer `bins` when input data contains infinity'
+ with pytest.raises(ValueError, match=msg):
+ cut(data, bins=3)
+
+
+def test_cut_out_of_range_more():
+ # see gh-1511
+ name = "x"
+
+ ser = Series([0, -1, 0, 1, -3], name=name)
+ ind = cut(ser, [0, 1], labels=False)
+
+ exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
+ tm.assert_series_equal(ind, exp)
+
+
[email protected]("right,breaks,closed", [
+ (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
+ (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left")
+])
+def test_labels(right, breaks, closed):
+ arr = np.tile(np.arange(0, 1.01, 0.1), 4)
+
+ result, bins = cut(arr, 4, retbins=True, right=right)
+ ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
+ tm.assert_index_equal(result.categories, ex_levels)
+
+
+def test_cut_pass_series_name_to_factor():
+ name = "foo"
+ ser = Series(np.random.randn(100), name=name)
+
+ factor = cut(ser, 4)
+ assert factor.name == name
+
+
+def test_label_precision():
+ arr = np.arange(0, 0.73, 0.01)
+ result = cut(arr, 4, precision=2)
+
+ ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
+ tm.assert_index_equal(result.categories, ex_levels)
+
+
[email protected]("labels", [None, False])
+def test_na_handling(labels):
+ arr = np.arange(0, 0.75, 0.01)
+ arr[::3] = np.nan
+
+ result = cut(arr, 4, labels=labels)
+ result = np.asarray(result)
+
+ expected = np.where(isna(arr), np.nan, result)
+ tm.assert_almost_equal(result, expected)
+
+
+def test_inf_handling():
+ data = np.arange(6)
+ data_ser = Series(data, dtype="int64")
+
+ bins = [-np.inf, 2, 4, np.inf]
+ result = cut(data, bins)
+ result_ser = cut(data_ser, bins)
+
+ ex_uniques = IntervalIndex.from_breaks(bins)
+ tm.assert_index_equal(result.categories, ex_uniques)
+
+ assert result[5] == Interval(4, np.inf)
+ assert result[0] == Interval(-np.inf, 2)
+ assert result_ser[5] == Interval(4, np.inf)
+ assert result_ser[0] == Interval(-np.inf, 2)
+
+
+def test_cut_out_of_bounds():
+ arr = np.random.randn(100)
+ result = cut(arr, [-1, 0, 1])
+
+ mask = isna(result)
+ ex_mask = (arr < -1) | (arr > 1)
+ tm.assert_numpy_array_equal(mask, ex_mask)
+
+
[email protected]("get_labels,get_expected", [
+ (lambda labels: labels,
+ lambda labels: Categorical(["Medium"] + 4 * ["Small"] +
+ ["Medium", "Large"],
+ categories=labels, ordered=True)),
+ (lambda labels: Categorical.from_codes([0, 1, 2], labels),
+ lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels))
+])
+def test_cut_pass_labels(get_labels, get_expected):
+ bins = [0, 25, 50, 100]
+ arr = [50, 5, 10, 15, 20, 30, 70]
+ labels = ["Small", "Medium", "Large"]
+
+ result = cut(arr, bins, labels=get_labels(labels))
+ tm.assert_categorical_equal(result, get_expected(labels))
+
+
+def test_cut_pass_labels_compat():
+ # see gh-16459
+ arr = [50, 5, 10, 15, 20, 30, 70]
+ labels = ["Good", "Medium", "Bad"]
+
+ result = cut(arr, 3, labels=labels)
+ exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
+ ordered=True))
+ tm.assert_categorical_equal(result, exp)
+
+
[email protected]("x", [np.arange(11.), np.arange(11.) / 1e10])
+def test_round_frac_just_works(x):
+ # It works.
+ cut(x, 2)
+
+
[email protected]("val,precision,expected", [
+ (-117.9998, 3, -118),
+ (117.9998, 3, 118),
+ (117.9998, 2, 118),
+ (0.000123456, 2, 0.00012)
+])
+def test_round_frac(val, precision, expected):
+ # see gh-1979
+ result = tmod._round_frac(val, precision=precision)
+ assert result == expected
+
+
+def test_cut_return_intervals():
+ ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
+ result = cut(ser, 3)
+
+ exp_bins = np.linspace(0, 8, num=4).round(3)
+ exp_bins[0] -= 0.008
+
+ expected = Series(IntervalIndex.from_breaks(exp_bins, closed="right").take(
+ [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
+ tm.assert_series_equal(result, expected)
+
+
+def test_series_ret_bins():
+ # see gh-8589
+ ser = Series(np.arange(4))
+ result, bins = cut(ser, 2, retbins=True)
+
+ expected = Series(IntervalIndex.from_breaks(
+ [-0.003, 1.5, 3], closed="right").repeat(2)).astype(CDT(ordered=True))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("kwargs,msg", [
+ (dict(duplicates="drop"), None),
+ (dict(), "Bin edges must be unique"),
+ (dict(duplicates="raise"), "Bin edges must be unique"),
+ (dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
+])
+def test_cut_duplicates_bin(kwargs, msg):
+ # see gh-20947
+ bins = [0, 2, 4, 6, 10, 10]
+ values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
+
+ if msg is not None:
+ with pytest.raises(ValueError, match=msg):
+ cut(values, bins, **kwargs)
+ else:
+ result = cut(values, bins, **kwargs)
+ expected = cut(values, pd.unique(bins))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("data", [9.0, -9.0, 0.0])
[email protected]("length", [1, 2])
+def test_single_bin(data, length):
+ # see gh-14652, gh-15428
+ ser = Series([data] * length)
+ result = cut(ser, 1, labels=False)
+
+ expected = Series([0] * length)
+ tm.assert_series_equal(result, expected)
+
+
+ "array_1_writeable,array_2_writeable",
+ [(True, True), (True, False), (False, False)])
+def test_cut_read_only(array_1_writeable, array_2_writeable):
+ # issue 18773
+ array_1 = np.arange(0, 100, 10)
+ array_1.flags.writeable = array_1_writeable
+
+ array_2 = np.arange(0, 100, 10)
+ array_2.flags.writeable = array_2_writeable
+
+ hundred_elements = np.arange(100)
+ tm.assert_categorical_equal(cut(hundred_elements, array_1),
+ cut(hundred_elements, array_2))
+
+
+ lambda v: Timestamp(v),
+ lambda v: to_datetime(v),
+ lambda v: np.datetime64(v),
+ lambda v: Timestamp(v).to_pydatetime(),
+])
+def test_datetime_bin(conv):
+ data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
+ bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
+
+ expected = Series(IntervalIndex([
+ Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
+ Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype(
+ CDT(ordered=True))
+
+ bins = [conv(v) for v in bin_data]
+ result = Series(cut(data, bins=bins))
+ tm.assert_series_equal(result, expected)
+
+
+ to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
+ [np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
+ np.datetime64("2013-01-03")],
+ np.array([np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
+ np.datetime64("2013-01-03")]),
+ DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"])
+])
+def test_datetime_cut(data):
+ # see gh-14714
+ #
+ # Testing time data when it comes in various collection types.
+ result, _ = cut(data, 3, retbins=True)
+ expected = Series(IntervalIndex([
+ Interval(Timestamp("2012-12-31 23:57:07.200000"),
+ Timestamp("2013-01-01 16:00:00")),
+ Interval(Timestamp("2013-01-01 16:00:00"),
+ Timestamp("2013-01-02 08:00:00")),
+ Interval(Timestamp("2013-01-02 08:00:00"),
+ Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True))
+ tm.assert_series_equal(Series(result), expected)
+
+
+ 3, [Timestamp("2013-01-01 04:57:07.200000"),
+ Timestamp("2013-01-01 21:00:00"),
+ Timestamp("2013-01-02 13:00:00"),
+ Timestamp("2013-01-03 05:00:00")]])
[email protected]("box", [list, np.array, Index, Series])
+def test_datetime_tz_cut(bins, box):
+ # see gh-19872
+ tz = "US/Eastern"
+ s = Series(date_range("20130101", periods=3, tz=tz))
+
+ if not isinstance(bins, int):
+ bins = box(bins)
+
+ result = cut(s, bins)
+ expected = Series(IntervalIndex([
+ Interval(Timestamp("2012-12-31 23:57:07.200000", tz=tz),
+ Timestamp("2013-01-01 16:00:00", tz=tz)),
+ Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
+ Timestamp("2013-01-02 08:00:00", tz=tz)),
+ Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
+ Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
+ CDT(ordered=True))
+ tm.assert_series_equal(result, expected)
+
+
+def test_datetime_nan_error():
+ msg = "bins must be of datetime64 dtype"
+
+ with pytest.raises(ValueError, match=msg):
+ cut(date_range("20130101", periods=3), bins=[0, 2, 4])
+
+
+def test_datetime_nan_mask():
+ result = cut(date_range("20130102", periods=5),
+ bins=date_range("20130101", periods=2))
+
+ mask = result.categories.isna()
+ tm.assert_numpy_array_equal(mask, np.array([False]))
+
+ mask = result.isna()
+ tm.assert_numpy_array_equal(mask, np.array([False, True, True,
+ True, True]))
+
+
[email protected]("tz", [None, "UTC", "US/Pacific"])
+def test_datetime_cut_roundtrip(tz):
+ # see gh-19891
+ ser = Series(date_range("20180101", periods=3, tz=tz))
+ result, result_bins = cut(ser, 2, retbins=True)
+
+ expected = cut(ser, result_bins)
+ tm.assert_series_equal(result, expected)
+
+ expected_bins = DatetimeIndex(["2017-12-31 23:57:07.200000",
+ "2018-01-02 00:00:00",
+ "2018-01-03 00:00:00"])
+ expected_bins = expected_bins.tz_localize(tz)
+ tm.assert_index_equal(result_bins, expected_bins)
+
+
+def test_timedelta_cut_roundtrip():
+ # see gh-19891
+ ser = Series(timedelta_range("1day", periods=3))
+ result, result_bins = cut(ser, 2, retbins=True)
+
+ expected = cut(ser, result_bins)
+ tm.assert_series_equal(result, expected)
+
+ expected_bins = TimedeltaIndex(["0 days 23:57:07.200000",
+ "2 days 00:00:00",
+ "3 days 00:00:00"])
+ tm.assert_index_equal(result_bins, expected_bins)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_melt.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_melt.py
new file mode 100644
index 00000000000..6bd1958633e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_melt.py
@@ -0,0 +1,718 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas.compat import range
+
+import pandas as pd
+from pandas import DataFrame, lreshape, melt, wide_to_long
+import pandas.util.testing as tm
+
+
+class TestMelt(object):
+
+ def setup_method(self, method):
+ self.df = tm.makeTimeDataFrame()[:10]
+ self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
+ self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
+
+ self.var_name = 'var'
+ self.value_name = 'val'
+
+ self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
+ ], [-1.321405, 0.368915, -1.055342],
+ [-0.807333, 0.08298, -0.873361]])
+ self.df1.columns = [list('ABC'), list('abc')]
+ self.df1.columns.names = ['CAP', 'low']
+
+ def test_top_level_method(self):
+ result = melt(self.df)
+ assert result.columns.tolist() == ['variable', 'value']
+
+ def test_method_signatures(self):
+ tm.assert_frame_equal(self.df.melt(),
+ melt(self.df))
+
+ tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
+ value_vars=['A', 'B']),
+ melt(self.df,
+ id_vars=['id1', 'id2'],
+ value_vars=['A', 'B']))
+
+ tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
+ value_name=self.value_name),
+ melt(self.df,
+ var_name=self.var_name,
+ value_name=self.value_name))
+
+ tm.assert_frame_equal(self.df1.melt(col_level=0),
+ melt(self.df1, col_level=0))
+
+ def test_default_col_names(self):
+ result = self.df.melt()
+ assert result.columns.tolist() == ['variable', 'value']
+
+ result1 = self.df.melt(id_vars=['id1'])
+ assert result1.columns.tolist() == ['id1', 'variable', 'value']
+
+ result2 = self.df.melt(id_vars=['id1', 'id2'])
+ assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
+
+ def test_value_vars(self):
+ result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
+ assert len(result3) == 10
+
+ result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
+ expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
+ 'id2': self.df['id2'].tolist() * 2,
+ 'variable': ['A'] * 10 + ['B'] * 10,
+ 'value': (self.df['A'].tolist() +
+ self.df['B'].tolist())},
+ columns=['id1', 'id2', 'variable', 'value'])
+ tm.assert_frame_equal(result4, expected4)
+
+ def test_value_vars_types(self):
+ # GH 15348
+ expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
+ 'id2': self.df['id2'].tolist() * 2,
+ 'variable': ['A'] * 10 + ['B'] * 10,
+ 'value': (self.df['A'].tolist() +
+ self.df['B'].tolist())},
+ columns=['id1', 'id2', 'variable', 'value'])
+
+ for type_ in (tuple, list, np.array):
+ result = self.df.melt(id_vars=['id1', 'id2'],
+ value_vars=type_(('A', 'B')))
+ tm.assert_frame_equal(result, expected)
+
+ def test_vars_work_with_multiindex(self):
+ expected = DataFrame({
+ ('A', 'a'): self.df1[('A', 'a')],
+ 'CAP': ['B'] * len(self.df1),
+ 'low': ['b'] * len(self.df1),
+ 'value': self.df1[('B', 'b')],
+ }, columns=[('A', 'a'), 'CAP', 'low', 'value'])
+
+ result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
+ tm.assert_frame_equal(result, expected)
+
+ def test_single_vars_work_with_multiindex(self):
+ expected = DataFrame({
+ 'A': {0: 1.067683, 1: -1.321405, 2: -0.807333},
+ 'CAP': {0: 'B', 1: 'B', 2: 'B'},
+ 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}})
+ result = self.df1.melt(['A'], ['B'], col_level=0)
+ tm.assert_frame_equal(result, expected)
+
+ def test_tuple_vars_fail_with_multiindex(self):
+ # melt should fail with an informative error message if
+ # the columns have a MultiIndex and a tuple is passed
+ # for id_vars or value_vars.
+ tuple_a = ('A', 'a')
+ list_a = [tuple_a]
+ tuple_b = ('B', 'b')
+ list_b = [tuple_b]
+
+ msg = (r"(id|value)_vars must be a list of tuples when columns are"
+ " a MultiIndex")
+ for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
+ (tuple_a, tuple_b)):
+ with pytest.raises(ValueError, match=msg):
+ self.df1.melt(id_vars=id_vars, value_vars=value_vars)
+
+ def test_custom_var_name(self):
+ result5 = self.df.melt(var_name=self.var_name)
+ assert result5.columns.tolist() == ['var', 'value']
+
+ result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
+ assert result6.columns.tolist() == ['id1', 'var', 'value']
+
+ result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
+ assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
+
+ result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
+ var_name=self.var_name)
+ assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
+
+ result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
+ var_name=self.var_name)
+ expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
+ 'id2': self.df['id2'].tolist() * 2,
+ self.var_name: ['A'] * 10 + ['B'] * 10,
+ 'value': (self.df['A'].tolist() +
+ self.df['B'].tolist())},
+ columns=['id1', 'id2', self.var_name, 'value'])
+ tm.assert_frame_equal(result9, expected9)
+
+ def test_custom_value_name(self):
+ result10 = self.df.melt(value_name=self.value_name)
+ assert result10.columns.tolist() == ['variable', 'val']
+
+ result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
+ assert result11.columns.tolist() == ['id1', 'variable', 'val']
+
+ result12 = self.df.melt(id_vars=['id1', 'id2'],
+ value_name=self.value_name)
+ assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
+
+ result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
+ value_name=self.value_name)
+ assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
+
+ result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
+ value_name=self.value_name)
+ expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
+ 'id2': self.df['id2'].tolist() * 2,
+ 'variable': ['A'] * 10 + ['B'] * 10,
+ self.value_name: (self.df['A'].tolist() +
+ self.df['B'].tolist())},
+ columns=['id1', 'id2', 'variable',
+ self.value_name])
+ tm.assert_frame_equal(result14, expected14)
+
+ def test_custom_var_and_value_name(self):
+
+ result15 = self.df.melt(var_name=self.var_name,
+ value_name=self.value_name)
+ assert result15.columns.tolist() == ['var', 'val']
+
+ result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
+ value_name=self.value_name)
+ assert result16.columns.tolist() == ['id1', 'var', 'val']
+
+ result17 = self.df.melt(id_vars=['id1', 'id2'],
+ var_name=self.var_name,
+ value_name=self.value_name)
+ assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
+
+ result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
+ var_name=self.var_name,
+ value_name=self.value_name)
+ assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
+
+ result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
+ var_name=self.var_name,
+ value_name=self.value_name)
+ expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
+ 'id2': self.df['id2'].tolist() * 2,
+ self.var_name: ['A'] * 10 + ['B'] * 10,
+ self.value_name: (self.df['A'].tolist() +
+ self.df['B'].tolist())},
+ columns=['id1', 'id2', self.var_name,
+ self.value_name])
+ tm.assert_frame_equal(result19, expected19)
+
+ df20 = self.df.copy()
+ df20.columns.name = 'foo'
+ result20 = df20.melt()
+ assert result20.columns.tolist() == ['foo', 'value']
+
+ def test_col_level(self):
+ res1 = self.df1.melt(col_level=0)
+ res2 = self.df1.melt(col_level='CAP')
+ assert res1.columns.tolist() == ['CAP', 'value']
+ assert res2.columns.tolist() == ['CAP', 'value']
+
+ def test_multiindex(self):
+ res = self.df1.melt()
+ assert res.columns.tolist() == ['CAP', 'low', 'value']
+
+ @pytest.mark.parametrize("col", [
+ pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
+ pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
+ pd.Series([0, 1, 0, 0, 0])])
+ def test_pandas_dtypes(self, col):
+ # GH 15785
+ df = DataFrame({'klass': range(5),
+ 'col': col,
+ 'attr1': [1, 0, 0, 0, 0],
+ 'attr2': col})
+ expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
+ ignore_index=True)
+ result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
+ value_name='value')
+ expected = DataFrame({0: list(range(5)) * 2,
+ 1: pd.concat([col] * 2, ignore_index=True),
+ 2: ['attr1'] * 5 + ['attr2'] * 5,
+ 3: expected_value})
+ expected.columns = ['klass', 'col', 'attribute', 'value']
+ tm.assert_frame_equal(result, expected)
+
+ def test_melt_missing_columns_raises(self):
+ # GH-23575
+ # This test is to ensure that pandas raises an error if melting is
+ # attempted with column names absent from the dataframe
+
+ # Generate data
+ df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
+
+ # Try to melt with missing `value_vars` column name
+ msg = "The following '{Var}' are not present in the DataFrame: {Col}"
+ with pytest.raises(
+ KeyError,
+ match=msg.format(Var='value_vars', Col="\\['C'\\]")):
+ df.melt(['a', 'b'], ['C', 'd'])
+
+ # Try to melt with missing `id_vars` column name
+ with pytest.raises(
+ KeyError,
+ match=msg.format(Var='id_vars', Col="\\['A'\\]")):
+ df.melt(['A', 'b'], ['c', 'd'])
+
+ # Multiple missing
+ with pytest.raises(
+ KeyError,
+ match=msg.format(Var='id_vars',
+ Col="\\['not_here', 'or_there'\\]")):
+ df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd'])
+
+ # Multiindex melt fails if column is missing from multilevel melt
+ multi = df.copy()
+ multi.columns = [list('ABCD'), list('abcd')]
+ with pytest.raises(
+ KeyError,
+ match=msg.format(Var='id_vars',
+ Col="\\['E'\\]")):
+ multi.melt([('E', 'a')], [('B', 'b')])
+ # Multiindex fails if column is missing from single level melt
+ with pytest.raises(
+ KeyError,
+ match=msg.format(Var='value_vars',
+ Col="\\['F'\\]")):
+ multi.melt(['A'], ['F'], col_level=0)
+
+
+class TestLreshape(object):
+
+ def test_pairs(self):
+ data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
+ '11jan2009'],
+ 'birthwt': [1766, 3301, 1454, 3139, 4133],
+ 'id': [101, 102, 103, 104, 105],
+ 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
+ 'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
+ '29dec2008', '20jan2009'],
+ 'visitdt2':
+ ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
+ 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
+ 'wt1': [1823, 3338, 1549, 3298, 4306],
+ 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
+ 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
+
+ df = DataFrame(data)
+
+ spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
+ 'wt': ['wt%d' % i for i in range(1, 4)]}
+ result = lreshape(df, spec)
+
+ exp_data = {'birthdt':
+ ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
+ '11jan2009', '08jan2009', '30dec2008', '21dec2008',
+ '11jan2009', '08jan2009', '21dec2008', '11jan2009'],
+ 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
+ 4133, 1766, 3139, 4133],
+ 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
+ 104, 105],
+ 'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
+ 'Male', 'Female', 'Female', 'Female', 'Male',
+ 'Female', 'Female'],
+ 'visitdt': ['11jan2009', '22dec2008', '04jan2009',
+ '29dec2008', '20jan2009', '21jan2009',
+ '22jan2009', '31dec2008', '03feb2009',
+ '05feb2009', '02jan2009', '15feb2009'],
+ 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
+ 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
+ exp = DataFrame(exp_data, columns=result.columns)
+ tm.assert_frame_equal(result, exp)
+
+ result = lreshape(df, spec, dropna=False)
+ exp_data = {'birthdt':
+ ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
+ '11jan2009', '08jan2009', '20dec2008', '30dec2008',
+ '21dec2008', '11jan2009', '08jan2009', '20dec2008',
+ '30dec2008', '21dec2008', '11jan2009'],
+ 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
+ 3139, 4133, 1766, 3301, 1454, 3139, 4133],
+ 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
+ 101, 102, 103, 104, 105],
+ 'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
+ 'Male', 'Female', 'Female', 'Female', 'Female',
+ 'Male', 'Female', 'Female', 'Female', 'Female'],
+ 'visitdt': ['11jan2009', '22dec2008', '04jan2009',
+ '29dec2008', '20jan2009', '21jan2009', nan,
+ '22jan2009', '31dec2008', '03feb2009',
+ '05feb2009', nan, nan, '02jan2009',
+ '15feb2009'],
+ 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
+ 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
+ 4805.0]}
+ exp = DataFrame(exp_data, columns=result.columns)
+ tm.assert_frame_equal(result, exp)
+
+ spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
+ 'wt': ['wt%d' % i for i in range(1, 4)]}
+ msg = "All column lists must be same length"
+ with pytest.raises(ValueError, match=msg):
+ lreshape(df, spec)
+
+
+class TestWideToLong(object):
+
+ def test_simple(self):
+ np.random.seed(123)
+ x = np.random.randn(3)
+ df = pd.DataFrame({"A1970": {0: "a",
+ 1: "b",
+ 2: "c"},
+ "A1980": {0: "d",
+ 1: "e",
+ 2: "f"},
+ "B1970": {0: 2.5,
+ 1: 1.2,
+ 2: .7},
+ "B1980": {0: 3.2,
+ 1: 1.3,
+ 2: .1},
+ "X": dict(zip(
+ range(3), x))})
+ df["id"] = df.index
+ exp_data = {"X": x.tolist() + x.tolist(),
+ "A": ['a', 'b', 'c', 'd', 'e', 'f'],
+ "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+ "year": [1970, 1970, 1970, 1980, 1980, 1980],
+ "id": [0, 1, 2, 0, 1, 2]}
+ expected = DataFrame(exp_data)
+ expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
+ result = wide_to_long(df, ["A", "B"], i="id", j="year")
+ tm.assert_frame_equal(result, expected)
+
+ def test_stubs(self):
+ # GH9204
+ df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
+ df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
+ stubs = ['inc', 'edu']
+
+ # TODO: unused?
+ df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
+
+ assert stubs == ['inc', 'edu']
+
+ def test_separating_character(self):
+ # GH14779
+ np.random.seed(123)
+ x = np.random.randn(3)
+ df = pd.DataFrame({"A.1970": {0: "a",
+ 1: "b",
+ 2: "c"},
+ "A.1980": {0: "d",
+ 1: "e",
+ 2: "f"},
+ "B.1970": {0: 2.5,
+ 1: 1.2,
+ 2: .7},
+ "B.1980": {0: 3.2,
+ 1: 1.3,
+ 2: .1},
+ "X": dict(zip(
+ range(3), x))})
+ df["id"] = df.index
+ exp_data = {"X": x.tolist() + x.tolist(),
+ "A": ['a', 'b', 'c', 'd', 'e', 'f'],
+ "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+ "year": [1970, 1970, 1970, 1980, 1980, 1980],
+ "id": [0, 1, 2, 0, 1, 2]}
+ expected = DataFrame(exp_data)
+ expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
+ result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
+ tm.assert_frame_equal(result, expected)
+
+ def test_escapable_characters(self):
+ np.random.seed(123)
+ x = np.random.randn(3)
+ df = pd.DataFrame({"A(quarterly)1970": {0: "a",
+ 1: "b",
+ 2: "c"},
+ "A(quarterly)1980": {0: "d",
+ 1: "e",
+ 2: "f"},
+ "B(quarterly)1970": {0: 2.5,
+ 1: 1.2,
+ 2: .7},
+ "B(quarterly)1980": {0: 3.2,
+ 1: 1.3,
+ 2: .1},
+ "X": dict(zip(
+ range(3), x))})
+ df["id"] = df.index
+ exp_data = {"X": x.tolist() + x.tolist(),
+ "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
+ "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+ "year": [1970, 1970, 1970, 1980, 1980, 1980],
+ "id": [0, 1, 2, 0, 1, 2]}
+ expected = DataFrame(exp_data)
+ expected = expected.set_index(
+ ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
+ result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
+ i="id", j="year")
+ tm.assert_frame_equal(result, expected)
+
+ def test_unbalanced(self):
+ # test that we can have a varying amount of time variables
+ df = pd.DataFrame({'A2010': [1.0, 2.0],
+ 'A2011': [3.0, 4.0],
+ 'B2010': [5.0, 6.0],
+ 'X': ['X1', 'X2']})
+ df['id'] = df.index
+ exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
+ 'A': [1.0, 3.0, 2.0, 4.0],
+ 'B': [5.0, np.nan, 6.0, np.nan],
+ 'id': [0, 0, 1, 1],
+ 'year': [2010, 2011, 2010, 2011]}
+ expected = pd.DataFrame(exp_data)
+ expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
+ result = wide_to_long(df, ['A', 'B'], i='id', j='year')
+ tm.assert_frame_equal(result, expected)
+
+ def test_character_overlap(self):
+ # Test we handle overlapping characters in both id_vars and value_vars
+ df = pd.DataFrame({
+ 'A11': ['a11', 'a22', 'a33'],
+ 'A12': ['a21', 'a22', 'a23'],
+ 'B11': ['b11', 'b12', 'b13'],
+ 'B12': ['b21', 'b22', 'b23'],
+ 'BB11': [1, 2, 3],
+ 'BB12': [4, 5, 6],
+ 'BBBX': [91, 92, 93],
+ 'BBBZ': [91, 92, 93]
+ })
+ df['id'] = df.index
+ expected = pd.DataFrame({
+ 'BBBX': [91, 92, 93, 91, 92, 93],
+ 'BBBZ': [91, 92, 93, 91, 92, 93],
+ 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
+ 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
+ 'BB': [1, 2, 3, 4, 5, 6],
+ 'id': [0, 1, 2, 0, 1, 2],
+ 'year': [11, 11, 11, 12, 12, 12]})
+ expected = expected.set_index(['id', 'year'])[
+ ['BBBX', 'BBBZ', 'A', 'B', 'BB']]
+ result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
+ tm.assert_frame_equal(result.sort_index(axis=1),
+ expected.sort_index(axis=1))
+
+ def test_invalid_separator(self):
+ # if an invalid separator is supplied a empty data frame is returned
+ sep = 'nope!'
+ df = pd.DataFrame({'A2010': [1.0, 2.0],
+ 'A2011': [3.0, 4.0],
+ 'B2010': [5.0, 6.0],
+ 'X': ['X1', 'X2']})
+ df['id'] = df.index
+ exp_data = {'X': '',
+ 'A2010': [],
+ 'A2011': [],
+ 'B2010': [],
+ 'id': [],
+ 'year': [],
+ 'A': [],
+ 'B': []}
+ expected = pd.DataFrame(exp_data).astype({'year': 'int'})
+ expected = expected.set_index(['id', 'year'])[[
+ 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
+ expected.index.set_levels([0, 1], level=0, inplace=True)
+ result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
+ tm.assert_frame_equal(result.sort_index(axis=1),
+ expected.sort_index(axis=1))
+
+ def test_num_string_disambiguation(self):
+ # Test that we can disambiguate number value_vars from
+ # string value_vars
+ df = pd.DataFrame({
+ 'A11': ['a11', 'a22', 'a33'],
+ 'A12': ['a21', 'a22', 'a23'],
+ 'B11': ['b11', 'b12', 'b13'],
+ 'B12': ['b21', 'b22', 'b23'],
+ 'BB11': [1, 2, 3],
+ 'BB12': [4, 5, 6],
+ 'Arating': [91, 92, 93],
+ 'Arating_old': [91, 92, 93]
+ })
+ df['id'] = df.index
+ expected = pd.DataFrame({
+ 'Arating': [91, 92, 93, 91, 92, 93],
+ 'Arating_old': [91, 92, 93, 91, 92, 93],
+ 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
+ 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
+ 'BB': [1, 2, 3, 4, 5, 6],
+ 'id': [0, 1, 2, 0, 1, 2],
+ 'year': [11, 11, 11, 12, 12, 12]})
+ expected = expected.set_index(['id', 'year'])[
+ ['Arating', 'Arating_old', 'A', 'B', 'BB']]
+ result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
+ tm.assert_frame_equal(result.sort_index(axis=1),
+ expected.sort_index(axis=1))
+
+ def test_invalid_suffixtype(self):
+ # If all stubs names end with a string, but a numeric suffix is
+ # assumed, an empty data frame is returned
+ df = pd.DataFrame({'Aone': [1.0, 2.0],
+ 'Atwo': [3.0, 4.0],
+ 'Bone': [5.0, 6.0],
+ 'X': ['X1', 'X2']})
+ df['id'] = df.index
+ exp_data = {'X': '',
+ 'Aone': [],
+ 'Atwo': [],
+ 'Bone': [],
+ 'id': [],
+ 'year': [],
+ 'A': [],
+ 'B': []}
+ expected = pd.DataFrame(exp_data).astype({'year': 'int'})
+
+ expected = expected.set_index(['id', 'year'])
+ expected.index.set_levels([0, 1], level=0, inplace=True)
+ result = wide_to_long(df, ['A', 'B'], i='id', j='year')
+ tm.assert_frame_equal(result.sort_index(axis=1),
+ expected.sort_index(axis=1))
+
+ def test_multiple_id_columns(self):
+ # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
+ df = pd.DataFrame({
+ 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+ 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+ 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
+ })
+ expected = pd.DataFrame({
+ 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
+ 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
+ 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
+ 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
+ 'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
+ 2, 1, 2, 1, 2, 1, 2, 1, 2]
+ })
+ expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
+ result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
+ tm.assert_frame_equal(result, expected)
+
+ def test_non_unique_idvars(self):
+ # GH16382
+ # Raise an error message if non unique id vars (i) are passed
+ df = pd.DataFrame({
+ 'A_A1': [1, 2, 3, 4, 5],
+ 'B_B1': [1, 2, 3, 4, 5],
+ 'x': [1, 1, 1, 1, 1]
+ })
+ msg = "the id variables need to uniquely identify each row"
+ with pytest.raises(ValueError, match=msg):
+ wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
+
+ def test_cast_j_int(self):
+ df = pd.DataFrame({
+ 'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
+ 'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
+ 'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
+ 'actor_fb_likes_2': [936.0, 5000.0, 393.0],
+ 'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
+
+ expected = pd.DataFrame({
+ 'actor': ['CCH Pounder',
+ 'Johnny Depp',
+ 'Christoph Waltz',
+ 'Joel David Moore',
+ 'Orlando Bloom',
+ 'Rory Kinnear'],
+ 'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
+ 'num': [1, 1, 1, 2, 2, 2],
+ 'title': ['Avatar',
+ 'Pirates of the Caribbean',
+ 'Spectre',
+ 'Avatar',
+ 'Pirates of the Caribbean',
+ 'Spectre']}).set_index(['title', 'num'])
+ result = wide_to_long(df, ['actor', 'actor_fb_likes'],
+ i='title', j='num', sep='_')
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_identical_stubnames(self):
+ df = pd.DataFrame({'A2010': [1.0, 2.0],
+ 'A2011': [3.0, 4.0],
+ 'B2010': [5.0, 6.0],
+ 'A': ['X1', 'X2']})
+ msg = "stubname can't be identical to a column name"
+ with pytest.raises(ValueError, match=msg):
+ wide_to_long(df, ['A', 'B'], i='A', j='colname')
+
+ def test_nonnumeric_suffix(self):
+ df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
+ 'treatment_test': [3.0, 4.0],
+ 'result_placebo': [5.0, 6.0],
+ 'A': ['X1', 'X2']})
+ expected = pd.DataFrame({
+ 'A': ['X1', 'X1', 'X2', 'X2'],
+ 'colname': ['placebo', 'test', 'placebo', 'test'],
+ 'result': [5.0, np.nan, 6.0, np.nan],
+ 'treatment': [1.0, 3.0, 2.0, 4.0]})
+ expected = expected.set_index(['A', 'colname'])
+ result = wide_to_long(df, ['result', 'treatment'],
+ i='A', j='colname', suffix='[a-z]+', sep='_')
+ tm.assert_frame_equal(result, expected)
+
+ def test_mixed_type_suffix(self):
+ df = pd.DataFrame({
+ 'A': ['X1', 'X2'],
+ 'result_1': [0, 9],
+ 'result_foo': [5.0, 6.0],
+ 'treatment_1': [1.0, 2.0],
+ 'treatment_foo': [3.0, 4.0]})
+ expected = pd.DataFrame({
+ 'A': ['X1', 'X2', 'X1', 'X2'],
+ 'colname': ['1', '1', 'foo', 'foo'],
+ 'result': [0.0, 9.0, 5.0, 6.0],
+ 'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
+ result = wide_to_long(df, ['result', 'treatment'],
+ i='A', j='colname', suffix='.+', sep='_')
+ tm.assert_frame_equal(result, expected)
+
+ def test_float_suffix(self):
+ df = pd.DataFrame({
+ 'treatment_1.1': [1.0, 2.0],
+ 'treatment_2.1': [3.0, 4.0],
+ 'result_1.2': [5.0, 6.0],
+ 'result_1': [0, 9],
+ 'A': ['X1', 'X2']})
+ expected = pd.DataFrame({
+ 'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
+ 'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
+ 'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
+ 'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
+ expected = expected.set_index(['A', 'colname'])
+ result = wide_to_long(df, ['result', 'treatment'],
+ i='A', j='colname', suffix='[0-9.]+', sep='_')
+ tm.assert_frame_equal(result, expected)
+
+ def test_col_substring_of_stubname(self):
+ # GH22468
+ # Don't raise ValueError when a column name is a substring
+ # of a stubname that's been passed as a string
+ wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+ 'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
+ 'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
+ 'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
+ 'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}
+ }
+ wide_df = pd.DataFrame.from_dict(wide_data)
+ expected = pd.wide_to_long(wide_df,
+ stubnames=['PA'],
+ i=['node_id', 'A'],
+ j='time')
+ result = pd.wide_to_long(wide_df,
+ stubnames='PA',
+ i=['node_id', 'A'],
+ j='time')
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_pivot.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_pivot.py
new file mode 100644
index 00000000000..e4fbb204af5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_pivot.py
@@ -0,0 +1,1798 @@
+# -*- coding: utf-8 -*-
+
+from collections import OrderedDict
+from datetime import date, datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import product, range
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Grouper, Index, MultiIndex, Series, concat,
+ date_range)
+from pandas.api.types import CategoricalDtype as CDT
+from pandas.core.reshape.pivot import crosstab, pivot_table
+import pandas.util.testing as tm
+
+
[email protected](params=[True, False])
+def dropna(request):
+ return request.param
+
+
+class TestPivotTable(object):
+
+ def setup_method(self, method):
+ self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
+ 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two',
+ 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull',
+ 'dull', 'shiny', 'shiny', 'dull',
+ 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ def test_pivot_table(self):
+ index = ['A', 'B']
+ columns = 'C'
+ table = pivot_table(self.data, values='D',
+ index=index, columns=columns)
+
+ table2 = self.data.pivot_table(
+ values='D', index=index, columns=columns)
+ tm.assert_frame_equal(table, table2)
+
+ # this works
+ pivot_table(self.data, values='D', index=index)
+
+ if len(index) > 1:
+ assert table.index.names == tuple(index)
+ else:
+ assert table.index.name == index[0]
+
+ if len(columns) > 1:
+ assert table.columns.names == columns
+ else:
+ assert table.columns.name == columns[0]
+
+ expected = self.data.groupby(
+ index + [columns])['D'].agg(np.mean).unstack()
+ tm.assert_frame_equal(table, expected)
+
+ def test_pivot_table_nocols(self):
+ df = DataFrame({'rows': ['a', 'b', 'c'],
+ 'cols': ['x', 'y', 'z'],
+ 'values': [1, 2, 3]})
+ rs = df.pivot_table(columns='cols', aggfunc=np.sum)
+ xp = df.pivot_table(index='cols', aggfunc=np.sum).T
+ tm.assert_frame_equal(rs, xp)
+
+ rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'})
+ xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T
+ tm.assert_frame_equal(rs, xp)
+
+ def test_pivot_table_dropna(self):
+ df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
+ 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
+ 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
+ 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
+ 'quantity': {0: 2000000, 1: 500000,
+ 2: 1000000, 3: 1000000}})
+ pv_col = df.pivot_table('quantity', 'month', [
+ 'customer', 'product'], dropna=False)
+ pv_ind = df.pivot_table(
+ 'quantity', ['customer', 'product'], 'month', dropna=False)
+
+ m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'),
+ ('A', 'd'), ('B', 'a'), ('B', 'b'),
+ ('B', 'c'), ('B', 'd'), ('C', 'a'),
+ ('C', 'b'), ('C', 'c'), ('C', 'd')],
+ names=['customer', 'product'])
+ tm.assert_index_equal(pv_col.columns, m)
+ tm.assert_index_equal(pv_ind.index, m)
+
+ def test_pivot_table_categorical(self):
+
+ cat1 = Categorical(["a", "a", "b", "b"],
+ categories=["a", "b", "z"], ordered=True)
+ cat2 = Categorical(["c", "d", "c", "d"],
+ categories=["c", "d", "y"], ordered=True)
+ df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+ result = pd.pivot_table(df, values='values', index=['A', 'B'],
+ dropna=True)
+
+ exp_index = pd.MultiIndex.from_arrays(
+ [cat1, cat2],
+ names=['A', 'B'])
+ expected = DataFrame(
+ {'values': [1, 2, 3, 4]},
+ index=exp_index)
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_table_dropna_categoricals(self, dropna):
+ # GH 15193
+ categories = ['a', 'b', 'c', 'd']
+
+ df = DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'],
+ 'B': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+ 'C': range(0, 9)})
+
+ df['A'] = df['A'].astype(CDT(categories, ordered=False))
+ result = df.pivot_table(index='B', columns='A', values='C',
+ dropna=dropna)
+ expected_columns = Series(['a', 'b', 'c'], name='A')
+ expected_columns = expected_columns.astype(
+ CDT(categories, ordered=False))
+ expected_index = Series([1, 2, 3], name='B')
+ expected = DataFrame([[0, 3, 6],
+ [1, 4, 7],
+ [2, 5, 8]],
+ index=expected_index,
+ columns=expected_columns,)
+ if not dropna:
+ # add back the non observed to compare
+ expected = expected.reindex(
+ columns=Categorical(categories)).astype('float')
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_with_non_observable_dropna(self, dropna):
+ # gh-21133
+ df = pd.DataFrame(
+ {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'],
+ categories=['low', 'high'],
+ ordered=True),
+ 'B': range(5)})
+
+ result = df.pivot_table(index='A', values='B', dropna=dropna)
+ expected = pd.DataFrame(
+ {'B': [2, 3]},
+ index=pd.Index(
+ pd.Categorical.from_codes([0, 1],
+ categories=['low', 'high'],
+ ordered=True),
+ name='A'))
+
+ tm.assert_frame_equal(result, expected)
+
+ # gh-21378
+ df = pd.DataFrame(
+ {'A': pd.Categorical(['left', 'low', 'high', 'low', 'high'],
+ categories=['low', 'high', 'left'],
+ ordered=True),
+ 'B': range(5)})
+
+ result = df.pivot_table(index='A', values='B', dropna=dropna)
+ expected = pd.DataFrame(
+ {'B': [2, 3, 0]},
+ index=pd.Index(
+ pd.Categorical.from_codes([0, 1, 2],
+ categories=['low', 'high', 'left'],
+ ordered=True),
+ name='A'))
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_pass_array(self):
+ result = self.data.pivot_table(
+ 'D', index=self.data.A, columns=self.data.C)
+ expected = self.data.pivot_table('D', index='A', columns='C')
+ tm.assert_frame_equal(result, expected)
+
+ def test_pass_function(self):
+ result = self.data.pivot_table('D', index=lambda x: x // 5,
+ columns=self.data.C)
+ expected = self.data.pivot_table('D', index=self.data.index // 5,
+ columns='C')
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_table_multiple(self):
+ index = ['A', 'B']
+ columns = 'C'
+ table = pivot_table(self.data, index=index, columns=columns)
+ expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
+ tm.assert_frame_equal(table, expected)
+
+ def test_pivot_dtypes(self):
+
+ # can convert dtypes
+ f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [
+ 1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b']})
+ assert f.dtypes['v'] == 'int64'
+
+ z = pivot_table(f, values='v', index=['a'], columns=[
+ 'i'], fill_value=0, aggfunc=np.sum)
+ result = z.get_dtype_counts()
+ expected = Series(dict(int64=2))
+ tm.assert_series_equal(result, expected)
+
+ # cannot convert dtypes
+ f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [
+ 1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b']})
+ assert f.dtypes['v'] == 'float64'
+
+ z = pivot_table(f, values='v', index=['a'], columns=[
+ 'i'], fill_value=0, aggfunc=np.mean)
+ result = z.get_dtype_counts()
+ expected = Series(dict(float64=2))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('columns,values',
+ [('bool1', ['float1', 'float2']),
+ ('bool1', ['float1', 'float2', 'bool1']),
+ ('bool2', ['float1', 'float2', 'bool1'])])
+ def test_pivot_preserve_dtypes(self, columns, values):
+ # GH 7142 regression test
+ v = np.arange(5, dtype=np.float64)
+ df = DataFrame({'float1': v, 'float2': v + 2.0,
+ 'bool1': v <= 2, 'bool2': v <= 3})
+
+ df_res = df.reset_index().pivot_table(
+ index='index', columns=columns, values=values)
+
+ result = dict(df_res.dtypes)
+ expected = {col: np.dtype('O') if col[0].startswith('b')
+ else np.dtype('float64') for col in df_res}
+ assert result == expected
+
+ def test_pivot_no_values(self):
+ # GH 14380
+ idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-01-02',
+ '2011-01-01', '2011-01-02'])
+ df = pd.DataFrame({'A': [1, 2, 3, 4, 5]},
+ index=idx)
+ res = df.pivot_table(index=df.index.month, columns=df.index.day)
+
+ exp_columns = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)])
+ exp = pd.DataFrame([[2.5, 4.0], [2.0, np.nan]],
+ index=[1, 2], columns=exp_columns)
+ tm.assert_frame_equal(res, exp)
+
+ df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
+ 'dt': pd.date_range('2011-01-01', freq='D',
+ periods=5)},
+ index=idx)
+ res = df.pivot_table(index=df.index.month,
+ columns=pd.Grouper(key='dt', freq='M'))
+ exp_columns = pd.MultiIndex.from_tuples([('A',
+ pd.Timestamp('2011-01-31'))])
+ exp_columns.names = [None, 'dt']
+ exp = pd.DataFrame([3.25, 2.0],
+ index=[1, 2], columns=exp_columns)
+ tm.assert_frame_equal(res, exp)
+
+ res = df.pivot_table(index=pd.Grouper(freq='A'),
+ columns=pd.Grouper(key='dt', freq='M'))
+ exp = pd.DataFrame([3],
+ index=pd.DatetimeIndex(['2011-12-31']),
+ columns=exp_columns)
+ tm.assert_frame_equal(res, exp)
+
+ def test_pivot_multi_values(self):
+ result = pivot_table(self.data, values=['D', 'E'],
+ index='A', columns=['B', 'C'], fill_value=0)
+ expected = pivot_table(self.data.drop(['F'], axis=1),
+ index='A', columns=['B', 'C'], fill_value=0)
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_multi_functions(self):
+ f = lambda func: pivot_table(self.data, values=['D', 'E'],
+ index=['A', 'B'], columns='C',
+ aggfunc=func)
+ result = f([np.mean, np.std])
+ means = f(np.mean)
+ stds = f(np.std)
+ expected = concat([means, stds], keys=['mean', 'std'], axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ # margins not supported??
+ f = lambda func: pivot_table(self.data, values=['D', 'E'],
+ index=['A', 'B'], columns='C',
+ aggfunc=func, margins=True)
+ result = f([np.mean, np.std])
+ means = f(np.mean)
+ stds = f(np.std)
+ expected = concat([means, stds], keys=['mean', 'std'], axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_index_with_nan(self, method):
+ # GH 3588
+ nan = np.nan
+ df = DataFrame({'a': ['R1', 'R2', nan, 'R4'],
+ 'b': ['C1', 'C2', 'C3', 'C4'],
+ 'c': [10, 15, 17, 20]})
+ if method:
+ result = df.pivot('a', 'b', 'c')
+ else:
+ result = pd.pivot(df, 'a', 'b', 'c')
+ expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan],
+ [nan, 15, nan, nan], [nan, nan, nan, 20]],
+ index=Index([nan, 'R1', 'R2', 'R4'], name='a'),
+ columns=Index(['C1', 'C2', 'C3', 'C4'], name='b'))
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)
+
+ # GH9491
+ df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'),
+ 'c': 100 + np.arange(6)})
+ df['b'] = df['a'] - pd.Timestamp('2014-02-02')
+ df.loc[1, 'a'] = df.loc[3, 'a'] = nan
+ df.loc[1, 'b'] = df.loc[4, 'b'] = nan
+
+ if method:
+ pv = df.pivot('a', 'b', 'c')
+ else:
+ pv = pd.pivot(df, 'a', 'b', 'c')
+ assert pv.notna().values.sum() == len(df)
+
+ for _, row in df.iterrows():
+ assert pv.loc[row['a'], row['b']] == row['c']
+
+ if method:
+ result = df.pivot('b', 'a', 'c')
+ else:
+ result = pd.pivot(df, 'b', 'a', 'c')
+ tm.assert_frame_equal(result, pv.T)
+
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_with_tz(self, method):
+ # GH 5878
+ df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0),
+ datetime(2013, 1, 2, 9, 0),
+ datetime(2013, 1, 1, 9, 0),
+ datetime(2013, 1, 2, 9, 0)],
+ 'dt2': [datetime(2014, 1, 1, 9, 0),
+ datetime(2014, 1, 1, 9, 0),
+ datetime(2014, 1, 2, 9, 0),
+ datetime(2014, 1, 2, 9, 0)],
+ 'data1': np.arange(4, dtype='int64'),
+ 'data2': np.arange(4, dtype='int64')})
+
+ df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
+ df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))
+
+ exp_col1 = Index(['data1', 'data1', 'data2', 'data2'])
+ exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00',
+ '2014/01/02 09:00'] * 2,
+ name='dt2', tz='Asia/Tokyo')
+ exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
+ expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]],
+ index=pd.DatetimeIndex(['2013/01/01 09:00',
+ '2013/01/02 09:00'],
+ name='dt1',
+ tz='US/Pacific'),
+ columns=exp_col)
+
+ if method:
+ pv = df.pivot(index='dt1', columns='dt2')
+ else:
+ pv = pd.pivot(df, index='dt1', columns='dt2')
+ tm.assert_frame_equal(pv, expected)
+
+ expected = DataFrame([[0, 2], [1, 3]],
+ index=pd.DatetimeIndex(['2013/01/01 09:00',
+ '2013/01/02 09:00'],
+ name='dt1',
+ tz='US/Pacific'),
+ columns=pd.DatetimeIndex(['2014/01/01 09:00',
+ '2014/01/02 09:00'],
+ name='dt2',
+ tz='Asia/Tokyo'))
+
+ if method:
+ pv = df.pivot(index='dt1', columns='dt2', values='data1')
+ else:
+ pv = pd.pivot(df, index='dt1', columns='dt2', values='data1')
+ tm.assert_frame_equal(pv, expected)
+
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_periods(self, method):
+ df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'),
+ pd.Period('2013-01-02', 'D'),
+ pd.Period('2013-01-01', 'D'),
+ pd.Period('2013-01-02', 'D')],
+ 'p2': [pd.Period('2013-01', 'M'),
+ pd.Period('2013-01', 'M'),
+ pd.Period('2013-02', 'M'),
+ pd.Period('2013-02', 'M')],
+ 'data1': np.arange(4, dtype='int64'),
+ 'data2': np.arange(4, dtype='int64')})
+
+ exp_col1 = Index(['data1', 'data1', 'data2', 'data2'])
+ exp_col2 = pd.PeriodIndex(['2013-01', '2013-02'] * 2,
+ name='p2', freq='M')
+ exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
+ expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]],
+ index=pd.PeriodIndex(['2013-01-01', '2013-01-02'],
+ name='p1', freq='D'),
+ columns=exp_col)
+ if method:
+ pv = df.pivot(index='p1', columns='p2')
+ else:
+ pv = pd.pivot(df, index='p1', columns='p2')
+ tm.assert_frame_equal(pv, expected)
+
+ expected = DataFrame([[0, 2], [1, 3]],
+ index=pd.PeriodIndex(['2013-01-01', '2013-01-02'],
+ name='p1', freq='D'),
+ columns=pd.PeriodIndex(['2013-01', '2013-02'],
+ name='p2', freq='M'))
+ if method:
+ pv = df.pivot(index='p1', columns='p2', values='data1')
+ else:
+ pv = pd.pivot(df, index='p1', columns='p2', values='data1')
+ tm.assert_frame_equal(pv, expected)
+
+ @pytest.mark.parametrize('values', [
+ ['baz', 'zoo'], np.array(['baz', 'zoo']),
+ pd.Series(['baz', 'zoo']), pd.Index(['baz', 'zoo'])
+ ])
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_with_list_like_values(self, values, method):
+ # issue #17160
+ df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+ 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+ 'baz': [1, 2, 3, 4, 5, 6],
+ 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+
+ if method:
+ result = df.pivot(index='foo', columns='bar', values=values)
+ else:
+ result = pd.pivot(df, index='foo', columns='bar', values=values)
+
+ data = [[1, 2, 3, 'x', 'y', 'z'],
+ [4, 5, 6, 'q', 'w', 't']]
+ index = Index(data=['one', 'two'], name='foo')
+ columns = MultiIndex(levels=[['baz', 'zoo'], ['A', 'B', 'C']],
+ codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
+ names=[None, 'bar'])
+ expected = DataFrame(data=data, index=index,
+ columns=columns, dtype='object')
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('values', [
+ ['bar', 'baz'], np.array(['bar', 'baz']),
+ pd.Series(['bar', 'baz']), pd.Index(['bar', 'baz'])
+ ])
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_with_list_like_values_nans(self, values, method):
+ # issue #17160
+ df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+ 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+ 'baz': [1, 2, 3, 4, 5, 6],
+ 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+
+ if method:
+ result = df.pivot(index='zoo', columns='foo', values=values)
+ else:
+ result = pd.pivot(df, index='zoo', columns='foo', values=values)
+
+ data = [[np.nan, 'A', np.nan, 4],
+ [np.nan, 'C', np.nan, 6],
+ [np.nan, 'B', np.nan, 5],
+ ['A', np.nan, 1, np.nan],
+ ['B', np.nan, 2, np.nan],
+ ['C', np.nan, 3, np.nan]]
+ index = Index(data=['q', 't', 'w', 'x', 'y', 'z'], name='zoo')
+ columns = MultiIndex(levels=[['bar', 'baz'], ['one', 'two']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=[None, 'foo'])
+ expected = DataFrame(data=data, index=index,
+ columns=columns, dtype='object')
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(reason='MultiIndexed unstack with tuple names fails'
+ 'with KeyError GH#19966')
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_with_multiindex(self, method):
+ # issue #17160
+ index = Index(data=[0, 1, 2, 3, 4, 5])
+ data = [['one', 'A', 1, 'x'],
+ ['one', 'B', 2, 'y'],
+ ['one', 'C', 3, 'z'],
+ ['two', 'A', 4, 'q'],
+ ['two', 'B', 5, 'w'],
+ ['two', 'C', 6, 't']]
+ columns = MultiIndex(levels=[['bar', 'baz'], ['first', 'second']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+ df = DataFrame(data=data, index=index, columns=columns, dtype='object')
+ if method:
+ result = df.pivot(index=('bar', 'first'),
+ columns=('bar', 'second'),
+ values=('baz', 'first'))
+ else:
+ result = pd.pivot(df,
+ index=('bar', 'first'),
+ columns=('bar', 'second'),
+ values=('baz', 'first'))
+
+ data = {'A': Series([1, 4], index=['one', 'two']),
+ 'B': Series([2, 5], index=['one', 'two']),
+ 'C': Series([3, 6], index=['one', 'two'])}
+ expected = DataFrame(data)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('method', [True, False])
+ def test_pivot_with_tuple_of_values(self, method):
+ # issue #17160
+ df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+ 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+ 'baz': [1, 2, 3, 4, 5, 6],
+ 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+ with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"):
+ # tuple is seen as a single column name
+ if method:
+ df.pivot(index='zoo', columns='foo', values=('bar', 'baz'))
+ else:
+ pd.pivot(df, index='zoo', columns='foo', values=('bar', 'baz'))
+
+ def test_margins(self):
+ def _check_output(result, values_col, index=['A', 'B'],
+ columns=['C'],
+ margins_col='All'):
+ col_margins = result.loc[result.index[:-1], margins_col]
+ expected_col_margins = self.data.groupby(index)[values_col].mean()
+ tm.assert_series_equal(col_margins, expected_col_margins,
+ check_names=False)
+ assert col_margins.name == margins_col
+
+ result = result.sort_index()
+ index_margins = result.loc[(margins_col, '')].iloc[:-1]
+
+ expected_ix_margins = self.data.groupby(columns)[values_col].mean()
+ tm.assert_series_equal(index_margins, expected_ix_margins,
+ check_names=False)
+ assert index_margins.name == (margins_col, '')
+
+ grand_total_margins = result.loc[(margins_col, ''), margins_col]
+ expected_total_margins = self.data[values_col].mean()
+ assert grand_total_margins == expected_total_margins
+
+ # column specified
+ result = self.data.pivot_table(values='D', index=['A', 'B'],
+ columns='C',
+ margins=True, aggfunc=np.mean)
+ _check_output(result, 'D')
+
+ # Set a different margins_name (not 'All')
+ result = self.data.pivot_table(values='D', index=['A', 'B'],
+ columns='C',
+ margins=True, aggfunc=np.mean,
+ margins_name='Totals')
+ _check_output(result, 'D', margins_col='Totals')
+
+ # no column specified
+ table = self.data.pivot_table(index=['A', 'B'], columns='C',
+ margins=True, aggfunc=np.mean)
+ for value_col in table.columns.levels[0]:
+ _check_output(table[value_col], value_col)
+
+ # no col
+
+ # to help with a buglet
+ self.data.columns = [k * 2 for k in self.data.columns]
+ table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
+ aggfunc=np.mean)
+ for value_col in table.columns:
+ totals = table.loc[('All', ''), value_col]
+ assert totals == self.data[value_col].mean()
+
+ # no rows
+ rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
+ aggfunc=np.mean)
+ assert isinstance(rtable, Series)
+
+ table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
+ aggfunc='mean')
+ for item in ['DD', 'EE', 'FF']:
+ totals = table.loc[('All', ''), item]
+ assert totals == self.data[item].mean()
+
+ def test_margins_dtype(self):
+ # GH 17013
+
+ df = self.data.copy()
+ df[['D', 'E', 'F']] = np.arange(len(df) * 3).reshape(len(df), 3)
+
+ mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')]
+ mi = MultiIndex.from_tuples(mi_val, names=('A', 'B'))
+ expected = DataFrame({'dull': [12, 21, 3, 9, 45],
+ 'shiny': [33, 0, 36, 51, 120]},
+ index=mi).rename_axis('C', axis=1)
+ expected['All'] = expected['dull'] + expected['shiny']
+
+ result = df.pivot_table(values='D', index=['A', 'B'],
+ columns='C', margins=True,
+ aggfunc=np.sum, fill_value=0)
+
+ tm.assert_frame_equal(expected, result)
+
+ @pytest.mark.xfail(reason='GH#17035 (len of floats is casted back to '
+ 'floats)')
+ def test_margins_dtype_len(self):
+ mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')]
+ mi = MultiIndex.from_tuples(mi_val, names=('A', 'B'))
+ expected = DataFrame({'dull': [1, 1, 2, 1, 5],
+ 'shiny': [2, 0, 2, 2, 6]},
+ index=mi).rename_axis('C', axis=1)
+ expected['All'] = expected['dull'] + expected['shiny']
+
+ result = self.data.pivot_table(values='D', index=['A', 'B'],
+ columns='C', margins=True,
+ aggfunc=len, fill_value=0)
+
+ tm.assert_frame_equal(expected, result)
+
+ def test_pivot_integer_columns(self):
+ # caused by upstream bug in unstack
+
+ d = date.min
+ data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
+ [d + timedelta(i)
+ for i in range(20)], [1.0]))
+ df = DataFrame(data)
+ table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])
+
+ df2 = df.rename(columns=str)
+ table2 = df2.pivot_table(
+ values='4', index=['0', '1', '3'], columns=['2'])
+
+ tm.assert_frame_equal(table, table2, check_names=False)
+
+ def test_pivot_no_level_overlap(self):
+ # GH #1181
+
+ data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
+ 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
+ 'c': (['foo'] * 4 + ['bar'] * 4) * 2,
+ 'value': np.random.randn(16)})
+
+ table = data.pivot_table('value', index='a', columns=['b', 'c'])
+
+ grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
+ expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
+ tm.assert_frame_equal(table, expected)
+
+ def test_pivot_columns_lexsorted(self):
+
+ n = 10000
+
+ dtype = np.dtype([
+ ("Index", object),
+ ("Symbol", object),
+ ("Year", int),
+ ("Month", int),
+ ("Day", int),
+ ("Quantity", int),
+ ("Price", float),
+ ])
+
+ products = np.array([
+ ('SP500', 'ADBE'),
+ ('SP500', 'NVDA'),
+ ('SP500', 'ORCL'),
+ ('NDQ100', 'AAPL'),
+ ('NDQ100', 'MSFT'),
+ ('NDQ100', 'GOOG'),
+ ('FTSE', 'DGE.L'),
+ ('FTSE', 'TSCO.L'),
+ ('FTSE', 'GSK.L'),
+ ], dtype=[('Index', object), ('Symbol', object)])
+ items = np.empty(n, dtype=dtype)
+ iproduct = np.random.randint(0, len(products), n)
+ items['Index'] = products['Index'][iproduct]
+ items['Symbol'] = products['Symbol'][iproduct]
+ dr = pd.date_range(date(2000, 1, 1),
+ date(2010, 12, 31))
+ dates = dr[np.random.randint(0, len(dr), n)]
+ items['Year'] = dates.year
+ items['Month'] = dates.month
+ items['Day'] = dates.day
+ items['Price'] = np.random.lognormal(4.0, 2.0, n)
+
+ df = DataFrame(items)
+
+ pivoted = df.pivot_table('Price', index=['Month', 'Day'],
+ columns=['Index', 'Symbol', 'Year'],
+ aggfunc='mean')
+
+ assert pivoted.columns.is_monotonic
+
+ def test_pivot_complex_aggfunc(self):
+ f = OrderedDict([('D', ['std']), ('E', ['sum'])])
+ expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
+ result = self.data.pivot_table(index='A', columns='B', aggfunc=f)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_margins_no_values_no_cols(self):
+ # Regression test on pivot table: no values or cols passed.
+ result = self.data[['A', 'B']].pivot_table(
+ index=['A', 'B'], aggfunc=len, margins=True)
+ result_list = result.tolist()
+ assert sum(result_list[:-1]) == result_list[-1]
+
+ def test_margins_no_values_two_rows(self):
+ # Regression test on pivot table: no values passed but rows are a
+ # multi-index
+ result = self.data[['A', 'B', 'C']].pivot_table(
+ index=['A', 'B'], columns='C', aggfunc=len, margins=True)
+ assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
+
+ def test_margins_no_values_one_row_one_col(self):
+ # Regression test on pivot table: no values passed but row and col
+ # defined
+ result = self.data[['A', 'B']].pivot_table(
+ index='A', columns='B', aggfunc=len, margins=True)
+ assert result.All.tolist() == [4.0, 7.0, 11.0]
+
+ def test_margins_no_values_two_row_two_cols(self):
+ # Regression test on pivot table: no values passed but rows and cols
+ # are multi-indexed
+ self.data['D'] = ['a', 'b', 'c', 'd',
+ 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+ result = self.data[['A', 'B', 'C', 'D']].pivot_table(
+ index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True)
+ assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
+
+ @pytest.mark.parametrize(
+ 'margin_name', ['foo', 'one', 666, None, ['a', 'b']])
+ def test_pivot_table_with_margins_set_margin_name(self, margin_name):
+ # see gh-3335
+ msg = (r'Conflicting name "{}" in margins|'
+ "margins_name argument must be a string").format(margin_name)
+ with pytest.raises(ValueError, match=msg):
+ # multi-index index
+ pivot_table(self.data, values='D', index=['A', 'B'],
+ columns=['C'], margins=True,
+ margins_name=margin_name)
+ with pytest.raises(ValueError, match=msg):
+ # multi-index column
+ pivot_table(self.data, values='D', index=['C'],
+ columns=['A', 'B'], margins=True,
+ margins_name=margin_name)
+ with pytest.raises(ValueError, match=msg):
+ # non-multi-index index/column
+ pivot_table(self.data, values='D', index=['A'],
+ columns=['B'], margins=True,
+ margins_name=margin_name)
+
+ def test_pivot_timegrouper(self):
+ df = DataFrame({
+ 'Branch': 'A A A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+ 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
+ 'Date': [datetime(2013, 1, 1),
+ datetime(2013, 1, 1),
+ datetime(2013, 10, 1),
+ datetime(2013, 10, 2),
+ datetime(2013, 10, 1),
+ datetime(2013, 10, 2),
+ datetime(2013, 12, 2),
+ datetime(2013, 12, 2), ]}).set_index('Date')
+
+ expected = DataFrame(np.array([10, 18, 3], dtype='int64')
+ .reshape(1, 3),
+ index=[datetime(2013, 12, 31)],
+ columns='Carl Joe Mark'.split())
+ expected.index.name = 'Date'
+ expected.columns.name = 'Buyer'
+
+ result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer',
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'),
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan])
+ .reshape(2, 3),
+ index=[datetime(2013, 1, 1),
+ datetime(2013, 7, 1)],
+ columns='Carl Joe Mark'.split())
+ expected.index.name = 'Date'
+ expected.columns.name = 'Buyer'
+
+ result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer',
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'),
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ # passing the name
+ df = df.reset_index()
+ result = pivot_table(df, index=Grouper(freq='6MS', key='Date'),
+ columns='Buyer',
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index='Buyer',
+ columns=Grouper(freq='6MS', key='Date'),
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ msg = "'The grouper name foo is not found'"
+ with pytest.raises(KeyError, match=msg):
+ pivot_table(df, index=Grouper(freq='6MS', key='foo'),
+ columns='Buyer', values='Quantity', aggfunc=np.sum)
+ with pytest.raises(KeyError, match=msg):
+ pivot_table(df, index='Buyer',
+ columns=Grouper(freq='6MS', key='foo'),
+ values='Quantity', aggfunc=np.sum)
+
+ # passing the level
+ df = df.set_index('Date')
+ result = pivot_table(df, index=Grouper(freq='6MS', level='Date'),
+ columns='Buyer', values='Quantity',
+ aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index='Buyer',
+ columns=Grouper(freq='6MS', level='Date'),
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ msg = "The level foo is not valid"
+ with pytest.raises(ValueError, match=msg):
+ pivot_table(df, index=Grouper(freq='6MS', level='foo'),
+ columns='Buyer', values='Quantity', aggfunc=np.sum)
+ with pytest.raises(ValueError, match=msg):
+ pivot_table(df, index='Buyer',
+ columns=Grouper(freq='6MS', level='foo'),
+ values='Quantity', aggfunc=np.sum)
+
+ # double grouper
+ df = DataFrame({
+ 'Branch': 'A A A A A A A B'.split(),
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+ 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
+ 'Date': [datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5),
+ datetime(2013, 10, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 11, 1, 20, 0),
+ datetime(2013, 10, 2, 10, 0),
+ datetime(2013, 10, 2, 12, 0),
+ datetime(2013, 12, 5, 14, 0)],
+ 'PayDay': [datetime(2013, 10, 4, 0, 0),
+ datetime(2013, 10, 15, 13, 5),
+ datetime(2013, 9, 5, 20, 0),
+ datetime(2013, 11, 2, 10, 0),
+ datetime(2013, 10, 7, 20, 0),
+ datetime(2013, 9, 5, 10, 0),
+ datetime(2013, 12, 30, 12, 0),
+ datetime(2013, 11, 20, 14, 0), ]})
+
+ result = pivot_table(df, index=Grouper(freq='M', key='Date'),
+ columns=Grouper(freq='M', key='PayDay'),
+ values='Quantity', aggfunc=np.sum)
+ expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan,
+ 6, np.nan, 1, 9,
+ np.nan, 9, np.nan, np.nan, np.nan,
+ np.nan, 3, np.nan]).reshape(4, 4),
+ index=[datetime(2013, 9, 30),
+ datetime(2013, 10, 31),
+ datetime(2013, 11, 30),
+ datetime(2013, 12, 31)],
+ columns=[datetime(2013, 9, 30),
+ datetime(2013, 10, 31),
+ datetime(2013, 11, 30),
+ datetime(2013, 12, 31)])
+ expected.index.name = 'Date'
+ expected.columns.name = 'PayDay'
+
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index=Grouper(freq='M', key='PayDay'),
+ columns=Grouper(freq='M', key='Date'),
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ tuples = [(datetime(2013, 9, 30), datetime(2013, 10, 31)),
+ (datetime(2013, 10, 31),
+ datetime(2013, 9, 30)),
+ (datetime(2013, 10, 31),
+ datetime(2013, 11, 30)),
+ (datetime(2013, 10, 31),
+ datetime(2013, 12, 31)),
+ (datetime(2013, 11, 30),
+ datetime(2013, 10, 31)),
+ (datetime(2013, 12, 31), datetime(2013, 11, 30)), ]
+ idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
+ expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan,
+ 9, np.nan, 9, np.nan,
+ np.nan, 3]).reshape(6, 2),
+ index=idx, columns=['A', 'B'])
+ expected.columns.name = 'Branch'
+
+ result = pivot_table(
+ df, index=[Grouper(freq='M', key='Date'),
+ Grouper(freq='M', key='PayDay')], columns=['Branch'],
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index=['Branch'],
+ columns=[Grouper(freq='M', key='Date'),
+ Grouper(freq='M', key='PayDay')],
+ values='Quantity', aggfunc=np.sum)
+ tm.assert_frame_equal(result, expected.T)
+
+ def test_pivot_datetime_tz(self):
+ dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00',
+ '2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00']
+ dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00',
+ '2013-01-01 15:00:00',
+ '2013-02-01 15:00:00', '2013-02-01 15:00:00',
+ '2013-02-01 15:00:00']
+ df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'dt1': dates1, 'dt2': dates2,
+ 'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2] * 3})
+ df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
+ df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))
+
+ exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00'],
+ tz='US/Pacific', name='dt1')
+ exp_col1 = Index(['value1', 'value1'])
+ exp_col2 = Index(['a', 'b'], name='label')
+ exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
+ expected = DataFrame([[0, 3], [1, 4], [2, 5]],
+ index=exp_idx, columns=exp_col)
+ result = pivot_table(df, index=['dt1'], columns=[
+ 'label'], values=['value1'])
+ tm.assert_frame_equal(result, expected)
+
+ exp_col1 = Index(['sum', 'sum', 'sum', 'sum',
+ 'mean', 'mean', 'mean', 'mean'])
+ exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
+ exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00',
+ '2013-02-01 15:00:00'] * 4,
+ tz='Asia/Tokyo', name='dt2')
+ exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
+ expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2],
+ [1, 4, 2, 1, 1, 4, 2, 1],
+ [2, 5, 1, 2, 2, 5, 1, 2]],
+ dtype='int64'),
+ index=exp_idx,
+ columns=exp_col)
+
+ result = pivot_table(df, index=['dt1'], columns=['dt2'],
+ values=['value1', 'value2'],
+ aggfunc=[np.sum, np.mean])
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_dtaccessor(self):
+ # GH 8103
+ dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00',
+ '2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00']
+ dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00',
+ '2013-01-01 15:00:00',
+ '2013-02-01 15:00:00', '2013-02-01 15:00:00',
+ '2013-02-01 15:00:00']
+ df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'dt1': dates1, 'dt2': dates2,
+ 'value1': np.arange(6, dtype='int64'),
+ 'value2': [1, 2] * 3})
+ df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d))
+ df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d))
+
+ result = pivot_table(df, index='label', columns=df['dt1'].dt.hour,
+ values='value1')
+
+ exp_idx = Index(['a', 'b'], name='label')
+ expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
+ index=exp_idx,
+ columns=Index([7, 8, 9], name='dt1'))
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index=df['dt2'].dt.month,
+ columns=df['dt1'].dt.hour,
+ values='value1')
+
+ expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
+ index=Index([1, 2], name='dt2'),
+ columns=Index([7, 8, 9], name='dt1'))
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index=df['dt2'].dt.year.values,
+ columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
+ values='value1')
+
+ exp_col = MultiIndex.from_arrays(
+ [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2'])
+ expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'),
+ index=[2013], columns=exp_col)
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(df, index=np.array(['X', 'X', 'X',
+ 'X', 'Y', 'Y']),
+ columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
+ values='value1')
+ expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan],
+ [np.nan, np.nan, np.nan,
+ 4, np.nan, 5]]),
+ index=['X', 'Y'], columns=exp_col)
+ tm.assert_frame_equal(result, expected)
+
+ def test_daily(self):
+ rng = date_range('1/1/2000', '12/31/2004', freq='D')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ annual = pivot_table(DataFrame(ts), index=ts.index.year,
+ columns=ts.index.dayofyear)
+ annual.columns = annual.columns.droplevel(0)
+
+ doy = np.asarray(ts.index.dayofyear)
+
+ for i in range(1, 367):
+ subset = ts[doy == i]
+ subset.index = subset.index.year
+
+ result = annual[i].dropna()
+ tm.assert_series_equal(result, subset, check_names=False)
+ assert result.name == i
+
+ def test_monthly(self):
+ rng = date_range('1/1/2000', '12/31/2004', freq='M')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ annual = pivot_table(pd.DataFrame(ts), index=ts.index.year,
+ columns=ts.index.month)
+ annual.columns = annual.columns.droplevel(0)
+
+ month = ts.index.month
+ for i in range(1, 13):
+ subset = ts[month == i]
+ subset.index = subset.index.year
+ result = annual[i].dropna()
+ tm.assert_series_equal(result, subset, check_names=False)
+ assert result.name == i
+
+ def test_pivot_table_with_iterator_values(self):
+ # GH 12017
+ aggs = {'D': 'sum', 'E': 'mean'}
+
+ pivot_values_list = pd.pivot_table(
+ self.data, index=['A'], values=list(aggs.keys()), aggfunc=aggs,
+ )
+
+ pivot_values_keys = pd.pivot_table(
+ self.data, index=['A'], values=aggs.keys(), aggfunc=aggs,
+ )
+ tm.assert_frame_equal(pivot_values_keys, pivot_values_list)
+
+ agg_values_gen = (value for value in aggs.keys())
+ pivot_values_gen = pd.pivot_table(
+ self.data, index=['A'], values=agg_values_gen, aggfunc=aggs,
+ )
+ tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
+
+ def test_pivot_table_margins_name_with_aggfunc_list(self):
+ # GH 13354
+ margins_name = 'Weekly'
+ costs = pd.DataFrame(
+ {'item': ['bacon', 'cheese', 'bacon', 'cheese'],
+ 'cost': [2.5, 4.5, 3.2, 3.3],
+ 'day': ['M', 'M', 'T', 'T']}
+ )
+ table = costs.pivot_table(
+ index="item", columns="day", margins=True,
+ margins_name=margins_name, aggfunc=[np.mean, max]
+ )
+ ix = pd.Index(
+ ['bacon', 'cheese', margins_name], dtype='object', name='item'
+ )
+ tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'),
+ ('mean', 'cost', margins_name), ('max', 'cost', 'M'),
+ ('max', 'cost', 'T'), ('max', 'cost', margins_name)]
+ cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day'])
+ expected = pd.DataFrame(table.values, index=ix, columns=cols)
+ tm.assert_frame_equal(table, expected)
+
+ @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to '
+ 'ints)')
+ def test_categorical_margins(self, observed):
+ # GH 10989
+ df = pd.DataFrame({'x': np.arange(8),
+ 'y': np.arange(8) // 4,
+ 'z': np.arange(8) % 2})
+
+ expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
+ expected.index = Index([0, 1, 'All'], name='y')
+ expected.columns = Index([0, 1, 'All'], name='z')
+
+ table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
+ tm.assert_frame_equal(table, expected)
+
+ @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to '
+ 'ints)')
+ def test_categorical_margins_category(self, observed):
+ df = pd.DataFrame({'x': np.arange(8),
+ 'y': np.arange(8) // 4,
+ 'z': np.arange(8) % 2})
+
+ expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
+ expected.index = Index([0, 1, 'All'], name='y')
+ expected.columns = Index([0, 1, 'All'], name='z')
+
+ df.y = df.y.astype('category')
+ df.z = df.z.astype('category')
+ table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
+ tm.assert_frame_equal(table, expected)
+
+ def test_categorical_aggfunc(self, observed):
+ # GH 9534
+ df = pd.DataFrame({"C1": ["A", "B", "C", "C"],
+ "C2": ["a", "a", "b", "b"],
+ "V": [1, 2, 3, 4]})
+ df["C1"] = df["C1"].astype("category")
+ result = df.pivot_table("V", index="C1", columns="C2",
+ dropna=observed, aggfunc="count")
+
+ expected_index = pd.CategoricalIndex(['A', 'B', 'C'],
+ categories=['A', 'B', 'C'],
+ ordered=False,
+ name='C1')
+ expected_columns = pd.Index(['a', 'b'], name='C2')
+ expected_data = np.array([[1., np.nan],
+ [1., np.nan],
+ [np.nan, 2.]])
+ expected = pd.DataFrame(expected_data,
+ index=expected_index,
+ columns=expected_columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_categorical_pivot_index_ordering(self, observed):
+ # GH 8731
+ df = pd.DataFrame({'Sales': [100, 120, 220],
+ 'Month': ['January', 'January', 'January'],
+ 'Year': [2013, 2014, 2013]})
+ months = ['January', 'February', 'March', 'April', 'May', 'June',
+ 'July', 'August', 'September', 'October', 'November',
+ 'December']
+ df['Month'] = df['Month'].astype('category').cat.set_categories(months)
+ result = df.pivot_table(values='Sales',
+ index='Month',
+ columns='Year',
+ dropna=observed,
+ aggfunc='sum')
+ expected_columns = pd.Int64Index([2013, 2014], name='Year')
+ expected_index = pd.CategoricalIndex(['January'],
+ categories=months,
+ ordered=False,
+ name='Month')
+ expected = pd.DataFrame([[320, 120]],
+ index=expected_index,
+ columns=expected_columns)
+ if not observed:
+ result = result.dropna().astype(np.int64)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_table_not_series(self):
+ # GH 4386
+ # pivot_table always returns a DataFrame
+ # when values is not list like and columns is None
+ # and aggfunc is not instance of list
+ df = DataFrame({'col1': [3, 4, 5],
+ 'col2': ['C', 'D', 'E'],
+ 'col3': [1, 3, 9]})
+
+ result = df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum)
+ m = MultiIndex.from_arrays([[1, 3, 9],
+ ['C', 'D', 'E']],
+ names=['col3', 'col2'])
+ expected = DataFrame([3, 4, 5],
+ index=m, columns=['col1'])
+
+ tm.assert_frame_equal(result, expected)
+
+ result = df.pivot_table(
+ 'col1', index='col3', columns='col2', aggfunc=np.sum
+ )
+ expected = DataFrame([[3, np.NaN, np.NaN],
+ [np.NaN, 4, np.NaN],
+ [np.NaN, np.NaN, 5]],
+ index=Index([1, 3, 9], name='col3'),
+ columns=Index(['C', 'D', 'E'], name='col2'))
+
+ tm.assert_frame_equal(result, expected)
+
+ result = df.pivot_table('col1', index='col3', aggfunc=[np.sum])
+ m = MultiIndex.from_arrays([['sum'],
+ ['col1']])
+ expected = DataFrame([3, 4, 5],
+ index=Index([1, 3, 9], name='col3'),
+ columns=m)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_pivot_margins_name_unicode(self):
+ # issue #13292
+ greek = u'\u0394\u03bf\u03ba\u03b9\u03bc\u03ae'
+ frame = pd.DataFrame({'foo': [1, 2, 3]})
+ table = pd.pivot_table(frame, index=['foo'], aggfunc=len, margins=True,
+ margins_name=greek)
+ index = pd.Index([1, 2, 3, greek], dtype='object', name='foo')
+ expected = pd.DataFrame(index=index)
+ tm.assert_frame_equal(table, expected)
+
+ def test_pivot_string_as_func(self):
+ # GH #18713
+ # for correctness purposes
+ data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar',
+ 'bar', 'bar', 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one',
+ 'one', 'two', 'two', 'two', 'one'],
+ 'C': range(11)})
+
+ result = pivot_table(data, index='A', columns='B', aggfunc='sum')
+ mi = MultiIndex(levels=[['C'], ['one', 'two']],
+ codes=[[0, 0], [0, 1]], names=[None, 'B'])
+ expected = DataFrame({('C', 'one'): {'bar': 15, 'foo': 13},
+ ('C', 'two'): {'bar': 7, 'foo': 20}},
+ columns=mi).rename_axis('A')
+ tm.assert_frame_equal(result, expected)
+
+ result = pivot_table(data, index='A', columns='B',
+ aggfunc=['sum', 'mean'])
+ mi = MultiIndex(levels=[['sum', 'mean'], ['C'], ['one', 'two']],
+ codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]],
+ names=[None, None, 'B'])
+ expected = DataFrame({('mean', 'C', 'one'): {'bar': 5.0, 'foo': 3.25},
+ ('mean', 'C', 'two'): {'bar': 7.0,
+ 'foo': 6.666666666666667},
+ ('sum', 'C', 'one'): {'bar': 15, 'foo': 13},
+ ('sum', 'C', 'two'): {'bar': 7, 'foo': 20}},
+ columns=mi).rename_axis('A')
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('f, f_numpy',
+ [('sum', np.sum),
+ ('mean', np.mean),
+ ('std', np.std),
+ (['sum', 'mean'], [np.sum, np.mean]),
+ (['sum', 'std'], [np.sum, np.std]),
+ (['std', 'mean'], [np.std, np.mean])])
+ def test_pivot_string_func_vs_func(self, f, f_numpy):
+ # GH #18713
+ # for consistency purposes
+ result = pivot_table(self.data, index='A', columns='B', aggfunc=f)
+ expected = pivot_table(self.data, index='A', columns='B',
+ aggfunc=f_numpy)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.slow
+ def test_pivot_number_of_levels_larger_than_int32(self):
+ # GH 20601
+ df = DataFrame({'ind1': np.arange(2 ** 16),
+ 'ind2': np.arange(2 ** 16),
+ 'count': 0})
+
+ msg = "Unstacked DataFrame is too big, causing int32 overflow"
+ with pytest.raises(ValueError, match=msg):
+ df.pivot_table(index='ind1', columns='ind2',
+ values='count', aggfunc='count')
+
+
+class TestCrosstab(object):
+
+ def setup_method(self, method):
+ df = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
+ 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two',
+ 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull',
+ 'dull', 'shiny', 'shiny', 'dull',
+ 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ self.df = df.append(df, ignore_index=True)
+
+ def test_crosstab_single(self):
+ df = self.df
+ result = crosstab(df['A'], df['C'])
+ expected = df.groupby(['A', 'C']).size().unstack()
+ tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
+
+ def test_crosstab_multiple(self):
+ df = self.df
+
+ result = crosstab(df['A'], [df['B'], df['C']])
+ expected = df.groupby(['A', 'B', 'C']).size()
+ expected = expected.unstack(
+ 'B').unstack('C').fillna(0).astype(np.int64)
+ tm.assert_frame_equal(result, expected)
+
+ result = crosstab([df['B'], df['C']], df['A'])
+ expected = df.groupby(['B', 'C', 'A']).size()
+ expected = expected.unstack('A').fillna(0).astype(np.int64)
+ tm.assert_frame_equal(result, expected)
+
+ def test_crosstab_ndarray(self):
+ a = np.random.randint(0, 5, size=100)
+ b = np.random.randint(0, 3, size=100)
+ c = np.random.randint(0, 10, size=100)
+
+ df = DataFrame({'a': a, 'b': b, 'c': c})
+
+ result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
+ expected = crosstab(df['a'], [df['b'], df['c']])
+ tm.assert_frame_equal(result, expected)
+
+ result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
+ expected = crosstab([df['b'], df['c']], df['a'])
+ tm.assert_frame_equal(result, expected)
+
+ # assign arbitrary names
+ result = crosstab(self.df['A'].values, self.df['C'].values)
+ assert result.index.name == 'row_0'
+ assert result.columns.name == 'col_0'
+
+ def test_crosstab_non_aligned(self):
+ # GH 17005
+ a = pd.Series([0, 1, 1], index=['a', 'b', 'c'])
+ b = pd.Series([3, 4, 3, 4, 3], index=['a', 'b', 'c', 'd', 'f'])
+ c = np.array([3, 4, 3])
+
+ expected = pd.DataFrame([[1, 0], [1, 1]],
+ index=Index([0, 1], name='row_0'),
+ columns=Index([3, 4], name='col_0'))
+
+ result = crosstab(a, b)
+ tm.assert_frame_equal(result, expected)
+
+ result = crosstab(a, c)
+ tm.assert_frame_equal(result, expected)
+
+ def test_crosstab_margins(self):
+ a = np.random.randint(0, 7, size=100)
+ b = np.random.randint(0, 3, size=100)
+ c = np.random.randint(0, 5, size=100)
+
+ df = DataFrame({'a': a, 'b': b, 'c': c})
+
+ result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
+ margins=True)
+
+ assert result.index.names == ('a',)
+ assert result.columns.names == ['b', 'c']
+
+ all_cols = result['All', '']
+ exp_cols = df.groupby(['a']).size().astype('i8')
+ # to keep index.name
+ exp_margin = Series([len(df)], index=Index(['All'], name='a'))
+ exp_cols = exp_cols.append(exp_margin)
+ exp_cols.name = ('All', '')
+
+ tm.assert_series_equal(all_cols, exp_cols)
+
+ all_rows = result.loc['All']
+ exp_rows = df.groupby(['b', 'c']).size().astype('i8')
+ exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))
+ exp_rows.name = 'All'
+
+ exp_rows = exp_rows.reindex(all_rows.index)
+ exp_rows = exp_rows.fillna(0).astype(np.int64)
+ tm.assert_series_equal(all_rows, exp_rows)
+
+ def test_crosstab_margins_set_margin_name(self):
+ # GH 15972
+ a = np.random.randint(0, 7, size=100)
+ b = np.random.randint(0, 3, size=100)
+ c = np.random.randint(0, 5, size=100)
+
+ df = DataFrame({'a': a, 'b': b, 'c': c})
+
+ result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
+ margins=True, margins_name='TOTAL')
+
+ assert result.index.names == ('a',)
+ assert result.columns.names == ['b', 'c']
+
+ all_cols = result['TOTAL', '']
+ exp_cols = df.groupby(['a']).size().astype('i8')
+ # to keep index.name
+ exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a'))
+ exp_cols = exp_cols.append(exp_margin)
+ exp_cols.name = ('TOTAL', '')
+
+ tm.assert_series_equal(all_cols, exp_cols)
+
+ all_rows = result.loc['TOTAL']
+ exp_rows = df.groupby(['b', 'c']).size().astype('i8')
+ exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')]))
+ exp_rows.name = 'TOTAL'
+
+ exp_rows = exp_rows.reindex(all_rows.index)
+ exp_rows = exp_rows.fillna(0).astype(np.int64)
+ tm.assert_series_equal(all_rows, exp_rows)
+
+ msg = "margins_name argument must be a string"
+ for margins_name in [666, None, ['a', 'b']]:
+ with pytest.raises(ValueError, match=msg):
+ crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
+ margins=True, margins_name=margins_name)
+
+ def test_crosstab_pass_values(self):
+ a = np.random.randint(0, 7, size=100)
+ b = np.random.randint(0, 3, size=100)
+ c = np.random.randint(0, 5, size=100)
+ values = np.random.randn(100)
+
+ table = crosstab([a, b], c, values, aggfunc=np.sum,
+ rownames=['foo', 'bar'], colnames=['baz'])
+
+ df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values})
+
+ expected = df.pivot_table('values', index=['foo', 'bar'],
+ columns='baz', aggfunc=np.sum)
+ tm.assert_frame_equal(table, expected)
+
+ def test_crosstab_dropna(self):
+ # GH 3820
+ a = np.array(['foo', 'foo', 'foo', 'bar',
+ 'bar', 'foo', 'foo'], dtype=object)
+ b = np.array(['one', 'one', 'two', 'one',
+ 'two', 'two', 'two'], dtype=object)
+ c = np.array(['dull', 'dull', 'dull', 'dull',
+ 'dull', 'shiny', 'shiny'], dtype=object)
+ res = pd.crosstab(a, [b, c], rownames=['a'],
+ colnames=['b', 'c'], dropna=False)
+ m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
+ ('two', 'dull'), ('two', 'shiny')],
+ names=['b', 'c'])
+ tm.assert_index_equal(res.columns, m)
+
+ def test_crosstab_no_overlap(self):
+ # GS 10291
+
+ s1 = pd.Series([1, 2, 3], index=[1, 2, 3])
+ s2 = pd.Series([4, 5, 6], index=[4, 5, 6])
+
+ actual = crosstab(s1, s2)
+ expected = pd.DataFrame()
+
+ tm.assert_frame_equal(actual, expected)
+
+ def test_margin_dropna(self):
+ # GH 12577
+ # pivot_table counts null into margin ('All')
+ # when margins=true and dropna=true
+
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
+ 'b': [3, 3, 4, 4, 4, 4]})
+ actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
+ expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
+ expected.index = Index([1.0, 2.0, 'All'], name='a')
+ expected.columns = Index([3, 4, 'All'], name='b')
+ tm.assert_frame_equal(actual, expected)
+
+ df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
+ 'b': [3, np.nan, 4, 4, 4, 4]})
+ actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
+ expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
+ expected.index = Index([1.0, 2.0, 'All'], name='a')
+ expected.columns = Index([3.0, 4.0, 'All'], name='b')
+ tm.assert_frame_equal(actual, expected)
+
+ df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2],
+ 'b': [3, 3, 4, 4, 4, 4]})
+ actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
+ expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
+ expected.index = Index([1.0, 2.0, 'All'], name='a')
+ expected.columns = Index([3, 4, 'All'], name='b')
+ tm.assert_frame_equal(actual, expected)
+
+ # GH 12642
+ # _add_margins raises KeyError: Level None not found
+ # when margins=True and dropna=False
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
+ 'b': [3, 3, 4, 4, 4, 4]})
+ actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
+ expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
+ expected.index = Index([1.0, 2.0, 'All'], name='a')
+ expected.columns = Index([3, 4, 'All'], name='b')
+ tm.assert_frame_equal(actual, expected)
+
+ df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
+ 'b': [3, np.nan, 4, 4, 4, 4]})
+ actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
+ expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
+ expected.index = Index([1.0, 2.0, 'All'], name='a')
+ expected.columns = Index([3.0, 4.0, 'All'], name='b')
+ tm.assert_frame_equal(actual, expected)
+
+ a = np.array(['foo', 'foo', 'foo', 'bar',
+ 'bar', 'foo', 'foo'], dtype=object)
+ b = np.array(['one', 'one', 'two', 'one',
+ 'two', np.nan, 'two'], dtype=object)
+ c = np.array(['dull', 'dull', 'dull', 'dull',
+ 'dull', 'shiny', 'shiny'], dtype=object)
+
+ actual = pd.crosstab(a, [b, c], rownames=['a'],
+ colnames=['b', 'c'], margins=True, dropna=False)
+ m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'],
+ ['dull', 'shiny', 'dull', 'shiny', '']],
+ names=['b', 'c'])
+ expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5],
+ [3, 0, 2, 1, 7]], columns=m)
+ expected.index = Index(['bar', 'foo', 'All'], name='a')
+ tm.assert_frame_equal(actual, expected)
+
+ actual = pd.crosstab([a, b], c, rownames=['a', 'b'],
+ colnames=['c'], margins=True, dropna=False)
+ m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'],
+ ['one', 'two', 'one', 'two', '']],
+ names=['a', 'b'])
+ expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
+ [5, 2, 7]], index=m)
+ expected.columns = Index(['dull', 'shiny', 'All'], name='c')
+ tm.assert_frame_equal(actual, expected)
+
+ actual = pd.crosstab([a, b], c, rownames=['a', 'b'],
+ colnames=['c'], margins=True, dropna=True)
+ m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'],
+ ['one', 'two', 'one', 'two', '']],
+ names=['a', 'b'])
+ expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
+ [5, 1, 6]], index=m)
+ expected.columns = Index(['dull', 'shiny', 'All'], name='c')
+ tm.assert_frame_equal(actual, expected)
+
+ def test_crosstab_normalize(self):
+ # Issue 12578
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+ 'c': [1, 1, np.nan, 1, 1]})
+
+ rindex = pd.Index([1, 2], name='a')
+ cindex = pd.Index([3, 4], name='b')
+ full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]],
+ index=rindex, columns=cindex)
+ row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]],
+ index=rindex, columns=cindex)
+ col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]],
+ index=rindex, columns=cindex)
+
+ # Check all normalize args
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'),
+ full_normal)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True),
+ full_normal)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
+ row_normal)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
+ col_normal)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
+ pd.crosstab(df.a, df.b, normalize='columns'))
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
+ pd.crosstab(df.a, df.b, normalize='index'))
+
+ row_normal_margins = pd.DataFrame([[1.0, 0],
+ [0.25, 0.75],
+ [0.4, 0.6]],
+ index=pd.Index([1, 2, 'All'],
+ name='a',
+ dtype='object'),
+ columns=pd.Index([3, 4], name='b',
+ dtype='object'))
+ col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
+ index=pd.Index([1, 2], name='a',
+ dtype='object'),
+ columns=pd.Index([3, 4, 'All'],
+ name='b',
+ dtype='object'))
+
+ all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
+ [0.2, 0.6, 0.8],
+ [0.4, 0.6, 1]],
+ index=pd.Index([1, 2, 'All'],
+ name='a',
+ dtype='object'),
+ columns=pd.Index([3, 4, 'All'],
+ name='b',
+ dtype='object'))
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
+ margins=True), row_normal_margins)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
+ margins=True),
+ col_normal_margins)
+ tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
+ margins=True), all_normal_margins)
+
+ # Test arrays
+ pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])],
+ np.array([1, 2, 1, 2]))
+
+ # Test with aggfunc
+ norm_counts = pd.DataFrame([[0.25, 0, 0.25],
+ [0.25, 0.5, 0.75],
+ [0.5, 0.5, 1]],
+ index=pd.Index([1, 2, 'All'],
+ name='a',
+ dtype='object'),
+ columns=pd.Index([3, 4, 'All'],
+ name='b'))
+ test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count',
+ normalize='all',
+ margins=True)
+ tm.assert_frame_equal(test_case, norm_counts)
+
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+ 'c': [0, 4, np.nan, 3, 3]})
+
+ norm_sum = pd.DataFrame([[0, 0, 0.],
+ [0.4, 0.6, 1],
+ [0.4, 0.6, 1]],
+ index=pd.Index([1, 2, 'All'],
+ name='a',
+ dtype='object'),
+ columns=pd.Index([3, 4, 'All'],
+ name='b',
+ dtype='object'))
+ test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum,
+ normalize='all',
+ margins=True)
+ tm.assert_frame_equal(test_case, norm_sum)
+
+ def test_crosstab_with_empties(self):
+ # Check handling of empties
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+ 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})
+
+ empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
+ index=pd.Index([1, 2],
+ name='a',
+ dtype='int64'),
+ columns=pd.Index([3, 4], name='b'))
+
+ for i in [True, 'index', 'columns']:
+ calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
+ normalize=i)
+ tm.assert_frame_equal(empty, calculated)
+
+ nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
+ index=pd.Index([1, 2],
+ name='a',
+ dtype='int64'),
+ columns=pd.Index([3, 4], name='b'))
+
+ calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
+ normalize=False)
+ tm.assert_frame_equal(nans, calculated)
+
+ def test_crosstab_errors(self):
+ # Issue 12578
+
+ df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+ 'c': [1, 1, np.nan, 1, 1]})
+
+ error = 'values cannot be used without an aggfunc.'
+ with pytest.raises(ValueError, match=error):
+ pd.crosstab(df.a, df.b, values=df.c)
+
+ error = 'aggfunc cannot be used without values'
+ with pytest.raises(ValueError, match=error):
+ pd.crosstab(df.a, df.b, aggfunc=np.mean)
+
+ error = 'Not a valid normalize argument'
+ with pytest.raises(ValueError, match=error):
+ pd.crosstab(df.a, df.b, normalize='42')
+
+ with pytest.raises(ValueError, match=error):
+ pd.crosstab(df.a, df.b, normalize=42)
+
+ error = 'Not a valid margins argument'
+ with pytest.raises(ValueError, match=error):
+ pd.crosstab(df.a, df.b, normalize='all', margins=42)
+
+ def test_crosstab_with_categorial_columns(self):
+ # GH 8860
+ df = pd.DataFrame({'MAKE': ['Honda', 'Acura', 'Tesla',
+ 'Honda', 'Honda', 'Acura'],
+ 'MODEL': ['Sedan', 'Sedan', 'Electric',
+ 'Pickup', 'Sedan', 'Sedan']})
+ categories = ['Sedan', 'Electric', 'Pickup']
+ df['MODEL'] = (df['MODEL'].astype('category')
+ .cat.set_categories(categories))
+ result = pd.crosstab(df['MAKE'], df['MODEL'])
+
+ expected_index = pd.Index(['Acura', 'Honda', 'Tesla'], name='MAKE')
+ expected_columns = pd.CategoricalIndex(categories,
+ categories=categories,
+ ordered=False,
+ name='MODEL')
+ expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
+ expected = pd.DataFrame(expected_data,
+ index=expected_index,
+ columns=expected_columns)
+ tm.assert_frame_equal(result, expected)
+
+ def test_crosstab_with_numpy_size(self):
+ # GH 4003
+ df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6,
+ 'B': ['A', 'B', 'C'] * 8,
+ 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
+ 'D': np.random.randn(24),
+ 'E': np.random.randn(24)})
+ result = pd.crosstab(index=[df['A'], df['B']],
+ columns=[df['C']],
+ margins=True,
+ aggfunc=np.size,
+ values=df['D'])
+ expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'],
+ ['', 'A', 'B', 'C']],
+ codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0],
+ [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
+ names=['A', 'B'])
+ expected_column = pd.Index(['bar', 'foo', 'All'],
+ dtype='object',
+ name='C')
+ expected_data = np.array([[2., 2., 4.],
+ [2., 2., 4.],
+ [2., 2., 4.],
+ [2., np.nan, 2.],
+ [np.nan, 2., 2.],
+ [2., np.nan, 2.],
+ [np.nan, 2., 2.],
+ [2., np.nan, 2.],
+ [np.nan, 2., 2.],
+ [12., 12., 24.]])
+ expected = pd.DataFrame(expected_data,
+ index=expected_index,
+ columns=expected_column)
+ tm.assert_frame_equal(result, expected)
+
+ def test_crosstab_dup_index_names(self):
+ # GH 13279
+ s = pd.Series(range(3), name='foo')
+
+ result = pd.crosstab(s, s)
+ expected_index = pd.Index(range(3), name='foo')
+ expected = pd.DataFrame(np.eye(3, dtype=np.int64),
+ index=expected_index,
+ columns=expected_index)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("names", [['a', ('b', 'c')],
+ [('a', 'b'), 'c']])
+ def test_crosstab_tuple_name(self, names):
+ s1 = pd.Series(range(3), name=names[0])
+ s2 = pd.Series(range(1, 4), name=names[1])
+
+ mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
+ expected = pd.Series(1, index=mi).unstack(1, fill_value=0)
+
+ result = pd.crosstab(s1, s2)
+ tm.assert_frame_equal(result, expected)
+
+ def test_crosstab_unsorted_order(self):
+ df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]},
+ index=['C', 'A', 'B'])
+ result = pd.crosstab(df.index, [df.b, df.a])
+ e_idx = pd.Index(['A', 'B', 'C'], name='row_0')
+ e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)],
+ names=['b', 'a'])
+ expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+ index=e_idx,
+ columns=e_columns)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_qcut.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_qcut.py
new file mode 100644
index 00000000000..997df7fd7aa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_qcut.py
@@ -0,0 +1,199 @@
+import os
+
+import numpy as np
+import pytest
+
+from pandas.compat import zip
+
+from pandas import (
+ Categorical, DatetimeIndex, Interval, IntervalIndex, NaT, Series,
+ TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, timedelta_range)
+from pandas.api.types import CategoricalDtype as CDT
+from pandas.core.algorithms import quantile
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import Day, Nano
+
+
+def test_qcut():
+ arr = np.random.randn(1000)
+
+ # We store the bins as Index that have been
+ # rounded to comparisons are a bit tricky.
+ labels, bins = qcut(arr, 4, retbins=True)
+ ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
+
+ result = labels.categories.left.values
+ assert np.allclose(result, ex_bins[:-1], atol=1e-2)
+
+ result = labels.categories.right.values
+ assert np.allclose(result, ex_bins[1:], atol=1e-2)
+
+ ex_levels = cut(arr, ex_bins, include_lowest=True)
+ tm.assert_categorical_equal(labels, ex_levels)
+
+
+def test_qcut_bounds():
+ arr = np.random.randn(1000)
+
+ factor = qcut(arr, 10, labels=False)
+ assert len(np.unique(factor)) == 10
+
+
+def test_qcut_specify_quantiles():
+ arr = np.random.randn(100)
+ factor = qcut(arr, [0, .25, .5, .75, 1.])
+
+ expected = qcut(arr, 4)
+ tm.assert_categorical_equal(factor, expected)
+
+
+def test_qcut_all_bins_same():
+ with pytest.raises(ValueError, match="edges.*unique"):
+ qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
+
+
+def test_qcut_include_lowest():
+ values = np.arange(10)
+ ii = qcut(values, 4)
+
+ ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5),
+ Interval(4.5, 6.75), Interval(6.75, 9)])
+ tm.assert_index_equal(ii.categories, ex_levels)
+
+
+def test_qcut_nas():
+ arr = np.random.randn(100)
+ arr[:20] = np.nan
+
+ result = qcut(arr, 4)
+ assert isna(result[:20]).all()
+
+
+def test_qcut_index():
+ result = qcut([0, 2], 2)
+ intervals = [Interval(-0.001, 1), Interval(1, 2)]
+
+ expected = Categorical(intervals, ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+
+def test_qcut_binning_issues(datapath):
+ # see gh-1978, gh-1979
+ cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
+ arr = np.loadtxt(cut_file)
+ result = qcut(arr, 20)
+
+ starts = []
+ ends = []
+
+ for lev in np.unique(result):
+ s = lev.left
+ e = lev.right
+ assert s != e
+
+ starts.append(float(s))
+ ends.append(float(e))
+
+ for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
+ zip(ends[:-1], ends[1:])):
+ assert sp < sn
+ assert ep < en
+ assert ep <= sn
+
+
+def test_qcut_return_intervals():
+ ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
+ res = qcut(ser, [0, 0.333, 0.666, 1])
+
+ exp_levels = np.array([Interval(-0.001, 2.664),
+ Interval(2.664, 5.328), Interval(5.328, 8)])
+ exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
+ CDT(ordered=True))
+ tm.assert_series_equal(res, exp)
+
+
[email protected]("kwargs,msg", [
+ (dict(duplicates="drop"), None),
+ (dict(), "Bin edges must be unique"),
+ (dict(duplicates="raise"), "Bin edges must be unique"),
+ (dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
+])
+def test_qcut_duplicates_bin(kwargs, msg):
+ # see gh-7751
+ values = [0, 0, 0, 0, 1, 2, 3]
+
+ if msg is not None:
+ with pytest.raises(ValueError, match=msg):
+ qcut(values, 3, **kwargs)
+ else:
+ result = qcut(values, 3, **kwargs)
+ expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
+ tm.assert_index_equal(result.categories, expected)
+
+
[email protected]("data,start,end", [
+ (9.0, 8.999, 9.0),
+ (0.0, -0.001, 0.0),
+ (-9.0, -9.001, -9.0),
+])
[email protected]("length", [1, 2])
[email protected]("labels", [None, False])
+def test_single_quantile(data, start, end, length, labels):
+ # see gh-15431
+ ser = Series([data] * length)
+ result = qcut(ser, 1, labels=labels)
+
+ if labels is None:
+ intervals = IntervalIndex([Interval(start, end)] *
+ length, closed="right")
+ expected = Series(intervals).astype(CDT(ordered=True))
+ else:
+ expected = Series([0] * length)
+
+ tm.assert_series_equal(result, expected)
+
+
+ Series(DatetimeIndex(["20180101", NaT, "20180103"])),
+ Series(TimedeltaIndex(["0 days", NaT, "2 days"]))],
+ ids=lambda x: str(x.dtype))
+def test_qcut_nat(ser):
+ # see gh-19768
+ intervals = IntervalIndex.from_tuples([
+ (ser[0] - Nano(), ser[2] - Day()),
+ np.nan, (ser[2] - Day(), ser[2])])
+ expected = Series(Categorical(intervals, ordered=True))
+
+ result = qcut(ser, 2)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("bins", [3, np.linspace(0, 1, 4)])
+def test_datetime_tz_qcut(bins):
+ # see gh-19872
+ tz = "US/Eastern"
+ ser = Series(date_range("20130101", periods=3, tz=tz))
+
+ result = qcut(ser, bins)
+ expected = Series(IntervalIndex([
+ Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
+ Timestamp("2013-01-01 16:00:00", tz=tz)),
+ Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
+ Timestamp("2013-01-02 08:00:00", tz=tz)),
+ Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
+ Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
+ CDT(ordered=True))
+ tm.assert_series_equal(result, expected)
+
+
[email protected]("arg,expected_bins", [
+ [timedelta_range("1day", periods=3),
+ TimedeltaIndex(["1 days", "2 days", "3 days"])],
+ [date_range("20180101", periods=3),
+ DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"])]])
+def test_date_like_qcut_bins(arg, expected_bins):
+ # see gh-19891
+ ser = Series(arg)
+ result, result_bins = qcut(ser, 2, retbins=True)
+ tm.assert_index_equal(result_bins, expected_bins)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_reshape.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_reshape.py
new file mode 100644
index 00000000000..7b544b7981c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_reshape.py
@@ -0,0 +1,621 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101
+
+from collections import OrderedDict
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas.compat import u
+
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+from pandas import Categorical, DataFrame, Index, Series, get_dummies
+from pandas.core.sparse.api import SparseArray, SparseDtype
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal
+
+
+class TestGetDummies(object):
+
+ @pytest.fixture
+ def df(self):
+ return DataFrame({'A': ['a', 'b', 'a'],
+ 'B': ['b', 'b', 'c'],
+ 'C': [1, 2, 3]})
+
+ @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
+ def dtype(self, request):
+ return np.dtype(request.param)
+
+ @pytest.fixture(params=['dense', 'sparse'])
+ def sparse(self, request):
+ # params are strings to simplify reading test results,
+ # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
+ return request.param == 'sparse'
+
+ def effective_dtype(self, dtype):
+ if dtype is None:
+ return np.uint8
+ return dtype
+
+ def test_raises_on_dtype_object(self, df):
+ with pytest.raises(ValueError):
+ get_dummies(df, dtype='object')
+
+ def test_basic(self, sparse, dtype):
+ s_list = list('abc')
+ s_series = Series(s_list)
+ s_series_index = Series(s_list, list('ABC'))
+
+ expected = DataFrame({'a': [1, 0, 0],
+ 'b': [0, 1, 0],
+ 'c': [0, 0, 1]},
+ dtype=self.effective_dtype(dtype))
+ if sparse:
+ expected = expected.apply(pd.SparseArray, fill_value=0.0)
+ result = get_dummies(s_list, sparse=sparse, dtype=dtype)
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(s_series, sparse=sparse, dtype=dtype)
+ assert_frame_equal(result, expected)
+
+ expected.index = list('ABC')
+ result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
+ assert_frame_equal(result, expected)
+
+ def test_basic_types(self, sparse, dtype):
+ # GH 10531
+ s_list = list('abc')
+ s_series = Series(s_list)
+ s_df = DataFrame({'a': [0, 1, 0, 1, 2],
+ 'b': ['A', 'A', 'B', 'C', 'C'],
+ 'c': [2, 3, 3, 3, 2]})
+
+ expected = DataFrame({'a': [1, 0, 0],
+ 'b': [0, 1, 0],
+ 'c': [0, 0, 1]},
+ dtype=self.effective_dtype(dtype),
+ columns=list('abc'))
+ if sparse:
+ if is_integer_dtype(dtype):
+ fill_value = 0
+ elif dtype == bool:
+ fill_value = False
+ else:
+ fill_value = 0.0
+
+ expected = expected.apply(SparseArray, fill_value=fill_value)
+ result = get_dummies(s_list, sparse=sparse, dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+ result = get_dummies(s_series, sparse=sparse, dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+ result = get_dummies(s_df, columns=s_df.columns,
+ sparse=sparse, dtype=dtype)
+ if sparse:
+ dtype_name = 'Sparse[{}, {}]'.format(
+ self.effective_dtype(dtype).name,
+ fill_value
+ )
+ else:
+ dtype_name = self.effective_dtype(dtype).name
+
+ expected = Series({dtype_name: 8})
+ tm.assert_series_equal(result.get_dtype_counts(), expected)
+
+ result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
+
+ expected_counts = {'int64': 1, 'object': 1}
+ expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
+
+ expected = Series(expected_counts).sort_index()
+ tm.assert_series_equal(result.get_dtype_counts().sort_index(),
+ expected)
+
+ def test_just_na(self, sparse):
+ just_na_list = [np.nan]
+ just_na_series = Series(just_na_list)
+ just_na_series_index = Series(just_na_list, index=['A'])
+
+ res_list = get_dummies(just_na_list, sparse=sparse)
+ res_series = get_dummies(just_na_series, sparse=sparse)
+ res_series_index = get_dummies(just_na_series_index, sparse=sparse)
+
+ assert res_list.empty
+ assert res_series.empty
+ assert res_series_index.empty
+
+ assert res_list.index.tolist() == [0]
+ assert res_series.index.tolist() == [0]
+ assert res_series_index.index.tolist() == ['A']
+
+ def test_include_na(self, sparse, dtype):
+ s = ['a', 'b', np.nan]
+ res = get_dummies(s, sparse=sparse, dtype=dtype)
+ exp = DataFrame({'a': [1, 0, 0],
+ 'b': [0, 1, 0]},
+ dtype=self.effective_dtype(dtype))
+ if sparse:
+ exp = exp.apply(pd.SparseArray, fill_value=0.0)
+ assert_frame_equal(res, exp)
+
+ # Sparse dataframes do not allow nan labelled columns, see #GH8822
+ res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
+ exp_na = DataFrame({nan: [0, 0, 1],
+ 'a': [1, 0, 0],
+ 'b': [0, 1, 0]},
+ dtype=self.effective_dtype(dtype))
+ exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
+ # hack (NaN handling in assert_index_equal)
+ exp_na.columns = res_na.columns
+ if sparse:
+ exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
+ assert_frame_equal(res_na, exp_na)
+
+ res_just_na = get_dummies([nan], dummy_na=True,
+ sparse=sparse, dtype=dtype)
+ exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
+ dtype=self.effective_dtype(dtype))
+ tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
+
+ def test_unicode(self, sparse):
+ # See GH 6885 - get_dummies chokes on unicode values
+ import unicodedata
+ e = 'e'
+ eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
+ s = [e, eacute, eacute]
+ res = get_dummies(s, prefix='letter', sparse=sparse)
+ exp = DataFrame({'letter_e': [1, 0, 0],
+ u('letter_%s') % eacute: [0, 1, 1]},
+ dtype=np.uint8)
+ if sparse:
+ exp = exp.apply(pd.SparseArray, fill_value=0)
+ assert_frame_equal(res, exp)
+
+ def test_dataframe_dummies_all_obj(self, df, sparse):
+ df = df[['A', 'B']]
+ result = get_dummies(df, sparse=sparse)
+ expected = DataFrame({'A_a': [1, 0, 1],
+ 'A_b': [0, 1, 0],
+ 'B_b': [1, 1, 0],
+ 'B_c': [0, 0, 1]},
+ dtype=np.uint8)
+ if sparse:
+ expected = pd.DataFrame({
+ "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
+ "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
+ "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
+ "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
+ })
+
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
+ result = get_dummies(df, sparse=sparse, dtype=dtype)
+ if sparse:
+ arr = SparseArray
+ typ = SparseDtype(dtype, 0)
+ else:
+ arr = np.array
+ typ = dtype
+ expected = DataFrame({'C': [1, 2, 3],
+ 'A_a': arr([1, 0, 1], dtype=typ),
+ 'A_b': arr([0, 1, 0], dtype=typ),
+ 'B_b': arr([1, 1, 0], dtype=typ),
+ 'B_c': arr([0, 0, 1], dtype=typ)})
+ expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_prefix_list(self, df, sparse):
+ prefixes = ['from_A', 'from_B']
+ result = get_dummies(df, prefix=prefixes, sparse=sparse)
+ expected = DataFrame({'C': [1, 2, 3],
+ 'from_A_a': [1, 0, 1],
+ 'from_A_b': [0, 1, 0],
+ 'from_B_b': [1, 1, 0],
+ 'from_B_c': [0, 0, 1]},
+ dtype=np.uint8)
+ expected[['C']] = df[['C']]
+ cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
+ expected = expected[['C'] + cols]
+
+ typ = pd.SparseArray if sparse else pd.Series
+ expected[cols] = expected[cols].apply(lambda x: typ(x))
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_prefix_str(self, df, sparse):
+ # not that you should do this...
+ result = get_dummies(df, prefix='bad', sparse=sparse)
+ bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
+ expected = DataFrame([[1, 1, 0, 1, 0],
+ [2, 0, 1, 1, 0],
+ [3, 1, 0, 0, 1]],
+ columns=['C'] + bad_columns,
+ dtype=np.uint8)
+ expected = expected.astype({"C": np.int64})
+ if sparse:
+ # work around astyping & assigning with duplicate columns
+ # https://github.com/pandas-dev/pandas/issues/14427
+ expected = pd.concat([
+ pd.Series([1, 2, 3], name='C'),
+ pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'),
+ pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
+ pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
+ pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'),
+ ], axis=1)
+
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_subset(self, df, sparse):
+ result = get_dummies(df, prefix=['from_A'], columns=['A'],
+ sparse=sparse)
+ expected = DataFrame({'B': ['b', 'b', 'c'],
+ 'C': [1, 2, 3],
+ 'from_A_a': [1, 0, 1],
+ 'from_A_b': [0, 1, 0]}, dtype=np.uint8)
+ expected[['C']] = df[['C']]
+ if sparse:
+ cols = ['from_A_a', 'from_A_b']
+ expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_prefix_sep(self, df, sparse):
+ result = get_dummies(df, prefix_sep='..', sparse=sparse)
+ expected = DataFrame({'C': [1, 2, 3],
+ 'A..a': [1, 0, 1],
+ 'A..b': [0, 1, 0],
+ 'B..b': [1, 1, 0],
+ 'B..c': [0, 0, 1]},
+ dtype=np.uint8)
+ expected[['C']] = df[['C']]
+ expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
+ if sparse:
+ cols = ['A..a', 'A..b', 'B..b', 'B..c']
+ expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
+
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
+ expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
+ sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
+ with pytest.raises(ValueError):
+ get_dummies(df, prefix=['too few'], sparse=sparse)
+
+ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
+ with pytest.raises(ValueError):
+ get_dummies(df, prefix_sep=['bad'], sparse=sparse)
+
+ def test_dataframe_dummies_prefix_dict(self, sparse):
+ prefixes = {'A': 'from_A', 'B': 'from_B'}
+ df = DataFrame({'C': [1, 2, 3],
+ 'A': ['a', 'b', 'a'],
+ 'B': ['b', 'b', 'c']})
+ result = get_dummies(df, prefix=prefixes, sparse=sparse)
+
+ expected = DataFrame({'C': [1, 2, 3],
+ 'from_A_a': [1, 0, 1],
+ 'from_A_b': [0, 1, 0],
+ 'from_B_b': [1, 1, 0],
+ 'from_B_c': [0, 0, 1]})
+
+ columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
+ expected[columns] = expected[columns].astype(np.uint8)
+ if sparse:
+ expected[columns] = expected[columns].apply(
+ lambda x: pd.SparseSeries(x)
+ )
+
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_with_na(self, df, sparse, dtype):
+ df.loc[3, :] = [np.nan, np.nan, np.nan]
+ result = get_dummies(df, dummy_na=True,
+ sparse=sparse, dtype=dtype).sort_index(axis=1)
+
+ if sparse:
+ arr = SparseArray
+ typ = SparseDtype(dtype, 0)
+ else:
+ arr = np.array
+ typ = dtype
+
+ expected = DataFrame({'C': [1, 2, 3, np.nan],
+ 'A_a': arr([1, 0, 1, 0], dtype=typ),
+ 'A_b': arr([0, 1, 0, 0], dtype=typ),
+ 'A_nan': arr([0, 0, 0, 1], dtype=typ),
+ 'B_b': arr([1, 1, 0, 0], dtype=typ),
+ 'B_c': arr([0, 0, 1, 0], dtype=typ),
+ 'B_nan': arr([0, 0, 0, 1], dtype=typ)
+ }).sort_index(axis=1)
+
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
+ expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
+ df['cat'] = pd.Categorical(['x', 'y', 'y'])
+ result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
+ if sparse:
+ arr = SparseArray
+ typ = SparseDtype(dtype, 0)
+ else:
+ arr = np.array
+ typ = dtype
+
+ expected = DataFrame({'C': [1, 2, 3],
+ 'A_a': arr([1, 0, 1], dtype=typ),
+ 'A_b': arr([0, 1, 0], dtype=typ),
+ 'B_b': arr([1, 1, 0], dtype=typ),
+ 'B_c': arr([0, 0, 1], dtype=typ),
+ 'cat_x': arr([1, 0, 0], dtype=typ),
+ 'cat_y': arr([0, 1, 1], dtype=typ)
+ }).sort_index(axis=1)
+
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('get_dummies_kwargs,expected', [
+ ({'data': pd.DataFrame(({u'ä': ['a']}))},
+ pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
+
+ ({'data': pd.DataFrame({'x': [u'ä']})},
+ pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),
+
+ ({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
+ pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
+
+ ({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
+ pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
+ def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
+ # GH22084 pd.get_dummies incorrectly encodes unicode characters
+ # in dataframe column names
+ result = get_dummies(**get_dummies_kwargs)
+ assert_frame_equal(result, expected)
+
+ def test_basic_drop_first(self, sparse):
+ # GH12402 Add a new parameter `drop_first` to avoid collinearity
+ # Basic case
+ s_list = list('abc')
+ s_series = Series(s_list)
+ s_series_index = Series(s_list, list('ABC'))
+
+ expected = DataFrame({'b': [0, 1, 0],
+ 'c': [0, 0, 1]},
+ dtype=np.uint8)
+
+ result = get_dummies(s_list, drop_first=True, sparse=sparse)
+ if sparse:
+ expected = expected.apply(pd.SparseArray, fill_value=0)
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(s_series, drop_first=True, sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ expected.index = list('ABC')
+ result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ def test_basic_drop_first_one_level(self, sparse):
+ # Test the case that categorical variable only has one level.
+ s_list = list('aaa')
+ s_series = Series(s_list)
+ s_series_index = Series(s_list, list('ABC'))
+
+ expected = DataFrame(index=np.arange(3))
+
+ result = get_dummies(s_list, drop_first=True, sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(s_series, drop_first=True, sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ expected = DataFrame(index=list('ABC'))
+ result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
+ assert_frame_equal(result, expected)
+
+ def test_basic_drop_first_NA(self, sparse):
+ # Test NA handling together with drop_first
+ s_NA = ['a', 'b', np.nan]
+ res = get_dummies(s_NA, drop_first=True, sparse=sparse)
+ exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
+ if sparse:
+ exp = exp.apply(pd.SparseArray, fill_value=0)
+
+ assert_frame_equal(res, exp)
+
+ res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
+ sparse=sparse)
+ exp_na = DataFrame(
+ {'b': [0, 1, 0],
+ nan: [0, 0, 1]},
+ dtype=np.uint8).reindex(['b', nan], axis=1)
+ if sparse:
+ exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
+ assert_frame_equal(res_na, exp_na)
+
+ res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
+ sparse=sparse)
+ exp_just_na = DataFrame(index=np.arange(1))
+ assert_frame_equal(res_just_na, exp_just_na)
+
+ def test_dataframe_dummies_drop_first(self, df, sparse):
+ df = df[['A', 'B']]
+ result = get_dummies(df, drop_first=True, sparse=sparse)
+ expected = DataFrame({'A_b': [0, 1, 0],
+ 'B_c': [0, 0, 1]},
+ dtype=np.uint8)
+ if sparse:
+ expected = expected.apply(pd.SparseArray, fill_value=0)
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_drop_first_with_categorical(
+ self, df, sparse, dtype):
+ df['cat'] = pd.Categorical(['x', 'y', 'y'])
+ result = get_dummies(df, drop_first=True, sparse=sparse)
+ expected = DataFrame({'C': [1, 2, 3],
+ 'A_b': [0, 1, 0],
+ 'B_c': [0, 0, 1],
+ 'cat_y': [0, 1, 1]})
+ cols = ['A_b', 'B_c', 'cat_y']
+ expected[cols] = expected[cols].astype(np.uint8)
+ expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
+ if sparse:
+ for col in cols:
+ expected[col] = pd.SparseSeries(expected[col])
+ assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
+ df.loc[3, :] = [np.nan, np.nan, np.nan]
+ result = get_dummies(df, dummy_na=True, drop_first=True,
+ sparse=sparse).sort_index(axis=1)
+ expected = DataFrame({'C': [1, 2, 3, np.nan],
+ 'A_b': [0, 1, 0, 0],
+ 'A_nan': [0, 0, 0, 1],
+ 'B_c': [0, 0, 1, 0],
+ 'B_nan': [0, 0, 0, 1]})
+ cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
+ expected[cols] = expected[cols].astype(np.uint8)
+ expected = expected.sort_index(axis=1)
+ if sparse:
+ for col in cols:
+ expected[col] = pd.SparseSeries(expected[col])
+
+ assert_frame_equal(result, expected)
+
+ result = get_dummies(df, dummy_na=False, drop_first=True,
+ sparse=sparse)
+ expected = expected[['C', 'A_b', 'B_c']]
+ assert_frame_equal(result, expected)
+
+ def test_int_int(self):
+ data = Series([1, 2, 1])
+ result = pd.get_dummies(data)
+ expected = DataFrame([[1, 0],
+ [0, 1],
+ [1, 0]],
+ columns=[1, 2],
+ dtype=np.uint8)
+ tm.assert_frame_equal(result, expected)
+
+ data = Series(pd.Categorical(['a', 'b', 'a']))
+ result = pd.get_dummies(data)
+ expected = DataFrame([[1, 0],
+ [0, 1],
+ [1, 0]],
+ columns=pd.Categorical(['a', 'b']),
+ dtype=np.uint8)
+ tm.assert_frame_equal(result, expected)
+
+ def test_int_df(self, dtype):
+ data = DataFrame(
+ {'A': [1, 2, 1],
+ 'B': pd.Categorical(['a', 'b', 'a']),
+ 'C': [1, 2, 1],
+ 'D': [1., 2., 1.]
+ }
+ )
+ columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
+ expected = DataFrame([
+ [1, 1., 1, 0, 1, 0],
+ [2, 2., 0, 1, 0, 1],
+ [1, 1., 1, 0, 1, 0]
+ ], columns=columns)
+ expected[columns[2:]] = expected[columns[2:]].astype(dtype)
+ result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
+ tm.assert_frame_equal(result, expected)
+
+ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
+ # GH13854
+ for ordered in [False, True]:
+ cat = pd.Categorical(list("xy"), categories=list("xyz"),
+ ordered=ordered)
+ result = get_dummies(cat, dtype=dtype)
+
+ data = np.array([[1, 0, 0], [0, 1, 0]],
+ dtype=self.effective_dtype(dtype))
+ cols = pd.CategoricalIndex(cat.categories,
+ categories=cat.categories,
+ ordered=ordered)
+ expected = DataFrame(data, columns=cols,
+ dtype=self.effective_dtype(dtype))
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('sparse', [True, False])
+ def test_get_dummies_dont_sparsify_all_columns(self, sparse):
+ # GH18914
+ df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
+ ('Nation', ['AB', 'CD'])]))
+ df = get_dummies(df, columns=['Nation'], sparse=sparse)
+ df2 = df.reindex(columns=['GDP'])
+
+ tm.assert_frame_equal(df[['GDP']], df2)
+
+ def test_get_dummies_duplicate_columns(self, df):
+ # GH20839
+ df.columns = ["A", "A", "A"]
+ result = get_dummies(df).sort_index(axis=1)
+
+ expected = DataFrame([[1, 1, 0, 1, 0],
+ [2, 0, 1, 1, 0],
+ [3, 1, 0, 0, 1]],
+ columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
+ dtype=np.uint8).sort_index(axis=1)
+
+ expected = expected.astype({"A": np.int64})
+
+ tm.assert_frame_equal(result, expected)
+
+
+class TestCategoricalReshape(object):
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_reshaping_panel_categorical(self):
+
+ p = tm.makePanel()
+ p['str'] = 'foo'
+ df = p.to_frame()
+
+ df['category'] = df['str'].astype('category')
+ result = df['category'].unstack()
+
+ c = Categorical(['foo'] * len(p.major_axis))
+ expected = DataFrame({'A': c.copy(),
+ 'B': c.copy(),
+ 'C': c.copy(),
+ 'D': c.copy()},
+ columns=Index(list('ABCD'), name='minor'),
+ index=p.major_axis.set_names('major'))
+ tm.assert_frame_equal(result, expected)
+
+
+class TestMakeAxisDummies(object):
+
+ def test_preserve_categorical_dtype(self):
+ # GH13854
+ for ordered in [False, True]:
+ cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
+ midx = pd.MultiIndex(levels=[['a'], cidx],
+ codes=[[0, 0], [0, 1]])
+ df = DataFrame([[10, 11]], index=midx)
+
+ expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
+ index=midx, columns=cidx)
+
+ from pandas.core.reshape.reshape import make_axis_dummies
+ result = make_axis_dummies(df)
+ tm.assert_frame_equal(result, expected)
+
+ result = make_axis_dummies(df, transform=lambda x: x)
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_union_categoricals.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_union_categoricals.py
new file mode 100644
index 00000000000..9b2b8bf9ed4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_union_categoricals.py
@@ -0,0 +1,346 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.concat import union_categoricals
+
+import pandas as pd
+from pandas import Categorical, CategoricalIndex, Series
+from pandas.util import testing as tm
+
+
+class TestUnionCategoricals(object):
+
+ def test_union_categorical(self):
+ # GH 13361
+ data = [
+ (list('abc'), list('abd'), list('abcabd')),
+ ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
+ ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
+
+ (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
+ ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
+
+ (pd.date_range('2014-01-01', '2014-01-05'),
+ pd.date_range('2014-01-06', '2014-01-07'),
+ pd.date_range('2014-01-01', '2014-01-07')),
+
+ (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
+ pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
+ pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
+
+ (pd.period_range('2014-01-01', '2014-01-05'),
+ pd.period_range('2014-01-06', '2014-01-07'),
+ pd.period_range('2014-01-01', '2014-01-07')),
+ ]
+
+ for a, b, combined in data:
+ for box in [Categorical, CategoricalIndex, Series]:
+ result = union_categoricals([box(Categorical(a)),
+ box(Categorical(b))])
+ expected = Categorical(combined)
+ tm.assert_categorical_equal(result, expected,
+ check_category_order=True)
+
+ # new categories ordered by appearance
+ s = Categorical(['x', 'y', 'z'])
+ s2 = Categorical(['a', 'b', 'c'])
+ result = union_categoricals([s, s2])
+ expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+ categories=['x', 'y', 'z', 'a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ s = Categorical([0, 1.2, 2], ordered=True)
+ s2 = Categorical([0, 1.2, 2], ordered=True)
+ result = union_categoricals([s, s2])
+ expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ # must exactly match types
+ s = Categorical([0, 1.2, 2])
+ s2 = Categorical([2, 3, 4])
+ msg = 'dtype of categories must be the same'
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([s, s2])
+
+ msg = 'No Categoricals to union'
+ with pytest.raises(ValueError, match=msg):
+ union_categoricals([])
+
+ def test_union_categoricals_nan(self):
+ # GH 13759
+ res = union_categoricals([pd.Categorical([1, 2, np.nan]),
+ pd.Categorical([3, 2, np.nan])])
+ exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
+ tm.assert_categorical_equal(res, exp)
+
+ res = union_categoricals([pd.Categorical(['A', 'B']),
+ pd.Categorical(['B', 'B', np.nan])])
+ exp = Categorical(['A', 'B', 'B', 'B', np.nan])
+ tm.assert_categorical_equal(res, exp)
+
+ val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
+ pd.NaT]
+ val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-02-01')]
+
+ res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
+ exp = Categorical(val1 + val2,
+ categories=[pd.Timestamp('2011-01-01'),
+ pd.Timestamp('2011-03-01'),
+ pd.Timestamp('2011-02-01')])
+ tm.assert_categorical_equal(res, exp)
+
+ # all NaN
+ res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
+ dtype=object)),
+ pd.Categorical(['X'])])
+ exp = Categorical([np.nan, np.nan, 'X'])
+ tm.assert_categorical_equal(res, exp)
+
+ res = union_categoricals([pd.Categorical([np.nan, np.nan]),
+ pd.Categorical([np.nan, np.nan])])
+ exp = Categorical([np.nan, np.nan, np.nan, np.nan])
+ tm.assert_categorical_equal(res, exp)
+
+ def test_union_categoricals_empty(self):
+ # GH 13759
+ res = union_categoricals([pd.Categorical([]),
+ pd.Categorical([])])
+ exp = Categorical([])
+ tm.assert_categorical_equal(res, exp)
+
+ res = union_categoricals([Categorical([]),
+ Categorical(['1'])])
+ exp = Categorical(['1'])
+ tm.assert_categorical_equal(res, exp)
+
+ def test_union_categorical_same_category(self):
+ # check fastpath
+ c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
+ c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
+ res = union_categoricals([c1, c2])
+ exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
+ categories=[1, 2, 3, 4])
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
+ c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
+ res = union_categoricals([c1, c2])
+ exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
+ categories=['x', 'y', 'z'])
+ tm.assert_categorical_equal(res, exp)
+
+ def test_union_categorical_same_categories_different_order(self):
+ # https://github.com/pandas-dev/pandas/issues/19096
+ c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
+ c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
+ result = union_categoricals([c1, c2])
+ expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+ categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ def test_union_categoricals_ordered(self):
+ c1 = Categorical([1, 2, 3], ordered=True)
+ c2 = Categorical([1, 2, 3], ordered=False)
+
+ msg = 'Categorical.ordered must be the same'
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([c1, c2])
+
+ res = union_categoricals([c1, c1])
+ exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical([1, 2, 3, np.nan], ordered=True)
+ c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
+
+ res = union_categoricals([c1, c2])
+ exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical([1, 2, 3], ordered=True)
+ c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
+
+ msg = "to union ordered Categoricals, all categories must be the same"
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([c1, c2])
+
+ def test_union_categoricals_ignore_order(self):
+ # GH 15219
+ c1 = Categorical([1, 2, 3], ordered=True)
+ c2 = Categorical([1, 2, 3], ordered=False)
+
+ res = union_categoricals([c1, c2], ignore_order=True)
+ exp = Categorical([1, 2, 3, 1, 2, 3])
+ tm.assert_categorical_equal(res, exp)
+
+ msg = 'Categorical.ordered must be the same'
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([c1, c2], ignore_order=False)
+
+ res = union_categoricals([c1, c1], ignore_order=True)
+ exp = Categorical([1, 2, 3, 1, 2, 3])
+ tm.assert_categorical_equal(res, exp)
+
+ res = union_categoricals([c1, c1], ignore_order=False)
+ exp = Categorical([1, 2, 3, 1, 2, 3],
+ categories=[1, 2, 3], ordered=True)
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical([1, 2, 3, np.nan], ordered=True)
+ c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
+
+ res = union_categoricals([c1, c2], ignore_order=True)
+ exp = Categorical([1, 2, 3, np.nan, 3, 2])
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical([1, 2, 3], ordered=True)
+ c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
+
+ res = union_categoricals([c1, c2], ignore_order=True)
+ exp = Categorical([1, 2, 3, 1, 2, 3])
+ tm.assert_categorical_equal(res, exp)
+
+ res = union_categoricals([c2, c1], ignore_order=True,
+ sort_categories=True)
+ exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
+ tm.assert_categorical_equal(res, exp)
+
+ c1 = Categorical([1, 2, 3], ordered=True)
+ c2 = Categorical([4, 5, 6], ordered=True)
+ result = union_categoricals([c1, c2], ignore_order=True)
+ expected = Categorical([1, 2, 3, 4, 5, 6])
+ tm.assert_categorical_equal(result, expected)
+
+ msg = "to union ordered Categoricals, all categories must be the same"
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([c1, c2], ignore_order=False)
+
+ with pytest.raises(TypeError, match=msg):
+ union_categoricals([c1, c2])
+
+ def test_union_categoricals_sort(self):
+ # GH 13846
+ c1 = Categorical(['x', 'y', 'z'])
+ c2 = Categorical(['a', 'b', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+ categories=['a', 'b', 'c', 'x', 'y', 'z'])
+ tm.assert_categorical_equal(result, expected)
+
+ # fastpath
+ c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+ c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical(['a', 'b', 'b', 'c'],
+ categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
+ c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical(['a', 'b', 'b', 'c'],
+ categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ # fastpath - skip resort
+ c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+ c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical(['a', 'b', 'b', 'c'],
+ categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical(['x', np.nan])
+ c2 = Categorical([np.nan, 'b'])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical(['x', np.nan, np.nan, 'b'],
+ categories=['b', 'x'])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical([np.nan])
+ c2 = Categorical([np.nan])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical([np.nan, np.nan])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical([])
+ c2 = Categorical([])
+ result = union_categoricals([c1, c2], sort_categories=True)
+ expected = Categorical([])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+ c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+ with pytest.raises(TypeError):
+ union_categoricals([c1, c2], sort_categories=True)
+
+ def test_union_categoricals_sort_false(self):
+ # GH 13846
+ c1 = Categorical(['x', 'y', 'z'])
+ c2 = Categorical(['a', 'b', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+ categories=['x', 'y', 'z', 'a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ # fastpath
+ c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+ c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical(['a', 'b', 'b', 'c'],
+ categories=['b', 'a', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ # fastpath - skip resort
+ c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+ c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical(['a', 'b', 'b', 'c'],
+ categories=['a', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical(['x', np.nan])
+ c2 = Categorical([np.nan, 'b'])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical(['x', np.nan, np.nan, 'b'],
+ categories=['x', 'b'])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical([np.nan])
+ c2 = Categorical([np.nan])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical([np.nan, np.nan])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical([])
+ c2 = Categorical([])
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical([])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+ c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+ result = union_categoricals([c1, c2], sort_categories=False)
+ expected = Categorical(['b', 'a', 'a', 'c'],
+ categories=['b', 'a', 'c'], ordered=True)
+ tm.assert_categorical_equal(result, expected)
+
+ def test_union_categorical_unwrap(self):
+ # GH 14173
+ c1 = Categorical(['a', 'b'])
+ c2 = pd.Series(['b', 'c'], dtype='category')
+ result = union_categoricals([c1, c2])
+ expected = Categorical(['a', 'b', 'b', 'c'])
+ tm.assert_categorical_equal(result, expected)
+
+ c2 = CategoricalIndex(c2)
+ result = union_categoricals([c1, c2])
+ tm.assert_categorical_equal(result, expected)
+
+ c1 = Series(c1)
+ result = union_categoricals([c1, c2])
+ tm.assert_categorical_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ union_categoricals([c1, ['a', 'b', 'c']])
diff --git a/contrib/python/pandas/py2/pandas/tests/reshape/test_util.py b/contrib/python/pandas/py2/pandas/tests/reshape/test_util.py
new file mode 100644
index 00000000000..a8d9e7a7754
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/reshape/test_util.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pytest
+
+from pandas import Index, date_range
+from pandas.core.reshape.util import cartesian_product
+import pandas.util.testing as tm
+
+
+class TestCartesianProduct(object):
+
+ def test_simple(self):
+ x, y = list('ABC'), [1, 22]
+ result1, result2 = cartesian_product([x, y])
+ expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
+ expected2 = np.array([1, 22, 1, 22, 1, 22])
+ tm.assert_numpy_array_equal(result1, expected1)
+ tm.assert_numpy_array_equal(result2, expected2)
+
+ def test_datetimeindex(self):
+ # regression test for GitHub issue #6439
+ # make sure that the ordering on datetimeindex is consistent
+ x = date_range('2000-01-01', periods=2)
+ result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
+ expected1 = Index([1, 1, 2, 2])
+ expected2 = Index([1, 2, 1, 2])
+ tm.assert_index_equal(result1, expected1)
+ tm.assert_index_equal(result2, expected2)
+
+ def test_empty(self):
+ # product of empty factors
+ X = [[], [0, 1], []]
+ Y = [[], [], ['a', 'b', 'c']]
+ for x, y in zip(X, Y):
+ expected1 = np.array([], dtype=np.asarray(x).dtype)
+ expected2 = np.array([], dtype=np.asarray(y).dtype)
+ result1, result2 = cartesian_product([x, y])
+ tm.assert_numpy_array_equal(result1, expected1)
+ tm.assert_numpy_array_equal(result2, expected2)
+
+ # empty product (empty input):
+ result = cartesian_product([])
+ expected = []
+ assert result == expected
+
+ @pytest.mark.parametrize("X", [
+ 1, [1], [1, 2], [[1], 2],
+ 'a', ['a'], ['a', 'b'], [['a'], 'b']
+ ])
+ def test_invalid_input(self, X):
+ msg = "Input must be a list-like of list-likes"
+
+ with pytest.raises(TypeError, match=msg):
+ cartesian_product(X=X)
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/__init__.py b/contrib/python/pandas/py2/pandas/tests/scalar/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/interval/__init__.py b/contrib/python/pandas/py2/pandas/tests/scalar/interval/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/interval/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_interval.py b/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_interval.py
new file mode 100644
index 00000000000..432f44725e2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_interval.py
@@ -0,0 +1,225 @@
+from __future__ import division
+
+import numpy as np
+import pytest
+
+from pandas import Interval, Timedelta, Timestamp
+import pandas.core.common as com
+
+
+def interval():
+ return Interval(0, 1)
+
+
+class TestInterval(object):
+
+ def test_properties(self, interval):
+ assert interval.closed == 'right'
+ assert interval.left == 0
+ assert interval.right == 1
+ assert interval.mid == 0.5
+
+ def test_repr(self, interval):
+ assert repr(interval) == "Interval(0, 1, closed='right')"
+ assert str(interval) == "(0, 1]"
+
+ interval_left = Interval(0, 1, closed='left')
+ assert repr(interval_left) == "Interval(0, 1, closed='left')"
+ assert str(interval_left) == "[0, 1)"
+
+ def test_contains(self, interval):
+ assert 0.5 in interval
+ assert 1 in interval
+ assert 0 not in interval
+
+ msg = "__contains__ not defined for two intervals"
+ with pytest.raises(TypeError, match=msg):
+ interval in interval
+
+ interval_both = Interval(0, 1, closed='both')
+ assert 0 in interval_both
+ assert 1 in interval_both
+
+ interval_neither = Interval(0, 1, closed='neither')
+ assert 0 not in interval_neither
+ assert 0.5 in interval_neither
+ assert 1 not in interval_neither
+
+ def test_equal(self):
+ assert Interval(0, 1) == Interval(0, 1, closed='right')
+ assert Interval(0, 1) != Interval(0, 1, closed='left')
+ assert Interval(0, 1) != 0
+
+ def test_comparison(self):
+ with pytest.raises(TypeError, match='unorderable types'):
+ Interval(0, 1) < 2
+
+ assert Interval(0, 1) < Interval(1, 2)
+ assert Interval(0, 1) < Interval(0, 2)
+ assert Interval(0, 1) < Interval(0.5, 1.5)
+ assert Interval(0, 1) <= Interval(0, 1)
+ assert Interval(0, 1) > Interval(-1, 2)
+ assert Interval(0, 1) >= Interval(0, 1)
+
+ def test_hash(self, interval):
+ # should not raise
+ hash(interval)
+
+ @pytest.mark.parametrize('left, right, expected', [
+ (0, 5, 5),
+ (-2, 5.5, 7.5),
+ (10, 10, 0),
+ (10, np.inf, np.inf),
+ (-np.inf, -5, np.inf),
+ (-np.inf, np.inf, np.inf),
+ (Timedelta('0 days'), Timedelta('5 days'), Timedelta('5 days')),
+ (Timedelta('10 days'), Timedelta('10 days'), Timedelta('0 days')),
+ (Timedelta('1H10M'), Timedelta('5H5M'), Timedelta('3H55M')),
+ (Timedelta('5S'), Timedelta('1H'), Timedelta('59M55S'))])
+ def test_length(self, left, right, expected):
+ # GH 18789
+ iv = Interval(left, right)
+ result = iv.length
+ assert result == expected
+
+ @pytest.mark.parametrize('left, right, expected', [
+ ('2017-01-01', '2017-01-06', '5 days'),
+ ('2017-01-01', '2017-01-01 12:00:00', '12 hours'),
+ ('2017-01-01 12:00', '2017-01-01 12:00:00', '0 days'),
+ ('2017-01-01 12:01', '2017-01-05 17:31:00', '4 days 5 hours 30 min')])
+ @pytest.mark.parametrize('tz', (None, 'UTC', 'CET', 'US/Eastern'))
+ def test_length_timestamp(self, tz, left, right, expected):
+ # GH 18789
+ iv = Interval(Timestamp(left, tz=tz), Timestamp(right, tz=tz))
+ result = iv.length
+ expected = Timedelta(expected)
+ assert result == expected
+
+ @pytest.mark.parametrize('left, right', [
+ ('a', 'z'),
+ (('a', 'b'), ('c', 'd')),
+ (list('AB'), list('ab')),
+ (Interval(0, 1), Interval(1, 2))])
+ def test_length_errors(self, left, right):
+ # GH 18789
+ iv = Interval(left, right)
+ msg = 'cannot compute length between .* and .*'
+ with pytest.raises(TypeError, match=msg):
+ iv.length
+
+ def test_math_add(self, closed):
+ interval = Interval(0, 1, closed=closed)
+ expected = Interval(1, 2, closed=closed)
+
+ result = interval + 1
+ assert result == expected
+
+ result = 1 + interval
+ assert result == expected
+
+ result = interval
+ result += 1
+ assert result == expected
+
+ msg = r"unsupported operand type\(s\) for \+"
+ with pytest.raises(TypeError, match=msg):
+ interval + interval
+
+ with pytest.raises(TypeError, match=msg):
+ interval + 'foo'
+
+ def test_math_sub(self, closed):
+ interval = Interval(0, 1, closed=closed)
+ expected = Interval(-1, 0, closed=closed)
+
+ result = interval - 1
+ assert result == expected
+
+ result = interval
+ result -= 1
+ assert result == expected
+
+ msg = r"unsupported operand type\(s\) for -"
+ with pytest.raises(TypeError, match=msg):
+ interval - interval
+
+ with pytest.raises(TypeError, match=msg):
+ interval - 'foo'
+
+ def test_math_mult(self, closed):
+ interval = Interval(0, 1, closed=closed)
+ expected = Interval(0, 2, closed=closed)
+
+ result = interval * 2
+ assert result == expected
+
+ result = 2 * interval
+ assert result == expected
+
+ result = interval
+ result *= 2
+ assert result == expected
+
+ msg = r"unsupported operand type\(s\) for \*"
+ with pytest.raises(TypeError, match=msg):
+ interval * interval
+
+ msg = r"can\'t multiply sequence by non-int"
+ with pytest.raises(TypeError, match=msg):
+ interval * 'foo'
+
+ def test_math_div(self, closed):
+ interval = Interval(0, 1, closed=closed)
+ expected = Interval(0, 0.5, closed=closed)
+
+ result = interval / 2.0
+ assert result == expected
+
+ result = interval
+ result /= 2.0
+ assert result == expected
+
+ msg = r"unsupported operand type\(s\) for /"
+ with pytest.raises(TypeError, match=msg):
+ interval / interval
+
+ with pytest.raises(TypeError, match=msg):
+ interval / 'foo'
+
+ def test_math_floordiv(self, closed):
+ interval = Interval(1, 2, closed=closed)
+ expected = Interval(0, 1, closed=closed)
+
+ result = interval // 2
+ assert result == expected
+
+ result = interval
+ result //= 2
+ assert result == expected
+
+ msg = r"unsupported operand type\(s\) for //"
+ with pytest.raises(TypeError, match=msg):
+ interval // interval
+
+ with pytest.raises(TypeError, match=msg):
+ interval // 'foo'
+
+ def test_constructor_errors(self):
+ msg = "invalid option for 'closed': foo"
+ with pytest.raises(ValueError, match=msg):
+ Interval(0, 1, closed='foo')
+
+ msg = 'left side of interval must be <= right side'
+ with pytest.raises(ValueError, match=msg):
+ Interval(1, 0)
+
+ @pytest.mark.parametrize('tz_left, tz_right', [
+ (None, 'UTC'), ('UTC', None), ('UTC', 'US/Eastern')])
+ def test_constructor_errors_tz(self, tz_left, tz_right):
+ # GH 18538
+ left = Timestamp('2017-01-01', tz=tz_left)
+ right = Timestamp('2017-01-02', tz=tz_right)
+ error = TypeError if com._any_none(tz_left, tz_right) else ValueError
+ with pytest.raises(error):
+ Interval(left, right)
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_ops.py b/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_ops.py
new file mode 100644
index 00000000000..869ff205c2f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/interval/test_ops.py
@@ -0,0 +1,60 @@
+"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
+import pytest
+
+from pandas import Interval, Timedelta, Timestamp
+
+
+ (Timedelta('0 days'), Timedelta('1 day')),
+ (Timestamp('2018-01-01'), Timedelta('1 day')),
+ (0, 1)], ids=lambda x: type(x[0]).__name__)
+def start_shift(request):
+ """
+ Fixture for generating intervals of types from a start value and a shift
+ value that can be added to start to generate an endpoint
+ """
+ return request.param
+
+
+class TestOverlaps(object):
+
+ def test_overlaps_self(self, start_shift, closed):
+ start, shift = start_shift
+ interval = Interval(start, start + shift, closed)
+ assert interval.overlaps(interval)
+
+ def test_overlaps_nested(self, start_shift, closed, other_closed):
+ start, shift = start_shift
+ interval1 = Interval(start, start + 3 * shift, other_closed)
+ interval2 = Interval(start + shift, start + 2 * shift, closed)
+
+ # nested intervals should always overlap
+ assert interval1.overlaps(interval2)
+
+ def test_overlaps_disjoint(self, start_shift, closed, other_closed):
+ start, shift = start_shift
+ interval1 = Interval(start, start + shift, other_closed)
+ interval2 = Interval(start + 2 * shift, start + 3 * shift, closed)
+
+ # disjoint intervals should never overlap
+ assert not interval1.overlaps(interval2)
+
+ def test_overlaps_endpoint(self, start_shift, closed, other_closed):
+ start, shift = start_shift
+ interval1 = Interval(start, start + shift, other_closed)
+ interval2 = Interval(start + shift, start + 2 * shift, closed)
+
+ # overlap if shared endpoint is closed for both (overlap at a point)
+ result = interval1.overlaps(interval2)
+ expected = interval1.closed_right and interval2.closed_left
+ assert result == expected
+
+ @pytest.mark.parametrize('other', [
+ 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')],
+ ids=lambda x: type(x).__name__)
+ def test_overlaps_invalid_type(self, other):
+ interval = Interval(0, 1)
+ msg = '`other` must be an Interval, got {other}'.format(
+ other=type(other).__name__)
+ with pytest.raises(TypeError, match=msg):
+ interval.overlaps(other)
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/period/__init__.py b/contrib/python/pandas/py2/pandas/tests/scalar/period/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/period/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/period/test_asfreq.py b/contrib/python/pandas/py2/pandas/tests/scalar/period/test_asfreq.py
new file mode 100644
index 00000000000..f46f2da6c07
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/period/test_asfreq.py
@@ -0,0 +1,747 @@
+import pytest
+
+from pandas._libs.tslibs.frequencies import (
+ INVALID_FREQ_ERR_MSG, _period_code_map)
+from pandas.errors import OutOfBoundsDatetime
+
+from pandas import Period, offsets
+
+
+class TestFreqConversion(object):
+ """Test frequency conversion of date objects"""
+ @pytest.mark.parametrize('freq', ['A', 'Q', 'M', 'W', 'B', 'D'])
+ def test_asfreq_near_zero(self, freq):
+ # GH#19643, GH#19650
+ per = Period('0001-01-01', freq=freq)
+ tup1 = (per.year, per.hour, per.day)
+
+ prev = per - 1
+ assert prev.ordinal == per.ordinal - 1
+ tup2 = (prev.year, prev.month, prev.day)
+ assert tup2 < tup1
+
+ def test_asfreq_near_zero_weekly(self):
+ # GH#19834
+ per1 = Period('0001-01-01', 'D') + 6
+ per2 = Period('0001-01-01', 'D') - 6
+ week1 = per1.asfreq('W')
+ week2 = per2.asfreq('W')
+ assert week1 != week2
+ assert week1.asfreq('D', 'E') >= per1
+ assert week2.asfreq('D', 'S') <= per2
+
+ @pytest.mark.xfail(reason='GH#19643 period_helper asfreq functions fail '
+ 'to check for overflows')
+ def test_to_timestamp_out_of_bounds(self):
+ # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848')
+ per = Period('0001-01-01', freq='B')
+ with pytest.raises(OutOfBoundsDatetime):
+ per.to_timestamp()
+
+ def test_asfreq_corner(self):
+ val = Period(freq='A', year=2007)
+ result1 = val.asfreq('5t')
+ result2 = val.asfreq('t')
+ expected = Period('2007-12-31 23:59', freq='t')
+ assert result1.ordinal == expected.ordinal
+ assert result1.freqstr == '5T'
+ assert result2.ordinal == expected.ordinal
+ assert result2.freqstr == 'T'
+
+ def test_conv_annual(self):
+ # frequency conversion tests: from Annual Frequency
+
+ ival_A = Period(freq='A', year=2007)
+
+ ival_AJAN = Period(freq="A-JAN", year=2007)
+ ival_AJUN = Period(freq="A-JUN", year=2007)
+ ival_ANOV = Period(freq="A-NOV", year=2007)
+
+ ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1)
+ ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4)
+ ival_A_to_M_start = Period(freq='M', year=2007, month=1)
+ ival_A_to_M_end = Period(freq='M', year=2007, month=12)
+ ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1)
+ ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31)
+ ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1)
+ ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31)
+ ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1)
+ ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31)
+ ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31,
+ hour=23)
+ ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31,
+ hour=23, minute=59)
+ ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31,
+ hour=23, minute=59, second=59)
+
+ ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31)
+ ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1)
+ ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30)
+ ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1)
+ ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30)
+ ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1)
+
+ assert ival_A.asfreq('Q', 'S') == ival_A_to_Q_start
+ assert ival_A.asfreq('Q', 'e') == ival_A_to_Q_end
+ assert ival_A.asfreq('M', 's') == ival_A_to_M_start
+ assert ival_A.asfreq('M', 'E') == ival_A_to_M_end
+ assert ival_A.asfreq('W', 'S') == ival_A_to_W_start
+ assert ival_A.asfreq('W', 'E') == ival_A_to_W_end
+ assert ival_A.asfreq('B', 'S') == ival_A_to_B_start
+ assert ival_A.asfreq('B', 'E') == ival_A_to_B_end
+ assert ival_A.asfreq('D', 'S') == ival_A_to_D_start
+ assert ival_A.asfreq('D', 'E') == ival_A_to_D_end
+ assert ival_A.asfreq('H', 'S') == ival_A_to_H_start
+ assert ival_A.asfreq('H', 'E') == ival_A_to_H_end
+ assert ival_A.asfreq('min', 'S') == ival_A_to_T_start
+ assert ival_A.asfreq('min', 'E') == ival_A_to_T_end
+ assert ival_A.asfreq('T', 'S') == ival_A_to_T_start
+ assert ival_A.asfreq('T', 'E') == ival_A_to_T_end
+ assert ival_A.asfreq('S', 'S') == ival_A_to_S_start
+ assert ival_A.asfreq('S', 'E') == ival_A_to_S_end
+
+ assert ival_AJAN.asfreq('D', 'S') == ival_AJAN_to_D_start
+ assert ival_AJAN.asfreq('D', 'E') == ival_AJAN_to_D_end
+
+ assert ival_AJUN.asfreq('D', 'S') == ival_AJUN_to_D_start
+ assert ival_AJUN.asfreq('D', 'E') == ival_AJUN_to_D_end
+
+ assert ival_ANOV.asfreq('D', 'S') == ival_ANOV_to_D_start
+ assert ival_ANOV.asfreq('D', 'E') == ival_ANOV_to_D_end
+
+ assert ival_A.asfreq('A') == ival_A
+
+ def test_conv_quarterly(self):
+ # frequency conversion tests: from Quarterly Frequency
+
+ ival_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4)
+
+ ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1)
+ ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1)
+
+ ival_Q_to_A = Period(freq='A', year=2007)
+ ival_Q_to_M_start = Period(freq='M', year=2007, month=1)
+ ival_Q_to_M_end = Period(freq='M', year=2007, month=3)
+ ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1)
+ ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31)
+ ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1)
+ ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30)
+ ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1)
+ ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31)
+ ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23)
+ ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31,
+ hour=23, minute=59)
+ ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23,
+ minute=59, second=59)
+
+ ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1)
+ ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30)
+
+ ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1)
+ ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30)
+
+ assert ival_Q.asfreq('A') == ival_Q_to_A
+ assert ival_Q_end_of_year.asfreq('A') == ival_Q_to_A
+
+ assert ival_Q.asfreq('M', 'S') == ival_Q_to_M_start
+ assert ival_Q.asfreq('M', 'E') == ival_Q_to_M_end
+ assert ival_Q.asfreq('W', 'S') == ival_Q_to_W_start
+ assert ival_Q.asfreq('W', 'E') == ival_Q_to_W_end
+ assert ival_Q.asfreq('B', 'S') == ival_Q_to_B_start
+ assert ival_Q.asfreq('B', 'E') == ival_Q_to_B_end
+ assert ival_Q.asfreq('D', 'S') == ival_Q_to_D_start
+ assert ival_Q.asfreq('D', 'E') == ival_Q_to_D_end
+ assert ival_Q.asfreq('H', 'S') == ival_Q_to_H_start
+ assert ival_Q.asfreq('H', 'E') == ival_Q_to_H_end
+ assert ival_Q.asfreq('Min', 'S') == ival_Q_to_T_start
+ assert ival_Q.asfreq('Min', 'E') == ival_Q_to_T_end
+ assert ival_Q.asfreq('S', 'S') == ival_Q_to_S_start
+ assert ival_Q.asfreq('S', 'E') == ival_Q_to_S_end
+
+ assert ival_QEJAN.asfreq('D', 'S') == ival_QEJAN_to_D_start
+ assert ival_QEJAN.asfreq('D', 'E') == ival_QEJAN_to_D_end
+ assert ival_QEJUN.asfreq('D', 'S') == ival_QEJUN_to_D_start
+ assert ival_QEJUN.asfreq('D', 'E') == ival_QEJUN_to_D_end
+
+ assert ival_Q.asfreq('Q') == ival_Q
+
+ def test_conv_monthly(self):
+ # frequency conversion tests: from Monthly Frequency
+
+ ival_M = Period(freq='M', year=2007, month=1)
+ ival_M_end_of_year = Period(freq='M', year=2007, month=12)
+ ival_M_end_of_quarter = Period(freq='M', year=2007, month=3)
+ ival_M_to_A = Period(freq='A', year=2007)
+ ival_M_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1)
+ ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31)
+ ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1)
+ ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31)
+ ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1)
+ ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31)
+ ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23)
+ ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31,
+ hour=23, minute=59)
+ ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23,
+ minute=59, second=59)
+
+ assert ival_M.asfreq('A') == ival_M_to_A
+ assert ival_M_end_of_year.asfreq('A') == ival_M_to_A
+ assert ival_M.asfreq('Q') == ival_M_to_Q
+ assert ival_M_end_of_quarter.asfreq('Q') == ival_M_to_Q
+
+ assert ival_M.asfreq('W', 'S') == ival_M_to_W_start
+ assert ival_M.asfreq('W', 'E') == ival_M_to_W_end
+ assert ival_M.asfreq('B', 'S') == ival_M_to_B_start
+ assert ival_M.asfreq('B', 'E') == ival_M_to_B_end
+ assert ival_M.asfreq('D', 'S') == ival_M_to_D_start
+ assert ival_M.asfreq('D', 'E') == ival_M_to_D_end
+ assert ival_M.asfreq('H', 'S') == ival_M_to_H_start
+ assert ival_M.asfreq('H', 'E') == ival_M_to_H_end
+ assert ival_M.asfreq('Min', 'S') == ival_M_to_T_start
+ assert ival_M.asfreq('Min', 'E') == ival_M_to_T_end
+ assert ival_M.asfreq('S', 'S') == ival_M_to_S_start
+ assert ival_M.asfreq('S', 'E') == ival_M_to_S_end
+
+ assert ival_M.asfreq('M') == ival_M
+
+ def test_conv_weekly(self):
+ # frequency conversion tests: from Weekly Frequency
+ ival_W = Period(freq='W', year=2007, month=1, day=1)
+
+ ival_WSUN = Period(freq='W', year=2007, month=1, day=7)
+ ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6)
+ ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5)
+ ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4)
+ ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3)
+ ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2)
+ ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1)
+
+ ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1)
+ ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7)
+ ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31)
+ ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6)
+ ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30)
+ ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5)
+ ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29)
+ ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4)
+ ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28)
+ ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3)
+ ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27)
+ ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2)
+ ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26)
+ ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1)
+
+ ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31)
+ ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31)
+ ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31)
+ ival_W_to_A = Period(freq='A', year=2007)
+ ival_W_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_W_to_M = Period(freq='M', year=2007, month=1)
+
+ if Period(freq='D', year=2007, month=12, day=31).weekday == 6:
+ ival_W_to_A_end_of_year = Period(freq='A', year=2007)
+ else:
+ ival_W_to_A_end_of_year = Period(freq='A', year=2008)
+
+ if Period(freq='D', year=2007, month=3, day=31).weekday == 6:
+ ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1)
+ else:
+ ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2)
+
+ if Period(freq='D', year=2007, month=1, day=31).weekday == 6:
+ ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1)
+ else:
+ ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2)
+
+ ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1)
+ ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5)
+ ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1)
+ ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7)
+ ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23)
+ ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7,
+ hour=23, minute=59)
+ ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23,
+ minute=59, second=59)
+
+ assert ival_W.asfreq('A') == ival_W_to_A
+ assert ival_W_end_of_year.asfreq('A') == ival_W_to_A_end_of_year
+
+ assert ival_W.asfreq('Q') == ival_W_to_Q
+ assert ival_W_end_of_quarter.asfreq('Q') == ival_W_to_Q_end_of_quarter
+
+ assert ival_W.asfreq('M') == ival_W_to_M
+ assert ival_W_end_of_month.asfreq('M') == ival_W_to_M_end_of_month
+
+ assert ival_W.asfreq('B', 'S') == ival_W_to_B_start
+ assert ival_W.asfreq('B', 'E') == ival_W_to_B_end
+
+ assert ival_W.asfreq('D', 'S') == ival_W_to_D_start
+ assert ival_W.asfreq('D', 'E') == ival_W_to_D_end
+
+ assert ival_WSUN.asfreq('D', 'S') == ival_WSUN_to_D_start
+ assert ival_WSUN.asfreq('D', 'E') == ival_WSUN_to_D_end
+ assert ival_WSAT.asfreq('D', 'S') == ival_WSAT_to_D_start
+ assert ival_WSAT.asfreq('D', 'E') == ival_WSAT_to_D_end
+ assert ival_WFRI.asfreq('D', 'S') == ival_WFRI_to_D_start
+ assert ival_WFRI.asfreq('D', 'E') == ival_WFRI_to_D_end
+ assert ival_WTHU.asfreq('D', 'S') == ival_WTHU_to_D_start
+ assert ival_WTHU.asfreq('D', 'E') == ival_WTHU_to_D_end
+ assert ival_WWED.asfreq('D', 'S') == ival_WWED_to_D_start
+ assert ival_WWED.asfreq('D', 'E') == ival_WWED_to_D_end
+ assert ival_WTUE.asfreq('D', 'S') == ival_WTUE_to_D_start
+ assert ival_WTUE.asfreq('D', 'E') == ival_WTUE_to_D_end
+ assert ival_WMON.asfreq('D', 'S') == ival_WMON_to_D_start
+ assert ival_WMON.asfreq('D', 'E') == ival_WMON_to_D_end
+
+ assert ival_W.asfreq('H', 'S') == ival_W_to_H_start
+ assert ival_W.asfreq('H', 'E') == ival_W_to_H_end
+ assert ival_W.asfreq('Min', 'S') == ival_W_to_T_start
+ assert ival_W.asfreq('Min', 'E') == ival_W_to_T_end
+ assert ival_W.asfreq('S', 'S') == ival_W_to_S_start
+ assert ival_W.asfreq('S', 'E') == ival_W_to_S_end
+
+ assert ival_W.asfreq('W') == ival_W
+
+ msg = INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ ival_W.asfreq('WK')
+
+ def test_conv_weekly_legacy(self):
+ # frequency conversion tests: from Weekly Frequency
+ msg = INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK', year=2007, month=1, day=1)
+
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-SAT', year=2007, month=1, day=6)
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-FRI', year=2007, month=1, day=5)
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-THU', year=2007, month=1, day=4)
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-WED', year=2007, month=1, day=3)
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-TUE', year=2007, month=1, day=2)
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK-MON', year=2007, month=1, day=1)
+
+ def test_conv_business(self):
+ # frequency conversion tests: from Business Frequency"
+
+ ival_B = Period(freq='B', year=2007, month=1, day=1)
+ ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31)
+ ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30)
+ ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31)
+ ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5)
+
+ ival_B_to_A = Period(freq='A', year=2007)
+ ival_B_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_B_to_M = Period(freq='M', year=2007, month=1)
+ ival_B_to_W = Period(freq='W', year=2007, month=1, day=7)
+ ival_B_to_D = Period(freq='D', year=2007, month=1, day=1)
+ ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23)
+ ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1,
+ hour=23, minute=59)
+ ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23,
+ minute=59, second=59)
+
+ assert ival_B.asfreq('A') == ival_B_to_A
+ assert ival_B_end_of_year.asfreq('A') == ival_B_to_A
+ assert ival_B.asfreq('Q') == ival_B_to_Q
+ assert ival_B_end_of_quarter.asfreq('Q') == ival_B_to_Q
+ assert ival_B.asfreq('M') == ival_B_to_M
+ assert ival_B_end_of_month.asfreq('M') == ival_B_to_M
+ assert ival_B.asfreq('W') == ival_B_to_W
+ assert ival_B_end_of_week.asfreq('W') == ival_B_to_W
+
+ assert ival_B.asfreq('D') == ival_B_to_D
+
+ assert ival_B.asfreq('H', 'S') == ival_B_to_H_start
+ assert ival_B.asfreq('H', 'E') == ival_B_to_H_end
+ assert ival_B.asfreq('Min', 'S') == ival_B_to_T_start
+ assert ival_B.asfreq('Min', 'E') == ival_B_to_T_end
+ assert ival_B.asfreq('S', 'S') == ival_B_to_S_start
+ assert ival_B.asfreq('S', 'E') == ival_B_to_S_end
+
+ assert ival_B.asfreq('B') == ival_B
+
+ def test_conv_daily(self):
+ # frequency conversion tests: from Business Frequency"
+
+ ival_D = Period(freq='D', year=2007, month=1, day=1)
+ ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31)
+ ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31)
+ ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31)
+ ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7)
+
+ ival_D_friday = Period(freq='D', year=2007, month=1, day=5)
+ ival_D_saturday = Period(freq='D', year=2007, month=1, day=6)
+ ival_D_sunday = Period(freq='D', year=2007, month=1, day=7)
+
+ # TODO: unused?
+ # ival_D_monday = Period(freq='D', year=2007, month=1, day=8)
+
+ ival_B_friday = Period(freq='B', year=2007, month=1, day=5)
+ ival_B_monday = Period(freq='B', year=2007, month=1, day=8)
+
+ ival_D_to_A = Period(freq='A', year=2007)
+
+ ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008)
+ ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007)
+ ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007)
+
+ ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4)
+ ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3)
+ ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1)
+
+ ival_D_to_M = Period(freq='M', year=2007, month=1)
+ ival_D_to_W = Period(freq='W', year=2007, month=1, day=7)
+
+ ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23)
+ ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1,
+ hour=23, minute=59)
+ ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23,
+ minute=59, second=59)
+
+ assert ival_D.asfreq('A') == ival_D_to_A
+
+ assert ival_D_end_of_quarter.asfreq('A-JAN') == ival_Deoq_to_AJAN
+ assert ival_D_end_of_quarter.asfreq('A-JUN') == ival_Deoq_to_AJUN
+ assert ival_D_end_of_quarter.asfreq('A-DEC') == ival_Deoq_to_ADEC
+
+ assert ival_D_end_of_year.asfreq('A') == ival_D_to_A
+ assert ival_D_end_of_quarter.asfreq('Q') == ival_D_to_QEDEC
+ assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN
+ assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN
+ assert ival_D.asfreq("Q-DEC") == ival_D_to_QEDEC
+ assert ival_D.asfreq('M') == ival_D_to_M
+ assert ival_D_end_of_month.asfreq('M') == ival_D_to_M
+ assert ival_D.asfreq('W') == ival_D_to_W
+ assert ival_D_end_of_week.asfreq('W') == ival_D_to_W
+
+ assert ival_D_friday.asfreq('B') == ival_B_friday
+ assert ival_D_saturday.asfreq('B', 'S') == ival_B_friday
+ assert ival_D_saturday.asfreq('B', 'E') == ival_B_monday
+ assert ival_D_sunday.asfreq('B', 'S') == ival_B_friday
+ assert ival_D_sunday.asfreq('B', 'E') == ival_B_monday
+
+ assert ival_D.asfreq('H', 'S') == ival_D_to_H_start
+ assert ival_D.asfreq('H', 'E') == ival_D_to_H_end
+ assert ival_D.asfreq('Min', 'S') == ival_D_to_T_start
+ assert ival_D.asfreq('Min', 'E') == ival_D_to_T_end
+ assert ival_D.asfreq('S', 'S') == ival_D_to_S_start
+ assert ival_D.asfreq('S', 'E') == ival_D_to_S_end
+
+ assert ival_D.asfreq('D') == ival_D
+
+ def test_conv_hourly(self):
+ # frequency conversion tests: from Hourly Frequency"
+
+ ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31,
+ hour=23)
+ ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31,
+ hour=23)
+ ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31,
+ hour=23)
+ ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7,
+ hour=23)
+ ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1,
+ hour=23)
+ ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1,
+ hour=23)
+
+ ival_H_to_A = Period(freq='A', year=2007)
+ ival_H_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_H_to_M = Period(freq='M', year=2007, month=1)
+ ival_H_to_W = Period(freq='W', year=2007, month=1, day=7)
+ ival_H_to_D = Period(freq='D', year=2007, month=1, day=1)
+ ival_H_to_B = Period(freq='B', year=2007, month=1, day=1)
+
+ ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=0)
+ ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0,
+ minute=59)
+ ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=59, second=59)
+
+ assert ival_H.asfreq('A') == ival_H_to_A
+ assert ival_H_end_of_year.asfreq('A') == ival_H_to_A
+ assert ival_H.asfreq('Q') == ival_H_to_Q
+ assert ival_H_end_of_quarter.asfreq('Q') == ival_H_to_Q
+ assert ival_H.asfreq('M') == ival_H_to_M
+ assert ival_H_end_of_month.asfreq('M') == ival_H_to_M
+ assert ival_H.asfreq('W') == ival_H_to_W
+ assert ival_H_end_of_week.asfreq('W') == ival_H_to_W
+ assert ival_H.asfreq('D') == ival_H_to_D
+ assert ival_H_end_of_day.asfreq('D') == ival_H_to_D
+ assert ival_H.asfreq('B') == ival_H_to_B
+ assert ival_H_end_of_bus.asfreq('B') == ival_H_to_B
+
+ assert ival_H.asfreq('Min', 'S') == ival_H_to_T_start
+ assert ival_H.asfreq('Min', 'E') == ival_H_to_T_end
+ assert ival_H.asfreq('S', 'S') == ival_H_to_S_start
+ assert ival_H.asfreq('S', 'E') == ival_H_to_S_end
+
+ assert ival_H.asfreq('H') == ival_H
+
+ def test_conv_minutely(self):
+ # frequency conversion tests: from Minutely Frequency"
+
+ ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0,
+ minute=0)
+ ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31,
+ hour=23, minute=59)
+ ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31,
+ hour=23, minute=59)
+ ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31,
+ hour=23, minute=59)
+ ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7,
+ hour=23, minute=59)
+ ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1,
+ hour=23, minute=59)
+ ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1,
+ hour=23, minute=59)
+ ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1,
+ hour=0, minute=59)
+
+ ival_T_to_A = Period(freq='A', year=2007)
+ ival_T_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_T_to_M = Period(freq='M', year=2007, month=1)
+ ival_T_to_W = Period(freq='W', year=2007, month=1, day=7)
+ ival_T_to_D = Period(freq='D', year=2007, month=1, day=1)
+ ival_T_to_B = Period(freq='B', year=2007, month=1, day=1)
+ ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0)
+
+ ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0,
+ minute=0, second=59)
+
+ assert ival_T.asfreq('A') == ival_T_to_A
+ assert ival_T_end_of_year.asfreq('A') == ival_T_to_A
+ assert ival_T.asfreq('Q') == ival_T_to_Q
+ assert ival_T_end_of_quarter.asfreq('Q') == ival_T_to_Q
+ assert ival_T.asfreq('M') == ival_T_to_M
+ assert ival_T_end_of_month.asfreq('M') == ival_T_to_M
+ assert ival_T.asfreq('W') == ival_T_to_W
+ assert ival_T_end_of_week.asfreq('W') == ival_T_to_W
+ assert ival_T.asfreq('D') == ival_T_to_D
+ assert ival_T_end_of_day.asfreq('D') == ival_T_to_D
+ assert ival_T.asfreq('B') == ival_T_to_B
+ assert ival_T_end_of_bus.asfreq('B') == ival_T_to_B
+ assert ival_T.asfreq('H') == ival_T_to_H
+ assert ival_T_end_of_hour.asfreq('H') == ival_T_to_H
+
+ assert ival_T.asfreq('S', 'S') == ival_T_to_S_start
+ assert ival_T.asfreq('S', 'E') == ival_T_to_S_end
+
+ assert ival_T.asfreq('Min') == ival_T
+
+ def test_conv_secondly(self):
+ # frequency conversion tests: from Secondly Frequency"
+
+ ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0,
+ second=0)
+ ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1,
+ hour=23, minute=59, second=59)
+ ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1,
+ hour=0, minute=59, second=59)
+ ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1,
+ hour=0, minute=0, second=59)
+
+ ival_S_to_A = Period(freq='A', year=2007)
+ ival_S_to_Q = Period(freq='Q', year=2007, quarter=1)
+ ival_S_to_M = Period(freq='M', year=2007, month=1)
+ ival_S_to_W = Period(freq='W', year=2007, month=1, day=7)
+ ival_S_to_D = Period(freq='D', year=2007, month=1, day=1)
+ ival_S_to_B = Period(freq='B', year=2007, month=1, day=1)
+ ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0,
+ minute=0)
+
+ assert ival_S.asfreq('A') == ival_S_to_A
+ assert ival_S_end_of_year.asfreq('A') == ival_S_to_A
+ assert ival_S.asfreq('Q') == ival_S_to_Q
+ assert ival_S_end_of_quarter.asfreq('Q') == ival_S_to_Q
+ assert ival_S.asfreq('M') == ival_S_to_M
+ assert ival_S_end_of_month.asfreq('M') == ival_S_to_M
+ assert ival_S.asfreq('W') == ival_S_to_W
+ assert ival_S_end_of_week.asfreq('W') == ival_S_to_W
+ assert ival_S.asfreq('D') == ival_S_to_D
+ assert ival_S_end_of_day.asfreq('D') == ival_S_to_D
+ assert ival_S.asfreq('B') == ival_S_to_B
+ assert ival_S_end_of_bus.asfreq('B') == ival_S_to_B
+ assert ival_S.asfreq('H') == ival_S_to_H
+ assert ival_S_end_of_hour.asfreq('H') == ival_S_to_H
+ assert ival_S.asfreq('Min') == ival_S_to_T
+ assert ival_S_end_of_minute.asfreq('Min') == ival_S_to_T
+
+ assert ival_S.asfreq('S') == ival_S
+
+ def test_asfreq_mult(self):
+ # normal freq to mult freq
+ p = Period(freq='A', year=2007)
+ # ordinal will not change
+ for freq in ['3A', offsets.YearEnd(3)]:
+ result = p.asfreq(freq)
+ expected = Period('2007', freq='3A')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+ # ordinal will not change
+ for freq in ['3A', offsets.YearEnd(3)]:
+ result = p.asfreq(freq, how='S')
+ expected = Period('2007', freq='3A')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+
+ # mult freq to normal freq
+ p = Period(freq='3A', year=2007)
+ # ordinal will change because how=E is the default
+ for freq in ['A', offsets.YearEnd()]:
+ result = p.asfreq(freq)
+ expected = Period('2009', freq='A')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+ # ordinal will not change
+ for freq in ['A', offsets.YearEnd()]:
+ result = p.asfreq(freq, how='S')
+ expected = Period('2007', freq='A')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+
+ p = Period(freq='A', year=2007)
+ for freq in ['2M', offsets.MonthEnd(2)]:
+ result = p.asfreq(freq)
+ expected = Period('2007-12', freq='2M')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+ for freq in ['2M', offsets.MonthEnd(2)]:
+ result = p.asfreq(freq, how='S')
+ expected = Period('2007-01', freq='2M')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+
+ p = Period(freq='3A', year=2007)
+ for freq in ['2M', offsets.MonthEnd(2)]:
+ result = p.asfreq(freq)
+ expected = Period('2009-12', freq='2M')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+ for freq in ['2M', offsets.MonthEnd(2)]:
+ result = p.asfreq(freq, how='S')
+ expected = Period('2007-01', freq='2M')
+
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+
+ def test_asfreq_combined(self):
+ # normal freq to combined freq
+ p = Period('2007', freq='H')
+
+ # ordinal will not change
+ expected = Period('2007', freq='25H')
+ for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']):
+ result = p.asfreq(freq, how=how)
+ assert result == expected
+ assert result.ordinal == expected.ordinal
+ assert result.freq == expected.freq
+
+ # combined freq to normal freq
+ p1 = Period(freq='1D1H', year=2007)
+ p2 = Period(freq='1H1D', year=2007)
+
+ # ordinal will change because how=E is the default
+ result1 = p1.asfreq('H')
+ result2 = p2.asfreq('H')
+ expected = Period('2007-01-02', freq='H')
+ assert result1 == expected
+ assert result1.ordinal == expected.ordinal
+ assert result1.freq == expected.freq
+ assert result2 == expected
+ assert result2.ordinal == expected.ordinal
+ assert result2.freq == expected.freq
+
+ # ordinal will not change
+ result1 = p1.asfreq('H', how='S')
+ result2 = p2.asfreq('H', how='S')
+ expected = Period('2007-01-01', freq='H')
+ assert result1 == expected
+ assert result1.ordinal == expected.ordinal
+ assert result1.freq == expected.freq
+ assert result2 == expected
+ assert result2.ordinal == expected.ordinal
+ assert result2.freq == expected.freq
+
+ def test_asfreq_MS(self):
+ initial = Period("2013")
+
+ assert initial.asfreq(freq="M", how="S") == Period('2013-01', 'M')
+
+ msg = INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ initial.asfreq(freq="MS", how="S")
+
+ with pytest.raises(ValueError, match=msg):
+ Period('2013-01', 'MS')
+
+ assert _period_code_map.get("MS") is None
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/period/test_period.py b/contrib/python/pandas/py2/pandas/tests/scalar/period/test_period.py
new file mode 100644
index 00000000000..d0f87618ad3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/period/test_period.py
@@ -0,0 +1,1495 @@
+from datetime import date, datetime, timedelta
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs import iNaT, period as libperiod
+from pandas._libs.tslibs.ccalendar import DAYS, MONTHS
+from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG
+from pandas._libs.tslibs.parsing import DateParseError
+from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz
+from pandas.compat import iteritems, text_type
+from pandas.compat.numpy import np_datetime64_compat
+
+import pandas as pd
+from pandas import NaT, Period, Timedelta, Timestamp, offsets
+import pandas.core.indexes.period as period
+import pandas.util.testing as tm
+
+
+class TestPeriodConstruction(object):
+ def test_construction(self):
+ i1 = Period('1/1/2005', freq='M')
+ i2 = Period('Jan 2005')
+
+ assert i1 == i2
+
+ i1 = Period('2005', freq='A')
+ i2 = Period('2005')
+ i3 = Period('2005', freq='a')
+
+ assert i1 == i2
+ assert i1 == i3
+
+ i4 = Period('2005', freq='M')
+ i5 = Period('2005', freq='m')
+
+ pytest.raises(ValueError, i1.__ne__, i4)
+ assert i4 == i5
+
+ i1 = Period.now('Q')
+ i2 = Period(datetime.now(), freq='Q')
+ i3 = Period.now('q')
+
+ assert i1 == i2
+ assert i1 == i3
+
+ i1 = Period('1982', freq='min')
+ i2 = Period('1982', freq='MIN')
+ assert i1 == i2
+ i2 = Period('1982', freq=('Min', 1))
+ assert i1 == i2
+
+ i1 = Period(year=2005, month=3, day=1, freq='D')
+ i2 = Period('3/1/2005', freq='D')
+ assert i1 == i2
+
+ i3 = Period(year=2005, month=3, day=1, freq='d')
+ assert i1 == i3
+
+ i1 = Period('2007-01-01 09:00:00.001')
+ expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L')
+ assert i1 == expected
+
+ expected = Period(np_datetime64_compat(
+ '2007-01-01 09:00:00.001Z'), freq='L')
+ assert i1 == expected
+
+ i1 = Period('2007-01-01 09:00:00.00101')
+ expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U')
+ assert i1 == expected
+
+ expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'),
+ freq='U')
+ assert i1 == expected
+
+ pytest.raises(ValueError, Period, ordinal=200701)
+
+ pytest.raises(ValueError, Period, '2007-1-1', freq='X')
+
+ def test_construction_bday(self):
+
+ # Biz day construction, roll forward if non-weekday
+ i1 = Period('3/10/12', freq='B')
+ i2 = Period('3/10/12', freq='D')
+ assert i1 == i2.asfreq('B')
+ i2 = Period('3/11/12', freq='D')
+ assert i1 == i2.asfreq('B')
+ i2 = Period('3/12/12', freq='D')
+ assert i1 == i2.asfreq('B')
+
+ i3 = Period('3/10/12', freq='b')
+ assert i1 == i3
+
+ i1 = Period(year=2012, month=3, day=10, freq='B')
+ i2 = Period('3/12/12', freq='B')
+ assert i1 == i2
+
+ def test_construction_quarter(self):
+
+ i1 = Period(year=2005, quarter=1, freq='Q')
+ i2 = Period('1/1/2005', freq='Q')
+ assert i1 == i2
+
+ i1 = Period(year=2005, quarter=3, freq='Q')
+ i2 = Period('9/1/2005', freq='Q')
+ assert i1 == i2
+
+ i1 = Period('2005Q1')
+ i2 = Period(year=2005, quarter=1, freq='Q')
+ i3 = Period('2005q1')
+ assert i1 == i2
+ assert i1 == i3
+
+ i1 = Period('05Q1')
+ assert i1 == i2
+ lower = Period('05q1')
+ assert i1 == lower
+
+ i1 = Period('1Q2005')
+ assert i1 == i2
+ lower = Period('1q2005')
+ assert i1 == lower
+
+ i1 = Period('1Q05')
+ assert i1 == i2
+ lower = Period('1q05')
+ assert i1 == lower
+
+ i1 = Period('4Q1984')
+ assert i1.year == 1984
+ lower = Period('4q1984')
+ assert i1 == lower
+
+ def test_construction_month(self):
+
+ expected = Period('2007-01', freq='M')
+ i1 = Period('200701', freq='M')
+ assert i1 == expected
+
+ i1 = Period('200701', freq='M')
+ assert i1 == expected
+
+ i1 = Period(200701, freq='M')
+ assert i1 == expected
+
+ i1 = Period(ordinal=200701, freq='M')
+ assert i1.year == 18695
+
+ i1 = Period(datetime(2007, 1, 1), freq='M')
+ i2 = Period('200701', freq='M')
+ assert i1 == i2
+
+ i1 = Period(date(2007, 1, 1), freq='M')
+ i2 = Period(datetime(2007, 1, 1), freq='M')
+ i3 = Period(np.datetime64('2007-01-01'), freq='M')
+ i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M')
+ i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M')
+ assert i1 == i2
+ assert i1 == i3
+ assert i1 == i4
+ assert i1 == i5
+
+ def test_period_constructor_offsets(self):
+ assert (Period('1/1/2005', freq=offsets.MonthEnd()) ==
+ Period('1/1/2005', freq='M'))
+ assert (Period('2005', freq=offsets.YearEnd()) ==
+ Period('2005', freq='A'))
+ assert (Period('2005', freq=offsets.MonthEnd()) ==
+ Period('2005', freq='M'))
+ assert (Period('3/10/12', freq=offsets.BusinessDay()) ==
+ Period('3/10/12', freq='B'))
+ assert (Period('3/10/12', freq=offsets.Day()) ==
+ Period('3/10/12', freq='D'))
+
+ assert (Period(year=2005, quarter=1,
+ freq=offsets.QuarterEnd(startingMonth=12)) ==
+ Period(year=2005, quarter=1, freq='Q'))
+ assert (Period(year=2005, quarter=2,
+ freq=offsets.QuarterEnd(startingMonth=12)) ==
+ Period(year=2005, quarter=2, freq='Q'))
+
+ assert (Period(year=2005, month=3, day=1, freq=offsets.Day()) ==
+ Period(year=2005, month=3, day=1, freq='D'))
+ assert (Period(year=2012, month=3, day=10, freq=offsets.BDay()) ==
+ Period(year=2012, month=3, day=10, freq='B'))
+
+ expected = Period('2005-03-01', freq='3D')
+ assert (Period(year=2005, month=3, day=1,
+ freq=offsets.Day(3)) == expected)
+ assert Period(year=2005, month=3, day=1, freq='3D') == expected
+
+ assert (Period(year=2012, month=3, day=10,
+ freq=offsets.BDay(3)) ==
+ Period(year=2012, month=3, day=10, freq='3B'))
+
+ assert (Period(200701, freq=offsets.MonthEnd()) ==
+ Period(200701, freq='M'))
+
+ i1 = Period(ordinal=200701, freq=offsets.MonthEnd())
+ i2 = Period(ordinal=200701, freq='M')
+ assert i1 == i2
+ assert i1.year == 18695
+ assert i2.year == 18695
+
+ i1 = Period(datetime(2007, 1, 1), freq='M')
+ i2 = Period('200701', freq='M')
+ assert i1 == i2
+
+ i1 = Period(date(2007, 1, 1), freq='M')
+ i2 = Period(datetime(2007, 1, 1), freq='M')
+ i3 = Period(np.datetime64('2007-01-01'), freq='M')
+ i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M')
+ i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M')
+ assert i1 == i2
+ assert i1 == i3
+ assert i1 == i4
+ assert i1 == i5
+
+ i1 = Period('2007-01-01 09:00:00.001')
+ expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L')
+ assert i1 == expected
+
+ expected = Period(np_datetime64_compat(
+ '2007-01-01 09:00:00.001Z'), freq='L')
+ assert i1 == expected
+
+ i1 = Period('2007-01-01 09:00:00.00101')
+ expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U')
+ assert i1 == expected
+
+ expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'),
+ freq='U')
+ assert i1 == expected
+
+ pytest.raises(ValueError, Period, ordinal=200701)
+
+ pytest.raises(ValueError, Period, '2007-1-1', freq='X')
+
+ def test_invalid_arguments(self):
+ with pytest.raises(ValueError):
+ Period(datetime.now())
+ with pytest.raises(ValueError):
+ Period(datetime.now().date())
+
+ with pytest.raises(ValueError):
+ Period(1.6, freq='D')
+ with pytest.raises(ValueError):
+ Period(ordinal=1.6, freq='D')
+ with pytest.raises(ValueError):
+ Period(ordinal=2, value=1, freq='D')
+
+ with pytest.raises(ValueError):
+ Period(month=1)
+
+ with pytest.raises(ValueError):
+ Period('-2000', 'A')
+ with pytest.raises(DateParseError):
+ Period('0', 'A')
+ with pytest.raises(DateParseError):
+ Period('1/1/-2000', 'A')
+
+ def test_constructor_corner(self):
+ expected = Period('2007-01', freq='2M')
+ assert Period(year=2007, month=1, freq='2M') == expected
+
+ assert Period(None) is NaT
+
+ p = Period('2007-01-01', freq='D')
+
+ result = Period(p, freq='A')
+ exp = Period('2007', freq='A')
+ assert result == exp
+
+ def test_constructor_infer_freq(self):
+ p = Period('2007-01-01')
+ assert p.freq == 'D'
+
+ p = Period('2007-01-01 07')
+ assert p.freq == 'H'
+
+ p = Period('2007-01-01 07:10')
+ assert p.freq == 'T'
+
+ p = Period('2007-01-01 07:10:15')
+ assert p.freq == 'S'
+
+ p = Period('2007-01-01 07:10:15.123')
+ assert p.freq == 'L'
+
+ p = Period('2007-01-01 07:10:15.123000')
+ assert p.freq == 'L'
+
+ p = Period('2007-01-01 07:10:15.123400')
+ assert p.freq == 'U'
+
+ def test_multiples(self):
+ result1 = Period('1989', freq='2A')
+ result2 = Period('1989', freq='A')
+ assert result1.ordinal == result2.ordinal
+ assert result1.freqstr == '2A-DEC'
+ assert result2.freqstr == 'A-DEC'
+ assert result1.freq == offsets.YearEnd(2)
+ assert result2.freq == offsets.YearEnd()
+
+ assert (result1 + 1).ordinal == result1.ordinal + 2
+ assert (1 + result1).ordinal == result1.ordinal + 2
+ assert (result1 - 1).ordinal == result2.ordinal - 2
+ assert (-1 + result1).ordinal == result2.ordinal - 2
+
+ @pytest.mark.parametrize('month', MONTHS)
+ def test_period_cons_quarterly(self, month):
+ # bugs in scikits.timeseries
+ freq = 'Q-%s' % month
+ exp = Period('1989Q3', freq=freq)
+ assert '1989Q3' in str(exp)
+ stamp = exp.to_timestamp('D', how='end')
+ p = Period(stamp, freq=freq)
+ assert p == exp
+
+ stamp = exp.to_timestamp('3D', how='end')
+ p = Period(stamp, freq=freq)
+ assert p == exp
+
+ @pytest.mark.parametrize('month', MONTHS)
+ def test_period_cons_annual(self, month):
+ # bugs in scikits.timeseries
+ freq = 'A-%s' % month
+ exp = Period('1989', freq=freq)
+ stamp = exp.to_timestamp('D', how='end') + timedelta(days=30)
+ p = Period(stamp, freq=freq)
+
+ assert p == exp + 1
+ assert isinstance(p, Period)
+
+ @pytest.mark.parametrize('day', DAYS)
+ @pytest.mark.parametrize('num', range(10, 17))
+ def test_period_cons_weekly(self, num, day):
+ daystr = '2011-02-%d' % num
+ freq = 'W-%s' % day
+
+ result = Period(daystr, freq=freq)
+ expected = Period(daystr, freq='D').asfreq(freq)
+ assert result == expected
+ assert isinstance(result, Period)
+
+ def test_period_from_ordinal(self):
+ p = Period('2011-01', freq='M')
+ res = Period._from_ordinal(p.ordinal, freq='M')
+ assert p == res
+ assert isinstance(res, Period)
+
+ def test_period_cons_nat(self):
+ p = Period('NaT', freq='M')
+ assert p is NaT
+
+ p = Period('nat', freq='W-SUN')
+ assert p is NaT
+
+ p = Period(iNaT, freq='D')
+ assert p is NaT
+
+ p = Period(iNaT, freq='3D')
+ assert p is NaT
+
+ p = Period(iNaT, freq='1D1H')
+ assert p is NaT
+
+ p = Period('NaT')
+ assert p is NaT
+
+ p = Period(iNaT)
+ assert p is NaT
+
+ def test_period_cons_mult(self):
+ p1 = Period('2011-01', freq='3M')
+ p2 = Period('2011-01', freq='M')
+ assert p1.ordinal == p2.ordinal
+
+ assert p1.freq == offsets.MonthEnd(3)
+ assert p1.freqstr == '3M'
+
+ assert p2.freq == offsets.MonthEnd()
+ assert p2.freqstr == 'M'
+
+ result = p1 + 1
+ assert result.ordinal == (p2 + 3).ordinal
+
+ assert result.freq == p1.freq
+ assert result.freqstr == '3M'
+
+ result = p1 - 1
+ assert result.ordinal == (p2 - 3).ordinal
+ assert result.freq == p1.freq
+ assert result.freqstr == '3M'
+
+ msg = ('Frequency must be positive, because it'
+ ' represents span: -3M')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='-3M')
+
+ msg = ('Frequency must be positive, because it' ' represents span: 0M')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='0M')
+
+ def test_period_cons_combined(self):
+ p = [(Period('2011-01', freq='1D1H'),
+ Period('2011-01', freq='1H1D'),
+ Period('2011-01', freq='H')),
+ (Period(ordinal=1, freq='1D1H'),
+ Period(ordinal=1, freq='1H1D'),
+ Period(ordinal=1, freq='H'))]
+
+ for p1, p2, p3 in p:
+ assert p1.ordinal == p3.ordinal
+ assert p2.ordinal == p3.ordinal
+
+ assert p1.freq == offsets.Hour(25)
+ assert p1.freqstr == '25H'
+
+ assert p2.freq == offsets.Hour(25)
+ assert p2.freqstr == '25H'
+
+ assert p3.freq == offsets.Hour()
+ assert p3.freqstr == 'H'
+
+ result = p1 + 1
+ assert result.ordinal == (p3 + 25).ordinal
+ assert result.freq == p1.freq
+ assert result.freqstr == '25H'
+
+ result = p2 + 1
+ assert result.ordinal == (p3 + 25).ordinal
+ assert result.freq == p2.freq
+ assert result.freqstr == '25H'
+
+ result = p1 - 1
+ assert result.ordinal == (p3 - 25).ordinal
+ assert result.freq == p1.freq
+ assert result.freqstr == '25H'
+
+ result = p2 - 1
+ assert result.ordinal == (p3 - 25).ordinal
+ assert result.freq == p2.freq
+ assert result.freqstr == '25H'
+
+ msg = ('Frequency must be positive, because it'
+ ' represents span: -25H')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='-1D1H')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='-1H1D')
+ with pytest.raises(ValueError, match=msg):
+ Period(ordinal=1, freq='-1D1H')
+ with pytest.raises(ValueError, match=msg):
+ Period(ordinal=1, freq='-1H1D')
+
+ msg = ('Frequency must be positive, because it'
+ ' represents span: 0D')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='0D0H')
+ with pytest.raises(ValueError, match=msg):
+ Period(ordinal=1, freq='0D0H')
+
+ # You can only combine together day and intraday offsets
+ msg = ('Invalid frequency: 1W1D')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='1W1D')
+ msg = ('Invalid frequency: 1D1W')
+ with pytest.raises(ValueError, match=msg):
+ Period('2011-01', freq='1D1W')
+
+
+class TestPeriodMethods(object):
+ def test_round_trip(self):
+ p = Period('2000Q1')
+ new_p = tm.round_trip_pickle(p)
+ assert new_p == p
+
+ def test_hash(self):
+ assert (hash(Period('2011-01', freq='M')) ==
+ hash(Period('2011-01', freq='M')))
+
+ assert (hash(Period('2011-01-01', freq='D')) !=
+ hash(Period('2011-01', freq='M')))
+
+ assert (hash(Period('2011-01', freq='3M')) !=
+ hash(Period('2011-01', freq='2M')))
+
+ assert (hash(Period('2011-01', freq='M')) !=
+ hash(Period('2011-02', freq='M')))
+
+ # --------------------------------------------------------------
+ # to_timestamp
+
+ @pytest.mark.parametrize('tzstr', ['Europe/Brussels',
+ 'Asia/Tokyo', 'US/Pacific'])
+ def test_to_timestamp_tz_arg(self, tzstr):
+ p = Period('1/1/2005', freq='M').to_timestamp(tz=tzstr)
+ exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr)
+ exp_zone = pytz.timezone(tzstr).normalize(p)
+
+ assert p == exp
+ assert p.tz == exp_zone.tzinfo
+ assert p.tz == exp.tz
+
+ p = Period('1/1/2005', freq='3H').to_timestamp(tz=tzstr)
+ exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr)
+ exp_zone = pytz.timezone(tzstr).normalize(p)
+
+ assert p == exp
+ assert p.tz == exp_zone.tzinfo
+ assert p.tz == exp.tz
+
+ p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=tzstr)
+ exp = Timestamp('31/12/2005', tz='UTC').tz_convert(tzstr)
+ exp_zone = pytz.timezone(tzstr).normalize(p)
+
+ assert p == exp
+ assert p.tz == exp_zone.tzinfo
+ assert p.tz == exp.tz
+
+ p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=tzstr)
+ exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr)
+ exp_zone = pytz.timezone(tzstr).normalize(p)
+
+ assert p == exp
+ assert p.tz == exp_zone.tzinfo
+ assert p.tz == exp.tz
+
+ @pytest.mark.parametrize('tzstr', ['dateutil/Europe/Brussels',
+ 'dateutil/Asia/Tokyo',
+ 'dateutil/US/Pacific'])
+ def test_to_timestamp_tz_arg_dateutil(self, tzstr):
+ tz = maybe_get_tz(tzstr)
+ p = Period('1/1/2005', freq='M').to_timestamp(tz=tz)
+ exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr)
+ assert p == exp
+ assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1])
+ assert p.tz == exp.tz
+
+ p = Period('1/1/2005', freq='M').to_timestamp(freq='3H', tz=tz)
+ exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr)
+ assert p == exp
+ assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1])
+ assert p.tz == exp.tz
+
+ def test_to_timestamp_tz_arg_dateutil_from_string(self):
+ p = Period('1/1/2005',
+ freq='M').to_timestamp(tz='dateutil/Europe/Brussels')
+ assert p.tz == dateutil_gettz('Europe/Brussels')
+
+ def test_to_timestamp_mult(self):
+ p = Period('2011-01', freq='M')
+ assert p.to_timestamp(how='S') == Timestamp('2011-01-01')
+ expected = Timestamp('2011-02-01') - Timedelta(1, 'ns')
+ assert p.to_timestamp(how='E') == expected
+
+ p = Period('2011-01', freq='3M')
+ assert p.to_timestamp(how='S') == Timestamp('2011-01-01')
+ expected = Timestamp('2011-04-01') - Timedelta(1, 'ns')
+ assert p.to_timestamp(how='E') == expected
+
+ def test_to_timestamp(self):
+ p = Period('1982', freq='A')
+ start_ts = p.to_timestamp(how='S')
+ aliases = ['s', 'StarT', 'BEGIn']
+ for a in aliases:
+ assert start_ts == p.to_timestamp('D', how=a)
+ # freq with mult should not affect to the result
+ assert start_ts == p.to_timestamp('3D', how=a)
+
+ end_ts = p.to_timestamp(how='E')
+ aliases = ['e', 'end', 'FINIsH']
+ for a in aliases:
+ assert end_ts == p.to_timestamp('D', how=a)
+ assert end_ts == p.to_timestamp('3D', how=a)
+
+ from_lst = ['A', 'Q', 'M', 'W', 'B', 'D', 'H', 'Min', 'S']
+
+ def _ex(p):
+ return Timestamp((p + p.freq).start_time.value - 1)
+
+ for i, fcode in enumerate(from_lst):
+ p = Period('1982', freq=fcode)
+ result = p.to_timestamp().to_period(fcode)
+ assert result == p
+
+ assert p.start_time == p.to_timestamp(how='S')
+
+ assert p.end_time == _ex(p)
+
+ # Frequency other than daily
+
+ p = Period('1985', freq='A')
+
+ result = p.to_timestamp('H', how='end')
+ expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns')
+ assert result == expected
+ result = p.to_timestamp('3H', how='end')
+ assert result == expected
+
+ result = p.to_timestamp('T', how='end')
+ expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns')
+ assert result == expected
+ result = p.to_timestamp('2T', how='end')
+ assert result == expected
+
+ result = p.to_timestamp(how='end')
+ expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns')
+ assert result == expected
+
+ expected = datetime(1985, 1, 1)
+ result = p.to_timestamp('H', how='start')
+ assert result == expected
+ result = p.to_timestamp('T', how='start')
+ assert result == expected
+ result = p.to_timestamp('S', how='start')
+ assert result == expected
+ result = p.to_timestamp('3H', how='start')
+ assert result == expected
+ result = p.to_timestamp('5S', how='start')
+ assert result == expected
+
+ # --------------------------------------------------------------
+ # Rendering: __repr__, strftime, etc
+
+ def test_repr(self):
+ p = Period('Jan-2000')
+ assert '2000-01' in repr(p)
+
+ p = Period('2000-12-15')
+ assert '2000-12-15' in repr(p)
+
+ def test_repr_nat(self):
+ p = Period('nat', freq='M')
+ assert repr(NaT) in repr(p)
+
+ def test_millisecond_repr(self):
+ p = Period('2000-01-01 12:15:02.123')
+
+ assert repr(p) == "Period('2000-01-01 12:15:02.123', 'L')"
+
+ def test_microsecond_repr(self):
+ p = Period('2000-01-01 12:15:02.123567')
+
+ assert repr(p) == "Period('2000-01-01 12:15:02.123567', 'U')"
+
+ def test_strftime(self):
+ # GH#3363
+ p = Period('2000-1-1 12:34:12', freq='S')
+ res = p.strftime('%Y-%m-%d %H:%M:%S')
+ assert res == '2000-01-01 12:34:12'
+ assert isinstance(res, text_type)
+
+
+class TestPeriodProperties(object):
+ "Test properties such as year, month, weekday, etc...."
+
+ @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'H'])
+ def test_is_leap_year(self, freq):
+ # GH 13727
+ p = Period('2000-01-01 00:00:00', freq=freq)
+ assert p.is_leap_year
+ assert isinstance(p.is_leap_year, bool)
+
+ p = Period('1999-01-01 00:00:00', freq=freq)
+ assert not p.is_leap_year
+
+ p = Period('2004-01-01 00:00:00', freq=freq)
+ assert p.is_leap_year
+
+ p = Period('2100-01-01 00:00:00', freq=freq)
+ assert not p.is_leap_year
+
+ def test_quarterly_negative_ordinals(self):
+ p = Period(ordinal=-1, freq='Q-DEC')
+ assert p.year == 1969
+ assert p.quarter == 4
+ assert isinstance(p, Period)
+
+ p = Period(ordinal=-2, freq='Q-DEC')
+ assert p.year == 1969
+ assert p.quarter == 3
+ assert isinstance(p, Period)
+
+ p = Period(ordinal=-2, freq='M')
+ assert p.year == 1969
+ assert p.month == 11
+ assert isinstance(p, Period)
+
+ def test_freq_str(self):
+ i1 = Period('1982', freq='Min')
+ assert i1.freq == offsets.Minute()
+ assert i1.freqstr == 'T'
+
+ def test_period_deprecated_freq(self):
+ cases = {"M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"],
+ "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"],
+ "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"],
+ "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"],
+ "T": ["minute", "MINUTE", "MINUTELY", "minutely"],
+ "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"],
+ "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"],
+ "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"],
+ "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]}
+
+ msg = INVALID_FREQ_ERR_MSG
+ for exp, freqs in iteritems(cases):
+ for freq in freqs:
+ with pytest.raises(ValueError, match=msg):
+ Period('2016-03-01 09:00', freq=freq)
+ with pytest.raises(ValueError, match=msg):
+ Period(ordinal=1, freq=freq)
+
+ # check supported freq-aliases still works
+ p1 = Period('2016-03-01 09:00', freq=exp)
+ p2 = Period(ordinal=1, freq=exp)
+ assert isinstance(p1, Period)
+ assert isinstance(p2, Period)
+
+ def test_start_time(self):
+ freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S']
+ xp = datetime(2012, 1, 1)
+ for f in freq_lst:
+ p = Period('2012', freq=f)
+ assert p.start_time == xp
+ assert Period('2012', freq='B').start_time == datetime(2012, 1, 2)
+ assert Period('2012', freq='W').start_time == datetime(2011, 12, 26)
+
+ def test_end_time(self):
+ p = Period('2012', freq='A')
+
+ def _ex(*args):
+ return Timestamp(Timestamp(datetime(*args)).value - 1)
+
+ xp = _ex(2013, 1, 1)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='Q')
+ xp = _ex(2012, 4, 1)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='M')
+ xp = _ex(2012, 2, 1)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='D')
+ xp = _ex(2012, 1, 2)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='H')
+ xp = _ex(2012, 1, 1, 1)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='B')
+ xp = _ex(2012, 1, 3)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='W')
+ xp = _ex(2012, 1, 2)
+ assert xp == p.end_time
+
+ # Test for GH 11738
+ p = Period('2012', freq='15D')
+ xp = _ex(2012, 1, 16)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='1D1H')
+ xp = _ex(2012, 1, 2, 1)
+ assert xp == p.end_time
+
+ p = Period('2012', freq='1H1D')
+ xp = _ex(2012, 1, 2, 1)
+ assert xp == p.end_time
+
+ def test_anchor_week_end_time(self):
+ def _ex(*args):
+ return Timestamp(Timestamp(datetime(*args)).value - 1)
+
+ p = Period('2013-1-1', 'W-SAT')
+ xp = _ex(2013, 1, 6)
+ assert p.end_time == xp
+
+ def test_properties_annually(self):
+ # Test properties on Periods with annually frequency.
+ a_date = Period(freq='A', year=2007)
+ assert a_date.year == 2007
+
+ def test_properties_quarterly(self):
+ # Test properties on Periods with daily frequency.
+ qedec_date = Period(freq="Q-DEC", year=2007, quarter=1)
+ qejan_date = Period(freq="Q-JAN", year=2007, quarter=1)
+ qejun_date = Period(freq="Q-JUN", year=2007, quarter=1)
+ #
+ for x in range(3):
+ for qd in (qedec_date, qejan_date, qejun_date):
+ assert (qd + x).qyear == 2007
+ assert (qd + x).quarter == x + 1
+
+ def test_properties_monthly(self):
+ # Test properties on Periods with daily frequency.
+ m_date = Period(freq='M', year=2007, month=1)
+ for x in range(11):
+ m_ival_x = m_date + x
+ assert m_ival_x.year == 2007
+ if 1 <= x + 1 <= 3:
+ assert m_ival_x.quarter == 1
+ elif 4 <= x + 1 <= 6:
+ assert m_ival_x.quarter == 2
+ elif 7 <= x + 1 <= 9:
+ assert m_ival_x.quarter == 3
+ elif 10 <= x + 1 <= 12:
+ assert m_ival_x.quarter == 4
+ assert m_ival_x.month == x + 1
+
+ def test_properties_weekly(self):
+ # Test properties on Periods with daily frequency.
+ w_date = Period(freq='W', year=2007, month=1, day=7)
+ #
+ assert w_date.year == 2007
+ assert w_date.quarter == 1
+ assert w_date.month == 1
+ assert w_date.week == 1
+ assert (w_date - 1).week == 52
+ assert w_date.days_in_month == 31
+ assert Period(freq='W', year=2012,
+ month=2, day=1).days_in_month == 29
+
+ def test_properties_weekly_legacy(self):
+ # Test properties on Periods with daily frequency.
+ w_date = Period(freq='W', year=2007, month=1, day=7)
+ assert w_date.year == 2007
+ assert w_date.quarter == 1
+ assert w_date.month == 1
+ assert w_date.week == 1
+ assert (w_date - 1).week == 52
+ assert w_date.days_in_month == 31
+
+ exp = Period(freq='W', year=2012, month=2, day=1)
+ assert exp.days_in_month == 29
+
+ msg = INVALID_FREQ_ERR_MSG
+ with pytest.raises(ValueError, match=msg):
+ Period(freq='WK', year=2007, month=1, day=7)
+
+ def test_properties_daily(self):
+ # Test properties on Periods with daily frequency.
+ b_date = Period(freq='B', year=2007, month=1, day=1)
+ #
+ assert b_date.year == 2007
+ assert b_date.quarter == 1
+ assert b_date.month == 1
+ assert b_date.day == 1
+ assert b_date.weekday == 0
+ assert b_date.dayofyear == 1
+ assert b_date.days_in_month == 31
+ assert Period(freq='B', year=2012,
+ month=2, day=1).days_in_month == 29
+
+ d_date = Period(freq='D', year=2007, month=1, day=1)
+
+ assert d_date.year == 2007
+ assert d_date.quarter == 1
+ assert d_date.month == 1
+ assert d_date.day == 1
+ assert d_date.weekday == 0
+ assert d_date.dayofyear == 1
+ assert d_date.days_in_month == 31
+ assert Period(freq='D', year=2012, month=2,
+ day=1).days_in_month == 29
+
+ def test_properties_hourly(self):
+ # Test properties on Periods with hourly frequency.
+ h_date1 = Period(freq='H', year=2007, month=1, day=1, hour=0)
+ h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0)
+
+ for h_date in [h_date1, h_date2]:
+ assert h_date.year == 2007
+ assert h_date.quarter == 1
+ assert h_date.month == 1
+ assert h_date.day == 1
+ assert h_date.weekday == 0
+ assert h_date.dayofyear == 1
+ assert h_date.hour == 0
+ assert h_date.days_in_month == 31
+ assert Period(freq='H', year=2012, month=2, day=1,
+ hour=0).days_in_month == 29
+
+ def test_properties_minutely(self):
+ # Test properties on Periods with minutely frequency.
+ t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0,
+ minute=0)
+ #
+ assert t_date.quarter == 1
+ assert t_date.month == 1
+ assert t_date.day == 1
+ assert t_date.weekday == 0
+ assert t_date.dayofyear == 1
+ assert t_date.hour == 0
+ assert t_date.minute == 0
+ assert t_date.days_in_month == 31
+ assert Period(freq='D', year=2012, month=2, day=1, hour=0,
+ minute=0).days_in_month == 29
+
+ def test_properties_secondly(self):
+ # Test properties on Periods with secondly frequency.
+ s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0,
+ minute=0, second=0)
+ #
+ assert s_date.year == 2007
+ assert s_date.quarter == 1
+ assert s_date.month == 1
+ assert s_date.day == 1
+ assert s_date.weekday == 0
+ assert s_date.dayofyear == 1
+ assert s_date.hour == 0
+ assert s_date.minute == 0
+ assert s_date.second == 0
+ assert s_date.days_in_month == 31
+ assert Period(freq='Min', year=2012, month=2, day=1, hour=0,
+ minute=0, second=0).days_in_month == 29
+
+
+class TestPeriodField(object):
+
+ def test_get_period_field_array_raises_on_out_of_range(self):
+ pytest.raises(ValueError, libperiod.get_period_field_arr, -1,
+ np.empty(1), 0)
+
+
+class TestComparisons(object):
+
+ def setup_method(self, method):
+ self.january1 = Period('2000-01', 'M')
+ self.january2 = Period('2000-01', 'M')
+ self.february = Period('2000-02', 'M')
+ self.march = Period('2000-03', 'M')
+ self.day = Period('2012-01-01', 'D')
+
+ def test_equal(self):
+ assert self.january1 == self.january2
+
+ def test_equal_Raises_Value(self):
+ with pytest.raises(period.IncompatibleFrequency):
+ self.january1 == self.day
+
+ def test_notEqual(self):
+ assert self.january1 != 1
+ assert self.january1 != self.february
+
+ def test_greater(self):
+ assert self.february > self.january1
+
+ def test_greater_Raises_Value(self):
+ with pytest.raises(period.IncompatibleFrequency):
+ self.january1 > self.day
+
+ def test_greater_Raises_Type(self):
+ with pytest.raises(TypeError):
+ self.january1 > 1
+
+ def test_greaterEqual(self):
+ assert self.january1 >= self.january2
+
+ def test_greaterEqual_Raises_Value(self):
+ with pytest.raises(period.IncompatibleFrequency):
+ self.january1 >= self.day
+
+ with pytest.raises(TypeError):
+ print(self.january1 >= 1)
+
+ def test_smallerEqual(self):
+ assert self.january1 <= self.january2
+
+ def test_smallerEqual_Raises_Value(self):
+ with pytest.raises(period.IncompatibleFrequency):
+ self.january1 <= self.day
+
+ def test_smallerEqual_Raises_Type(self):
+ with pytest.raises(TypeError):
+ self.january1 <= 1
+
+ def test_smaller(self):
+ assert self.january1 < self.february
+
+ def test_smaller_Raises_Value(self):
+ with pytest.raises(period.IncompatibleFrequency):
+ self.january1 < self.day
+
+ def test_smaller_Raises_Type(self):
+ with pytest.raises(TypeError):
+ self.january1 < 1
+
+ def test_sort(self):
+ periods = [self.march, self.january1, self.february]
+ correctPeriods = [self.january1, self.february, self.march]
+ assert sorted(periods) == correctPeriods
+
+ def test_period_nat_comp(self):
+ p_nat = Period('NaT', freq='D')
+ p = Period('2011-01-01', freq='D')
+
+ nat = Timestamp('NaT')
+ t = Timestamp('2011-01-01')
+ # confirm Period('NaT') work identical with Timestamp('NaT')
+ for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t),
+ (t, nat), (nat, nat)]:
+ assert not left < right
+ assert not left > right
+ assert not left == right
+ assert left != right
+ assert not left <= right
+ assert not left >= right
+
+
+class TestArithmetic(object):
+
+ def test_sub_delta(self):
+ left, right = Period('2011', freq='A'), Period('2007', freq='A')
+ result = left - right
+ assert result == 4 * right.freq
+
+ with pytest.raises(period.IncompatibleFrequency):
+ left - Period('2007-01', freq='M')
+
+ def test_add_integer(self):
+ per1 = Period(freq='D', year=2008, month=1, day=1)
+ per2 = Period(freq='D', year=2008, month=1, day=2)
+ assert per1 + 1 == per2
+ assert 1 + per1 == per2
+
+ def test_add_sub_nat(self):
+ # GH#13071
+ p = Period('2011-01', freq='M')
+ assert p + NaT is NaT
+ assert NaT + p is NaT
+ assert p - NaT is NaT
+ assert NaT - p is NaT
+
+ p = Period('NaT', freq='M')
+ assert p + NaT is NaT
+ assert NaT + p is NaT
+ assert p - NaT is NaT
+ assert NaT - p is NaT
+
+ def test_add_invalid(self):
+ # GH#4731
+ per1 = Period(freq='D', year=2008, month=1, day=1)
+ per2 = Period(freq='D', year=2008, month=1, day=2)
+
+ msg = r"unsupported operand type\(s\)"
+ with pytest.raises(TypeError, match=msg):
+ per1 + "str"
+ with pytest.raises(TypeError, match=msg):
+ "str" + per1
+ with pytest.raises(TypeError, match=msg):
+ per1 + per2
+
+ boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])]
+ ids = ['identity', 'Series', 'Index']
+
+ @pytest.mark.parametrize('lbox', boxes, ids=ids)
+ @pytest.mark.parametrize('rbox', boxes, ids=ids)
+ def test_add_timestamp_raises(self, rbox, lbox):
+ # GH#17983
+ ts = Timestamp('2017')
+ per = Period('2017', freq='M')
+
+ # We may get a different message depending on which class raises
+ # the error.
+ msg = (r"cannot add|unsupported operand|"
+ r"can only operate on a|incompatible type|"
+ r"ufunc add cannot use operands")
+ with pytest.raises(TypeError, match=msg):
+ lbox(ts) + rbox(per)
+
+ with pytest.raises(TypeError, match=msg):
+ lbox(per) + rbox(ts)
+
+ with pytest.raises(TypeError, match=msg):
+ lbox(per) + rbox(per)
+
+ def test_sub(self):
+ per1 = Period('2011-01-01', freq='D')
+ per2 = Period('2011-01-15', freq='D')
+
+ off = per1.freq
+ assert per1 - per2 == -14 * off
+ assert per2 - per1 == 14 * off
+
+ msg = r"Input has different freq=M from Period\(freq=D\)"
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ per1 - Period('2011-02', freq='M')
+
+ @pytest.mark.parametrize('n', [1, 2, 3, 4])
+ def test_sub_n_gt_1_ticks(self, tick_classes, n):
+ # GH 23878
+ p1 = pd.Period('19910905', freq=tick_classes(n))
+ p2 = pd.Period('19920406', freq=tick_classes(n))
+
+ expected = (pd.Period(str(p2), freq=p2.freq.base)
+ - pd.Period(str(p1), freq=p1.freq.base))
+
+ assert (p2 - p1) == expected
+
+ @pytest.mark.parametrize('normalize', [True, False])
+ @pytest.mark.parametrize('n', [1, 2, 3, 4])
+ @pytest.mark.parametrize('offset, kwd_name', [
+ (pd.offsets.YearEnd, 'month'),
+ (pd.offsets.QuarterEnd, 'startingMonth'),
+ (pd.offsets.MonthEnd, None),
+ (pd.offsets.Week, 'weekday')
+ ])
+ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize):
+ # GH 23878
+ kwds = {kwd_name: 3} if kwd_name is not None else {}
+ p1_d = '19910905'
+ p2_d = '19920406'
+ p1 = pd.Period(p1_d, freq=offset(n, normalize, **kwds))
+ p2 = pd.Period(p2_d, freq=offset(n, normalize, **kwds))
+
+ expected = (pd.Period(p2_d, freq=p2.freq.base)
+ - pd.Period(p1_d, freq=p1.freq.base))
+
+ assert (p2 - p1) == expected
+
+ def test_add_offset(self):
+ # freq is DateOffset
+ for freq in ['A', '2A', '3A']:
+ p = Period('2011', freq=freq)
+ exp = Period('2013', freq=freq)
+ assert p + offsets.YearEnd(2) == exp
+ assert offsets.YearEnd(2) + p == exp
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p + o
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ with pytest.raises(period.IncompatibleFrequency):
+ o + p
+
+ for freq in ['M', '2M', '3M']:
+ p = Period('2011-03', freq=freq)
+ exp = Period('2011-05', freq=freq)
+ assert p + offsets.MonthEnd(2) == exp
+ assert offsets.MonthEnd(2) + p == exp
+
+ exp = Period('2012-03', freq=freq)
+ assert p + offsets.MonthEnd(12) == exp
+ assert offsets.MonthEnd(12) + p == exp
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p + o
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ with pytest.raises(period.IncompatibleFrequency):
+ o + p
+
+ # freq is Tick
+ for freq in ['D', '2D', '3D']:
+ p = Period('2011-04-01', freq=freq)
+
+ exp = Period('2011-04-06', freq=freq)
+ assert p + offsets.Day(5) == exp
+ assert offsets.Day(5) + p == exp
+
+ exp = Period('2011-04-02', freq=freq)
+ assert p + offsets.Hour(24) == exp
+ assert offsets.Hour(24) + p == exp
+
+ exp = Period('2011-04-03', freq=freq)
+ assert p + np.timedelta64(2, 'D') == exp
+ with pytest.raises(TypeError):
+ np.timedelta64(2, 'D') + p
+
+ exp = Period('2011-04-02', freq=freq)
+ assert p + np.timedelta64(3600 * 24, 's') == exp
+ with pytest.raises(TypeError):
+ np.timedelta64(3600 * 24, 's') + p
+
+ exp = Period('2011-03-30', freq=freq)
+ assert p + timedelta(-2) == exp
+ assert timedelta(-2) + p == exp
+
+ exp = Period('2011-04-03', freq=freq)
+ assert p + timedelta(hours=48) == exp
+ assert timedelta(hours=48) + p == exp
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(4, 'h'),
+ timedelta(hours=23)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p + o
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ with pytest.raises(period.IncompatibleFrequency):
+ o + p
+
+ for freq in ['H', '2H', '3H']:
+ p = Period('2011-04-01 09:00', freq=freq)
+
+ exp = Period('2011-04-03 09:00', freq=freq)
+ assert p + offsets.Day(2) == exp
+ assert offsets.Day(2) + p == exp
+
+ exp = Period('2011-04-01 12:00', freq=freq)
+ assert p + offsets.Hour(3) == exp
+ assert offsets.Hour(3) + p == exp
+
+ exp = Period('2011-04-01 12:00', freq=freq)
+ assert p + np.timedelta64(3, 'h') == exp
+ with pytest.raises(TypeError):
+ np.timedelta64(3, 'h') + p
+
+ exp = Period('2011-04-01 10:00', freq=freq)
+ assert p + np.timedelta64(3600, 's') == exp
+ with pytest.raises(TypeError):
+ np.timedelta64(3600, 's') + p
+
+ exp = Period('2011-04-01 11:00', freq=freq)
+ assert p + timedelta(minutes=120) == exp
+ assert timedelta(minutes=120) + p == exp
+
+ exp = Period('2011-04-05 12:00', freq=freq)
+ assert p + timedelta(days=4, minutes=180) == exp
+ assert timedelta(days=4, minutes=180) + p == exp
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(3200, 's'),
+ timedelta(hours=23, minutes=30)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p + o
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ with pytest.raises(period.IncompatibleFrequency):
+ o + p
+
+ def test_add_offset_nat(self):
+ # freq is DateOffset
+ for freq in ['A', '2A', '3A']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.YearEnd(2)]:
+ assert p + o is NaT
+ assert o + p is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ for freq in ['M', '2M', '3M']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ # freq is Tick
+ for freq in ['D', '2D', '3D']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'),
+ np.timedelta64(3600 * 24, 's'), timedelta(-2),
+ timedelta(hours=48)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(4, 'h'),
+ timedelta(hours=23)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ for freq in ['H', '2H', '3H']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'),
+ np.timedelta64(3600, 's'), timedelta(minutes=120),
+ timedelta(days=4, minutes=180)]:
+ assert p + o is NaT
+
+ if not isinstance(o, np.timedelta64):
+ assert o + p is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(3200, 's'),
+ timedelta(hours=23, minutes=30)]:
+ assert p + o is NaT
+
+ if isinstance(o, np.timedelta64):
+ with pytest.raises(TypeError):
+ o + p
+ else:
+ assert o + p is NaT
+
+ def test_sub_offset(self):
+ # freq is DateOffset
+ for freq in ['A', '2A', '3A']:
+ p = Period('2011', freq=freq)
+ assert p - offsets.YearEnd(2) == Period('2009', freq=freq)
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p - o
+
+ for freq in ['M', '2M', '3M']:
+ p = Period('2011-03', freq=freq)
+ assert p - offsets.MonthEnd(2) == Period('2011-01', freq=freq)
+ assert p - offsets.MonthEnd(12) == Period('2010-03', freq=freq)
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p - o
+
+ # freq is Tick
+ for freq in ['D', '2D', '3D']:
+ p = Period('2011-04-01', freq=freq)
+ assert p - offsets.Day(5) == Period('2011-03-27', freq=freq)
+ assert p - offsets.Hour(24) == Period('2011-03-31', freq=freq)
+ assert p - np.timedelta64(2, 'D') == Period(
+ '2011-03-30', freq=freq)
+ assert p - np.timedelta64(3600 * 24, 's') == Period(
+ '2011-03-31', freq=freq)
+ assert p - timedelta(-2) == Period('2011-04-03', freq=freq)
+ assert p - timedelta(hours=48) == Period('2011-03-30', freq=freq)
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(4, 'h'),
+ timedelta(hours=23)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p - o
+
+ for freq in ['H', '2H', '3H']:
+ p = Period('2011-04-01 09:00', freq=freq)
+ assert p - offsets.Day(2) == Period('2011-03-30 09:00', freq=freq)
+ assert p - offsets.Hour(3) == Period('2011-04-01 06:00', freq=freq)
+ assert p - np.timedelta64(3, 'h') == Period(
+ '2011-04-01 06:00', freq=freq)
+ assert p - np.timedelta64(3600, 's') == Period(
+ '2011-04-01 08:00', freq=freq)
+ assert p - timedelta(minutes=120) == Period(
+ '2011-04-01 07:00', freq=freq)
+ assert p - timedelta(days=4, minutes=180) == Period(
+ '2011-03-28 06:00', freq=freq)
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(3200, 's'),
+ timedelta(hours=23, minutes=30)]:
+ with pytest.raises(period.IncompatibleFrequency):
+ p - o
+
+ def test_sub_offset_nat(self):
+ # freq is DateOffset
+ for freq in ['A', '2A', '3A']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.YearEnd(2)]:
+ assert p - o is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ assert p - o is NaT
+
+ for freq in ['M', '2M', '3M']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]:
+ assert p - o is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(365, 'D'),
+ timedelta(365)]:
+ assert p - o is NaT
+
+ # freq is Tick
+ for freq in ['D', '2D', '3D']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'),
+ np.timedelta64(3600 * 24, 's'), timedelta(-2),
+ timedelta(hours=48)]:
+ assert p - o is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(4, 'h'),
+ timedelta(hours=23)]:
+ assert p - o is NaT
+
+ for freq in ['H', '2H', '3H']:
+ p = Period('NaT', freq=freq)
+ for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'),
+ np.timedelta64(3600, 's'), timedelta(minutes=120),
+ timedelta(days=4, minutes=180)]:
+ assert p - o is NaT
+
+ for o in [offsets.YearBegin(2), offsets.MonthBegin(1),
+ offsets.Minute(), np.timedelta64(3200, 's'),
+ timedelta(hours=23, minutes=30)]:
+ assert p - o is NaT
+
+ @pytest.mark.parametrize('freq', ['M', '2M', '3M'])
+ def test_nat_ops(self, freq):
+ p = Period('NaT', freq=freq)
+ assert p + 1 is NaT
+ assert 1 + p is NaT
+ assert p - 1 is NaT
+ assert p - Period('2011-01', freq=freq) is NaT
+ assert Period('2011-01', freq=freq) - p is NaT
+
+ def test_period_ops_offset(self):
+ p = Period('2011-04-01', freq='D')
+ result = p + offsets.Day()
+ exp = Period('2011-04-02', freq='D')
+ assert result == exp
+
+ result = p - offsets.Day(2)
+ exp = Period('2011-03-30', freq='D')
+ assert result == exp
+
+ msg = r"Input cannot be converted to Period\(freq=D\)"
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ p + offsets.Hour(2)
+
+ with pytest.raises(period.IncompatibleFrequency, match=msg):
+ p - offsets.Hour(2)
+
+
+def test_period_immutable():
+ # see gh-17116
+ per = Period('2014Q1')
+ with pytest.raises(AttributeError):
+ per.ordinal = 14
+
+ freq = per.freq
+ with pytest.raises(AttributeError):
+ per.freq = 2 * freq
+
+
+# TODO: This doesn't fail on all systems; track down which
[email protected](reason="Parses as Jan 1, 0007 on some systems",
+ strict=False)
+def test_small_year_parsing():
+ per1 = Period('0001-01-07', 'D')
+ assert per1.year == 1
+ assert per1.day == 7
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/test_nat.py b/contrib/python/pandas/py2/pandas/tests/scalar/test_nat.py
new file mode 100644
index 00000000000..abf95b276cd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/test_nat.py
@@ -0,0 +1,341 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs import iNaT
+import pandas.compat as compat
+
+from pandas import (
+ DatetimeIndex, Index, NaT, Period, Series, Timedelta, TimedeltaIndex,
+ Timestamp)
+from pandas.core.arrays import PeriodArray
+from pandas.util import testing as tm
+
+
[email protected]("nat,idx", [(Timestamp("NaT"), DatetimeIndex),
+ (Timedelta("NaT"), TimedeltaIndex),
+ (Period("NaT", freq="M"), PeriodArray)])
+def test_nat_fields(nat, idx):
+
+ for field in idx._field_ops:
+ # weekday is a property of DTI, but a method
+ # on NaT/Timestamp for compat with datetime
+ if field == "weekday":
+ continue
+
+ result = getattr(NaT, field)
+ assert np.isnan(result)
+
+ result = getattr(nat, field)
+ assert np.isnan(result)
+
+ for field in idx._bool_ops:
+
+ result = getattr(NaT, field)
+ assert result is False
+
+ result = getattr(nat, field)
+ assert result is False
+
+
+def test_nat_vector_field_access():
+ idx = DatetimeIndex(["1/1/2000", None, None, "1/4/2000"])
+
+ for field in DatetimeIndex._field_ops:
+ # weekday is a property of DTI, but a method
+ # on NaT/Timestamp for compat with datetime
+ if field == "weekday":
+ continue
+
+ result = getattr(idx, field)
+ expected = Index([getattr(x, field) for x in idx])
+ tm.assert_index_equal(result, expected)
+
+ ser = Series(idx)
+
+ for field in DatetimeIndex._field_ops:
+ # weekday is a property of DTI, but a method
+ # on NaT/Timestamp for compat with datetime
+ if field == "weekday":
+ continue
+
+ result = getattr(ser.dt, field)
+ expected = [getattr(x, field) for x in idx]
+ tm.assert_series_equal(result, Series(expected))
+
+ for field in DatetimeIndex._bool_ops:
+ result = getattr(ser.dt, field)
+ expected = [getattr(x, field) for x in idx]
+ tm.assert_series_equal(result, Series(expected))
+
+
[email protected]("klass", [Timestamp, Timedelta, Period])
[email protected]("value", [None, np.nan, iNaT, float("nan"),
+ NaT, "NaT", "nat"])
+def test_identity(klass, value):
+ assert klass(value) is NaT
+
+
[email protected]("klass", [Timestamp, Timedelta, Period])
[email protected]("value", ["", "nat", "NAT", None, np.nan])
+def test_equality(klass, value):
+ if klass is Period and value == "":
+ pytest.skip("Period cannot parse empty string")
+
+ assert klass(value).value == iNaT
+
+
[email protected]("klass", [Timestamp, Timedelta])
[email protected]("method", ["round", "floor", "ceil"])
[email protected]("freq", ["s", "5s", "min", "5min", "h", "5h"])
+def test_round_nat(klass, method, freq):
+ # see gh-14940
+ ts = klass("nat")
+
+ round_method = getattr(ts, method)
+ assert round_method(freq) is ts
+
+
[email protected]("method", [
+ "astimezone", "combine", "ctime", "dst", "fromordinal",
+ "fromtimestamp", "isocalendar", "strftime", "strptime",
+ "time", "timestamp", "timetuple", "timetz", "toordinal",
+ "tzname", "utcfromtimestamp", "utcnow", "utcoffset",
+ "utctimetuple", "timestamp"
+])
+def test_nat_methods_raise(method):
+ # see gh-9513, gh-17329
+ msg = "NaTType does not support {method}".format(method=method)
+
+ with pytest.raises(ValueError, match=msg):
+ getattr(NaT, method)()
+
+
[email protected]("method", [
+ "weekday", "isoweekday"
+])
+def test_nat_methods_nan(method):
+ # see gh-9513, gh-17329
+ assert np.isnan(getattr(NaT, method)())
+
+
[email protected]("method", [
+ "date", "now", "replace", "today",
+ "tz_convert", "tz_localize"
+])
+def test_nat_methods_nat(method):
+ # see gh-8254, gh-9513, gh-17329
+ assert getattr(NaT, method)() is NaT
+
+
[email protected]("get_nat", [
+ lambda x: NaT,
+ lambda x: Timedelta(x),
+ lambda x: Timestamp(x)
+])
+def test_nat_iso_format(get_nat):
+ # see gh-12300
+ assert get_nat("NaT").isoformat() == "NaT"
+
+
[email protected]("klass,expected", [
+ (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]),
+ (Timedelta, ["components", "delta", "is_populated", "to_pytimedelta",
+ "to_timedelta64", "view"])
+])
+def test_missing_public_nat_methods(klass, expected):
+ # see gh-17327
+ #
+ # NaT should have *most* of the Timestamp and Timedelta methods.
+ # Here, we check which public methods NaT does not have. We
+ # ignore any missing private methods.
+ nat_names = dir(NaT)
+ klass_names = dir(klass)
+
+ missing = [x for x in klass_names if x not in nat_names and
+ not x.startswith("_")]
+ missing.sort()
+
+ assert missing == expected
+
+
+def _get_overlap_public_nat_methods(klass, as_tuple=False):
+ """
+ Get overlapping public methods between NaT and another class.
+
+ Parameters
+ ----------
+ klass : type
+ The class to compare with NaT
+ as_tuple : bool, default False
+ Whether to return a list of tuples of the form (klass, method).
+
+ Returns
+ -------
+ overlap : list
+ """
+ nat_names = dir(NaT)
+ klass_names = dir(klass)
+
+ overlap = [x for x in nat_names if x in klass_names and
+ not x.startswith("_") and
+ callable(getattr(klass, x))]
+
+ # Timestamp takes precedence over Timedelta in terms of overlap.
+ if klass is Timedelta:
+ ts_names = dir(Timestamp)
+ overlap = [x for x in overlap if x not in ts_names]
+
+ if as_tuple:
+ overlap = [(klass, method) for method in overlap]
+
+ overlap.sort()
+ return overlap
+
+
[email protected]("klass,expected", [
+ (Timestamp, ["astimezone", "ceil", "combine", "ctime", "date", "day_name",
+ "dst", "floor", "fromisoformat", "fromordinal",
+ "fromtimestamp", "isocalendar", "isoformat", "isoweekday",
+ "month_name", "now", "replace", "round", "strftime",
+ "strptime", "time", "timestamp", "timetuple", "timetz",
+ "to_datetime64", "to_pydatetime", "today", "toordinal",
+ "tz_convert", "tz_localize", "tzname", "utcfromtimestamp",
+ "utcnow", "utcoffset", "utctimetuple", "weekday"]),
+ (Timedelta, ["total_seconds"])
+])
+def test_overlap_public_nat_methods(klass, expected):
+ # see gh-17327
+ #
+ # NaT should have *most* of the Timestamp and Timedelta methods.
+ # In case when Timestamp, Timedelta, and NaT are overlap, the overlap
+ # is considered to be with Timestamp and NaT, not Timedelta.
+
+ # "fromisoformat" was introduced in 3.7
+ if klass is Timestamp and not compat.PY37:
+ expected.remove("fromisoformat")
+
+ assert _get_overlap_public_nat_methods(klass) == expected
+
+
[email protected]("compare", (
+ _get_overlap_public_nat_methods(Timestamp, True) +
+ _get_overlap_public_nat_methods(Timedelta, True))
+)
+def test_nat_doc_strings(compare):
+ # see gh-17327
+ #
+ # The docstrings for overlapping methods should match.
+ klass, method = compare
+ klass_doc = getattr(klass, method).__doc__
+
+ nat_doc = getattr(NaT, method).__doc__
+ assert klass_doc == nat_doc
+
+
+_ops = {
+ "left_plus_right": lambda a, b: a + b,
+ "right_plus_left": lambda a, b: b + a,
+ "left_minus_right": lambda a, b: a - b,
+ "right_minus_left": lambda a, b: b - a,
+ "left_times_right": lambda a, b: a * b,
+ "right_times_left": lambda a, b: b * a,
+ "left_div_right": lambda a, b: a / b,
+ "right_div_left": lambda a, b: b / a,
+}
+
+
[email protected]("op_name", list(_ops.keys()))
[email protected]("value,val_type", [
+ (2, "scalar"),
+ (1.5, "scalar"),
+ (np.nan, "scalar"),
+ (timedelta(3600), "timedelta"),
+ (Timedelta("5s"), "timedelta"),
+ (datetime(2014, 1, 1), "timestamp"),
+ (Timestamp("2014-01-01"), "timestamp"),
+ (Timestamp("2014-01-01", tz="UTC"), "timestamp"),
+ (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"),
+ (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"),
+])
+def test_nat_arithmetic_scalar(op_name, value, val_type):
+ # see gh-6873
+ invalid_ops = {
+ "scalar": {"right_div_left"},
+ "timedelta": {"left_times_right", "right_times_left"},
+ "timestamp": {"left_times_right", "right_times_left",
+ "left_div_right", "right_div_left"}
+ }
+
+ op = _ops[op_name]
+
+ if op_name in invalid_ops.get(val_type, set()):
+ if (val_type == "timedelta" and "times" in op_name and
+ isinstance(value, Timedelta)):
+ msg = "Cannot multiply"
+ else:
+ msg = "unsupported operand type"
+
+ with pytest.raises(TypeError, match=msg):
+ op(NaT, value)
+ else:
+ if val_type == "timedelta" and "div" in op_name:
+ expected = np.nan
+ else:
+ expected = NaT
+
+ assert op(NaT, value) is expected
+
+
[email protected]("val,expected", [
+ (np.nan, NaT),
+ (NaT, np.nan),
+ (np.timedelta64("NaT"), np.nan)
+])
+def test_nat_rfloordiv_timedelta(val, expected):
+ # see gh-#18846
+ #
+ # See also test_timedelta.TestTimedeltaArithmetic.test_floordiv
+ td = Timedelta(hours=3, minutes=4)
+ assert td // val is expected
+
+
[email protected]("op_name", [
+ "left_plus_right", "right_plus_left",
+ "left_minus_right", "right_minus_left"
+])
+ DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"),
+ DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"),
+ TimedeltaIndex(["1 day", "2 day"], name="x"),
+])
+def test_nat_arithmetic_index(op_name, value):
+ # see gh-11718
+ exp_name = "x"
+ exp_data = [NaT] * 2
+
+ if isinstance(value, DatetimeIndex) and "plus" in op_name:
+ expected = DatetimeIndex(exp_data, name=exp_name, tz=value.tz)
+ else:
+ expected = TimedeltaIndex(exp_data, name=exp_name)
+
+ tm.assert_index_equal(_ops[op_name](NaT, value), expected)
+
+
[email protected]("op_name", [
+ "left_plus_right", "right_plus_left",
+ "left_minus_right", "right_minus_left"
+])
[email protected]("box", [TimedeltaIndex, Series])
+def test_nat_arithmetic_td64_vector(op_name, box):
+ # see gh-19124
+ vec = box(["1 day", "2 day"], dtype="timedelta64[ns]")
+ box_nat = box([NaT, NaT], dtype="timedelta64[ns]")
+ tm.assert_equal(_ops[op_name](vec, NaT), box_nat)
+
+
+def test_nat_pinned_docstrings():
+ # see gh-17327
+ assert NaT.ctime.__doc__ == datetime.ctime.__doc__
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/__init__.py b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_arithmetic.py
new file mode 100644
index 00000000000..b6ad251d598
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_arithmetic.py
@@ -0,0 +1,691 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for scalar Timedelta arithmetic ops
+"""
+from datetime import datetime, timedelta
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import NaT, Timedelta, Timestamp
+from pandas.core import ops
+import pandas.util.testing as tm
+
+
+class TestTimedeltaAdditionSubtraction(object):
+ """
+ Tests for Timedelta methods:
+
+ __add__, __radd__,
+ __sub__, __rsub__
+ """
+ @pytest.mark.parametrize('ten_seconds', [
+ Timedelta(10, unit='s'),
+ timedelta(seconds=10),
+ np.timedelta64(10, 's'),
+ np.timedelta64(10000000000, 'ns'),
+ pd.offsets.Second(10)])
+ def test_td_add_sub_ten_seconds(self, ten_seconds):
+ # GH#6808
+ base = Timestamp('20130101 09:01:12.123456')
+ expected_add = Timestamp('20130101 09:01:22.123456')
+ expected_sub = Timestamp('20130101 09:01:02.123456')
+
+ result = base + ten_seconds
+ assert result == expected_add
+
+ result = base - ten_seconds
+ assert result == expected_sub
+
+ @pytest.mark.parametrize('one_day_ten_secs', [
+ Timedelta('1 day, 00:00:10'),
+ Timedelta('1 days, 00:00:10'),
+ timedelta(days=1, seconds=10),
+ np.timedelta64(1, 'D') + np.timedelta64(10, 's'),
+ pd.offsets.Day() + pd.offsets.Second(10)])
+ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs):
+ # GH#6808
+ base = Timestamp('20130102 09:01:12.123456')
+ expected_add = Timestamp('20130103 09:01:22.123456')
+ expected_sub = Timestamp('20130101 09:01:02.123456')
+
+ result = base + one_day_ten_secs
+ assert result == expected_add
+
+ result = base - one_day_ten_secs
+ assert result == expected_sub
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_datetimelike_scalar(self, op):
+ # GH#19738
+ td = Timedelta(10, unit='d')
+
+ result = op(td, datetime(2016, 1, 1))
+ if op is operator.add:
+ # datetime + Timedelta does _not_ call Timedelta.__radd__,
+ # so we get a datetime back instead of a Timestamp
+ assert isinstance(result, Timestamp)
+ assert result == Timestamp(2016, 1, 11)
+
+ result = op(td, Timestamp('2018-01-12 18:09'))
+ assert isinstance(result, Timestamp)
+ assert result == Timestamp('2018-01-22 18:09')
+
+ result = op(td, np.datetime64('2018-01-12'))
+ assert isinstance(result, Timestamp)
+ assert result == Timestamp('2018-01-22')
+
+ result = op(td, NaT)
+ assert result is NaT
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_td(self, op):
+ td = Timedelta(10, unit='d')
+
+ result = op(td, Timedelta(days=10))
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=20)
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_pytimedelta(self, op):
+ td = Timedelta(10, unit='d')
+ result = op(td, timedelta(days=9))
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=19)
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_timedelta64(self, op):
+ td = Timedelta(10, unit='d')
+ result = op(td, np.timedelta64(-4, 'D'))
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=6)
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_offset(self, op):
+ td = Timedelta(10, unit='d')
+
+ result = op(td, pd.offsets.Hour(6))
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=10, hours=6)
+
+ def test_td_sub_td(self):
+ td = Timedelta(10, unit='d')
+ expected = Timedelta(0, unit='ns')
+ result = td - td
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+ def test_td_sub_pytimedelta(self):
+ td = Timedelta(10, unit='d')
+ expected = Timedelta(0, unit='ns')
+
+ result = td - td.to_pytimedelta()
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+ result = td.to_pytimedelta() - td
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+ def test_td_sub_timedelta64(self):
+ td = Timedelta(10, unit='d')
+ expected = Timedelta(0, unit='ns')
+
+ result = td - td.to_timedelta64()
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+ result = td.to_timedelta64() - td
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+ def test_td_sub_nat(self):
+ # In this context pd.NaT is treated as timedelta-like
+ td = Timedelta(10, unit='d')
+ result = td - NaT
+ assert result is NaT
+
+ def test_td_sub_td64_nat(self):
+ td = Timedelta(10, unit='d')
+ td_nat = np.timedelta64('NaT')
+
+ result = td - td_nat
+ assert result is NaT
+
+ result = td_nat - td
+ assert result is NaT
+
+ def test_td_sub_offset(self):
+ td = Timedelta(10, unit='d')
+ result = td - pd.offsets.Hour(1)
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(239, unit='h')
+
+ def test_td_add_sub_numeric_raises(self):
+ td = Timedelta(10, unit='d')
+ for other in [2, 2.0, np.int64(2), np.float64(2)]:
+ with pytest.raises(TypeError):
+ td + other
+ with pytest.raises(TypeError):
+ other + td
+ with pytest.raises(TypeError):
+ td - other
+ with pytest.raises(TypeError):
+ other - td
+
+ def test_td_rsub_nat(self):
+ td = Timedelta(10, unit='d')
+ result = NaT - td
+ assert result is NaT
+
+ result = np.datetime64('NaT') - td
+ assert result is NaT
+
+ def test_td_rsub_offset(self):
+ result = pd.offsets.Hour(1) - Timedelta(10, unit='d')
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(-239, unit='h')
+
+ def test_td_sub_timedeltalike_object_dtype_array(self):
+ # GH#21980
+ arr = np.array([Timestamp('20130101 9:01'),
+ Timestamp('20121230 9:02')])
+ exp = np.array([Timestamp('20121231 9:01'),
+ Timestamp('20121229 9:02')])
+ res = arr - Timedelta('1D')
+ tm.assert_numpy_array_equal(res, exp)
+
+ def test_td_sub_mixed_most_timedeltalike_object_dtype_array(self):
+ # GH#21980
+ now = Timestamp.now()
+ arr = np.array([now,
+ Timedelta('1D'),
+ np.timedelta64(2, 'h')])
+ exp = np.array([now - Timedelta('1D'),
+ Timedelta('0D'),
+ np.timedelta64(2, 'h') - Timedelta('1D')])
+ res = arr - Timedelta('1D')
+ tm.assert_numpy_array_equal(res, exp)
+
+ def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self):
+ # GH#21980
+ now = Timestamp.now()
+ arr = np.array([now,
+ Timedelta('1D'),
+ np.timedelta64(2, 'h')])
+ with pytest.raises(TypeError):
+ Timedelta('1D') - arr
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_timedeltalike_object_dtype_array(self, op):
+ # GH#21980
+ arr = np.array([Timestamp('20130101 9:01'),
+ Timestamp('20121230 9:02')])
+ exp = np.array([Timestamp('20130102 9:01'),
+ Timestamp('20121231 9:02')])
+ res = op(arr, Timedelta('1D'))
+ tm.assert_numpy_array_equal(res, exp)
+
+ @pytest.mark.parametrize('op', [operator.add, ops.radd])
+ def test_td_add_mixed_timedeltalike_object_dtype_array(self, op):
+ # GH#21980
+ now = Timestamp.now()
+ arr = np.array([now,
+ Timedelta('1D')])
+ exp = np.array([now + Timedelta('1D'),
+ Timedelta('2D')])
+ res = op(arr, Timedelta('1D'))
+ tm.assert_numpy_array_equal(res, exp)
+
+
+class TestTimedeltaMultiplicationDivision(object):
+ """
+ Tests for Timedelta methods:
+
+ __mul__, __rmul__,
+ __div__, __rdiv__,
+ __truediv__, __rtruediv__,
+ __floordiv__, __rfloordiv__,
+ __mod__, __rmod__,
+ __divmod__, __rdivmod__
+ """
+
+ # ---------------------------------------------------------------
+ # Timedelta.__mul__, __rmul__
+
+ @pytest.mark.parametrize('td_nat', [NaT,
+ np.timedelta64('NaT', 'ns'),
+ np.timedelta64('NaT')])
+ @pytest.mark.parametrize('op', [operator.mul, ops.rmul])
+ def test_td_mul_nat(self, op, td_nat):
+ # GH#19819
+ td = Timedelta(10, unit='d')
+ with pytest.raises(TypeError):
+ op(td, td_nat)
+
+ @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')])
+ @pytest.mark.parametrize('op', [operator.mul, ops.rmul])
+ def test_td_mul_nan(self, op, nan):
+ # np.float64('NaN') has a 'dtype' attr, avoid treating as array
+ td = Timedelta(10, unit='d')
+ result = op(td, nan)
+ assert result is NaT
+
+ @pytest.mark.parametrize('op', [operator.mul, ops.rmul])
+ def test_td_mul_scalar(self, op):
+ # GH#19738
+ td = Timedelta(minutes=3)
+
+ result = op(td, 2)
+ assert result == Timedelta(minutes=6)
+
+ result = op(td, 1.5)
+ assert result == Timedelta(minutes=4, seconds=30)
+
+ assert op(td, np.nan) is NaT
+
+ assert op(-1, td).value == -1 * td.value
+ assert op(-1.0, td).value == -1.0 * td.value
+
+ with pytest.raises(TypeError):
+ # timedelta * datetime is gibberish
+ op(td, Timestamp(2016, 1, 2))
+
+ with pytest.raises(TypeError):
+ # invalid multiply with another timedelta
+ op(td, td)
+
+ # ---------------------------------------------------------------
+ # Timedelta.__div__, __truediv__
+
+ def test_td_div_timedeltalike_scalar(self):
+ # GH#19738
+ td = Timedelta(10, unit='d')
+
+ result = td / pd.offsets.Hour(1)
+ assert result == 240
+
+ assert td / td == 1
+ assert td / np.timedelta64(60, 'h') == 4
+
+ assert np.isnan(td / NaT)
+
+ def test_td_div_numeric_scalar(self):
+ # GH#19738
+ td = Timedelta(10, unit='d')
+
+ result = td / 2
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=5)
+
+ result = td / 5.0
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(days=2)
+
+ @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')])
+ def test_td_div_nan(self, nan):
+ # np.float64('NaN') has a 'dtype' attr, avoid treating as array
+ td = Timedelta(10, unit='d')
+ result = td / nan
+ assert result is NaT
+
+ result = td // nan
+ assert result is NaT
+
+ # ---------------------------------------------------------------
+ # Timedelta.__rdiv__
+
+ def test_td_rdiv_timedeltalike_scalar(self):
+ # GH#19738
+ td = Timedelta(10, unit='d')
+ result = pd.offsets.Hour(1) / td
+ assert result == 1 / 240.0
+
+ assert np.timedelta64(60, 'h') / td == 0.25
+
+ # ---------------------------------------------------------------
+ # Timedelta.__floordiv__
+
+ def test_td_floordiv_timedeltalike_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+ scalar = Timedelta(hours=3, minutes=3)
+
+ assert td // scalar == 1
+ assert -td // scalar.to_pytimedelta() == -2
+ assert (2 * td) // scalar.to_timedelta64() == 2
+
+ def test_td_floordiv_null_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+
+ assert td // np.nan is NaT
+ assert np.isnan(td // NaT)
+ assert np.isnan(td // np.timedelta64('NaT'))
+
+ def test_td_floordiv_offsets(self):
+ # GH#19738
+ td = Timedelta(hours=3, minutes=4)
+ assert td // pd.offsets.Hour(1) == 3
+ assert td // pd.offsets.Minute(2) == 92
+
+ def test_td_floordiv_invalid_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+
+ with pytest.raises(TypeError):
+ td // np.datetime64('2016-01-01', dtype='datetime64[us]')
+
+ def test_td_floordiv_numeric_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+
+ expected = Timedelta(hours=1, minutes=32)
+ assert td // 2 == expected
+ assert td // 2.0 == expected
+ assert td // np.float64(2.0) == expected
+ assert td // np.int32(2.0) == expected
+ assert td // np.uint8(2.0) == expected
+
+ def test_td_floordiv_timedeltalike_array(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+ scalar = Timedelta(hours=3, minutes=3)
+
+ # Array-like others
+ assert td // np.array(scalar.to_timedelta64()) == 1
+
+ res = (3 * td) // np.array([scalar.to_timedelta64()])
+ expected = np.array([3], dtype=np.int64)
+ tm.assert_numpy_array_equal(res, expected)
+
+ res = (10 * td) // np.array([scalar.to_timedelta64(),
+ np.timedelta64('NaT')])
+ expected = np.array([10, np.nan])
+ tm.assert_numpy_array_equal(res, expected)
+
+ def test_td_floordiv_numeric_series(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=4)
+ ser = pd.Series([1], dtype=np.int64)
+ res = td // ser
+ assert res.dtype.kind == 'm'
+
+ # ---------------------------------------------------------------
+ # Timedelta.__rfloordiv__
+
+ def test_td_rfloordiv_timedeltalike_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+ scalar = Timedelta(hours=3, minutes=4)
+
+ # scalar others
+ # x // Timedelta is defined only for timedelta-like x. int-like,
+ # float-like, and date-like, in particular, should all either
+ # a) raise TypeError directly or
+ # b) return NotImplemented, following which the reversed
+ # operation will raise TypeError.
+ assert td.__rfloordiv__(scalar) == 1
+ assert (-td).__rfloordiv__(scalar.to_pytimedelta()) == -2
+ assert (2 * td).__rfloordiv__(scalar.to_timedelta64()) == 0
+
+ def test_td_rfloordiv_null_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+
+ assert np.isnan(td.__rfloordiv__(NaT))
+ assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT')))
+
+ def test_td_rfloordiv_offsets(self):
+ # GH#19738
+ assert pd.offsets.Hour(1) // Timedelta(minutes=25) == 2
+
+ def test_td_rfloordiv_invalid_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+
+ dt64 = np.datetime64('2016-01-01', dtype='datetime64[us]')
+ with pytest.raises(TypeError):
+ td.__rfloordiv__(dt64)
+
+ def test_td_rfloordiv_numeric_scalar(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+
+ assert td.__rfloordiv__(np.nan) is NotImplemented
+ assert td.__rfloordiv__(3.5) is NotImplemented
+ assert td.__rfloordiv__(2) is NotImplemented
+
+ with pytest.raises(TypeError):
+ td.__rfloordiv__(np.float64(2.0))
+ with pytest.raises(TypeError):
+ td.__rfloordiv__(np.uint8(9))
+ with tm.assert_produces_warning(FutureWarning):
+ # GH-19761: Change to TypeError.
+ td.__rfloordiv__(np.int32(2.0))
+
+ def test_td_rfloordiv_timedeltalike_array(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+ scalar = Timedelta(hours=3, minutes=4)
+
+ # Array-like others
+ assert td.__rfloordiv__(np.array(scalar.to_timedelta64())) == 1
+
+ res = td.__rfloordiv__(np.array([(3 * scalar).to_timedelta64()]))
+ expected = np.array([3], dtype=np.int64)
+ tm.assert_numpy_array_equal(res, expected)
+
+ arr = np.array([(10 * scalar).to_timedelta64(),
+ np.timedelta64('NaT')])
+ res = td.__rfloordiv__(arr)
+ expected = np.array([10, np.nan])
+ tm.assert_numpy_array_equal(res, expected)
+
+ def test_td_rfloordiv_numeric_series(self):
+ # GH#18846
+ td = Timedelta(hours=3, minutes=3)
+ ser = pd.Series([1], dtype=np.int64)
+ res = td.__rfloordiv__(ser)
+ assert res is NotImplemented
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # TODO: GH-19761. Change to TypeError.
+ ser // td
+
+ # ----------------------------------------------------------------
+ # Timedelta.__mod__, __rmod__
+
+ def test_mod_timedeltalike(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ # Timedelta-like others
+ result = td % Timedelta(hours=6)
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(hours=1)
+
+ result = td % timedelta(minutes=60)
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(0)
+
+ result = td % NaT
+ assert result is NaT
+
+ def test_mod_timedelta64_nat(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ result = td % np.timedelta64('NaT', 'ns')
+ assert result is NaT
+
+ def test_mod_timedelta64(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ result = td % np.timedelta64(2, 'h')
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(hours=1)
+
+ def test_mod_offset(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ result = td % pd.offsets.Hour(5)
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(hours=2)
+
+ def test_mod_numeric(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ # Numeric Others
+ result = td % 2
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(0)
+
+ result = td % 1e12
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(minutes=3, seconds=20)
+
+ result = td % int(1e12)
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(minutes=3, seconds=20)
+
+ def test_mod_invalid(self):
+ # GH#19365
+ td = Timedelta(hours=37)
+
+ with pytest.raises(TypeError):
+ td % Timestamp('2018-01-22')
+
+ with pytest.raises(TypeError):
+ td % []
+
+ def test_rmod_pytimedelta(self):
+ # GH#19365
+ td = Timedelta(minutes=3)
+
+ result = timedelta(minutes=4) % td
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(minutes=1)
+
+ def test_rmod_timedelta64(self):
+ # GH#19365
+ td = Timedelta(minutes=3)
+ result = np.timedelta64(5, 'm') % td
+ assert isinstance(result, Timedelta)
+ assert result == Timedelta(minutes=2)
+
+ def test_rmod_invalid(self):
+ # GH#19365
+ td = Timedelta(minutes=3)
+
+ with pytest.raises(TypeError):
+ Timestamp('2018-01-22') % td
+
+ with pytest.raises(TypeError):
+ 15 % td
+
+ with pytest.raises(TypeError):
+ 16.0 % td
+
+ with pytest.raises(TypeError):
+ np.array([22, 24]) % td
+
+ # ----------------------------------------------------------------
+ # Timedelta.__divmod__, __rdivmod__
+
+ def test_divmod_numeric(self):
+ # GH#19365
+ td = Timedelta(days=2, hours=6)
+
+ result = divmod(td, 53 * 3600 * 1e9)
+ assert result[0] == Timedelta(1, unit='ns')
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(hours=1)
+
+ assert result
+ result = divmod(td, np.nan)
+ assert result[0] is NaT
+ assert result[1] is NaT
+
+ def test_divmod(self):
+ # GH#19365
+ td = Timedelta(days=2, hours=6)
+
+ result = divmod(td, timedelta(days=1))
+ assert result[0] == 2
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(hours=6)
+
+ result = divmod(td, 54)
+ assert result[0] == Timedelta(hours=1)
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(0)
+
+ result = divmod(td, NaT)
+ assert np.isnan(result[0])
+ assert result[1] is NaT
+
+ def test_divmod_offset(self):
+ # GH#19365
+ td = Timedelta(days=2, hours=6)
+
+ result = divmod(td, pd.offsets.Hour(-4))
+ assert result[0] == -14
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(hours=-2)
+
+ def test_divmod_invalid(self):
+ # GH#19365
+ td = Timedelta(days=2, hours=6)
+
+ with pytest.raises(TypeError):
+ divmod(td, Timestamp('2018-01-22'))
+
+ def test_rdivmod_pytimedelta(self):
+ # GH#19365
+ result = divmod(timedelta(days=2, hours=6), Timedelta(days=1))
+ assert result[0] == 2
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(hours=6)
+
+ def test_rdivmod_offset(self):
+ result = divmod(pd.offsets.Hour(54), Timedelta(hours=-4))
+ assert result[0] == -14
+ assert isinstance(result[1], Timedelta)
+ assert result[1] == Timedelta(hours=-2)
+
+ def test_rdivmod_invalid(self):
+ # GH#19365
+ td = Timedelta(minutes=3)
+
+ with pytest.raises(TypeError):
+ divmod(Timestamp('2018-01-22'), td)
+
+ with pytest.raises(TypeError):
+ divmod(15, td)
+
+ with pytest.raises(TypeError):
+ divmod(16.0, td)
+
+ with pytest.raises(TypeError):
+ divmod(np.array([22, 24]), td)
+
+ # ----------------------------------------------------------------
+
+ @pytest.mark.parametrize('op', [
+ operator.mul,
+ ops.rmul,
+ operator.truediv,
+ ops.rdiv,
+ ops.rsub])
+ @pytest.mark.parametrize('arr', [
+ np.array([Timestamp('20130101 9:01'), Timestamp('20121230 9:02')]),
+ np.array([Timestamp.now(), Timedelta('1D')])
+ ])
+ def test_td_op_timedelta_timedeltalike_array(self, op, arr):
+ with pytest.raises(TypeError):
+ op(arr, Timedelta('1D'))
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_construction.py b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_construction.py
new file mode 100644
index 00000000000..880eca91474
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_construction.py
@@ -0,0 +1,210 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas import Timedelta, offsets, to_timedelta
+
+
+def test_construction():
+ expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8')
+ assert Timedelta(10, unit='d').value == expected
+ assert Timedelta(10.0, unit='d').value == expected
+ assert Timedelta('10 days').value == expected
+ assert Timedelta(days=10).value == expected
+ assert Timedelta(days=10.0).value == expected
+
+ expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8')
+ assert Timedelta('10 days 00:00:10').value == expected
+ assert Timedelta(days=10, seconds=10).value == expected
+ assert Timedelta(days=10, milliseconds=10 * 1000).value == expected
+ assert Timedelta(days=10,
+ microseconds=10 * 1000 * 1000).value == expected
+
+ # rounding cases
+ assert Timedelta(82739999850000).value == 82739999850000
+ assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000)))
+ assert Timedelta(123072001000000).value == 123072001000000
+ assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000)))
+
+ # string conversion with/without leading zero
+ # GH#9570
+ assert Timedelta('0:00:00') == timedelta(hours=0)
+ assert Timedelta('00:00:00') == timedelta(hours=0)
+ assert Timedelta('-1:00:00') == -timedelta(hours=1)
+ assert Timedelta('-01:00:00') == -timedelta(hours=1)
+
+ # more strings & abbrevs
+ # GH#8190
+ assert Timedelta('1 h') == timedelta(hours=1)
+ assert Timedelta('1 hour') == timedelta(hours=1)
+ assert Timedelta('1 hr') == timedelta(hours=1)
+ assert Timedelta('1 hours') == timedelta(hours=1)
+ assert Timedelta('-1 hours') == -timedelta(hours=1)
+ assert Timedelta('1 m') == timedelta(minutes=1)
+ assert Timedelta('1.5 m') == timedelta(seconds=90)
+ assert Timedelta('1 minute') == timedelta(minutes=1)
+ assert Timedelta('1 minutes') == timedelta(minutes=1)
+ assert Timedelta('1 s') == timedelta(seconds=1)
+ assert Timedelta('1 second') == timedelta(seconds=1)
+ assert Timedelta('1 seconds') == timedelta(seconds=1)
+ assert Timedelta('1 ms') == timedelta(milliseconds=1)
+ assert Timedelta('1 milli') == timedelta(milliseconds=1)
+ assert Timedelta('1 millisecond') == timedelta(milliseconds=1)
+ assert Timedelta('1 us') == timedelta(microseconds=1)
+ assert Timedelta('1 micros') == timedelta(microseconds=1)
+ assert Timedelta('1 microsecond') == timedelta(microseconds=1)
+ assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500')
+ assert Timedelta('1 ns') == Timedelta('00:00:00.000000001')
+ assert Timedelta('1 nano') == Timedelta('00:00:00.000000001')
+ assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001')
+
+ # combos
+ assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1)
+ assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1)
+ assert Timedelta('10 days 1 h 1m 1s') == timedelta(
+ days=10, hours=1, minutes=1, seconds=1)
+ assert Timedelta('-10 days 1 h 1m 1s') == -timedelta(
+ days=10, hours=1, minutes=1, seconds=1)
+ assert Timedelta('-10 days 1 h 1m 1s') == -timedelta(
+ days=10, hours=1, minutes=1, seconds=1)
+ assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta(
+ days=10, hours=1, minutes=1, seconds=1, microseconds=3)
+ assert Timedelta('-10 days 1 h 1.5m 1s 3us') == -timedelta(
+ days=10, hours=1, minutes=1, seconds=31, microseconds=3)
+
+ # Currently invalid as it has a - on the hh:mm:dd part
+ # (only allowed on the days)
+ with pytest.raises(ValueError):
+ Timedelta('-10 days -1 h 1.5m 1s 3us')
+
+ # only leading neg signs are allowed
+ with pytest.raises(ValueError):
+ Timedelta('10 days -1 h 1.5m 1s 3us')
+
+ # no units specified
+ with pytest.raises(ValueError):
+ Timedelta('3.1415')
+
+ # invalid construction
+ with pytest.raises(ValueError, match="cannot construct a Timedelta"):
+ Timedelta()
+
+ with pytest.raises(ValueError, match="unit abbreviation w/o a number"):
+ Timedelta('foo')
+
+ msg = ("cannot construct a Timedelta from "
+ "the passed arguments, allowed keywords are ")
+ with pytest.raises(ValueError, match=msg):
+ Timedelta(day=10)
+
+ # floats
+ expected = np.timedelta64(
+ 10, 's').astype('m8[ns]').view('i8') + np.timedelta64(
+ 500, 'ms').astype('m8[ns]').view('i8')
+ assert Timedelta(10.5, unit='s').value == expected
+
+ # offset
+ assert to_timedelta(offsets.Hour(2)) == Timedelta(hours=2)
+ assert Timedelta(offsets.Hour(2)) == Timedelta(hours=2)
+ assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2)
+
+ # GH#11995: unicode
+ expected = Timedelta('1H')
+ result = Timedelta(u'1H')
+ assert result == expected
+ assert to_timedelta(offsets.Hour(2)) == Timedelta(u'0 days, 02:00:00')
+
+ with pytest.raises(ValueError):
+ Timedelta(u'foo bar')
+
+
[email protected]('item', list({'days': 'D',
+ 'seconds': 's',
+ 'microseconds': 'us',
+ 'milliseconds': 'ms',
+ 'minutes': 'm',
+ 'hours': 'h',
+ 'weeks': 'W'}.items()))
[email protected]('npdtype', [np.int64, np.int32, np.int16,
+ np.float64, np.float32, np.float16])
+def test_td_construction_with_np_dtypes(npdtype, item):
+ # GH#8757: test construction with np dtypes
+ pykwarg, npkwarg = item
+ expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8')
+ assert Timedelta(**{pykwarg: npdtype(1)}).value == expected
+
+
+ '1s', '-1s', '1us', '-1us', '1 day', '-1 day',
+ '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns',
+ '1ns', '-23:59:59.999999999'])
+def test_td_from_repr_roundtrip(val):
+ # round-trip both for string and value
+ td = Timedelta(val)
+ assert Timedelta(td.value) == td
+
+ # str does not normally display nanos
+ if not td.nanoseconds:
+ assert Timedelta(str(td)) == td
+ assert Timedelta(td._repr_base(format='all')) == td
+
+
+def test_overflow_on_construction():
+ # GH#3374
+ value = Timedelta('1day').value * 20169940
+ with pytest.raises(OverflowError):
+ Timedelta(value)
+
+ # xref GH#17637
+ with pytest.raises(OverflowError):
+ Timedelta(7 * 19999, unit='D')
+
+ with pytest.raises(OverflowError):
+ Timedelta(timedelta(days=13 * 19999))
+
+
[email protected]('fmt,exp', [
+ ('P6DT0H50M3.010010012S', Timedelta(days=6, minutes=50, seconds=3,
+ milliseconds=10, microseconds=10,
+ nanoseconds=12)),
+ ('P-6DT0H50M3.010010012S', Timedelta(days=-6, minutes=50, seconds=3,
+ milliseconds=10, microseconds=10,
+ nanoseconds=12)),
+ ('P4DT12H30M5S', Timedelta(days=4, hours=12, minutes=30, seconds=5)),
+ ('P0DT0H0M0.000000123S', Timedelta(nanoseconds=123)),
+ ('P0DT0H0M0.00001S', Timedelta(microseconds=10)),
+ ('P0DT0H0M0.001S', Timedelta(milliseconds=1)),
+ ('P0DT0H1M0S', Timedelta(minutes=1)),
+ ('P1DT25H61M61S', Timedelta(days=1, hours=25, minutes=61, seconds=61))
+])
+def test_iso_constructor(fmt, exp):
+ assert Timedelta(fmt) == exp
+
+
+ 'PPPPPPPPPPPP', 'PDTHMS', 'P0DT999H999M999S',
+ 'P1DT0H0M0.0000000000000S', 'P1DT0H0M00000000000S',
+ 'P1DT0H0M0.S'])
+def test_iso_constructor_raises(fmt):
+ with pytest.raises(ValueError, match=('Invalid ISO 8601 Duration '
+ 'format - {}'.format(fmt))):
+ Timedelta(fmt)
+
+
[email protected]('constructed_td, conversion', [
+ (Timedelta(nanoseconds=100), '100ns'),
+ (Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, milliseconds=1,
+ microseconds=1, nanoseconds=1), 694861001001001),
+ (Timedelta(microseconds=1) + Timedelta(nanoseconds=1), '1us1ns'),
+ (Timedelta(microseconds=1) - Timedelta(nanoseconds=1), '999ns'),
+ (Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2), '990ns')])
+def test_td_constructor_on_nanoseconds(constructed_td, conversion):
+ # GH#9273
+ assert constructed_td == Timedelta(conversion)
+
+
+def test_td_constructor_value_error():
+ with pytest.raises(TypeError):
+ Timedelta(nanoseconds='abc')
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_formats.py b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_formats.py
new file mode 100644
index 00000000000..0d0b24f192f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_formats.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+from pandas import Timedelta
+
+
[email protected]('td, expected_repr', [
+ (Timedelta(10, unit='d'), "Timedelta('10 days 00:00:00')"),
+ (Timedelta(10, unit='s'), "Timedelta('0 days 00:00:10')"),
+ (Timedelta(10, unit='ms'), "Timedelta('0 days 00:00:00.010000')"),
+ (Timedelta(-10, unit='ms'), "Timedelta('-1 days +23:59:59.990000')")])
+def test_repr(td, expected_repr):
+ assert repr(td) == expected_repr
+
+
[email protected]('td, expected_iso', [
+ (Timedelta(days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10,
+ nanoseconds=12), 'P6DT0H50M3.010010012S'),
+ (Timedelta(days=4, hours=12, minutes=30, seconds=5), 'P4DT12H30M5S'),
+ (Timedelta(nanoseconds=123), 'P0DT0H0M0.000000123S'),
+ # trim nano
+ (Timedelta(microseconds=10), 'P0DT0H0M0.00001S'),
+ # trim micro
+ (Timedelta(milliseconds=1), 'P0DT0H0M0.001S'),
+ # don't strip every 0
+ (Timedelta(minutes=1), 'P0DT0H1M0S')])
+def test_isoformat(td, expected_iso):
+ assert td.isoformat() == expected_iso
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_timedelta.py b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_timedelta.py
new file mode 100644
index 00000000000..e1838e0160f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timedelta/test_timedelta.py
@@ -0,0 +1,715 @@
+""" test the scalar Timedelta """
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import NaT, iNaT
+import pandas.compat as compat
+
+import pandas as pd
+from pandas import (
+ Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta)
+import pandas.util.testing as tm
+
+
+class TestTimedeltaArithmetic(object):
+
+ def test_arithmetic_overflow(self):
+ with pytest.raises(OverflowError):
+ pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D')
+
+ with pytest.raises(OverflowError):
+ pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999)
+
+ def test_array_timedelta_floordiv(self):
+ # https://github.com/pandas-dev/pandas/issues/19761
+ ints = pd.date_range('2012-10-08', periods=4, freq='D').view('i8')
+ msg = r"Use 'array // timedelta.value'"
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ints // pd.Timedelta(1, unit='s')
+
+ assert msg in str(m[0].message)
+ expected = np.array([1349654400, 1349740800, 1349827200, 1349913600],
+ dtype='i8')
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_ops_error_str(self):
+ # GH 13624
+ td = Timedelta('1 day')
+
+ for left, right in [(td, 'a'), ('a', td)]:
+
+ with pytest.raises(TypeError):
+ left + right
+
+ with pytest.raises(TypeError):
+ left > right
+
+ assert not left == right
+ assert left != right
+
+ def test_ops_notimplemented(self):
+ class Other(object):
+ pass
+
+ other = Other()
+
+ td = Timedelta('1 day')
+ assert td.__add__(other) is NotImplemented
+ assert td.__sub__(other) is NotImplemented
+ assert td.__truediv__(other) is NotImplemented
+ assert td.__mul__(other) is NotImplemented
+ assert td.__floordiv__(other) is NotImplemented
+
+ def test_unary_ops(self):
+ td = Timedelta(10, unit='d')
+
+ # __neg__, __pos__
+ assert -td == Timedelta(-10, unit='d')
+ assert -td == Timedelta('-10d')
+ assert +td == Timedelta(10, unit='d')
+
+ # __abs__, __abs__(__neg__)
+ assert abs(td) == td
+ assert abs(-td) == td
+ assert abs(-td) == Timedelta('10d')
+
+
+class TestTimedeltaComparison(object):
+ def test_compare_tick(self, tick_classes):
+ cls = tick_classes
+
+ off = cls(4)
+ td = off.delta
+ assert isinstance(td, Timedelta)
+
+ assert td == off
+ assert not td != off
+ assert td <= off
+ assert td >= off
+ assert not td < off
+ assert not td > off
+
+ assert not td == 2 * off
+ assert td != 2 * off
+ assert td <= 2 * off
+ assert td < 2 * off
+ assert not td >= 2 * off
+ assert not td > 2 * off
+
+ def test_comparison_object_array(self):
+ # analogous to GH#15183
+ td = Timedelta('2 days')
+ other = Timedelta('3 hours')
+
+ arr = np.array([other, td], dtype=object)
+ res = arr == td
+ expected = np.array([False, True], dtype=bool)
+ assert (res == expected).all()
+
+ # 2D case
+ arr = np.array([[other, td],
+ [td, other]],
+ dtype=object)
+ res = arr != td
+ expected = np.array([[True, False], [False, True]], dtype=bool)
+ assert res.shape == expected.shape
+ assert (res == expected).all()
+
+ def test_compare_timedelta_ndarray(self):
+ # GH11835
+ periods = [Timedelta('0 days 01:00:00'), Timedelta('0 days 01:00:00')]
+ arr = np.array(periods)
+ result = arr[0] > arr
+ expected = np.array([False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0")
+ def test_compare_custom_object(self):
+ """
+ Make sure non supported operations on Timedelta returns NonImplemented
+ and yields to other operand (GH#20829).
+ """
+ class CustomClass(object):
+
+ def __init__(self, cmp_result=None):
+ self.cmp_result = cmp_result
+
+ def generic_result(self):
+ if self.cmp_result is None:
+ return NotImplemented
+ else:
+ return self.cmp_result
+
+ def __eq__(self, other):
+ return self.generic_result()
+
+ def __gt__(self, other):
+ return self.generic_result()
+
+ t = Timedelta('1s')
+
+ assert not (t == "string")
+ assert not (t == 1)
+ assert not (t == CustomClass())
+ assert not (t == CustomClass(cmp_result=False))
+
+ assert t < CustomClass(cmp_result=True)
+ assert not (t < CustomClass(cmp_result=False))
+
+ assert t == CustomClass(cmp_result=True)
+
+ @pytest.mark.parametrize("val", ["string", 1])
+ def test_compare_unknown_type(self, val):
+ # GH20829
+ t = Timedelta('1s')
+ with pytest.raises(TypeError):
+ t >= val
+ with pytest.raises(TypeError):
+ t > val
+ with pytest.raises(TypeError):
+ t <= val
+ with pytest.raises(TypeError):
+ t < val
+
+
+class TestTimedeltas(object):
+
+ @pytest.mark.parametrize("unit, value, expected", [
+ ('us', 9.999, 9999), ('ms', 9.999999, 9999999),
+ ('s', 9.999999999, 9999999999)])
+ def test_rounding_on_int_unit_construction(self, unit, value, expected):
+ # GH 12690
+ result = Timedelta(value, unit=unit)
+ assert result.value == expected
+ result = Timedelta(str(value) + unit)
+ assert result.value == expected
+
+ def test_total_seconds_scalar(self):
+ # see gh-10939
+ rng = Timedelta('1 days, 10:11:12.100123456')
+ expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9
+ tm.assert_almost_equal(rng.total_seconds(), expt)
+
+ rng = Timedelta(np.nan)
+ assert np.isnan(rng.total_seconds())
+
+ def test_conversion(self):
+
+ for td in [Timedelta(10, unit='d'),
+ Timedelta('1 days, 10:11:12.012345')]:
+ pydt = td.to_pytimedelta()
+ assert td == Timedelta(pydt)
+ assert td == pydt
+ assert (isinstance(pydt, timedelta) and not isinstance(
+ pydt, Timedelta))
+
+ assert td == np.timedelta64(td.value, 'ns')
+ td64 = td.to_timedelta64()
+
+ assert td64 == np.timedelta64(td.value, 'ns')
+ assert td == td64
+
+ assert isinstance(td64, np.timedelta64)
+
+ # this is NOT equal and cannot be roundtriped (because of the nanos)
+ td = Timedelta('1 days, 10:11:12.012345678')
+ assert td != td.to_pytimedelta()
+
+ def test_freq_conversion(self):
+
+ # truediv
+ td = Timedelta('1 days 2 hours 3 ns')
+ result = td / np.timedelta64(1, 'D')
+ assert result == td.value / float(86400 * 1e9)
+ result = td / np.timedelta64(1, 's')
+ assert result == td.value / float(1e9)
+ result = td / np.timedelta64(1, 'ns')
+ assert result == td.value
+
+ # floordiv
+ td = Timedelta('1 days 2 hours 3 ns')
+ result = td // np.timedelta64(1, 'D')
+ assert result == 1
+ result = td // np.timedelta64(1, 's')
+ assert result == 93600
+ result = td // np.timedelta64(1, 'ns')
+ assert result == td.value
+
+ def test_fields(self):
+ def check(value):
+ # that we are int/long like
+ assert isinstance(value, (int, compat.long))
+
+ # compat to datetime.timedelta
+ rng = to_timedelta('1 days, 10:11:12')
+ assert rng.days == 1
+ assert rng.seconds == 10 * 3600 + 11 * 60 + 12
+ assert rng.microseconds == 0
+ assert rng.nanoseconds == 0
+
+ pytest.raises(AttributeError, lambda: rng.hours)
+ pytest.raises(AttributeError, lambda: rng.minutes)
+ pytest.raises(AttributeError, lambda: rng.milliseconds)
+
+ # GH 10050
+ check(rng.days)
+ check(rng.seconds)
+ check(rng.microseconds)
+ check(rng.nanoseconds)
+
+ td = Timedelta('-1 days, 10:11:12')
+ assert abs(td) == Timedelta('13:48:48')
+ assert str(td) == "-1 days +10:11:12"
+ assert -td == Timedelta('0 days 13:48:48')
+ assert -Timedelta('-1 days, 10:11:12').value == 49728000000000
+ assert Timedelta('-1 days, 10:11:12').value == -49728000000000
+
+ rng = to_timedelta('-1 days, 10:11:12.100123456')
+ assert rng.days == -1
+ assert rng.seconds == 10 * 3600 + 11 * 60 + 12
+ assert rng.microseconds == 100 * 1000 + 123
+ assert rng.nanoseconds == 456
+ pytest.raises(AttributeError, lambda: rng.hours)
+ pytest.raises(AttributeError, lambda: rng.minutes)
+ pytest.raises(AttributeError, lambda: rng.milliseconds)
+
+ # components
+ tup = pd.to_timedelta(-1, 'us').components
+ assert tup.days == -1
+ assert tup.hours == 23
+ assert tup.minutes == 59
+ assert tup.seconds == 59
+ assert tup.milliseconds == 999
+ assert tup.microseconds == 999
+ assert tup.nanoseconds == 0
+
+ # GH 10050
+ check(tup.days)
+ check(tup.hours)
+ check(tup.minutes)
+ check(tup.seconds)
+ check(tup.milliseconds)
+ check(tup.microseconds)
+ check(tup.nanoseconds)
+
+ tup = Timedelta('-1 days 1 us').components
+ assert tup.days == -2
+ assert tup.hours == 23
+ assert tup.minutes == 59
+ assert tup.seconds == 59
+ assert tup.milliseconds == 999
+ assert tup.microseconds == 999
+ assert tup.nanoseconds == 0
+
+ def test_iso_conversion(self):
+ # GH #21877
+ expected = Timedelta(1, unit='s')
+ assert to_timedelta('P0DT0H0M1S') == expected
+
+ def test_nat_converters(self):
+ result = to_timedelta('nat', box=False)
+ assert result.dtype.kind == 'm'
+ assert result.astype('int64') == iNaT
+
+ result = to_timedelta('nan', box=False)
+ assert result.dtype.kind == 'm'
+ assert result.astype('int64') == iNaT
+
+ @pytest.mark.parametrize('units, np_unit',
+ [(['Y', 'y'], 'Y'),
+ (['M'], 'M'),
+ (['W', 'w'], 'W'),
+ (['D', 'd', 'days', 'day', 'Days', 'Day'], 'D'),
+ (['m', 'minute', 'min', 'minutes', 't',
+ 'Minute', 'Min', 'Minutes', 'T'], 'm'),
+ (['s', 'seconds', 'sec', 'second',
+ 'S', 'Seconds', 'Sec', 'Second'], 's'),
+ (['ms', 'milliseconds', 'millisecond', 'milli',
+ 'millis', 'l', 'MS', 'Milliseconds',
+ 'Millisecond', 'Milli', 'Millis', 'L'], 'ms'),
+ (['us', 'microseconds', 'microsecond', 'micro',
+ 'micros', 'u', 'US', 'Microseconds',
+ 'Microsecond', 'Micro', 'Micros', 'U'], 'us'),
+ (['ns', 'nanoseconds', 'nanosecond', 'nano',
+ 'nanos', 'n', 'NS', 'Nanoseconds',
+ 'Nanosecond', 'Nano', 'Nanos', 'N'], 'ns')])
+ @pytest.mark.parametrize('wrapper', [np.array, list, pd.Index])
+ def test_unit_parser(self, units, np_unit, wrapper):
+ # validate all units, GH 6855, GH 21762
+ for unit in units:
+ # array-likes
+ expected = TimedeltaIndex([np.timedelta64(i, np_unit)
+ for i in np.arange(5).tolist()])
+ result = to_timedelta(wrapper(range(5)), unit=unit)
+ tm.assert_index_equal(result, expected)
+ result = TimedeltaIndex(wrapper(range(5)), unit=unit)
+ tm.assert_index_equal(result, expected)
+
+ if unit == 'M':
+ # M is treated as minutes in string repr
+ expected = TimedeltaIndex([np.timedelta64(i, 'm')
+ for i in np.arange(5).tolist()])
+
+ str_repr = ['{}{}'.format(x, unit) for x in np.arange(5)]
+ result = to_timedelta(wrapper(str_repr))
+ tm.assert_index_equal(result, expected)
+ result = TimedeltaIndex(wrapper(str_repr))
+ tm.assert_index_equal(result, expected)
+
+ # scalar
+ expected = Timedelta(np.timedelta64(2, np_unit).astype(
+ 'timedelta64[ns]'))
+
+ result = to_timedelta(2, unit=unit)
+ assert result == expected
+ result = Timedelta(2, unit=unit)
+ assert result == expected
+
+ if unit == 'M':
+ expected = Timedelta(np.timedelta64(2, 'm').astype(
+ 'timedelta64[ns]'))
+
+ result = to_timedelta('2{}'.format(unit))
+ assert result == expected
+ result = Timedelta('2{}'.format(unit))
+ assert result == expected
+
+ def test_numeric_conversions(self):
+ assert Timedelta(0) == np.timedelta64(0, 'ns')
+ assert Timedelta(10) == np.timedelta64(10, 'ns')
+ assert Timedelta(10, unit='ns') == np.timedelta64(10, 'ns')
+
+ assert Timedelta(10, unit='us') == np.timedelta64(10, 'us')
+ assert Timedelta(10, unit='ms') == np.timedelta64(10, 'ms')
+ assert Timedelta(10, unit='s') == np.timedelta64(10, 's')
+ assert Timedelta(10, unit='d') == np.timedelta64(10, 'D')
+
+ def test_timedelta_conversions(self):
+ assert (Timedelta(timedelta(seconds=1)) ==
+ np.timedelta64(1, 's').astype('m8[ns]'))
+ assert (Timedelta(timedelta(microseconds=1)) ==
+ np.timedelta64(1, 'us').astype('m8[ns]'))
+ assert (Timedelta(timedelta(days=1)) ==
+ np.timedelta64(1, 'D').astype('m8[ns]'))
+
+ def test_round(self):
+
+ t1 = Timedelta('1 days 02:34:56.789123456')
+ t2 = Timedelta('-1 days 02:34:56.789123456')
+
+ for (freq, s1, s2) in [('N', t1, t2),
+ ('U', Timedelta('1 days 02:34:56.789123000'),
+ Timedelta('-1 days 02:34:56.789123000')),
+ ('L', Timedelta('1 days 02:34:56.789000000'),
+ Timedelta('-1 days 02:34:56.789000000')),
+ ('S', Timedelta('1 days 02:34:57'),
+ Timedelta('-1 days 02:34:57')),
+ ('2S', Timedelta('1 days 02:34:56'),
+ Timedelta('-1 days 02:34:56')),
+ ('5S', Timedelta('1 days 02:34:55'),
+ Timedelta('-1 days 02:34:55')),
+ ('T', Timedelta('1 days 02:35:00'),
+ Timedelta('-1 days 02:35:00')),
+ ('12T', Timedelta('1 days 02:36:00'),
+ Timedelta('-1 days 02:36:00')),
+ ('H', Timedelta('1 days 03:00:00'),
+ Timedelta('-1 days 03:00:00')),
+ ('d', Timedelta('1 days'),
+ Timedelta('-1 days'))]:
+ r1 = t1.round(freq)
+ assert r1 == s1
+ r2 = t2.round(freq)
+ assert r2 == s2
+
+ # invalid
+ for freq in ['Y', 'M', 'foobar']:
+ pytest.raises(ValueError, lambda: t1.round(freq))
+
+ t1 = timedelta_range('1 days', periods=3, freq='1 min 2 s 3 us')
+ t2 = -1 * t1
+ t1a = timedelta_range('1 days', periods=3, freq='1 min 2 s')
+ t1c = pd.TimedeltaIndex([1, 1, 1], unit='D')
+
+ # note that negative times round DOWN! so don't give whole numbers
+ for (freq, s1, s2) in [('N', t1, t2),
+ ('U', t1, t2),
+ ('L', t1a,
+ TimedeltaIndex(['-1 days +00:00:00',
+ '-2 days +23:58:58',
+ '-2 days +23:57:56'],
+ dtype='timedelta64[ns]',
+ freq=None)
+ ),
+ ('S', t1a,
+ TimedeltaIndex(['-1 days +00:00:00',
+ '-2 days +23:58:58',
+ '-2 days +23:57:56'],
+ dtype='timedelta64[ns]',
+ freq=None)
+ ),
+ ('12T', t1c,
+ TimedeltaIndex(['-1 days',
+ '-1 days',
+ '-1 days'],
+ dtype='timedelta64[ns]',
+ freq=None)
+ ),
+ ('H', t1c,
+ TimedeltaIndex(['-1 days',
+ '-1 days',
+ '-1 days'],
+ dtype='timedelta64[ns]',
+ freq=None)
+ ),
+ ('d', t1c,
+ pd.TimedeltaIndex([-1, -1, -1], unit='D')
+ )]:
+
+ r1 = t1.round(freq)
+ tm.assert_index_equal(r1, s1)
+ r2 = t2.round(freq)
+ tm.assert_index_equal(r2, s2)
+
+ # invalid
+ for freq in ['Y', 'M', 'foobar']:
+ pytest.raises(ValueError, lambda: t1.round(freq))
+
+ def test_contains(self):
+ # Checking for any NaT-like objects
+ # GH 13603
+ td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
+ for v in [pd.NaT, None, float('nan'), np.nan]:
+ assert not (v in td)
+
+ td = to_timedelta([pd.NaT])
+ for v in [pd.NaT, None, float('nan'), np.nan]:
+ assert (v in td)
+
+ def test_identity(self):
+
+ td = Timedelta(10, unit='d')
+ assert isinstance(td, Timedelta)
+ assert isinstance(td, timedelta)
+
+ def test_short_format_converters(self):
+ def conv(v):
+ return v.astype('m8[ns]')
+
+ assert Timedelta('10') == np.timedelta64(10, 'ns')
+ assert Timedelta('10ns') == np.timedelta64(10, 'ns')
+ assert Timedelta('100') == np.timedelta64(100, 'ns')
+ assert Timedelta('100ns') == np.timedelta64(100, 'ns')
+
+ assert Timedelta('1000') == np.timedelta64(1000, 'ns')
+ assert Timedelta('1000ns') == np.timedelta64(1000, 'ns')
+ assert Timedelta('1000NS') == np.timedelta64(1000, 'ns')
+
+ assert Timedelta('10us') == np.timedelta64(10000, 'ns')
+ assert Timedelta('100us') == np.timedelta64(100000, 'ns')
+ assert Timedelta('1000us') == np.timedelta64(1000000, 'ns')
+ assert Timedelta('1000Us') == np.timedelta64(1000000, 'ns')
+ assert Timedelta('1000uS') == np.timedelta64(1000000, 'ns')
+
+ assert Timedelta('1ms') == np.timedelta64(1000000, 'ns')
+ assert Timedelta('10ms') == np.timedelta64(10000000, 'ns')
+ assert Timedelta('100ms') == np.timedelta64(100000000, 'ns')
+ assert Timedelta('1000ms') == np.timedelta64(1000000000, 'ns')
+
+ assert Timedelta('-1s') == -np.timedelta64(1000000000, 'ns')
+ assert Timedelta('1s') == np.timedelta64(1000000000, 'ns')
+ assert Timedelta('10s') == np.timedelta64(10000000000, 'ns')
+ assert Timedelta('100s') == np.timedelta64(100000000000, 'ns')
+ assert Timedelta('1000s') == np.timedelta64(1000000000000, 'ns')
+
+ assert Timedelta('1d') == conv(np.timedelta64(1, 'D'))
+ assert Timedelta('-1d') == -conv(np.timedelta64(1, 'D'))
+ assert Timedelta('1D') == conv(np.timedelta64(1, 'D'))
+ assert Timedelta('10D') == conv(np.timedelta64(10, 'D'))
+ assert Timedelta('100D') == conv(np.timedelta64(100, 'D'))
+ assert Timedelta('1000D') == conv(np.timedelta64(1000, 'D'))
+ assert Timedelta('10000D') == conv(np.timedelta64(10000, 'D'))
+
+ # space
+ assert Timedelta(' 10000D ') == conv(np.timedelta64(10000, 'D'))
+ assert Timedelta(' - 10000D ') == -conv(np.timedelta64(10000, 'D'))
+
+ # invalid
+ with pytest.raises(ValueError):
+ Timedelta('1foo')
+ with pytest.raises(ValueError):
+ Timedelta('foo')
+
+ def test_full_format_converters(self):
+ def conv(v):
+ return v.astype('m8[ns]')
+
+ d1 = np.timedelta64(1, 'D')
+
+ assert Timedelta('1days') == conv(d1)
+ assert Timedelta('1days,') == conv(d1)
+ assert Timedelta('- 1days,') == -conv(d1)
+
+ assert Timedelta('00:00:01') == conv(np.timedelta64(1, 's'))
+ assert Timedelta('06:00:01') == conv(np.timedelta64(6 * 3600 + 1, 's'))
+ assert Timedelta('06:00:01.0') == conv(
+ np.timedelta64(6 * 3600 + 1, 's'))
+ assert Timedelta('06:00:01.01') == conv(np.timedelta64(
+ 1000 * (6 * 3600 + 1) + 10, 'ms'))
+
+ assert (Timedelta('- 1days, 00:00:01') ==
+ conv(-d1 + np.timedelta64(1, 's')))
+ assert (Timedelta('1days, 06:00:01') ==
+ conv(d1 + np.timedelta64(6 * 3600 + 1, 's')))
+ assert (Timedelta('1days, 06:00:01.01') ==
+ conv(d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, 'ms')))
+
+ # invalid
+ with pytest.raises(ValueError):
+ Timedelta('- 1days, 00')
+
+ def test_overflow(self):
+ # GH 9442
+ s = Series(pd.date_range('20130101', periods=100000, freq='H'))
+ s[0] += pd.Timedelta('1s 1ms')
+
+ # mean
+ result = (s - s.min()).mean()
+ expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)
+ ).sum())
+
+ # the computation is converted to float so
+ # might be some loss of precision
+ assert np.allclose(result.value / 1000, expected.value / 1000)
+
+ # sum
+ pytest.raises(ValueError, lambda: (s - s.min()).sum())
+ s1 = s[0:10000]
+ pytest.raises(ValueError, lambda: (s1 - s1.min()).sum())
+ s2 = s[0:1000]
+ result = (s2 - s2.min()).sum()
+
+ def test_pickle(self):
+
+ v = Timedelta('1 days 10:11:12.0123456')
+ v_p = tm.round_trip_pickle(v)
+ assert v == v_p
+
+ def test_timedelta_hash_equality(self):
+ # GH 11129
+ v = Timedelta(1, 'D')
+ td = timedelta(days=1)
+ assert hash(v) == hash(td)
+
+ d = {td: 2}
+ assert d[v] == 2
+
+ tds = timedelta_range('1 second', periods=20)
+ assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds)
+
+ # python timedeltas drop ns resolution
+ ns_td = Timedelta(1, 'ns')
+ assert hash(ns_td) != hash(ns_td.to_pytimedelta())
+
+ def test_implementation_limits(self):
+ min_td = Timedelta(Timedelta.min)
+ max_td = Timedelta(Timedelta.max)
+
+ # GH 12727
+ # timedelta limits correspond to int64 boundaries
+ assert min_td.value == np.iinfo(np.int64).min + 1
+ assert max_td.value == np.iinfo(np.int64).max
+
+ # Beyond lower limit, a NAT before the Overflow
+ assert (min_td - Timedelta(1, 'ns')) is NaT
+
+ with pytest.raises(OverflowError):
+ min_td - Timedelta(2, 'ns')
+
+ with pytest.raises(OverflowError):
+ max_td + Timedelta(1, 'ns')
+
+ # Same tests using the internal nanosecond values
+ td = Timedelta(min_td.value - 1, 'ns')
+ assert td is NaT
+
+ with pytest.raises(OverflowError):
+ Timedelta(min_td.value - 2, 'ns')
+
+ with pytest.raises(OverflowError):
+ Timedelta(max_td.value + 1, 'ns')
+
+ def test_total_seconds_precision(self):
+ # GH 19458
+ assert Timedelta('30S').total_seconds() == 30.0
+ assert Timedelta('0').total_seconds() == 0.0
+ assert Timedelta('-2S').total_seconds() == -2.0
+ assert Timedelta('5.324S').total_seconds() == 5.324
+ assert (Timedelta('30S').total_seconds() - 30.0) < 1e-20
+ assert (30.0 - Timedelta('30S').total_seconds()) < 1e-20
+
+ def test_timedelta_arithmetic(self):
+ data = pd.Series(['nat', '32 days'], dtype='timedelta64[ns]')
+ deltas = [timedelta(days=1), Timedelta(1, unit='D')]
+ for delta in deltas:
+ result_method = data.add(delta)
+ result_operator = data + delta
+ expected = pd.Series(['nat', '33 days'], dtype='timedelta64[ns]')
+ tm.assert_series_equal(result_operator, expected)
+ tm.assert_series_equal(result_method, expected)
+
+ result_method = data.sub(delta)
+ result_operator = data - delta
+ expected = pd.Series(['nat', '31 days'], dtype='timedelta64[ns]')
+ tm.assert_series_equal(result_operator, expected)
+ tm.assert_series_equal(result_method, expected)
+ # GH 9396
+ result_method = data.div(delta)
+ result_operator = data / delta
+ expected = pd.Series([np.nan, 32.], dtype='float64')
+ tm.assert_series_equal(result_operator, expected)
+ tm.assert_series_equal(result_method, expected)
+
+ def test_apply_to_timedelta(self):
+ timedelta_NaT = pd.to_timedelta('NaT')
+
+ list_of_valid_strings = ['00:00:01', '00:00:02']
+ a = pd.to_timedelta(list_of_valid_strings)
+ b = Series(list_of_valid_strings).apply(pd.to_timedelta)
+ # Can't compare until apply on a Series gives the correct dtype
+ # assert_series_equal(a, b)
+
+ list_of_strings = ['00:00:01', np.nan, pd.NaT, timedelta_NaT]
+
+ # TODO: unused?
+ a = pd.to_timedelta(list_of_strings) # noqa
+ b = Series(list_of_strings).apply(pd.to_timedelta) # noqa
+ # Can't compare until apply on a Series gives the correct dtype
+ # assert_series_equal(a, b)
+
+ def test_components(self):
+ rng = timedelta_range('1 days, 10:11:12', periods=2, freq='s')
+ rng.components
+
+ # with nat
+ s = Series(rng)
+ s[1] = np.nan
+
+ result = s.dt.components
+ assert not result.iloc[0].isna().all()
+ assert result.iloc[1].isna().all()
+
+
[email protected]('value, expected', [
+ (Timedelta('10S'), True),
+ (Timedelta('-10S'), True),
+ (Timedelta(10, unit='ns'), True),
+ (Timedelta(0, unit='ns'), False),
+ (Timedelta(-10, unit='ns'), True),
+ (Timedelta(None), True),
+ (pd.NaT, True),
+])
+def test_truthiness(value, expected):
+ # https://github.com/pandas-dev/pandas/issues/21484
+ assert bool(value) is expected
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/__init__.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_arithmetic.py
new file mode 100644
index 00000000000..331d6658980
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_arithmetic.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import long
+
+from pandas import Timedelta, Timestamp
+import pandas.util.testing as tm
+
+from pandas.tseries import offsets
+from pandas.tseries.frequencies import to_offset
+
+
+class TestTimestampArithmetic(object):
+ def test_overflow_offset(self):
+ # no overflow expected
+
+ stamp = Timestamp("2000/1/1")
+ offset_no_overflow = to_offset("D") * 100
+
+ expected = Timestamp("2000/04/10")
+ assert stamp + offset_no_overflow == expected
+
+ assert offset_no_overflow + stamp == expected
+
+ expected = Timestamp("1999/09/23")
+ assert stamp - offset_no_overflow == expected
+
+ def test_overflow_offset_raises(self):
+ # xref https://github.com/statsmodels/statsmodels/issues/3374
+ # ends up multiplying really large numbers which overflow
+
+ stamp = Timestamp('2017-01-13 00:00:00', freq='D')
+ offset_overflow = 20169940 * offsets.Day(1)
+ msg = ("the add operation between "
+ r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} "
+ "will overflow")
+
+ with pytest.raises(OverflowError, match=msg):
+ stamp + offset_overflow
+
+ with pytest.raises(OverflowError, match=msg):
+ offset_overflow + stamp
+
+ with pytest.raises(OverflowError, match=msg):
+ stamp - offset_overflow
+
+ # xref https://github.com/pandas-dev/pandas/issues/14080
+ # used to crash, so check for proper overflow exception
+
+ stamp = Timestamp("2000/1/1")
+ offset_overflow = to_offset("D") * 100 ** 25
+
+ with pytest.raises(OverflowError, match=msg):
+ stamp + offset_overflow
+
+ with pytest.raises(OverflowError, match=msg):
+ offset_overflow + stamp
+
+ with pytest.raises(OverflowError, match=msg):
+ stamp - offset_overflow
+
+ def test_delta_preserve_nanos(self):
+ val = Timestamp(long(1337299200000000123))
+ result = val + timedelta(1)
+ assert result.nanosecond == val.nanosecond
+
+ def test_timestamp_sub_datetime(self):
+ dt = datetime(2013, 10, 12)
+ ts = Timestamp(datetime(2013, 10, 13))
+ assert (ts - dt).days == 1
+ assert (dt - ts).days == -1
+
+ def test_addition_subtraction_types(self):
+ # Assert on the types resulting from Timestamp +/- various date/time
+ # objects
+ dt = datetime(2014, 3, 4)
+ td = timedelta(seconds=1)
+ # build a timestamp with a frequency, since then it supports
+ # addition/subtraction of integers
+ ts = Timestamp(dt, freq='D')
+
+ with tm.assert_produces_warning(FutureWarning):
+ # GH#22535 add/sub with integers is deprecated
+ assert type(ts + 1) == Timestamp
+ assert type(ts - 1) == Timestamp
+
+ # Timestamp + datetime not supported, though subtraction is supported
+ # and yields timedelta more tests in tseries/base/tests/test_base.py
+ assert type(ts - dt) == Timedelta
+ assert type(ts + td) == Timestamp
+ assert type(ts - td) == Timestamp
+
+ # Timestamp +/- datetime64 not supported, so not tested (could possibly
+ # assert error raised?)
+ td64 = np.timedelta64(1, 'D')
+ assert type(ts + td64) == Timestamp
+ assert type(ts - td64) == Timestamp
+
+ def test_addition_subtraction_preserve_frequency(self):
+ ts = Timestamp('2014-03-05', freq='D')
+ td = timedelta(days=1)
+ original_freq = ts.freq
+
+ with tm.assert_produces_warning(FutureWarning):
+ # GH#22535 add/sub with integers is deprecated
+ assert (ts + 1).freq == original_freq
+ assert (ts - 1).freq == original_freq
+
+ assert (ts + td).freq == original_freq
+ assert (ts - td).freq == original_freq
+
+ td64 = np.timedelta64(1, 'D')
+ assert (ts + td64).freq == original_freq
+ assert (ts - td64).freq == original_freq
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_comparisons.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_comparisons.py
new file mode 100644
index 00000000000..74dd52c4815
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_comparisons.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import PY2, long
+
+from pandas import Timestamp
+
+
+class TestTimestampComparison(object):
+ def test_comparison_object_array(self):
+ # GH#15183
+ ts = Timestamp('2011-01-03 00:00:00-0500', tz='US/Eastern')
+ other = Timestamp('2011-01-01 00:00:00-0500', tz='US/Eastern')
+ naive = Timestamp('2011-01-01 00:00:00')
+
+ arr = np.array([other, ts], dtype=object)
+ res = arr == ts
+ expected = np.array([False, True], dtype=bool)
+ assert (res == expected).all()
+
+ # 2D case
+ arr = np.array([[other, ts],
+ [ts, other]],
+ dtype=object)
+ res = arr != ts
+ expected = np.array([[True, False], [False, True]], dtype=bool)
+ assert res.shape == expected.shape
+ assert (res == expected).all()
+
+ # tzaware mismatch
+ arr = np.array([naive], dtype=object)
+ with pytest.raises(TypeError):
+ arr < ts
+
+ def test_comparison(self):
+ # 5-18-2012 00:00:00.000
+ stamp = long(1337299200000000000)
+
+ val = Timestamp(stamp)
+
+ assert val == val
+ assert not val != val
+ assert not val < val
+ assert val <= val
+ assert not val > val
+ assert val >= val
+
+ other = datetime(2012, 5, 18)
+ assert val == other
+ assert not val != other
+ assert not val < other
+ assert val <= other
+ assert not val > other
+ assert val >= other
+
+ other = Timestamp(stamp + 100)
+
+ assert val != other
+ assert val != other
+ assert val < other
+ assert val <= other
+ assert other > val
+ assert other >= val
+
+ def test_compare_invalid(self):
+ # GH#8058
+ val = Timestamp('20130101 12:01:02')
+ assert not val == 'foo'
+ assert not val == 10.0
+ assert not val == 1
+ assert not val == long(1)
+ assert not val == []
+ assert not val == {'foo': 1}
+ assert not val == np.float64(1)
+ assert not val == np.int64(1)
+
+ assert val != 'foo'
+ assert val != 10.0
+ assert val != 1
+ assert val != long(1)
+ assert val != []
+ assert val != {'foo': 1}
+ assert val != np.float64(1)
+ assert val != np.int64(1)
+
+ def test_cant_compare_tz_naive_w_aware(self, utc_fixture):
+ # see GH#1404
+ a = Timestamp('3/12/2012')
+ b = Timestamp('3/12/2012', tz=utc_fixture)
+
+ with pytest.raises(TypeError):
+ a == b
+ with pytest.raises(TypeError):
+ a != b
+ with pytest.raises(TypeError):
+ a < b
+ with pytest.raises(TypeError):
+ a <= b
+ with pytest.raises(TypeError):
+ a > b
+ with pytest.raises(TypeError):
+ a >= b
+
+ with pytest.raises(TypeError):
+ b == a
+ with pytest.raises(TypeError):
+ b != a
+ with pytest.raises(TypeError):
+ b < a
+ with pytest.raises(TypeError):
+ b <= a
+ with pytest.raises(TypeError):
+ b > a
+ with pytest.raises(TypeError):
+ b >= a
+
+ if PY2:
+ with pytest.raises(TypeError):
+ a == b.to_pydatetime()
+ with pytest.raises(TypeError):
+ a.to_pydatetime() == b
+ else:
+ assert not a == b.to_pydatetime()
+ assert not a.to_pydatetime() == b
+
+ def test_timestamp_compare_scalars(self):
+ # case where ndim == 0
+ lhs = np.datetime64(datetime(2013, 12, 6))
+ rhs = Timestamp('now')
+ nat = Timestamp('nat')
+
+ ops = {'gt': 'lt',
+ 'lt': 'gt',
+ 'ge': 'le',
+ 'le': 'ge',
+ 'eq': 'eq',
+ 'ne': 'ne'}
+
+ for left, right in ops.items():
+ left_f = getattr(operator, left)
+ right_f = getattr(operator, right)
+ expected = left_f(lhs, rhs)
+
+ result = right_f(rhs, lhs)
+ assert result == expected
+
+ expected = left_f(rhs, nat)
+ result = right_f(nat, rhs)
+ assert result == expected
+
+ def test_timestamp_compare_with_early_datetime(self):
+ # e.g. datetime.min
+ stamp = Timestamp('2012-01-01')
+
+ assert not stamp == datetime.min
+ assert not stamp == datetime(1600, 1, 1)
+ assert not stamp == datetime(2700, 1, 1)
+ assert stamp != datetime.min
+ assert stamp != datetime(1600, 1, 1)
+ assert stamp != datetime(2700, 1, 1)
+ assert stamp > datetime(1600, 1, 1)
+ assert stamp >= datetime(1600, 1, 1)
+ assert stamp < datetime(2700, 1, 1)
+ assert stamp <= datetime(2700, 1, 1)
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_rendering.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_rendering.py
new file mode 100644
index 00000000000..29b65ee4df7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_rendering.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+from distutils.version import LooseVersion
+import pprint
+
+import dateutil
+import pytest
+import pytz # noqa # a test below uses pytz but only inside a `eval` call
+
+from pandas import Timestamp
+
+
+class TestTimestampRendering(object):
+
+ # dateutil zone change (only matters for repr)
+ if LooseVersion(dateutil.__version__) >= LooseVersion('2.6.0'):
+ timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern',
+ 'dateutil/US/Pacific']
+ else:
+ timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern',
+ 'dateutil/America/Los_Angeles']
+
+ @pytest.mark.parametrize('tz', timezones)
+ @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N'])
+ @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00',
+ '2014-01-01 00:00:00.000000001'])
+ def test_repr(self, date, freq, tz):
+ # avoid to match with timezone name
+ freq_repr = "'{0}'".format(freq)
+ if tz.startswith('dateutil'):
+ tz_repr = tz.replace('dateutil', '')
+ else:
+ tz_repr = tz
+
+ date_only = Timestamp(date)
+ assert date in repr(date_only)
+ assert tz_repr not in repr(date_only)
+ assert freq_repr not in repr(date_only)
+ assert date_only == eval(repr(date_only))
+
+ date_tz = Timestamp(date, tz=tz)
+ assert date in repr(date_tz)
+ assert tz_repr in repr(date_tz)
+ assert freq_repr not in repr(date_tz)
+ assert date_tz == eval(repr(date_tz))
+
+ date_freq = Timestamp(date, freq=freq)
+ assert date in repr(date_freq)
+ assert tz_repr not in repr(date_freq)
+ assert freq_repr in repr(date_freq)
+ assert date_freq == eval(repr(date_freq))
+
+ date_tz_freq = Timestamp(date, tz=tz, freq=freq)
+ assert date in repr(date_tz_freq)
+ assert tz_repr in repr(date_tz_freq)
+ assert freq_repr in repr(date_tz_freq)
+ assert date_tz_freq == eval(repr(date_tz_freq))
+
+ def test_repr_utcoffset(self):
+ # This can cause the tz field to be populated, but it's redundant to
+ # include this information in the date-string.
+ date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None)
+ assert '2014-03-13 00:00:00-0400' in repr(date_with_utc_offset)
+ assert 'tzoffset' not in repr(date_with_utc_offset)
+ assert 'pytz.FixedOffset(-240)' in repr(date_with_utc_offset)
+ expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'",
+ 'pytz.FixedOffset(-240)')
+ assert date_with_utc_offset == eval(expr)
+
+ def test_timestamp_repr_pre1900(self):
+ # pre-1900
+ stamp = Timestamp('1850-01-01', tz='US/Eastern')
+ repr(stamp)
+
+ iso8601 = '1850-01-01 01:23:45.012345'
+ stamp = Timestamp(iso8601, tz='US/Eastern')
+ result = repr(stamp)
+ assert iso8601 in result
+
+ def test_pprint(self):
+ # GH#12622
+ nested_obj = {'foo': 1,
+ 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10}
+ result = pprint.pformat(nested_obj, width=50)
+ expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}}],
+ 'foo': 1}"""
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timestamp.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timestamp.py
new file mode 100644
index 00000000000..b2c05d1564a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timestamp.py
@@ -0,0 +1,964 @@
+""" test the scalar Timestamp """
+
+import calendar
+from datetime import datetime, timedelta
+import locale
+import unicodedata
+
+import dateutil
+from dateutil.tz import tzutc
+import numpy as np
+import pytest
+import pytz
+from pytz import timezone, utc
+
+from pandas._libs.tslibs import conversion
+from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone
+from pandas.compat import PY2, PY3, long
+from pandas.compat.numpy import np_datetime64_compat
+from pandas.errors import OutOfBoundsDatetime
+import pandas.util._test_decorators as td
+
+from pandas import NaT, Period, Timedelta, Timestamp
+import pandas.util.testing as tm
+
+from pandas.tseries import offsets
+
+
+class TestTimestampProperties(object):
+
+ def test_properties_business(self):
+ ts = Timestamp('2017-10-01', freq='B')
+ control = Timestamp('2017-10-01')
+ assert ts.dayofweek == 6
+ assert not ts.is_month_start # not a weekday
+ assert not ts.is_quarter_start # not a weekday
+ # Control case: non-business is month/qtr start
+ assert control.is_month_start
+ assert control.is_quarter_start
+
+ ts = Timestamp('2017-09-30', freq='B')
+ control = Timestamp('2017-09-30')
+ assert ts.dayofweek == 5
+ assert not ts.is_month_end # not a weekday
+ assert not ts.is_quarter_end # not a weekday
+ # Control case: non-business is month/qtr start
+ assert control.is_month_end
+ assert control.is_quarter_end
+
+ def test_fields(self):
+ def check(value, equal):
+ # that we are int/long like
+ assert isinstance(value, (int, long))
+ assert value == equal
+
+ # GH 10050
+ ts = Timestamp('2015-05-10 09:06:03.000100001')
+ check(ts.year, 2015)
+ check(ts.month, 5)
+ check(ts.day, 10)
+ check(ts.hour, 9)
+ check(ts.minute, 6)
+ check(ts.second, 3)
+ pytest.raises(AttributeError, lambda: ts.millisecond)
+ check(ts.microsecond, 100)
+ check(ts.nanosecond, 1)
+ check(ts.dayofweek, 6)
+ check(ts.quarter, 2)
+ check(ts.dayofyear, 130)
+ check(ts.week, 19)
+ check(ts.daysinmonth, 31)
+ check(ts.daysinmonth, 31)
+
+ # GH 13303
+ ts = Timestamp('2014-12-31 23:59:00-05:00', tz='US/Eastern')
+ check(ts.year, 2014)
+ check(ts.month, 12)
+ check(ts.day, 31)
+ check(ts.hour, 23)
+ check(ts.minute, 59)
+ check(ts.second, 0)
+ pytest.raises(AttributeError, lambda: ts.millisecond)
+ check(ts.microsecond, 0)
+ check(ts.nanosecond, 0)
+ check(ts.dayofweek, 2)
+ check(ts.quarter, 4)
+ check(ts.dayofyear, 365)
+ check(ts.week, 1)
+ check(ts.daysinmonth, 31)
+
+ ts = Timestamp('2014-01-01 00:00:00+01:00')
+ starts = ['is_month_start', 'is_quarter_start', 'is_year_start']
+ for start in starts:
+ assert getattr(ts, start)
+ ts = Timestamp('2014-12-31 23:59:59+01:00')
+ ends = ['is_month_end', 'is_year_end', 'is_quarter_end']
+ for end in ends:
+ assert getattr(ts, end)
+
+ # GH 12806
+ @pytest.mark.parametrize('data',
+ [Timestamp('2017-08-28 23:00:00'),
+ Timestamp('2017-08-28 23:00:00', tz='EST')])
+ @pytest.mark.parametrize('time_locale', [
+ None] if tm.get_locales() is None else [None] + tm.get_locales())
+ def test_names(self, data, time_locale):
+ # GH 17354
+ # Test .weekday_name, .day_name(), .month_name
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert data.weekday_name == 'Monday'
+ if time_locale is None:
+ expected_day = 'Monday'
+ expected_month = 'August'
+ else:
+ with tm.set_locale(time_locale, locale.LC_TIME):
+ expected_day = calendar.day_name[0].capitalize()
+ expected_month = calendar.month_name[8].capitalize()
+
+ result_day = data.day_name(time_locale)
+ result_month = data.month_name(time_locale)
+
+ # Work around https://github.com/pandas-dev/pandas/issues/22342
+ # different normalizations
+
+ if not PY2:
+ expected_day = unicodedata.normalize("NFD", expected_day)
+ expected_month = unicodedata.normalize("NFD", expected_month)
+
+ result_day = unicodedata.normalize("NFD", result_day,)
+ result_month = unicodedata.normalize("NFD", result_month)
+
+ assert result_day == expected_day
+ assert result_month == expected_month
+
+ # Test NaT
+ nan_ts = Timestamp(NaT)
+ assert np.isnan(nan_ts.day_name(time_locale))
+ assert np.isnan(nan_ts.month_name(time_locale))
+
+ def test_is_leap_year(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ # GH 13727
+ dt = Timestamp('2000-01-01 00:00:00', tz=tz)
+ assert dt.is_leap_year
+ assert isinstance(dt.is_leap_year, bool)
+
+ dt = Timestamp('1999-01-01 00:00:00', tz=tz)
+ assert not dt.is_leap_year
+
+ dt = Timestamp('2004-01-01 00:00:00', tz=tz)
+ assert dt.is_leap_year
+
+ dt = Timestamp('2100-01-01 00:00:00', tz=tz)
+ assert not dt.is_leap_year
+
+ def test_woy_boundary(self):
+ # make sure weeks at year boundaries are correct
+ d = datetime(2013, 12, 31)
+ result = Timestamp(d).week
+ expected = 1 # ISO standard
+ assert result == expected
+
+ d = datetime(2008, 12, 28)
+ result = Timestamp(d).week
+ expected = 52 # ISO standard
+ assert result == expected
+
+ d = datetime(2009, 12, 31)
+ result = Timestamp(d).week
+ expected = 53 # ISO standard
+ assert result == expected
+
+ d = datetime(2010, 1, 1)
+ result = Timestamp(d).week
+ expected = 53 # ISO standard
+ assert result == expected
+
+ d = datetime(2010, 1, 3)
+ result = Timestamp(d).week
+ expected = 53 # ISO standard
+ assert result == expected
+
+ result = np.array([Timestamp(datetime(*args)).week
+ for args in [(2000, 1, 1), (2000, 1, 2), (
+ 2005, 1, 1), (2005, 1, 2)]])
+ assert (result == [52, 52, 53, 53]).all()
+
+ def test_resolution(self):
+ # GH#21336, GH#21365
+ dt = Timestamp('2100-01-01 00:00:00')
+ assert dt.resolution == Timedelta(nanoseconds=1)
+
+
+class TestTimestampConstructors(object):
+
+ def test_constructor(self):
+ base_str = '2014-07-01 09:00'
+ base_dt = datetime(2014, 7, 1, 9)
+ base_expected = 1404205200000000000
+
+ # confirm base representation is correct
+ import calendar
+ assert (calendar.timegm(base_dt.timetuple()) * 1000000000 ==
+ base_expected)
+
+ tests = [(base_str, base_dt, base_expected),
+ ('2014-07-01 10:00', datetime(2014, 7, 1, 10),
+ base_expected + 3600 * 1000000000),
+ ('2014-07-01 09:00:00.000008000',
+ datetime(2014, 7, 1, 9, 0, 0, 8),
+ base_expected + 8000),
+ ('2014-07-01 09:00:00.000000005',
+ Timestamp('2014-07-01 09:00:00.000000005'),
+ base_expected + 5)]
+
+ timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9),
+ ('US/Eastern', -4), ('dateutil/US/Pacific', -7),
+ (pytz.FixedOffset(-180), -3),
+ (dateutil.tz.tzoffset(None, 18000), 5)]
+
+ for date_str, date, expected in tests:
+ for result in [Timestamp(date_str), Timestamp(date)]:
+ # only with timestring
+ assert result.value == expected
+ assert conversion.pydt_to_i8(result) == expected
+
+ # re-creation shouldn't affect to internal value
+ result = Timestamp(result)
+ assert result.value == expected
+ assert conversion.pydt_to_i8(result) == expected
+
+ # with timezone
+ for tz, offset in timezones:
+ for result in [Timestamp(date_str, tz=tz), Timestamp(date,
+ tz=tz)]:
+ expected_tz = expected - offset * 3600 * 1000000000
+ assert result.value == expected_tz
+ assert conversion.pydt_to_i8(result) == expected_tz
+
+ # should preserve tz
+ result = Timestamp(result)
+ assert result.value == expected_tz
+ assert conversion.pydt_to_i8(result) == expected_tz
+
+ # should convert to UTC
+ if tz is not None:
+ result = Timestamp(result).tz_convert('UTC')
+ else:
+ result = Timestamp(result, tz='UTC')
+ expected_utc = expected - offset * 3600 * 1000000000
+ assert result.value == expected_utc
+ assert conversion.pydt_to_i8(result) == expected_utc
+
+ def test_constructor_with_stringoffset(self):
+ # GH 7833
+ base_str = '2014-07-01 11:00:00+02:00'
+ base_dt = datetime(2014, 7, 1, 9)
+ base_expected = 1404205200000000000
+
+ # confirm base representation is correct
+ import calendar
+ assert (calendar.timegm(base_dt.timetuple()) * 1000000000 ==
+ base_expected)
+
+ tests = [(base_str, base_expected),
+ ('2014-07-01 12:00:00+02:00',
+ base_expected + 3600 * 1000000000),
+ ('2014-07-01 11:00:00.000008000+02:00', base_expected + 8000),
+ ('2014-07-01 11:00:00.000000005+02:00', base_expected + 5)]
+
+ timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9),
+ ('US/Eastern', -4), ('dateutil/US/Pacific', -7),
+ (pytz.FixedOffset(-180), -3),
+ (dateutil.tz.tzoffset(None, 18000), 5)]
+
+ for date_str, expected in tests:
+ for result in [Timestamp(date_str)]:
+ # only with timestring
+ assert result.value == expected
+ assert conversion.pydt_to_i8(result) == expected
+
+ # re-creation shouldn't affect to internal value
+ result = Timestamp(result)
+ assert result.value == expected
+ assert conversion.pydt_to_i8(result) == expected
+
+ # with timezone
+ for tz, offset in timezones:
+ result = Timestamp(date_str, tz=tz)
+ expected_tz = expected
+ assert result.value == expected_tz
+ assert conversion.pydt_to_i8(result) == expected_tz
+
+ # should preserve tz
+ result = Timestamp(result)
+ assert result.value == expected_tz
+ assert conversion.pydt_to_i8(result) == expected_tz
+
+ # should convert to UTC
+ result = Timestamp(result).tz_convert('UTC')
+ expected_utc = expected
+ assert result.value == expected_utc
+ assert conversion.pydt_to_i8(result) == expected_utc
+
+ # This should be 2013-11-01 05:00 in UTC
+ # converted to Chicago tz
+ result = Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')
+ assert result.value == Timestamp('2013-11-01 05:00').value
+ expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # This should be 2013-11-01 05:00 in UTC
+ # converted to Tokyo tz (+09:00)
+ result = Timestamp('2013-11-01 00:00:00-0500', tz='Asia/Tokyo')
+ assert result.value == Timestamp('2013-11-01 05:00').value
+ expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # GH11708
+ # This should be 2015-11-18 10:00 in UTC
+ # converted to Asia/Katmandu
+ result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu")
+ assert result.value == Timestamp("2015-11-18 10:00").value
+ expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ # This should be 2015-11-18 10:00 in UTC
+ # converted to Asia/Kolkata
+ result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata")
+ assert result.value == Timestamp("2015-11-18 10:00").value
+ expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')"
+ assert repr(result) == expected
+ assert result == eval(repr(result))
+
+ def test_constructor_invalid(self):
+ with pytest.raises(TypeError, match='Cannot convert input'):
+ Timestamp(slice(2))
+ with pytest.raises(ValueError, match='Cannot convert Period'):
+ Timestamp(Period('1000-01-01'))
+
+ def test_constructor_invalid_tz(self):
+ # GH#17690
+ with pytest.raises(TypeError, match='must be a datetime.tzinfo'):
+ Timestamp('2017-10-22', tzinfo='US/Eastern')
+
+ with pytest.raises(ValueError, match='at most one of'):
+ Timestamp('2017-10-22', tzinfo=utc, tz='UTC')
+
+ with pytest.raises(ValueError, match="Invalid frequency:"):
+ # GH#5168
+ # case where user tries to pass tz as an arg, not kwarg, gets
+ # interpreted as a `freq`
+ Timestamp('2012-01-01', 'US/Pacific')
+
+ def test_constructor_tz_or_tzinfo(self):
+ # GH#17943, GH#17690, GH#5168
+ stamps = [Timestamp(year=2017, month=10, day=22, tz='UTC'),
+ Timestamp(year=2017, month=10, day=22, tzinfo=utc),
+ Timestamp(year=2017, month=10, day=22, tz=utc),
+ Timestamp(datetime(2017, 10, 22), tzinfo=utc),
+ Timestamp(datetime(2017, 10, 22), tz='UTC'),
+ Timestamp(datetime(2017, 10, 22), tz=utc)]
+ assert all(ts == stamps[0] for ts in stamps)
+
+ def test_constructor_positional(self):
+ # see gh-10758
+ with pytest.raises(TypeError):
+ Timestamp(2000, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 0, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 13, 1)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 1, 0)
+ with pytest.raises(ValueError):
+ Timestamp(2000, 1, 32)
+
+ # see gh-11630
+ assert (repr(Timestamp(2015, 11, 12)) ==
+ repr(Timestamp('20151112')))
+ assert (repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) ==
+ repr(Timestamp('2015-11-12 01:02:03.999999')))
+
+ def test_constructor_keyword(self):
+ # GH 10758
+ with pytest.raises(TypeError):
+ Timestamp(year=2000, month=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=0, day=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=13, day=1)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=1, day=0)
+ with pytest.raises(ValueError):
+ Timestamp(year=2000, month=1, day=32)
+
+ assert (repr(Timestamp(year=2015, month=11, day=12)) ==
+ repr(Timestamp('20151112')))
+
+ assert (repr(Timestamp(year=2015, month=11, day=12, hour=1, minute=2,
+ second=3, microsecond=999999)) ==
+ repr(Timestamp('2015-11-12 01:02:03.999999')))
+
+ def test_constructor_fromordinal(self):
+ base = datetime(2000, 1, 1)
+
+ ts = Timestamp.fromordinal(base.toordinal(), freq='D')
+ assert base == ts
+ assert ts.freq == 'D'
+ assert base.toordinal() == ts.toordinal()
+
+ ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern')
+ assert Timestamp('2000-01-01', tz='US/Eastern') == ts
+ assert base.toordinal() == ts.toordinal()
+
+ # GH#3042
+ dt = datetime(2011, 4, 16, 0, 0)
+ ts = Timestamp.fromordinal(dt.toordinal())
+ assert ts.to_pydatetime() == dt
+
+ # with a tzinfo
+ stamp = Timestamp('2011-4-16', tz='US/Eastern')
+ dt_tz = stamp.to_pydatetime()
+ ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern')
+ assert ts.to_pydatetime() == dt_tz
+
+ @pytest.mark.parametrize('result', [
+ Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1),
+ Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5,
+ microsecond=6, nanosecond=1),
+ Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5,
+ microsecond=6, nanosecond=1, tz='UTC'),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC)])
+ def test_constructor_nanosecond(self, result):
+ # GH 18898
+ expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz)
+ expected = expected + Timedelta(nanoseconds=1)
+ assert result == expected
+
+ @pytest.mark.parametrize('z', ['Z0', 'Z00'])
+ def test_constructor_invalid_Z0_isostring(self, z):
+ # GH 8910
+ with pytest.raises(ValueError):
+ Timestamp('2014-11-02 01:00{}'.format(z))
+
+ @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute',
+ 'second', 'microsecond', 'nanosecond'])
+ def test_invalid_date_kwarg_with_string_input(self, arg):
+ kwarg = {arg: 1}
+ with pytest.raises(ValueError):
+ Timestamp('2010-10-10 12:59:59.999999999', **kwarg)
+
+ def test_out_of_bounds_value(self):
+ one_us = np.timedelta64(1).astype('timedelta64[us]')
+
+ # By definition we can't go out of bounds in [ns], so we
+ # convert the datetime64s to [us] so we can go out of bounds
+ min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]')
+ max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]')
+
+ # No error for the min/max datetimes
+ Timestamp(min_ts_us)
+ Timestamp(max_ts_us)
+
+ # One us less than the minimum is an error
+ with pytest.raises(ValueError):
+ Timestamp(min_ts_us - one_us)
+
+ # One us more than the maximum is an error
+ with pytest.raises(ValueError):
+ Timestamp(max_ts_us + one_us)
+
+ def test_out_of_bounds_string(self):
+ with pytest.raises(ValueError):
+ Timestamp('1676-01-01')
+ with pytest.raises(ValueError):
+ Timestamp('2263-01-01')
+
+ def test_barely_out_of_bounds(self):
+ # GH#19529
+ # GH#19382 close enough to bounds that dropping nanos would result
+ # in an in-bounds datetime
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp('2262-04-11 23:47:16.854775808')
+
+ def test_bounds_with_different_units(self):
+ out_of_bounds_dates = ('1677-09-21', '2262-04-12')
+
+ time_units = ('D', 'h', 'm', 's', 'ms', 'us')
+
+ for date_string in out_of_bounds_dates:
+ for unit in time_units:
+ dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit)
+ with pytest.raises(ValueError):
+ Timestamp(dt64)
+
+ in_bounds_dates = ('1677-09-23', '2262-04-11')
+
+ for date_string in in_bounds_dates:
+ for unit in time_units:
+ dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit)
+ Timestamp(dt64)
+
+ def test_min_valid(self):
+ # Ensure that Timestamp.min is a valid Timestamp
+ Timestamp(Timestamp.min)
+
+ def test_max_valid(self):
+ # Ensure that Timestamp.max is a valid Timestamp
+ Timestamp(Timestamp.max)
+
+ def test_now(self):
+ # GH#9000
+ ts_from_string = Timestamp('now')
+ ts_from_method = Timestamp.now()
+ ts_datetime = datetime.now()
+
+ ts_from_string_tz = Timestamp('now', tz='US/Eastern')
+ ts_from_method_tz = Timestamp.now(tz='US/Eastern')
+
+ # Check that the delta between the times is less than 1s (arbitrarily
+ # small)
+ delta = Timedelta(seconds=1)
+ assert abs(ts_from_method - ts_from_string) < delta
+ assert abs(ts_datetime - ts_from_method) < delta
+ assert abs(ts_from_method_tz - ts_from_string_tz) < delta
+ assert (abs(ts_from_string_tz.tz_localize(None) -
+ ts_from_method_tz.tz_localize(None)) < delta)
+
+ def test_today(self):
+ ts_from_string = Timestamp('today')
+ ts_from_method = Timestamp.today()
+ ts_datetime = datetime.today()
+
+ ts_from_string_tz = Timestamp('today', tz='US/Eastern')
+ ts_from_method_tz = Timestamp.today(tz='US/Eastern')
+
+ # Check that the delta between the times is less than 1s (arbitrarily
+ # small)
+ delta = Timedelta(seconds=1)
+ assert abs(ts_from_method - ts_from_string) < delta
+ assert abs(ts_datetime - ts_from_method) < delta
+ assert abs(ts_from_method_tz - ts_from_string_tz) < delta
+ assert (abs(ts_from_string_tz.tz_localize(None) -
+ ts_from_method_tz.tz_localize(None)) < delta)
+
+ @pytest.mark.parametrize('tz', [None, pytz.timezone('US/Pacific')])
+ def test_disallow_setting_tz(self, tz):
+ # GH 3746
+ ts = Timestamp('2010')
+ with pytest.raises(AttributeError):
+ ts.tz = tz
+
+ @pytest.mark.parametrize('offset', ['+0300', '+0200'])
+ def test_construct_timestamp_near_dst(self, offset):
+ # GH 20854
+ expected = Timestamp('2016-10-30 03:00:00{}'.format(offset),
+ tz='Europe/Helsinki')
+ result = Timestamp(expected).tz_convert('Europe/Helsinki')
+ assert result == expected
+
+ @pytest.mark.parametrize('arg', [
+ '2013/01/01 00:00:00+09:00', '2013-01-01 00:00:00+09:00'])
+ def test_construct_with_different_string_format(self, arg):
+ # GH 12064
+ result = Timestamp(arg)
+ expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540))
+ assert result == expected
+
+ def test_construct_timestamp_preserve_original_frequency(self):
+ # GH 22311
+ result = Timestamp(Timestamp('2010-08-08', freq='D')).freq
+ expected = offsets.Day()
+ assert result == expected
+
+ def test_constructor_invalid_frequency(self):
+ # GH 22311
+ with pytest.raises(ValueError, match="Invalid frequency:"):
+ Timestamp('2012-01-01', freq=[])
+
+ @pytest.mark.parametrize('box', [datetime, Timestamp])
+ def test_depreciate_tz_and_tzinfo_in_datetime_input(self, box):
+ # GH 23579
+ kwargs = {'year': 2018, 'month': 1, 'day': 1, 'tzinfo': utc}
+ with tm.assert_produces_warning(FutureWarning):
+ Timestamp(box(**kwargs), tz='US/Pacific')
+
+ def test_dont_convert_dateutil_utc_to_pytz_utc(self):
+ result = Timestamp(datetime(2018, 1, 1), tz=tzutc())
+ expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc())
+ assert result == expected
+
+
+class TestTimestamp(object):
+
+ def test_tz(self):
+ tstr = '2014-02-01 09:00'
+ ts = Timestamp(tstr)
+ local = ts.tz_localize('Asia/Tokyo')
+ assert local.hour == 9
+ assert local == Timestamp(tstr, tz='Asia/Tokyo')
+ conv = local.tz_convert('US/Eastern')
+ assert conv == Timestamp('2014-01-31 19:00', tz='US/Eastern')
+ assert conv.hour == 19
+
+ # preserves nanosecond
+ ts = Timestamp(tstr) + offsets.Nano(5)
+ local = ts.tz_localize('Asia/Tokyo')
+ assert local.hour == 9
+ assert local.nanosecond == 5
+ conv = local.tz_convert('US/Eastern')
+ assert conv.nanosecond == 5
+ assert conv.hour == 19
+
+ def test_utc_z_designator(self):
+ assert get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo) is utc
+
+ def test_asm8(self):
+ np.random.seed(7960929)
+ ns = [Timestamp.min.value, Timestamp.max.value, 1000]
+
+ for n in ns:
+ assert (Timestamp(n).asm8.view('i8') ==
+ np.datetime64(n, 'ns').view('i8') == n)
+
+ assert (Timestamp('nat').asm8.view('i8') ==
+ np.datetime64('nat', 'ns').view('i8'))
+
+ def test_class_ops_pytz(self):
+ def compare(x, y):
+ assert (int(Timestamp(x).value / 1e9) ==
+ int(Timestamp(y).value / 1e9))
+
+ compare(Timestamp.now(), datetime.now())
+ compare(Timestamp.now('UTC'), datetime.now(timezone('UTC')))
+ compare(Timestamp.utcnow(), datetime.utcnow())
+ compare(Timestamp.today(), datetime.today())
+ current_time = calendar.timegm(datetime.now().utctimetuple())
+ compare(Timestamp.utcfromtimestamp(current_time),
+ datetime.utcfromtimestamp(current_time))
+ compare(Timestamp.fromtimestamp(current_time),
+ datetime.fromtimestamp(current_time))
+
+ date_component = datetime.utcnow()
+ time_component = (date_component + timedelta(minutes=10)).time()
+ compare(Timestamp.combine(date_component, time_component),
+ datetime.combine(date_component, time_component))
+
+ def test_class_ops_dateutil(self):
+ def compare(x, y):
+ assert (int(np.round(Timestamp(x).value / 1e9)) ==
+ int(np.round(Timestamp(y).value / 1e9)))
+
+ compare(Timestamp.now(), datetime.now())
+ compare(Timestamp.now('UTC'), datetime.now(tzutc()))
+ compare(Timestamp.utcnow(), datetime.utcnow())
+ compare(Timestamp.today(), datetime.today())
+ current_time = calendar.timegm(datetime.now().utctimetuple())
+ compare(Timestamp.utcfromtimestamp(current_time),
+ datetime.utcfromtimestamp(current_time))
+ compare(Timestamp.fromtimestamp(current_time),
+ datetime.fromtimestamp(current_time))
+
+ date_component = datetime.utcnow()
+ time_component = (date_component + timedelta(minutes=10)).time()
+ compare(Timestamp.combine(date_component, time_component),
+ datetime.combine(date_component, time_component))
+
+ def test_basics_nanos(self):
+ val = np.int64(946684800000000000).view('M8[ns]')
+ stamp = Timestamp(val.view('i8') + 500)
+ assert stamp.year == 2000
+ assert stamp.month == 1
+ assert stamp.microsecond == 0
+ assert stamp.nanosecond == 500
+
+ # GH 14415
+ val = np.iinfo(np.int64).min + 80000000000000
+ stamp = Timestamp(val)
+ assert stamp.year == 1677
+ assert stamp.month == 9
+ assert stamp.day == 21
+ assert stamp.microsecond == 145224
+ assert stamp.nanosecond == 192
+
+ @pytest.mark.parametrize('value, check_kwargs', [
+ [946688461000000000, {}],
+ [946688461000000000 / long(1000), dict(unit='us')],
+ [946688461000000000 / long(1000000), dict(unit='ms')],
+ [946688461000000000 / long(1000000000), dict(unit='s')],
+ [10957, dict(unit='D', h=0)],
+ pytest.param((946688461000000000 + 500000) / long(1000000000),
+ dict(unit='s', us=499, ns=964),
+ marks=pytest.mark.skipif(not PY3,
+ reason='using truediv, so these'
+ ' are like floats')),
+ pytest.param((946688461000000000 + 500000000) / long(1000000000),
+ dict(unit='s', us=500000),
+ marks=pytest.mark.skipif(not PY3,
+ reason='using truediv, so these'
+ ' are like floats')),
+ pytest.param((946688461000000000 + 500000) / long(1000000),
+ dict(unit='ms', us=500),
+ marks=pytest.mark.skipif(not PY3,
+ reason='using truediv, so these'
+ ' are like floats')),
+ pytest.param((946688461000000000 + 500000) / long(1000000000),
+ dict(unit='s'),
+ marks=pytest.mark.skipif(PY3,
+ reason='get chopped in py2')),
+ pytest.param((946688461000000000 + 500000000) / long(1000000000),
+ dict(unit='s'),
+ marks=pytest.mark.skipif(PY3,
+ reason='get chopped in py2')),
+ pytest.param((946688461000000000 + 500000) / long(1000000),
+ dict(unit='ms'),
+ marks=pytest.mark.skipif(PY3,
+ reason='get chopped in py2')),
+ [(946688461000000000 + 500000) / long(1000), dict(unit='us', us=500)],
+ [(946688461000000000 + 500000000) / long(1000000),
+ dict(unit='ms', us=500000)],
+ [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)],
+ [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)],
+ [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)],
+ [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)],
+ [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)],
+ [10957 + 0.5, dict(unit='D', h=12)]])
+ def test_unit(self, value, check_kwargs):
+ def check(value, unit=None, h=1, s=1, us=0, ns=0):
+ stamp = Timestamp(value, unit=unit)
+ assert stamp.year == 2000
+ assert stamp.month == 1
+ assert stamp.day == 1
+ assert stamp.hour == h
+ if unit != 'D':
+ assert stamp.minute == 1
+ assert stamp.second == s
+ assert stamp.microsecond == us
+ else:
+ assert stamp.minute == 0
+ assert stamp.second == 0
+ assert stamp.microsecond == 0
+ assert stamp.nanosecond == ns
+
+ check(value, **check_kwargs)
+
+ def test_roundtrip(self):
+
+ # test value to string and back conversions
+ # further test accessors
+ base = Timestamp('20140101 00:00:00')
+
+ result = Timestamp(base.value + Timedelta('5ms').value)
+ assert result == Timestamp(str(base) + ".005000")
+ assert result.microsecond == 5000
+
+ result = Timestamp(base.value + Timedelta('5us').value)
+ assert result == Timestamp(str(base) + ".000005")
+ assert result.microsecond == 5
+
+ result = Timestamp(base.value + Timedelta('5ns').value)
+ assert result == Timestamp(str(base) + ".000000005")
+ assert result.nanosecond == 5
+ assert result.microsecond == 0
+
+ result = Timestamp(base.value + Timedelta('6ms 5us').value)
+ assert result == Timestamp(str(base) + ".006005")
+ assert result.microsecond == 5 + 6 * 1000
+
+ result = Timestamp(base.value + Timedelta('200ms 5us').value)
+ assert result == Timestamp(str(base) + ".200005")
+ assert result.microsecond == 5 + 200 * 1000
+
+ def test_hash_equivalent(self):
+ d = {datetime(2011, 1, 1): 5}
+ stamp = Timestamp(datetime(2011, 1, 1))
+ assert d[stamp] == 5
+
+
+class TestTimestampNsOperations(object):
+
+ def setup_method(self, method):
+ self.timestamp = Timestamp(datetime.utcnow())
+
+ def assert_ns_timedelta(self, modified_timestamp, expected_value):
+ value = self.timestamp.value
+ modified_value = modified_timestamp.value
+
+ assert modified_value - value == expected_value
+
+ def test_timedelta_ns_arithmetic(self):
+ self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'),
+ -123)
+
+ def test_timedelta_ns_based_arithmetic(self):
+ self.assert_ns_timedelta(self.timestamp + np.timedelta64(
+ 1234567898, 'ns'), 1234567898)
+
+ def test_timedelta_us_arithmetic(self):
+ self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'),
+ -123000)
+
+ def test_timedelta_ms_arithmetic(self):
+ time = self.timestamp + np.timedelta64(-123, 'ms')
+ self.assert_ns_timedelta(time, -123000000)
+
+ def test_nanosecond_string_parsing(self):
+ ts = Timestamp('2013-05-01 07:15:45.123456789')
+ # GH 7878
+ expected_repr = '2013-05-01 07:15:45.123456789'
+ expected_value = 1367392545123456789
+ assert ts.value == expected_value
+ assert expected_repr in repr(ts)
+
+ ts = Timestamp('2013-05-01 07:15:45.123456789+09:00', tz='Asia/Tokyo')
+ assert ts.value == expected_value - 9 * 3600 * 1000000000
+ assert expected_repr in repr(ts)
+
+ ts = Timestamp('2013-05-01 07:15:45.123456789', tz='UTC')
+ assert ts.value == expected_value
+ assert expected_repr in repr(ts)
+
+ ts = Timestamp('2013-05-01 07:15:45.123456789', tz='US/Eastern')
+ assert ts.value == expected_value + 4 * 3600 * 1000000000
+ assert expected_repr in repr(ts)
+
+ # GH 10041
+ ts = Timestamp('20130501T071545.123456789')
+ assert ts.value == expected_value
+ assert expected_repr in repr(ts)
+
+ def test_nanosecond_timestamp(self):
+ # GH 7610
+ expected = 1293840000000000005
+ t = Timestamp('2011-01-01') + offsets.Nano(5)
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')"
+ assert t.value == expected
+ assert t.nanosecond == 5
+
+ t = Timestamp(t)
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')"
+ assert t.value == expected
+ assert t.nanosecond == 5
+
+ t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000005Z'))
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')"
+ assert t.value == expected
+ assert t.nanosecond == 5
+
+ expected = 1293840000000000010
+ t = t + offsets.Nano(5)
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')"
+ assert t.value == expected
+ assert t.nanosecond == 10
+
+ t = Timestamp(t)
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')"
+ assert t.value == expected
+ assert t.nanosecond == 10
+
+ t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000010Z'))
+ assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')"
+ assert t.value == expected
+ assert t.nanosecond == 10
+
+
+class TestTimestampToJulianDate(object):
+
+ def test_compare_1700(self):
+ r = Timestamp('1700-06-23').to_julian_date()
+ assert r == 2342145.5
+
+ def test_compare_2000(self):
+ r = Timestamp('2000-04-12').to_julian_date()
+ assert r == 2451646.5
+
+ def test_compare_2100(self):
+ r = Timestamp('2100-08-12').to_julian_date()
+ assert r == 2488292.5
+
+ def test_compare_hour01(self):
+ r = Timestamp('2000-08-12T01:00:00').to_julian_date()
+ assert r == 2451768.5416666666666666
+
+ def test_compare_hour13(self):
+ r = Timestamp('2000-08-12T13:00:00').to_julian_date()
+ assert r == 2451769.0416666666666666
+
+
+class TestTimestampConversion(object):
+ def test_conversion(self):
+ # GH#9255
+ ts = Timestamp('2000-01-01')
+
+ result = ts.to_pydatetime()
+ expected = datetime(2000, 1, 1)
+ assert result == expected
+ assert type(result) == type(expected)
+
+ result = ts.to_datetime64()
+ expected = np.datetime64(ts.value, 'ns')
+ assert result == expected
+ assert type(result) == type(expected)
+ assert result.dtype == expected.dtype
+
+ def test_to_pydatetime_nonzero_nano(self):
+ ts = Timestamp('2011-01-01 9:00:00.123456789')
+
+ # Warn the user of data loss (nanoseconds).
+ with tm.assert_produces_warning(UserWarning,
+ check_stacklevel=False):
+ expected = datetime(2011, 1, 1, 9, 0, 0, 123456)
+ result = ts.to_pydatetime()
+ assert result == expected
+
+ def test_timestamp_to_datetime(self):
+ stamp = Timestamp('20090415', tz='US/Eastern', freq='D')
+ dtval = stamp.to_pydatetime()
+ assert stamp == dtval
+ assert stamp.tzinfo == dtval.tzinfo
+
+ def test_timestamp_to_datetime_dateutil(self):
+ stamp = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D')
+ dtval = stamp.to_pydatetime()
+ assert stamp == dtval
+ assert stamp.tzinfo == dtval.tzinfo
+
+ def test_timestamp_to_datetime_explicit_pytz(self):
+ stamp = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D')
+ dtval = stamp.to_pydatetime()
+ assert stamp == dtval
+ assert stamp.tzinfo == dtval.tzinfo
+
+ @td.skip_if_windows_python_3
+ def test_timestamp_to_datetime_explicit_dateutil(self):
+ stamp = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D')
+ dtval = stamp.to_pydatetime()
+ assert stamp == dtval
+ assert stamp.tzinfo == dtval.tzinfo
+
+ def test_to_datetime_bijective(self):
+ # Ensure that converting to datetime and back only loses precision
+ # by going from nanoseconds to microseconds.
+ exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning
+ with tm.assert_produces_warning(exp_warning, check_stacklevel=False):
+ assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 ==
+ Timestamp.max.value / 1000)
+
+ exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning
+ with tm.assert_produces_warning(exp_warning, check_stacklevel=False):
+ assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 ==
+ Timestamp.min.value / 1000)
+
+ def test_to_period_tz_warning(self):
+ # GH#21333 make sure a warning is issued when timezone
+ # info is lost
+ ts = Timestamp('2009-04-15 16:17:18', tz='US/Eastern')
+ with tm.assert_produces_warning(UserWarning):
+ # warning that timezone info will be lost
+ ts.to_period('D')
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timezones.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timezones.py
new file mode 100644
index 00000000000..bc67a3e72f8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_timezones.py
@@ -0,0 +1,389 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for Timestamp timezone-related methods
+"""
+from datetime import date, datetime, timedelta
+from distutils.version import LooseVersion
+
+import dateutil
+from dateutil.tz import gettz, tzoffset
+import pytest
+import pytz
+from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError
+
+from pandas._libs.tslibs import timezones
+from pandas.errors import OutOfBoundsDatetime
+import pandas.util._test_decorators as td
+
+from pandas import NaT, Timestamp
+import pandas.util.testing as tm
+
+
+class TestTimestampTZOperations(object):
+ # --------------------------------------------------------------
+ # Timestamp.tz_localize
+
+ def test_tz_localize_pushes_out_of_bounds(self):
+ # GH#12677
+ # tz_localize that pushes away from the boundary is OK
+ pac = Timestamp.min.tz_localize('US/Pacific')
+ assert pac.value > Timestamp.min.value
+ pac.tz_convert('Asia/Tokyo') # tz_convert doesn't change value
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp.min.tz_localize('Asia/Tokyo')
+
+ # tz_localize that pushes away from the boundary is OK
+ tokyo = Timestamp.max.tz_localize('Asia/Tokyo')
+ assert tokyo.value < Timestamp.max.value
+ tokyo.tz_convert('US/Pacific') # tz_convert doesn't change value
+ with pytest.raises(OutOfBoundsDatetime):
+ Timestamp.max.tz_localize('US/Pacific')
+
+ def test_tz_localize_ambiguous_bool(self):
+ # make sure that we are correctly accepting bool values as ambiguous
+ # GH#14402
+ ts = Timestamp('2015-11-01 01:00:03')
+ expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central')
+ expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central')
+
+ with pytest.raises(pytz.AmbiguousTimeError):
+ ts.tz_localize('US/Central')
+
+ result = ts.tz_localize('US/Central', ambiguous=True)
+ assert result == expected0
+
+ result = ts.tz_localize('US/Central', ambiguous=False)
+ assert result == expected1
+
+ def test_tz_localize_ambiguous(self):
+ ts = Timestamp('2014-11-02 01:00')
+ ts_dst = ts.tz_localize('US/Eastern', ambiguous=True)
+ ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False)
+
+ assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600
+ with pytest.raises(ValueError):
+ ts.tz_localize('US/Eastern', ambiguous='infer')
+
+ # GH#8025
+ msg = ('Cannot localize tz-aware Timestamp, '
+ 'use tz_convert for conversions')
+ with pytest.raises(TypeError, match=msg):
+ Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo')
+
+ msg = ('Cannot convert tz-naive Timestamp, '
+ 'use tz_localize to localize')
+ with pytest.raises(TypeError, match=msg):
+ Timestamp('2011-01-01').tz_convert('Asia/Tokyo')
+
+ @pytest.mark.parametrize('stamp, tz', [
+ ('2015-03-08 02:00', 'US/Eastern'),
+ ('2015-03-08 02:30', 'US/Pacific'),
+ ('2015-03-29 02:00', 'Europe/Paris'),
+ ('2015-03-29 02:30', 'Europe/Belgrade')])
+ @pytest.mark.filterwarnings('ignore::FutureWarning')
+ def test_tz_localize_nonexistent(self, stamp, tz):
+ # GH#13057
+ ts = Timestamp(stamp)
+ with pytest.raises(NonExistentTimeError):
+ ts.tz_localize(tz)
+ # GH 22644
+ with pytest.raises(NonExistentTimeError):
+ with tm.assert_produces_warning(FutureWarning):
+ ts.tz_localize(tz, errors='raise')
+ with tm.assert_produces_warning(FutureWarning):
+ assert ts.tz_localize(tz, errors='coerce') is NaT
+
+ def test_tz_localize_errors_ambiguous(self):
+ # GH#13057
+ ts = Timestamp('2015-11-1 01:00')
+ with pytest.raises(AmbiguousTimeError):
+ with tm.assert_produces_warning(FutureWarning):
+ ts.tz_localize('US/Pacific', errors='coerce')
+
+ @pytest.mark.filterwarnings('ignore::FutureWarning')
+ def test_tz_localize_errors_invalid_arg(self):
+ # GH 22644
+ tz = 'Europe/Warsaw'
+ ts = Timestamp('2015-03-29 02:00:00')
+ with pytest.raises(ValueError):
+ with tm.assert_produces_warning(FutureWarning):
+ ts.tz_localize(tz, errors='foo')
+
+ def test_tz_localize_errors_coerce(self):
+ # GH 22644
+ # make sure errors='coerce' gets mapped correctly to nonexistent
+ tz = 'Europe/Warsaw'
+ ts = Timestamp('2015-03-29 02:00:00')
+ with tm.assert_produces_warning(FutureWarning):
+ result = ts.tz_localize(tz, errors='coerce')
+ expected = ts.tz_localize(tz, nonexistent='NaT')
+ assert result is expected
+
+ @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00',
+ '2014-11-01 17:00', '2014-11-05 00:00'])
+ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture):
+ tz = tz_aware_fixture
+ ts = Timestamp(stamp)
+ localized = ts.tz_localize(tz)
+ assert localized == Timestamp(stamp, tz=tz)
+
+ with pytest.raises(TypeError):
+ localized.tz_localize(tz)
+
+ reset = localized.tz_localize(None)
+ assert reset == ts
+ assert reset.tzinfo is None
+
+ def test_tz_localize_ambiguous_compat(self):
+ # validate that pytz and dateutil are compat for dst
+ # when the transition happens
+ naive = Timestamp('2013-10-27 01:00:00')
+
+ pytz_zone = 'Europe/London'
+ dateutil_zone = 'dateutil/Europe/London'
+ result_pytz = naive.tz_localize(pytz_zone, ambiguous=0)
+ result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0)
+ assert result_pytz.value == result_dateutil.value
+ assert result_pytz.value == 1382835600000000000
+
+ if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'):
+ # dateutil 2.6 buggy w.r.t. ambiguous=0
+ # see gh-14621
+ # see https://github.com/dateutil/dateutil/issues/321
+ assert (result_pytz.to_pydatetime().tzname() ==
+ result_dateutil.to_pydatetime().tzname())
+ assert str(result_pytz) == str(result_dateutil)
+ elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'):
+ # fixed ambiguous behavior
+ assert result_pytz.to_pydatetime().tzname() == 'GMT'
+ assert result_dateutil.to_pydatetime().tzname() == 'BST'
+ assert str(result_pytz) != str(result_dateutil)
+
+ # 1 hour difference
+ result_pytz = naive.tz_localize(pytz_zone, ambiguous=1)
+ result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=1)
+ assert result_pytz.value == result_dateutil.value
+ assert result_pytz.value == 1382832000000000000
+
+ # dateutil < 2.6 is buggy w.r.t. ambiguous timezones
+ if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'):
+ # see gh-14621
+ assert str(result_pytz) == str(result_dateutil)
+ assert (result_pytz.to_pydatetime().tzname() ==
+ result_dateutil.to_pydatetime().tzname())
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern'),
+ 'US/Eastern', 'dateutil/US/Eastern'])
+ def test_timestamp_tz_localize(self, tz):
+ stamp = Timestamp('3/11/2012 04:00')
+
+ result = stamp.tz_localize(tz)
+ expected = Timestamp('3/11/2012 04:00', tz=tz)
+ assert result.hour == expected.hour
+ assert result == expected
+
+ @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [
+ ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00',
+ 'forward'],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 01:59:59.999999999', 'backward'],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 03:20:00', timedelta(hours=1)],
+ ['2015-03-29 02:20:00', 'Europe/Warsaw',
+ '2015-03-29 01:20:00', timedelta(hours=-1)],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00',
+ 'forward'],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999',
+ 'backward'],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00',
+ timedelta(hours=1)],
+ ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00',
+ timedelta(hours=-1)]
+ ])
+ @pytest.mark.parametrize('tz_type', ['', 'dateutil/'])
+ def test_timestamp_tz_localize_nonexistent_shift(self, start_ts, tz,
+ end_ts, shift,
+ tz_type):
+ # GH 8917, 24466
+ tz = tz_type + tz
+ if isinstance(shift, str):
+ shift = 'shift_' + shift
+ ts = Timestamp(start_ts)
+ result = ts.tz_localize(tz, nonexistent=shift)
+ expected = Timestamp(end_ts).tz_localize(tz)
+ assert result == expected
+
+ @pytest.mark.parametrize('offset', [-1, 1])
+ @pytest.mark.parametrize('tz_type', ['', 'dateutil/'])
+ def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset,
+ tz_type):
+ # GH 8917, 24466
+ tz = tz_type + 'Europe/Warsaw'
+ ts = Timestamp('2015-03-29 02:20:00')
+ msg = "The provided timedelta will relocalize on a nonexistent time"
+ with pytest.raises(ValueError, match=msg):
+ ts.tz_localize(tz, nonexistent=timedelta(seconds=offset))
+
+ @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw'])
+ def test_timestamp_tz_localize_nonexistent_NaT(self, tz):
+ # GH 8917
+ ts = Timestamp('2015-03-29 02:20:00')
+ result = ts.tz_localize(tz, nonexistent='NaT')
+ assert result is NaT
+
+ @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw'])
+ def test_timestamp_tz_localize_nonexistent_raise(self, tz):
+ # GH 8917
+ ts = Timestamp('2015-03-29 02:20:00')
+ with pytest.raises(pytz.NonExistentTimeError):
+ ts.tz_localize(tz, nonexistent='raise')
+ with pytest.raises(ValueError):
+ ts.tz_localize(tz, nonexistent='foo')
+
+ # ------------------------------------------------------------------
+ # Timestamp.tz_convert
+
+ @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00',
+ '2014-11-01 17:00', '2014-11-05 00:00'])
+ def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture):
+ tz = tz_aware_fixture
+
+ ts = Timestamp(stamp, tz='UTC')
+ converted = ts.tz_convert(tz)
+
+ reset = converted.tz_convert(None)
+ assert reset == Timestamp(stamp)
+ assert reset.tzinfo is None
+ assert reset == converted.tz_convert('UTC').tz_localize(None)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_astimezone(self, tzstr):
+ # astimezone is an alias for tz_convert, so keep it with
+ # the tz_convert tests
+ utcdate = Timestamp('3/11/2012 22:00', tz='UTC')
+ expected = utcdate.tz_convert(tzstr)
+ result = utcdate.astimezone(tzstr)
+ assert expected == result
+ assert isinstance(result, Timestamp)
+
+ @td.skip_if_windows
+ def test_tz_convert_utc_with_system_utc(self):
+ from pandas._libs.tslibs.timezones import maybe_get_tz
+
+ # from system utc to real utc
+ ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC'))
+ # check that the time hasn't changed.
+ assert ts == ts.tz_convert(dateutil.tz.tzutc())
+
+ # from system utc to real utc
+ ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC'))
+ # check that the time hasn't changed.
+ assert ts == ts.tz_convert(dateutil.tz.tzutc())
+
+ # ------------------------------------------------------------------
+ # Timestamp.__init__ with tz str or tzinfo
+
+ def test_timestamp_constructor_tz_utc(self):
+ utc_stamp = Timestamp('3/11/2012 05:00', tz='utc')
+ assert utc_stamp.tzinfo is pytz.utc
+ assert utc_stamp.hour == 5
+
+ utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc')
+ assert utc_stamp.hour == 5
+
+ def test_timestamp_to_datetime_tzoffset(self):
+ tzinfo = tzoffset(None, 7200)
+ expected = Timestamp('3/11/2012 04:00', tz=tzinfo)
+ result = Timestamp(expected.to_pydatetime())
+ assert expected == result
+
+ def test_timestamp_constructor_near_dst_boundary(self):
+ # GH#11481 & GH#15777
+ # Naive string timestamps were being localized incorrectly
+ # with tz_convert_single instead of tz_localize_to_utc
+
+ for tz in ['Europe/Brussels', 'Europe/Prague']:
+ result = Timestamp('2015-10-25 01:00', tz=tz)
+ expected = Timestamp('2015-10-25 01:00').tz_localize(tz)
+ assert result == expected
+
+ with pytest.raises(pytz.AmbiguousTimeError):
+ Timestamp('2015-10-25 02:00', tz=tz)
+
+ result = Timestamp('2017-03-26 01:00', tz='Europe/Paris')
+ expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris')
+ assert result == expected
+
+ with pytest.raises(pytz.NonExistentTimeError):
+ Timestamp('2017-03-26 02:00', tz='Europe/Paris')
+
+ # GH#11708
+ naive = Timestamp('2015-11-18 10:00:00')
+ result = naive.tz_localize('UTC').tz_convert('Asia/Kolkata')
+ expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')
+ assert result == expected
+
+ # GH#15823
+ result = Timestamp('2017-03-26 00:00', tz='Europe/Paris')
+ expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris')
+ assert result == expected
+
+ result = Timestamp('2017-03-26 01:00', tz='Europe/Paris')
+ expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris')
+ assert result == expected
+
+ with pytest.raises(pytz.NonExistentTimeError):
+ Timestamp('2017-03-26 02:00', tz='Europe/Paris')
+
+ result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris')
+ naive = Timestamp(result.value)
+ expected = naive.tz_localize('UTC').tz_convert('Europe/Paris')
+ assert result == expected
+
+ result = Timestamp('2017-03-26 03:00', tz='Europe/Paris')
+ expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris')
+ assert result == expected
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern'),
+ 'US/Eastern', 'dateutil/US/Eastern'])
+ def test_timestamp_constructed_by_date_and_tz(self, tz):
+ # GH#2993, Timestamp cannot be constructed by datetime.date
+ # and tz correctly
+
+ result = Timestamp(date(2012, 3, 11), tz=tz)
+
+ expected = Timestamp('3/11/2012', tz=tz)
+ assert result.hour == expected.hour
+ assert result == expected
+
+ @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'),
+ gettz('US/Eastern'),
+ 'US/Eastern', 'dateutil/US/Eastern'])
+ def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz):
+ # GH#1389
+
+ # 4 hours before DST transition
+ stamp = Timestamp('3/10/2012 22:00', tz=tz)
+
+ result = stamp + timedelta(hours=6)
+
+ # spring forward, + "7" hours
+ expected = Timestamp('3/11/2012 05:00', tz=tz)
+
+ assert result == expected
+
+ def test_timestamp_timetz_equivalent_with_datetime_tz(self,
+ tz_naive_fixture):
+ # GH21358
+ tz = timezones.maybe_get_tz(tz_naive_fixture)
+
+ stamp = Timestamp('2018-06-04 10:20:30', tz=tz)
+ _datetime = datetime(2018, 6, 4, hour=10,
+ minute=20, second=30, tzinfo=tz)
+
+ result = stamp.timetz()
+ expected = _datetime.timetz()
+
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_unary_ops.py b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_unary_ops.py
new file mode 100644
index 00000000000..3f9a30d2541
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/scalar/timestamp/test_unary_ops.py
@@ -0,0 +1,364 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+from dateutil.tz import gettz
+import pytest
+import pytz
+from pytz import utc
+
+from pandas._libs.tslibs import conversion
+from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG
+from pandas.compat import PY3
+import pandas.util._test_decorators as td
+
+from pandas import NaT, Timestamp
+import pandas.util.testing as tm
+
+from pandas.tseries.frequencies import to_offset
+
+
+class TestTimestampUnaryOps(object):
+
+ # --------------------------------------------------------------
+ # Timestamp.round
+ @pytest.mark.parametrize('timestamp, freq, expected', [
+ ('20130101 09:10:11', 'D', '20130101'),
+ ('20130101 19:10:11', 'D', '20130102'),
+ ('20130201 12:00:00', 'D', '20130202'),
+ ('20130104 12:00:00', 'D', '20130105'),
+ ('2000-01-05 05:09:15.13', 'D', '2000-01-05 00:00:00'),
+ ('2000-01-05 05:09:15.13', 'H', '2000-01-05 05:00:00'),
+ ('2000-01-05 05:09:15.13', 'S', '2000-01-05 05:09:15')
+ ])
+ def test_round_frequencies(self, timestamp, freq, expected):
+ dt = Timestamp(timestamp)
+ result = dt.round(freq)
+ expected = Timestamp(expected)
+ assert result == expected
+
+ def test_round_tzaware(self):
+ dt = Timestamp('20130101 09:10:11', tz='US/Eastern')
+ result = dt.round('D')
+ expected = Timestamp('20130101', tz='US/Eastern')
+ assert result == expected
+
+ dt = Timestamp('20130101 09:10:11', tz='US/Eastern')
+ result = dt.round('s')
+ assert result == dt
+
+ def test_round_30min(self):
+ # round
+ dt = Timestamp('20130104 12:32:00')
+ result = dt.round('30Min')
+ expected = Timestamp('20130104 12:30:00')
+ assert result == expected
+
+ def test_round_subsecond(self):
+ # GH#14440 & GH#15578
+ result = Timestamp('2016-10-17 12:00:00.0015').round('ms')
+ expected = Timestamp('2016-10-17 12:00:00.002000')
+ assert result == expected
+
+ result = Timestamp('2016-10-17 12:00:00.00149').round('ms')
+ expected = Timestamp('2016-10-17 12:00:00.001000')
+ assert result == expected
+
+ ts = Timestamp('2016-10-17 12:00:00.0015')
+ for freq in ['us', 'ns']:
+ assert ts == ts.round(freq)
+
+ result = Timestamp('2016-10-17 12:00:00.001501031').round('10ns')
+ expected = Timestamp('2016-10-17 12:00:00.001501030')
+ assert result == expected
+
+ def test_round_nonstandard_freq(self):
+ with tm.assert_produces_warning(False):
+ Timestamp('2016-10-17 12:00:00.001501031').round('1010ns')
+
+ def test_round_invalid_arg(self):
+ stamp = Timestamp('2000-01-05 05:09:15.13')
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ stamp.round('foo')
+
+ @pytest.mark.parametrize('test_input, rounder, freq, expected', [
+ ('2117-01-01 00:00:45', 'floor', '15s', '2117-01-01 00:00:45'),
+ ('2117-01-01 00:00:45', 'ceil', '15s', '2117-01-01 00:00:45'),
+ ('2117-01-01 00:00:45.000000012', 'floor', '10ns',
+ '2117-01-01 00:00:45.000000010'),
+ ('1823-01-01 00:00:01.000000012', 'ceil', '10ns',
+ '1823-01-01 00:00:01.000000020'),
+ ('1823-01-01 00:00:01', 'floor', '1s', '1823-01-01 00:00:01'),
+ ('1823-01-01 00:00:01', 'ceil', '1s', '1823-01-01 00:00:01'),
+ ('NaT', 'floor', '1s', 'NaT'),
+ ('NaT', 'ceil', '1s', 'NaT')
+ ])
+ def test_ceil_floor_edge(self, test_input, rounder, freq, expected):
+ dt = Timestamp(test_input)
+ func = getattr(dt, rounder)
+ result = func(freq)
+
+ if dt is NaT:
+ assert result is NaT
+ else:
+ expected = Timestamp(expected)
+ assert result == expected
+
+ @pytest.mark.parametrize('test_input, freq, expected', [
+ ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'),
+ ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'),
+ ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'),
+ ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'),
+ ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'),
+ ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'),
+ ])
+ @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round'])
+ def test_round_minute_freq(self, test_input, freq, expected, rounder):
+ # Ensure timestamps that shouldnt round dont!
+ # GH#21262
+
+ dt = Timestamp(test_input)
+ expected = Timestamp(expected)
+ func = getattr(dt, rounder)
+ result = func(freq)
+ assert result == expected
+
+ def test_ceil(self):
+ dt = Timestamp('20130101 09:10:11')
+ result = dt.ceil('D')
+ expected = Timestamp('20130102')
+ assert result == expected
+
+ def test_floor(self):
+ dt = Timestamp('20130101 09:10:11')
+ result = dt.floor('D')
+ expected = Timestamp('20130101')
+ assert result == expected
+
+ @pytest.mark.parametrize('method', ['ceil', 'round', 'floor'])
+ def test_round_dst_border_ambiguous(self, method):
+ # GH 18946 round near "fall back" DST
+ ts = Timestamp('2017-10-29 00:00:00', tz='UTC').tz_convert(
+ 'Europe/Madrid'
+ )
+ #
+ result = getattr(ts, method)('H', ambiguous=True)
+ assert result == ts
+
+ result = getattr(ts, method)('H', ambiguous=False)
+ expected = Timestamp('2017-10-29 01:00:00', tz='UTC').tz_convert(
+ 'Europe/Madrid'
+ )
+ assert result == expected
+
+ result = getattr(ts, method)('H', ambiguous='NaT')
+ assert result is NaT
+
+ with pytest.raises(pytz.AmbiguousTimeError):
+ getattr(ts, method)('H', ambiguous='raise')
+
+ @pytest.mark.parametrize('method, ts_str, freq', [
+ ['ceil', '2018-03-11 01:59:00-0600', '5min'],
+ ['round', '2018-03-11 01:59:00-0600', '5min'],
+ ['floor', '2018-03-11 03:01:00-0500', '2H']])
+ def test_round_dst_border_nonexistent(self, method, ts_str, freq):
+ # GH 23324 round near "spring forward" DST
+ ts = Timestamp(ts_str, tz='America/Chicago')
+ result = getattr(ts, method)(freq, nonexistent='shift_forward')
+ expected = Timestamp('2018-03-11 03:00:00', tz='America/Chicago')
+ assert result == expected
+
+ result = getattr(ts, method)(freq, nonexistent='NaT')
+ assert result is NaT
+
+ with pytest.raises(pytz.NonExistentTimeError,
+ match='2018-03-11 02:00:00'):
+ getattr(ts, method)(freq, nonexistent='raise')
+
+ @pytest.mark.parametrize('timestamp', [
+ '2018-01-01 0:0:0.124999360',
+ '2018-01-01 0:0:0.125000367',
+ '2018-01-01 0:0:0.125500',
+ '2018-01-01 0:0:0.126500',
+ '2018-01-01 12:00:00',
+ '2019-01-01 12:00:00',
+ ])
+ @pytest.mark.parametrize('freq', [
+ '2ns', '3ns', '4ns', '5ns', '6ns', '7ns',
+ '250ns', '500ns', '750ns',
+ '1us', '19us', '250us', '500us', '750us',
+ '1s', '2s', '3s',
+ '1D',
+ ])
+ def test_round_int64(self, timestamp, freq):
+ """check that all rounding modes are accurate to int64 precision
+ see GH#22591
+ """
+ dt = Timestamp(timestamp)
+ unit = to_offset(freq).nanos
+
+ # test floor
+ result = dt.floor(freq)
+ assert result.value % unit == 0, "floor not a {} multiple".format(freq)
+ assert 0 <= dt.value - result.value < unit, "floor error"
+
+ # test ceil
+ result = dt.ceil(freq)
+ assert result.value % unit == 0, "ceil not a {} multiple".format(freq)
+ assert 0 <= result.value - dt.value < unit, "ceil error"
+
+ # test round
+ result = dt.round(freq)
+ assert result.value % unit == 0, "round not a {} multiple".format(freq)
+ assert abs(result.value - dt.value) <= unit // 2, "round error"
+ if unit % 2 == 0 and abs(result.value - dt.value) == unit // 2:
+ # round half to even
+ assert result.value // unit % 2 == 0, "round half to even error"
+
+ # --------------------------------------------------------------
+ # Timestamp.replace
+
+ def test_replace_naive(self):
+ # GH#14621, GH#7825
+ ts = Timestamp('2016-01-01 09:00:00')
+ result = ts.replace(hour=0)
+ expected = Timestamp('2016-01-01 00:00:00')
+ assert result == expected
+
+ def test_replace_aware(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH#14621, GH#7825
+ # replacing datetime components with and w/o presence of a timezone
+ ts = Timestamp('2016-01-01 09:00:00', tz=tz)
+ result = ts.replace(hour=0)
+ expected = Timestamp('2016-01-01 00:00:00', tz=tz)
+ assert result == expected
+
+ def test_replace_preserves_nanos(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH#14621, GH#7825
+ ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz)
+ result = ts.replace(hour=0)
+ expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz)
+ assert result == expected
+
+ def test_replace_multiple(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH#14621, GH#7825
+ # replacing datetime components with and w/o presence of a timezone
+ # test all
+ ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz)
+ result = ts.replace(year=2015, month=2, day=2, hour=0, minute=5,
+ second=5, microsecond=5, nanosecond=5)
+ expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz)
+ assert result == expected
+
+ def test_replace_invalid_kwarg(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH#14621, GH#7825
+ ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz)
+ with pytest.raises(TypeError):
+ ts.replace(foo=5)
+
+ def test_replace_integer_args(self, tz_aware_fixture):
+ tz = tz_aware_fixture
+ # GH#14621, GH#7825
+ ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz)
+ with pytest.raises(ValueError):
+ ts.replace(hour=0.1)
+
+ def test_replace_tzinfo_equiv_tz_localize_none(self):
+ # GH#14621, GH#7825
+ # assert conversion to naive is the same as replacing tzinfo with None
+ ts = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern')
+ assert ts.tz_localize(None) == ts.replace(tzinfo=None)
+
+ @td.skip_if_windows
+ def test_replace_tzinfo(self):
+ # GH#15683
+ dt = datetime(2016, 3, 27, 1)
+ tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo
+
+ result_dt = dt.replace(tzinfo=tzinfo)
+ result_pd = Timestamp(dt).replace(tzinfo=tzinfo)
+
+ if PY3:
+ # datetime.timestamp() converts in the local timezone
+ with tm.set_timezone('UTC'):
+ assert result_dt.timestamp() == result_pd.timestamp()
+
+ assert result_dt == result_pd
+ assert result_dt == result_pd.to_pydatetime()
+
+ result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None)
+ result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None)
+
+ if PY3:
+ # datetime.timestamp() converts in the local timezone
+ with tm.set_timezone('UTC'):
+ assert result_dt.timestamp() == result_pd.timestamp()
+
+ assert result_dt == result_pd
+ assert result_dt == result_pd.to_pydatetime()
+
+ @pytest.mark.parametrize('tz, normalize', [
+ (pytz.timezone('US/Eastern'), lambda x: x.tzinfo.normalize(x)),
+ (gettz('US/Eastern'), lambda x: x)])
+ def test_replace_across_dst(self, tz, normalize):
+ # GH#18319 check that 1) timezone is correctly normalized and
+ # 2) that hour is not incorrectly changed by this normalization
+ ts_naive = Timestamp('2017-12-03 16:03:30')
+ ts_aware = conversion.localize_pydatetime(ts_naive, tz)
+
+ # Preliminary sanity-check
+ assert ts_aware == normalize(ts_aware)
+
+ # Replace across DST boundary
+ ts2 = ts_aware.replace(month=6)
+
+ # Check that `replace` preserves hour literal
+ assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute)
+
+ # Check that post-replace object is appropriately normalized
+ ts2b = normalize(ts2)
+ assert ts2 == ts2b
+
+ def test_replace_dst_border(self):
+ # Gh 7825
+ t = Timestamp('2013-11-3', tz='America/Chicago')
+ result = t.replace(hour=3)
+ expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago')
+ assert result == expected
+
+ # --------------------------------------------------------------
+ # Timestamp.normalize
+
+ @pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00'])
+ def test_normalize(self, tz_naive_fixture, arg):
+ tz = tz_naive_fixture
+ ts = Timestamp(arg, tz=tz)
+ result = ts.normalize()
+ expected = Timestamp('2013-11-30', tz=tz)
+ assert result == expected
+
+ # --------------------------------------------------------------
+
+ @td.skip_if_windows
+ def test_timestamp(self):
+ # GH#17329
+ # tz-naive --> treat it as if it were UTC for purposes of timestamp()
+ ts = Timestamp.now()
+ uts = ts.replace(tzinfo=utc)
+ assert ts.timestamp() == uts.timestamp()
+
+ tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central')
+ utsc = tsc.tz_convert('UTC')
+
+ # utsc is a different representation of the same time
+ assert tsc.timestamp() == utsc.timestamp()
+
+ if PY3:
+ # datetime.timestamp() converts in the local timezone
+ with tm.set_timezone('UTC'):
+ # should agree with datetime.timestamp method
+ dt = ts.to_pydatetime()
+ assert dt.timestamp() == ts.timestamp()
diff --git a/contrib/python/pandas/py2/pandas/tests/series/__init__.py b/contrib/python/pandas/py2/pandas/tests/series/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/series/common.py b/contrib/python/pandas/py2/pandas/tests/series/common.py
new file mode 100644
index 00000000000..cacca38b2d6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/common.py
@@ -0,0 +1,31 @@
+from pandas.util._decorators import cache_readonly
+
+import pandas as pd
+import pandas.util.testing as tm
+
+_ts = tm.makeTimeSeries()
+
+
+class TestData(object):
+
+ @cache_readonly
+ def ts(self):
+ ts = _ts.copy()
+ ts.name = 'ts'
+ return ts
+
+ @cache_readonly
+ def series(self):
+ series = tm.makeStringSeries()
+ series.name = 'series'
+ return series
+
+ @cache_readonly
+ def objSeries(self):
+ objSeries = tm.makeObjectSeries()
+ objSeries.name = 'objects'
+ return objSeries
+
+ @cache_readonly
+ def empty(self):
+ return pd.Series([], index=[])
diff --git a/contrib/python/pandas/py2/pandas/tests/series/conftest.py b/contrib/python/pandas/py2/pandas/tests/series/conftest.py
new file mode 100644
index 00000000000..431aacb1c8d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/conftest.py
@@ -0,0 +1,42 @@
+import pytest
+
+from pandas import Series
+import pandas.util.testing as tm
+
+
+def datetime_series():
+ """
+ Fixture for Series of floats with DatetimeIndex
+ """
+ s = tm.makeTimeSeries()
+ s.name = 'ts'
+ return s
+
+
+def string_series():
+ """
+ Fixture for Series of floats with Index of unique strings
+ """
+ s = tm.makeStringSeries()
+ s.name = 'series'
+ return s
+
+
+def object_series():
+ """
+ Fixture for Series of dtype datetime64[ns] with Index of unique strings
+ """
+ s = tm.makeObjectSeries()
+ s.name = 'objects'
+ return s
+
+
+def empty_series():
+ """
+ Fixture for empty Series
+ """
+ return Series([], index=[])
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/__init__.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/conftest.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/conftest.py
new file mode 100644
index 00000000000..0e06f6b8e46
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/conftest.py
@@ -0,0 +1,8 @@
+import pytest
+
+from pandas.tests.series.common import TestData
+
+
[email protected](scope='module')
+def test_data():
+ return TestData()
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_alter_index.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_alter_index.py
new file mode 100644
index 00000000000..a826a0644fa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_alter_index.py
@@ -0,0 +1,564 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime
+
+import numpy as np
+from numpy import nan
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import lrange, range
+
+import pandas as pd
+from pandas import Categorical, Series, date_range, isna
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+ 'first_slice,second_slice', [
+ [[2, None], [None, -5]],
+ [[None, 0], [None, -5]],
+ [[None, -5], [None, 0]],
+ [[None, 0], [None, 0]]
+ ])
[email protected]('fill', [None, -1])
+def test_align(test_data, first_slice, second_slice, join_type, fill):
+ a = test_data.ts[slice(*first_slice)]
+ b = test_data.ts[slice(*second_slice)]
+
+ aa, ab = a.align(b, join=join_type, fill_value=fill)
+
+ join_index = a.index.join(b.index, how=join_type)
+ if fill is not None:
+ diff_a = aa.index.difference(join_index)
+ diff_b = ab.index.difference(join_index)
+ if len(diff_a) > 0:
+ assert (aa.reindex(diff_a) == fill).all()
+ if len(diff_b) > 0:
+ assert (ab.reindex(diff_b) == fill).all()
+
+ ea = a.reindex(join_index)
+ eb = b.reindex(join_index)
+
+ if fill is not None:
+ ea = ea.fillna(fill)
+ eb = eb.fillna(fill)
+
+ assert_series_equal(aa, ea)
+ assert_series_equal(ab, eb)
+ assert aa.name == 'ts'
+ assert ea.name == 'ts'
+ assert ab.name == 'ts'
+ assert eb.name == 'ts'
+
+
+ 'first_slice,second_slice', [
+ [[2, None], [None, -5]],
+ [[None, 0], [None, -5]],
+ [[None, -5], [None, 0]],
+ [[None, 0], [None, 0]]
+ ])
[email protected]('method', ['pad', 'bfill'])
[email protected]('limit', [None, 1])
+def test_align_fill_method(test_data,
+ first_slice, second_slice,
+ join_type, method, limit):
+ a = test_data.ts[slice(*first_slice)]
+ b = test_data.ts[slice(*second_slice)]
+
+ aa, ab = a.align(b, join=join_type, method=method, limit=limit)
+
+ join_index = a.index.join(b.index, how=join_type)
+ ea = a.reindex(join_index)
+ eb = b.reindex(join_index)
+
+ ea = ea.fillna(method=method, limit=limit)
+ eb = eb.fillna(method=method, limit=limit)
+
+ assert_series_equal(aa, ea)
+ assert_series_equal(ab, eb)
+
+
+def test_align_nocopy(test_data):
+ b = test_data.ts[:5].copy()
+
+ # do copy
+ a = test_data.ts.copy()
+ ra, _ = a.align(b, join='left')
+ ra[:5] = 5
+ assert not (a[:5] == 5).any()
+
+ # do not copy
+ a = test_data.ts.copy()
+ ra, _ = a.align(b, join='left', copy=False)
+ ra[:5] = 5
+ assert (a[:5] == 5).all()
+
+ # do copy
+ a = test_data.ts.copy()
+ b = test_data.ts[:5].copy()
+ _, rb = a.align(b, join='right')
+ rb[:3] = 5
+ assert not (b[:3] == 5).any()
+
+ # do not copy
+ a = test_data.ts.copy()
+ b = test_data.ts[:5].copy()
+ _, rb = a.align(b, join='right', copy=False)
+ rb[:2] = 5
+ assert (b[:2] == 5).all()
+
+
+def test_align_same_index(test_data):
+ a, b = test_data.ts.align(test_data.ts, copy=False)
+ assert a.index is test_data.ts.index
+ assert b.index is test_data.ts.index
+
+ a, b = test_data.ts.align(test_data.ts, copy=True)
+ assert a.index is not test_data.ts.index
+ assert b.index is not test_data.ts.index
+
+
+def test_align_multiindex():
+ # GH 10665
+
+ midx = pd.MultiIndex.from_product([range(2), range(3), range(2)],
+ names=('a', 'b', 'c'))
+ idx = pd.Index(range(2), name='b')
+ s1 = pd.Series(np.arange(12, dtype='int64'), index=midx)
+ s2 = pd.Series(np.arange(2, dtype='int64'), index=idx)
+
+ # these must be the same results (but flipped)
+ res1l, res1r = s1.align(s2, join='left')
+ res2l, res2r = s2.align(s1, join='right')
+
+ expl = s1
+ tm.assert_series_equal(expl, res1l)
+ tm.assert_series_equal(expl, res2r)
+ expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
+ tm.assert_series_equal(expr, res1r)
+ tm.assert_series_equal(expr, res2l)
+
+ res1l, res1r = s1.align(s2, join='right')
+ res2l, res2r = s2.align(s1, join='left')
+
+ exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)],
+ names=('a', 'b', 'c'))
+ expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
+ tm.assert_series_equal(expl, res1l)
+ tm.assert_series_equal(expl, res2r)
+ expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx)
+ tm.assert_series_equal(expr, res1r)
+ tm.assert_series_equal(expr, res2l)
+
+
+def test_reindex(test_data):
+ identity = test_data.series.reindex(test_data.series.index)
+
+ # __array_interface__ is not defined for older numpies
+ # and on some pythons
+ try:
+ assert np.may_share_memory(test_data.series.index, identity.index)
+ except AttributeError:
+ pass
+
+ assert identity.index.is_(test_data.series.index)
+ assert identity.index.identical(test_data.series.index)
+
+ subIndex = test_data.series.index[10:20]
+ subSeries = test_data.series.reindex(subIndex)
+
+ for idx, val in compat.iteritems(subSeries):
+ assert val == test_data.series[idx]
+
+ subIndex2 = test_data.ts.index[10:20]
+ subTS = test_data.ts.reindex(subIndex2)
+
+ for idx, val in compat.iteritems(subTS):
+ assert val == test_data.ts[idx]
+ stuffSeries = test_data.ts.reindex(subIndex)
+
+ assert np.isnan(stuffSeries).all()
+
+ # This is extremely important for the Cython code to not screw up
+ nonContigIndex = test_data.ts.index[::2]
+ subNonContig = test_data.ts.reindex(nonContigIndex)
+ for idx, val in compat.iteritems(subNonContig):
+ assert val == test_data.ts[idx]
+
+ # return a copy the same index here
+ result = test_data.ts.reindex()
+ assert not (result is test_data.ts)
+
+
+def test_reindex_nan():
+ ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8])
+
+ i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2]
+ assert_series_equal(ts.reindex(i), ts.iloc[j])
+
+ ts.index = ts.index.astype('object')
+
+ # reindex coerces index.dtype to float, loc/iloc doesn't
+ assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False)
+
+
+def test_reindex_series_add_nat():
+ rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
+ series = Series(rng)
+
+ result = series.reindex(lrange(15))
+ assert np.issubdtype(result.dtype, np.dtype('M8[ns]'))
+
+ mask = result.isna()
+ assert mask[-5:].all()
+ assert not mask[:-5].any()
+
+
+def test_reindex_with_datetimes():
+ rng = date_range('1/1/2000', periods=20)
+ ts = Series(np.random.randn(20), index=rng)
+
+ result = ts.reindex(list(ts.index[5:10]))
+ expected = ts[5:10]
+ tm.assert_series_equal(result, expected)
+
+ result = ts[list(ts.index[5:10])]
+ tm.assert_series_equal(result, expected)
+
+
+def test_reindex_corner(test_data):
+ # (don't forget to fix this) I think it's fixed
+ test_data.empty.reindex(test_data.ts.index, method='pad') # it works
+
+ # corner case: pad empty series
+ reindexed = test_data.empty.reindex(test_data.ts.index, method='pad')
+
+ # pass non-Index
+ reindexed = test_data.ts.reindex(list(test_data.ts.index))
+ assert_series_equal(test_data.ts, reindexed)
+
+ # bad fill method
+ ts = test_data.ts[::2]
+ msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill"
+ r" \(bfill\) or nearest\. Got foo")
+ with pytest.raises(ValueError, match=msg):
+ ts.reindex(test_data.ts.index, method='foo')
+
+
+def test_reindex_pad():
+ s = Series(np.arange(10), dtype='int64')
+ s2 = s[::2]
+
+ reindexed = s2.reindex(s.index, method='pad')
+ reindexed2 = s2.reindex(s.index, method='ffill')
+ assert_series_equal(reindexed, reindexed2)
+
+ expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10))
+ assert_series_equal(reindexed, expected)
+
+ # GH4604
+ s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
+ new_index = ['a', 'g', 'c', 'f']
+ expected = Series([1, 1, 3, 3], index=new_index)
+
+ # this changes dtype because the ffill happens after
+ result = s.reindex(new_index).ffill()
+ assert_series_equal(result, expected.astype('float64'))
+
+ result = s.reindex(new_index).ffill(downcast='infer')
+ assert_series_equal(result, expected)
+
+ expected = Series([1, 5, 3, 5], index=new_index)
+ result = s.reindex(new_index, method='ffill')
+ assert_series_equal(result, expected)
+
+ # inference of new dtype
+ s = Series([True, False, False, True], index=list('abcd'))
+ new_index = 'agc'
+ result = s.reindex(list(new_index)).ffill()
+ expected = Series([True, True, False], index=list(new_index))
+ assert_series_equal(result, expected)
+
+ # GH4618 shifted series downcasting
+ s = Series(False, index=lrange(0, 5))
+ result = s.shift(1).fillna(method='bfill')
+ expected = Series(False, index=lrange(0, 5))
+ assert_series_equal(result, expected)
+
+
+def test_reindex_nearest():
+ s = Series(np.arange(10, dtype='int64'))
+ target = [0.1, 0.9, 1.5, 2.0]
+ actual = s.reindex(target, method='nearest')
+ expected = Series(np.around(target).astype('int64'), target)
+ assert_series_equal(expected, actual)
+
+ actual = s.reindex_like(actual, method='nearest')
+ assert_series_equal(expected, actual)
+
+ actual = s.reindex_like(actual, method='nearest', tolerance=1)
+ assert_series_equal(expected, actual)
+ actual = s.reindex_like(actual, method='nearest',
+ tolerance=[1, 2, 3, 4])
+ assert_series_equal(expected, actual)
+
+ actual = s.reindex(target, method='nearest', tolerance=0.2)
+ expected = Series([0, 1, np.nan, 2], target)
+ assert_series_equal(expected, actual)
+
+ actual = s.reindex(target, method='nearest',
+ tolerance=[0.3, 0.01, 0.4, 3])
+ expected = Series([0, np.nan, np.nan, 2], target)
+ assert_series_equal(expected, actual)
+
+
+def test_reindex_backfill():
+ pass
+
+
+def test_reindex_int(test_data):
+ ts = test_data.ts[::2]
+ int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index)
+
+ # this should work fine
+ reindexed_int = int_ts.reindex(test_data.ts.index)
+
+ # if NaNs introduced
+ assert reindexed_int.dtype == np.float_
+
+ # NO NaNs introduced
+ reindexed_int = int_ts.reindex(int_ts.index[::2])
+ assert reindexed_int.dtype == np.int_
+
+
+def test_reindex_bool(test_data):
+ # A series other than float, int, string, or object
+ ts = test_data.ts[::2]
+ bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index)
+
+ # this should work fine
+ reindexed_bool = bool_ts.reindex(test_data.ts.index)
+
+ # if NaNs introduced
+ assert reindexed_bool.dtype == np.object_
+
+ # NO NaNs introduced
+ reindexed_bool = bool_ts.reindex(bool_ts.index[::2])
+ assert reindexed_bool.dtype == np.bool_
+
+
+def test_reindex_bool_pad(test_data):
+ # fail
+ ts = test_data.ts[5:]
+ bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index)
+ filled_bool = bool_ts.reindex(test_data.ts.index, method='pad')
+ assert isna(filled_bool[:5]).all()
+
+
+def test_reindex_categorical():
+ index = date_range('20000101', periods=3)
+
+ # reindexing to an invalid Categorical
+ s = Series(['a', 'b', 'c'], dtype='category')
+ result = s.reindex(index)
+ expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
+ categories=['a', 'b', 'c']))
+ expected.index = index
+ tm.assert_series_equal(result, expected)
+
+ # partial reindexing
+ expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
+ 'c']))
+ expected.index = [1, 2]
+ result = s.reindex([1, 2])
+ tm.assert_series_equal(result, expected)
+
+ expected = Series(Categorical(
+ values=['c', np.nan], categories=['a', 'b', 'c']))
+ expected.index = [2, 3]
+ result = s.reindex([2, 3])
+ tm.assert_series_equal(result, expected)
+
+
+def test_reindex_like(test_data):
+ other = test_data.ts[::2]
+ assert_series_equal(test_data.ts.reindex(other.index),
+ test_data.ts.reindex_like(other))
+
+ # GH 7179
+ day1 = datetime(2013, 3, 5)
+ day2 = datetime(2013, 5, 5)
+ day3 = datetime(2014, 3, 5)
+
+ series1 = Series([5, None, None], [day1, day2, day3])
+ series2 = Series([None, None], [day1, day3])
+
+ result = series1.reindex_like(series2, method='pad')
+ expected = Series([5, np.nan], index=[day1, day3])
+ assert_series_equal(result, expected)
+
+
+def test_reindex_fill_value():
+ # -----------------------------------------------------------
+ # floats
+ floats = Series([1., 2., 3.])
+ result = floats.reindex([1, 2, 3])
+ expected = Series([2., 3., np.nan], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+ result = floats.reindex([1, 2, 3], fill_value=0)
+ expected = Series([2., 3., 0], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+ # -----------------------------------------------------------
+ # ints
+ ints = Series([1, 2, 3])
+
+ result = ints.reindex([1, 2, 3])
+ expected = Series([2., 3., np.nan], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+ # don't upcast
+ result = ints.reindex([1, 2, 3], fill_value=0)
+ expected = Series([2, 3, 0], index=[1, 2, 3])
+ assert issubclass(result.dtype.type, np.integer)
+ assert_series_equal(result, expected)
+
+ # -----------------------------------------------------------
+ # objects
+ objects = Series([1, 2, 3], dtype=object)
+
+ result = objects.reindex([1, 2, 3])
+ expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object)
+ assert_series_equal(result, expected)
+
+ result = objects.reindex([1, 2, 3], fill_value='foo')
+ expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object)
+ assert_series_equal(result, expected)
+
+ # ------------------------------------------------------------
+ # bools
+ bools = Series([True, False, True])
+
+ result = bools.reindex([1, 2, 3])
+ expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object)
+ assert_series_equal(result, expected)
+
+ result = bools.reindex([1, 2, 3], fill_value=False)
+ expected = Series([False, True, False], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+
+def test_reindex_datetimeindexes_tz_naive_and_aware():
+ # GH 8306
+ idx = date_range('20131101', tz='America/Chicago', periods=7)
+ newidx = date_range('20131103', periods=10, freq='H')
+ s = Series(range(7), index=idx)
+ with pytest.raises(TypeError):
+ s.reindex(newidx, method='ffill')
+
+
+def test_reindex_empty_series_tz_dtype():
+ # GH 20869
+ result = Series(dtype='datetime64[ns, UTC]').reindex([0, 1])
+ expected = Series([pd.NaT] * 2, dtype='datetime64[ns, UTC]')
+ tm.assert_equal(result, expected)
+
+
+def test_rename():
+ # GH 17407
+ s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex'))
+ result = s.rename(str)
+ expected = s.rename(lambda i: str(i))
+ assert_series_equal(result, expected)
+
+ assert result.name == expected.name
+
+
+ 'data, index, drop_labels,'
+ ' axis, expected_data, expected_index',
+ [
+ # Unique Index
+ ([1, 2], ['one', 'two'], ['two'],
+ 0, [1], ['one']),
+ ([1, 2], ['one', 'two'], ['two'],
+ 'rows', [1], ['one']),
+ ([1, 1, 2], ['one', 'two', 'one'], ['two'],
+ 0, [1, 2], ['one', 'one']),
+
+ # GH 5248 Non-Unique Index
+ ([1, 1, 2], ['one', 'two', 'one'], 'two',
+ 0, [1, 2], ['one', 'one']),
+ ([1, 1, 2], ['one', 'two', 'one'], ['one'],
+ 0, [1], ['two']),
+ ([1, 1, 2], ['one', 'two', 'one'], 'one',
+ 0, [1], ['two'])])
+def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels,
+ expected_data, expected_index):
+
+ s = Series(data=data, index=index)
+ result = s.drop(drop_labels, axis=axis)
+ expected = Series(data=expected_data, index=expected_index)
+ tm.assert_series_equal(result, expected)
+
+
+ 'data, index, drop_labels,'
+ ' axis, error_type, error_desc',
+ [
+ # single string/tuple-like
+ (range(3), list('abc'), 'bc',
+ 0, KeyError, 'not found in axis'),
+
+ # bad axis
+ (range(3), list('abc'), ('a',),
+ 0, KeyError, 'not found in axis'),
+ (range(3), list('abc'), 'one',
+ 'columns', ValueError, 'No axis named columns')])
+def test_drop_exception_raised(data, index, drop_labels,
+ axis, error_type, error_desc):
+
+ with pytest.raises(error_type, match=error_desc):
+ Series(data, index=index).drop(drop_labels, axis=axis)
+
+
+def test_drop_with_ignore_errors():
+ # errors='ignore'
+ s = Series(range(3), index=list('abc'))
+ result = s.drop('bc', errors='ignore')
+ tm.assert_series_equal(result, s)
+ result = s.drop(['a', 'd'], errors='ignore')
+ expected = s.iloc[1:]
+ tm.assert_series_equal(result, expected)
+
+ # GH 8522
+ s = Series([2, 3], index=[True, False])
+ assert s.index.is_object()
+ result = s.drop(True)
+ expected = Series([3], index=[False])
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('index', [[1, 2, 3], [1, 1, 3]])
[email protected]('drop_labels', [[], [1], [3]])
+def test_drop_empty_list(index, drop_labels):
+ # GH 21494
+ expected_index = [i for i in index if i not in drop_labels]
+ series = pd.Series(index=index).drop(drop_labels)
+ tm.assert_series_equal(series, pd.Series(index=expected_index))
+
+
[email protected]('data, index, drop_labels', [
+ (None, [1, 2, 3], [1, 4]),
+ (None, [1, 2, 2], [1, 4]),
+ ([2, 3], [0, 1], [False, True])
+])
+def test_drop_non_empty_list(data, index, drop_labels):
+ # GH 21494 and GH 16877
+ with pytest.raises(KeyError, match='not found in axis'):
+ pd.Series(data=data, index=index).drop(drop_labels)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_boolean.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_boolean.py
new file mode 100644
index 00000000000..9017d13051b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_boolean.py
@@ -0,0 +1,634 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, range
+
+from pandas.core.dtypes.common import is_integer
+
+import pandas as pd
+from pandas import Index, Series, Timestamp, date_range, isna
+from pandas.core.indexing import IndexingError
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+from pandas.tseries.offsets import BDay
+
+
+def test_getitem_boolean(test_data):
+ s = test_data.series
+ mask = s > s.median()
+
+ # passing list is OK
+ result = s[list(mask)]
+ expected = s[mask]
+ assert_series_equal(result, expected)
+ tm.assert_index_equal(result.index, s.index[mask])
+
+
+def test_getitem_boolean_empty():
+ s = Series([], dtype=np.int64)
+ s.index.name = 'index_name'
+ s = s[s.isna()]
+ assert s.index.name == 'index_name'
+ assert s.dtype == np.int64
+
+ # GH5877
+ # indexing with empty series
+ s = Series(['A', 'B'])
+ expected = Series(np.nan, index=['C'], dtype=object)
+ result = s[Series(['C'], dtype=object)]
+ assert_series_equal(result, expected)
+
+ s = Series(['A', 'B'])
+ expected = Series(dtype=object, index=Index([], dtype='int64'))
+ result = s[Series([], dtype=object)]
+ assert_series_equal(result, expected)
+
+ # invalid because of the boolean indexer
+ # that's empty or not-aligned
+ msg = (r"Unalignable boolean Series provided as indexer \(index of"
+ r" the boolean Series and of the indexed object do not match")
+ with pytest.raises(IndexingError, match=msg):
+ s[Series([], dtype=bool)]
+
+ with pytest.raises(IndexingError, match=msg):
+ s[Series([True], dtype=bool)]
+
+
+def test_getitem_boolean_object(test_data):
+ # using column from DataFrame
+
+ s = test_data.series
+ mask = s > s.median()
+ omask = mask.astype(object)
+
+ # getitem
+ result = s[omask]
+ expected = s[mask]
+ assert_series_equal(result, expected)
+
+ # setitem
+ s2 = s.copy()
+ cop = s.copy()
+ cop[omask] = 5
+ s2[mask] = 5
+ assert_series_equal(cop, s2)
+
+ # nans raise exception
+ omask[5:10] = np.nan
+ msg = "cannot index with vector containing NA / NaN values"
+ with pytest.raises(ValueError, match=msg):
+ s[omask]
+ with pytest.raises(ValueError, match=msg):
+ s[omask] = 5
+
+
+def test_getitem_setitem_boolean_corner(test_data):
+ ts = test_data.ts
+ mask_shifted = ts.shift(1, freq=BDay()) > ts.median()
+
+ # these used to raise...??
+
+ msg = (r"Unalignable boolean Series provided as indexer \(index of"
+ r" the boolean Series and of the indexed object do not match")
+ with pytest.raises(IndexingError, match=msg):
+ ts[mask_shifted]
+ with pytest.raises(IndexingError, match=msg):
+ ts[mask_shifted] = 1
+
+ with pytest.raises(IndexingError, match=msg):
+ ts.loc[mask_shifted]
+ with pytest.raises(IndexingError, match=msg):
+ ts.loc[mask_shifted] = 1
+
+
+def test_setitem_boolean(test_data):
+ mask = test_data.series > test_data.series.median()
+
+ # similar indexed series
+ result = test_data.series.copy()
+ result[mask] = test_data.series * 2
+ expected = test_data.series * 2
+ assert_series_equal(result[mask], expected[mask])
+
+ # needs alignment
+ result = test_data.series.copy()
+ result[mask] = (test_data.series * 2)[0:5]
+ expected = (test_data.series * 2)[0:5].reindex_like(test_data.series)
+ expected[-mask] = test_data.series[mask]
+ assert_series_equal(result[mask], expected[mask])
+
+
+def test_get_set_boolean_different_order(test_data):
+ ordered = test_data.series.sort_values()
+
+ # setting
+ copy = test_data.series.copy()
+ copy[ordered > 0] = 0
+
+ expected = test_data.series.copy()
+ expected[expected > 0] = 0
+
+ assert_series_equal(copy, expected)
+
+ # getting
+ sel = test_data.series[ordered > 0]
+ exp = test_data.series[test_data.series > 0]
+ assert_series_equal(sel, exp)
+
+
+def test_where_unsafe_int(sint_dtype):
+ s = Series(np.arange(10), dtype=sint_dtype)
+ mask = s < 5
+
+ s[mask] = lrange(2, 7)
+ expected = Series(lrange(2, 7) + lrange(5, 10), dtype=sint_dtype)
+
+ assert_series_equal(s, expected)
+
+
+def test_where_unsafe_float(float_dtype):
+ s = Series(np.arange(10), dtype=float_dtype)
+ mask = s < 5
+
+ s[mask] = lrange(2, 7)
+ expected = Series(lrange(2, 7) + lrange(5, 10), dtype=float_dtype)
+
+ assert_series_equal(s, expected)
+
+
[email protected]("dtype,expected_dtype", [
+ (np.int8, np.float64),
+ (np.int16, np.float64),
+ (np.int32, np.float64),
+ (np.int64, np.float64),
+ (np.float32, np.float32),
+ (np.float64, np.float64)
+])
+def test_where_unsafe_upcast(dtype, expected_dtype):
+ # see gh-9743
+ s = Series(np.arange(10), dtype=dtype)
+ values = [2.5, 3.5, 4.5, 5.5, 6.5]
+ mask = s < 5
+ expected = Series(values + lrange(5, 10), dtype=expected_dtype)
+ s[mask] = values
+ assert_series_equal(s, expected)
+
+
+def test_where_unsafe():
+ # see gh-9731
+ s = Series(np.arange(10), dtype="int64")
+ values = [2.5, 3.5, 4.5, 5.5]
+
+ mask = s > 5
+ expected = Series(lrange(6) + values, dtype="float64")
+
+ s[mask] = values
+ assert_series_equal(s, expected)
+
+ # see gh-3235
+ s = Series(np.arange(10), dtype='int64')
+ mask = s < 5
+ s[mask] = lrange(2, 7)
+ expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
+ assert_series_equal(s, expected)
+ assert s.dtype == expected.dtype
+
+ s = Series(np.arange(10), dtype='int64')
+ mask = s > 5
+ s[mask] = [0] * 4
+ expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
+ assert_series_equal(s, expected)
+
+ s = Series(np.arange(10))
+ mask = s > 5
+
+ msg = "cannot assign mismatch length to masked array"
+ with pytest.raises(ValueError, match=msg):
+ s[mask] = [5, 4, 3, 2, 1]
+
+ with pytest.raises(ValueError, match=msg):
+ s[mask] = [0] * 5
+
+ # dtype changes
+ s = Series([1, 2, 3, 4])
+ result = s.where(s > 2, np.nan)
+ expected = Series([np.nan, np.nan, 3, 4])
+ assert_series_equal(result, expected)
+
+ # GH 4667
+ # setting with None changes dtype
+ s = Series(range(10)).astype(float)
+ s[8] = None
+ result = s[8]
+ assert isna(result)
+
+ s = Series(range(10)).astype(float)
+ s[s > 8] = None
+ result = s[isna(s)]
+ expected = Series(np.nan, index=[9])
+ assert_series_equal(result, expected)
+
+
+def test_where_raise_on_error_deprecation():
+ # gh-14968
+ # deprecation of raise_on_error
+ s = Series(np.random.randn(5))
+ cond = s > 0
+ with tm.assert_produces_warning(FutureWarning):
+ s.where(cond, raise_on_error=True)
+ with tm.assert_produces_warning(FutureWarning):
+ s.mask(cond, raise_on_error=True)
+
+
+def test_where():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.where(cond).dropna()
+ rs2 = s[cond]
+ assert_series_equal(rs, rs2)
+
+ rs = s.where(cond, -s)
+ assert_series_equal(rs, s.abs())
+
+ rs = s.where(cond)
+ assert (s.shape == rs.shape)
+ assert (rs is not s)
+
+ # test alignment
+ cond = Series([True, False, False, True, False], index=s.index)
+ s2 = -(s.abs())
+
+ expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
+ rs = s2.where(cond[:3])
+ assert_series_equal(rs, expected)
+
+ expected = s2.abs()
+ expected.iloc[0] = s2[0]
+ rs = s2.where(cond[:3], -s2)
+ assert_series_equal(rs, expected)
+
+
+def test_where_error():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.where(1)
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond[:3].values, -s)
+
+ # GH 2745
+ s = Series([1, 2])
+ s[[True, False]] = [0, 1]
+ expected = Series([0, 2])
+ assert_series_equal(s, expected)
+
+ # failures
+ msg = "cannot assign mismatch length to masked array"
+ with pytest.raises(ValueError, match=msg):
+ s[[True, False]] = [0, 2, 3]
+ msg = ("NumPy boolean array indexing assignment cannot assign 0 input"
+ " values to the 1 output values where the mask is true")
+ with pytest.raises(ValueError, match=msg):
+ s[[True, False]] = []
+
+
[email protected]('klass', [list, tuple, np.array, Series])
+def test_where_array_like(klass):
+ # see gh-15414
+ s = Series([1, 2, 3])
+ cond = [False, True, True]
+ expected = Series([np.nan, 2, 3])
+
+ result = s.where(klass(cond))
+ assert_series_equal(result, expected)
+
+
+ [1, 0, 1],
+ Series([2, 5, 7]),
+ ["True", "False", "True"],
+ [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")]
+])
+def test_where_invalid_input(cond):
+ # see gh-15414: only boolean arrays accepted
+ s = Series([1, 2, 3])
+ msg = "Boolean array expected for the condition"
+
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.where([True])
+
+
+def test_where_ndframe_align():
+ msg = "Array conditional must be same shape as self"
+ s = Series([1, 2, 3])
+
+ cond = [True]
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ expected = Series([1, np.nan, np.nan])
+
+ out = s.where(Series(cond))
+ tm.assert_series_equal(out, expected)
+
+ cond = np.array([False, True, False, True])
+ with pytest.raises(ValueError, match=msg):
+ s.where(cond)
+
+ expected = Series([np.nan, 2, np.nan])
+
+ out = s.where(Series(cond))
+ tm.assert_series_equal(out, expected)
+
+
+def test_where_setitem_invalid():
+ # GH 2702
+ # make sure correct exceptions are raised on invalid list assignment
+
+ msg = ("cannot set using a {} indexer with a different length than"
+ " the value")
+
+ # slice
+ s = Series(list('abc'))
+
+ with pytest.raises(ValueError, match=msg.format('slice')):
+ s[0:3] = list(range(27))
+
+ s[0:3] = list(range(3))
+ expected = Series([0, 1, 2])
+ assert_series_equal(s.astype(np.int64), expected, )
+
+ # slice with step
+ s = Series(list('abcdef'))
+
+ with pytest.raises(ValueError, match=msg.format('slice')):
+ s[0:4:2] = list(range(27))
+
+ s = Series(list('abcdef'))
+ s[0:4:2] = list(range(2))
+ expected = Series([0, 'b', 1, 'd', 'e', 'f'])
+ assert_series_equal(s, expected)
+
+ # neg slices
+ s = Series(list('abcdef'))
+
+ with pytest.raises(ValueError, match=msg.format('slice')):
+ s[:-1] = list(range(27))
+
+ s[-3:-1] = list(range(2))
+ expected = Series(['a', 'b', 'c', 0, 1, 'f'])
+ assert_series_equal(s, expected)
+
+ # list
+ s = Series(list('abc'))
+
+ with pytest.raises(ValueError, match=msg.format('list-like')):
+ s[[0, 1, 2]] = list(range(27))
+
+ s = Series(list('abc'))
+
+ with pytest.raises(ValueError, match=msg.format('list-like')):
+ s[[0, 1, 2]] = list(range(2))
+
+ # scalar
+ s = Series(list('abc'))
+ s[0] = list(range(10))
+ expected = Series([list(range(10)), 'b', 'c'])
+ assert_series_equal(s, expected)
+
+
[email protected]('size', range(2, 6))
+ [True, False, False, False, False],
+ [True, False],
+ [False]
+])
+ 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min
+])
+# Test numpy arrays, lists and tuples as the input to be
+# broadcast
+ lambda x: np.array([x]),
+ lambda x: [x],
+ lambda x: (x,)
+])
+def test_broadcast(size, mask, item, box):
+ selection = np.resize(mask, size)
+
+ data = np.arange(size, dtype=float)
+
+ # Construct the expected series by taking the source
+ # data or item based on the selection
+ expected = Series([item if use_item else data[
+ i] for i, use_item in enumerate(selection)])
+
+ s = Series(data)
+ s[selection] = box(item)
+ assert_series_equal(s, expected)
+
+ s = Series(data)
+ result = s.where(~selection, box(item))
+ assert_series_equal(result, expected)
+
+ s = Series(data)
+ result = s.mask(selection, box(item))
+ assert_series_equal(result, expected)
+
+
+def test_where_inplace():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.copy()
+
+ rs.where(cond, inplace=True)
+ assert_series_equal(rs.dropna(), s[cond])
+ assert_series_equal(rs, s.where(cond))
+
+ rs = s.copy()
+ rs.where(cond, -s, inplace=True)
+ assert_series_equal(rs, s.where(cond, -s))
+
+
+def test_where_dups():
+ # GH 4550
+ # where crashes with dups in index
+ s1 = Series(list(range(3)))
+ s2 = Series(list(range(3)))
+ comb = pd.concat([s1, s2])
+ result = comb.where(comb < 2)
+ expected = Series([0, 1, np.nan, 0, 1, np.nan],
+ index=[0, 1, 2, 0, 1, 2])
+ assert_series_equal(result, expected)
+
+ # GH 4548
+ # inplace updating not working with dups
+ comb[comb < 1] = 5
+ expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2])
+ assert_series_equal(comb, expected)
+
+ comb[comb < 2] += 10
+ expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2])
+ assert_series_equal(comb, expected)
+
+
+def test_where_numeric_with_string():
+ # GH 9280
+ s = pd.Series([1, 2, 3])
+ w = s.where(s > 1, 'X')
+
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == 'object'
+
+ w = s.where(s > 1, ['X', 'Y', 'Z'])
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == 'object'
+
+ w = s.where(s > 1, np.array(['X', 'Y', 'Z']))
+ assert not is_integer(w[0])
+ assert is_integer(w[1])
+ assert is_integer(w[2])
+ assert isinstance(w[0], str)
+ assert w.dtype == 'object'
+
+
+def test_where_timedelta_coerce():
+ s = Series([1, 2], dtype='timedelta64[ns]')
+ expected = Series([10, 10])
+ mask = np.array([False, False])
+
+ rs = s.where(mask, [10, 10])
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10)
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10.0)
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, 10.0])
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, np.nan])
+ expected = Series([10, None], dtype='object')
+ assert_series_equal(rs, expected)
+
+
+def test_where_datetime_conversion():
+ s = Series(date_range('20130102', periods=2))
+ expected = Series([10, 10])
+ mask = np.array([False, False])
+
+ rs = s.where(mask, [10, 10])
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10)
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, 10.0)
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, 10.0])
+ assert_series_equal(rs, expected)
+
+ rs = s.where(mask, [10.0, np.nan])
+ expected = Series([10, None], dtype='object')
+ assert_series_equal(rs, expected)
+
+ # GH 15701
+ timestamps = ['2016-12-31 12:00:04+00:00',
+ '2016-12-31 12:00:04.010000+00:00']
+ s = Series([pd.Timestamp(t) for t in timestamps])
+ rs = s.where(Series([False, True]))
+ expected = Series([pd.NaT, s[1]])
+ assert_series_equal(rs, expected)
+
+
+def test_where_dt_tz_values(tz_naive_fixture):
+ ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'],
+ tz=tz_naive_fixture))
+ ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'],
+ tz=tz_naive_fixture))
+ mask = pd.Series([True, True, False])
+ result = ser1.where(mask, ser2)
+ exp = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20160516'],
+ tz=tz_naive_fixture))
+ assert_series_equal(exp, result)
+
+
+def test_mask():
+ # compare with tested results in test_where
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.where(~cond, np.nan)
+ assert_series_equal(rs, s.mask(cond))
+
+ rs = s.where(~cond)
+ rs2 = s.mask(cond)
+ assert_series_equal(rs, rs2)
+
+ rs = s.where(~cond, -s)
+ rs2 = s.mask(cond, -s)
+ assert_series_equal(rs, rs2)
+
+ cond = Series([True, False, False, True, False], index=s.index)
+ s2 = -(s.abs())
+ rs = s2.where(~cond[:3])
+ rs2 = s2.mask(cond[:3])
+ assert_series_equal(rs, rs2)
+
+ rs = s2.where(~cond[:3], -s2)
+ rs2 = s2.mask(cond[:3], -s2)
+ assert_series_equal(rs, rs2)
+
+ msg = "Array conditional must be same shape as self"
+ with pytest.raises(ValueError, match=msg):
+ s.mask(1)
+ with pytest.raises(ValueError, match=msg):
+ s.mask(cond[:3].values, -s)
+
+ # dtype changes
+ s = Series([1, 2, 3, 4])
+ result = s.mask(s > 2, np.nan)
+ expected = Series([1, 2, np.nan, np.nan])
+ assert_series_equal(result, expected)
+
+ # see gh-21891
+ s = Series([1, 2])
+ res = s.mask([True, False])
+
+ exp = Series([np.nan, 2])
+ tm.assert_series_equal(res, exp)
+
+
+def test_mask_inplace():
+ s = Series(np.random.randn(5))
+ cond = s > 0
+
+ rs = s.copy()
+ rs.mask(cond, inplace=True)
+ assert_series_equal(rs.dropna(), s[~cond])
+ assert_series_equal(rs, s.mask(cond))
+
+ rs = s.copy()
+ rs.mask(cond, -s, inplace=True)
+ assert_series_equal(rs, s.mask(cond, -s))
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_callable.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_callable.py
new file mode 100644
index 00000000000..b6561375459
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_callable.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import pandas.util.testing as tm
+
+
+def test_getitem_callable():
+ # GH 12533
+ s = pd.Series(4, index=list('ABCD'))
+ result = s[lambda x: 'A']
+ assert result == s.loc['A']
+
+ result = s[lambda x: ['A', 'B']]
+ tm.assert_series_equal(result, s.loc[['A', 'B']])
+
+ result = s[lambda x: [True, False, True, True]]
+ tm.assert_series_equal(result, s.iloc[[0, 2, 3]])
+
+
+def test_setitem_callable():
+ # GH 12533
+ s = pd.Series([1, 2, 3, 4], index=list('ABCD'))
+ s[lambda x: 'A'] = -1
+ tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD')))
+
+
+def test_setitem_other_callable():
+ # GH 13299
+ inc = lambda x: x + 1
+
+ s = pd.Series([1, 2, -1, 4])
+ s[s < 0] = inc
+
+ expected = pd.Series([1, 2, inc, 4])
+ tm.assert_series_equal(s, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_datetime.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_datetime.py
new file mode 100644
index 00000000000..0efc9feb0db
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_datetime.py
@@ -0,0 +1,714 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs import iNaT
+import pandas._libs.index as _index
+from pandas.compat import lrange, range
+
+import pandas as pd
+from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+
+"""
+Also test support for datetime64[ns] in Series / DataFrame
+"""
+
+
+def test_fancy_getitem():
+ dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1),
+ end=datetime(2010, 1, 1))
+
+ s = Series(np.arange(len(dti)), index=dti)
+
+ assert s[48] == 48
+ assert s['1/2/2009'] == 48
+ assert s['2009-1-2'] == 48
+ assert s[datetime(2009, 1, 2)] == 48
+ assert s[Timestamp(datetime(2009, 1, 2))] == 48
+ with pytest.raises(KeyError, match=r"^'2009-1-3'$"):
+ s['2009-1-3']
+ assert_series_equal(s['3/6/2009':'2009-06-05'],
+ s[datetime(2009, 3, 6):datetime(2009, 6, 5)])
+
+
+def test_fancy_setitem():
+ dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1),
+ end=datetime(2010, 1, 1))
+
+ s = Series(np.arange(len(dti)), index=dti)
+ s[48] = -1
+ assert s[48] == -1
+ s['1/2/2009'] = -2
+ assert s[48] == -2
+ s['1/2/2009':'2009-06-05'] = -3
+ assert (s[48:54] == -3).all()
+
+
+def test_dti_snap():
+ dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002',
+ '1/5/2002', '1/6/2002', '1/7/2002'], freq='D')
+
+ res = dti.snap(freq='W-MON')
+ exp = date_range('12/31/2001', '1/7/2002', freq='w-mon')
+ exp = exp.repeat([3, 4])
+ assert (res == exp).all()
+
+ res = dti.snap(freq='B')
+
+ exp = date_range('1/1/2002', '1/7/2002', freq='b')
+ exp = exp.repeat([1, 1, 1, 2, 2])
+ assert (res == exp).all()
+
+
+def test_dti_reset_index_round_trip():
+ dti = date_range(start='1/1/2001', end='6/1/2001', freq='D')
+ d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti)
+ d2 = d1.reset_index()
+ assert d2.dtypes[0] == np.dtype('M8[ns]')
+ d3 = d2.set_index('index')
+ assert_frame_equal(d1, d3, check_names=False)
+
+ # #2329
+ stamp = datetime(2012, 11, 22)
+ df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value'])
+ df = df.set_index('Date')
+
+ assert df.index[0] == stamp
+ assert df.reset_index()['Date'][0] == stamp
+
+
+def test_series_set_value():
+ # #1561
+
+ dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)]
+ index = DatetimeIndex(dates)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ s = Series().set_value(dates[0], 1.)
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ s2 = s.set_value(dates[1], np.nan)
+
+ exp = Series([1., np.nan], index=index)
+
+ assert_series_equal(s2, exp)
+
+ # s = Series(index[:1], index[:1])
+ # s2 = s.set_value(dates[1], index[1])
+ # assert s2.values.dtype == 'M8[ns]'
+
+
+def test_slice_locs_indexerror():
+ times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10)
+ for i in range(100000)]
+ s = Series(lrange(100000), times)
+ s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)]
+
+
+def test_slicing_datetimes():
+ # GH 7523
+
+ # unique
+ df = DataFrame(np.arange(4., dtype='float64'),
+ index=[datetime(2001, 1, i, 10, 00)
+ for i in [1, 2, 3, 4]])
+ result = df.loc[datetime(2001, 1, 1, 10):]
+ assert_frame_equal(result, df)
+ result = df.loc[:datetime(2001, 1, 4, 10)]
+ assert_frame_equal(result, df)
+ result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)]
+ assert_frame_equal(result, df)
+
+ result = df.loc[datetime(2001, 1, 1, 11):]
+ expected = df.iloc[1:]
+ assert_frame_equal(result, expected)
+ result = df.loc['20010101 11':]
+ assert_frame_equal(result, expected)
+
+ # duplicates
+ df = pd.DataFrame(np.arange(5., dtype='float64'),
+ index=[datetime(2001, 1, i, 10, 00)
+ for i in [1, 2, 2, 3, 4]])
+
+ result = df.loc[datetime(2001, 1, 1, 10):]
+ assert_frame_equal(result, df)
+ result = df.loc[:datetime(2001, 1, 4, 10)]
+ assert_frame_equal(result, df)
+ result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)]
+ assert_frame_equal(result, df)
+
+ result = df.loc[datetime(2001, 1, 1, 11):]
+ expected = df.iloc[1:]
+ assert_frame_equal(result, expected)
+ result = df.loc['20010101 11':]
+ assert_frame_equal(result, expected)
+
+
+def test_frame_datetime64_duplicated():
+ dates = date_range('2010-07-01', end='2010-08-05')
+
+ tst = DataFrame({'symbol': 'AAA', 'date': dates})
+ result = tst.duplicated(['date', 'symbol'])
+ assert (-result).all()
+
+ tst = DataFrame({'date': dates})
+ result = tst.duplicated()
+ assert (-result).all()
+
+
+def test_getitem_setitem_datetime_tz_pytz():
+ from pytz import timezone as tz
+ from pandas import date_range
+
+ N = 50
+ # testing with timezone, GH #2785
+ rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
+ ts = Series(np.random.randn(N), index=rng)
+
+ # also test Timestamp tz handling, GH #2789
+ result = ts.copy()
+ result["1990-01-01 09:00:00+00:00"] = 0
+ result["1990-01-01 09:00:00+00:00"] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts.copy()
+ result["1990-01-01 03:00:00-06:00"] = 0
+ result["1990-01-01 03:00:00-06:00"] = ts[4]
+ assert_series_equal(result, ts)
+
+ # repeat with datetimes
+ result = ts.copy()
+ result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0
+ result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts.copy()
+
+ # comparison dates with datetime MUST be localized!
+ date = tz('US/Central').localize(datetime(1990, 1, 1, 3))
+ result[date] = 0
+ result[date] = ts[4]
+ assert_series_equal(result, ts)
+
+
+def test_getitem_setitem_datetime_tz_dateutil():
+ from dateutil.tz import tzutc
+ from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
+
+ tz = lambda x: tzutc() if x == 'UTC' else gettz(
+ x) # handle special case for utc in dateutil
+
+ from pandas import date_range
+
+ N = 50
+
+ # testing with timezone, GH #2785
+ rng = date_range('1/1/1990', periods=N, freq='H',
+ tz='America/New_York')
+ ts = Series(np.random.randn(N), index=rng)
+
+ # also test Timestamp tz handling, GH #2789
+ result = ts.copy()
+ result["1990-01-01 09:00:00+00:00"] = 0
+ result["1990-01-01 09:00:00+00:00"] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts.copy()
+ result["1990-01-01 03:00:00-06:00"] = 0
+ result["1990-01-01 03:00:00-06:00"] = ts[4]
+ assert_series_equal(result, ts)
+
+ # repeat with datetimes
+ result = ts.copy()
+ result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0
+ result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts.copy()
+ result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0
+ result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4]
+ assert_series_equal(result, ts)
+
+
+def test_getitem_setitem_datetimeindex():
+ N = 50
+ # testing with timezone, GH #2785
+ rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
+ ts = Series(np.random.randn(N), index=rng)
+
+ result = ts["1990-01-01 04:00:00"]
+ expected = ts[4]
+ assert result == expected
+
+ result = ts.copy()
+ result["1990-01-01 04:00:00"] = 0
+ result["1990-01-01 04:00:00"] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0
+ result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8]
+ assert_series_equal(result, ts)
+
+ lb = "1990-01-01 04:00:00"
+ rb = "1990-01-01 07:00:00"
+ # GH#18435 strings get a pass from tzawareness compat
+ result = ts[(ts.index >= lb) & (ts.index <= rb)]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ lb = "1990-01-01 04:00:00-0500"
+ rb = "1990-01-01 07:00:00-0500"
+ result = ts[(ts.index >= lb) & (ts.index <= rb)]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ # repeat all the above with naive datetimes
+ result = ts[datetime(1990, 1, 1, 4)]
+ expected = ts[4]
+ assert result == expected
+
+ result = ts.copy()
+ result[datetime(1990, 1, 1, 4)] = 0
+ result[datetime(1990, 1, 1, 4)] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0
+ result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8]
+ assert_series_equal(result, ts)
+
+ lb = datetime(1990, 1, 1, 4)
+ rb = datetime(1990, 1, 1, 7)
+ msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
+ with pytest.raises(TypeError, match=msg):
+ # tznaive vs tzaware comparison is invalid
+ # see GH#18376, GH#18162
+ ts[(ts.index >= lb) & (ts.index <= rb)]
+
+ lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo)
+ rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo)
+ result = ts[(ts.index >= lb) & (ts.index <= rb)]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts[ts.index[4]]
+ expected = ts[4]
+ assert result == expected
+
+ result = ts[ts.index[4:8]]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result[ts.index[4:8]] = 0
+ result[4:8] = ts[4:8]
+ assert_series_equal(result, ts)
+
+ # also test partial date slicing
+ result = ts["1990-01-02"]
+ expected = ts[24:48]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result["1990-01-02"] = 0
+ result["1990-01-02"] = ts[24:48]
+ assert_series_equal(result, ts)
+
+
+def test_getitem_setitem_periodindex():
+ from pandas import period_range
+
+ N = 50
+ rng = period_range('1/1/1990', periods=N, freq='H')
+ ts = Series(np.random.randn(N), index=rng)
+
+ result = ts["1990-01-01 04"]
+ expected = ts[4]
+ assert result == expected
+
+ result = ts.copy()
+ result["1990-01-01 04"] = 0
+ result["1990-01-01 04"] = ts[4]
+ assert_series_equal(result, ts)
+
+ result = ts["1990-01-01 04":"1990-01-01 07"]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result["1990-01-01 04":"1990-01-01 07"] = 0
+ result["1990-01-01 04":"1990-01-01 07"] = ts[4:8]
+ assert_series_equal(result, ts)
+
+ lb = "1990-01-01 04"
+ rb = "1990-01-01 07"
+ result = ts[(ts.index >= lb) & (ts.index <= rb)]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ # GH 2782
+ result = ts[ts.index[4]]
+ expected = ts[4]
+ assert result == expected
+
+ result = ts[ts.index[4:8]]
+ expected = ts[4:8]
+ assert_series_equal(result, expected)
+
+ result = ts.copy()
+ result[ts.index[4:8]] = 0
+ result[4:8] = ts[4:8]
+ assert_series_equal(result, ts)
+
+
+# FutureWarning from NumPy.
[email protected]("ignore:Using a non-tuple:FutureWarning")
+def test_getitem_median_slice_bug():
+ index = date_range('20090415', '20090519', freq='2B')
+ s = Series(np.random.randn(13), index=index)
+
+ indexer = [slice(6, 7, None)]
+ result = s[indexer]
+ expected = s[indexer[0]]
+ assert_series_equal(result, expected)
+
+
+def test_datetime_indexing():
+ from pandas import date_range
+
+ index = date_range('1/1/2000', '1/7/2000')
+ index = index.repeat(3)
+
+ s = Series(len(index), index=index)
+ stamp = Timestamp('1/8/2000')
+
+ with pytest.raises(KeyError, match=r"^947289600000000000L?$"):
+ s[stamp]
+ s[stamp] = 0
+ assert s[stamp] == 0
+
+ # not monotonic
+ s = Series(len(index), index=index)
+ s = s[::-1]
+
+ with pytest.raises(KeyError, match=r"^947289600000000000L?$"):
+ s[stamp]
+ s[stamp] = 0
+ assert s[stamp] == 0
+
+
+"""
+test duplicates in time series
+"""
+
+
[email protected](scope='module')
+def dups():
+ dates = [datetime(2000, 1, 2), datetime(2000, 1, 2),
+ datetime(2000, 1, 2), datetime(2000, 1, 3),
+ datetime(2000, 1, 3), datetime(2000, 1, 3),
+ datetime(2000, 1, 4), datetime(2000, 1, 4),
+ datetime(2000, 1, 4), datetime(2000, 1, 5)]
+
+ return Series(np.random.randn(len(dates)), index=dates)
+
+
+def test_constructor(dups):
+ assert isinstance(dups, Series)
+ assert isinstance(dups.index, DatetimeIndex)
+
+
+def test_is_unique_monotonic(dups):
+ assert not dups.index.is_unique
+
+
+def test_index_unique(dups):
+ uniques = dups.index.unique()
+ expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3),
+ datetime(2000, 1, 4), datetime(2000, 1, 5)])
+ assert uniques.dtype == 'M8[ns]' # sanity
+ tm.assert_index_equal(uniques, expected)
+ assert dups.index.nunique() == 4
+
+ # #2563
+ assert isinstance(uniques, DatetimeIndex)
+
+ dups_local = dups.index.tz_localize('US/Eastern')
+ dups_local.name = 'foo'
+ result = dups_local.unique()
+ expected = DatetimeIndex(expected, name='foo')
+ expected = expected.tz_localize('US/Eastern')
+ assert result.tz is not None
+ assert result.name == 'foo'
+ tm.assert_index_equal(result, expected)
+
+ # NaT, note this is excluded
+ arr = [1370745748 + t for t in range(20)] + [iNaT]
+ idx = DatetimeIndex(arr * 3)
+ tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
+ assert idx.nunique() == 20
+ assert idx.nunique(dropna=False) == 21
+
+ arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t)
+ for t in range(20)] + [NaT]
+ idx = DatetimeIndex(arr * 3)
+ tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
+ assert idx.nunique() == 20
+ assert idx.nunique(dropna=False) == 21
+
+
+def test_index_dupes_contains():
+ d = datetime(2011, 12, 5, 20, 30)
+ ix = DatetimeIndex([d, d])
+ assert d in ix
+
+
+def test_duplicate_dates_indexing(dups):
+ ts = dups
+
+ uniques = ts.index.unique()
+ for date in uniques:
+ result = ts[date]
+
+ mask = ts.index == date
+ total = (ts.index == date).sum()
+ expected = ts[mask]
+ if total > 1:
+ assert_series_equal(result, expected)
+ else:
+ assert_almost_equal(result, expected[0])
+
+ cp = ts.copy()
+ cp[date] = 0
+ expected = Series(np.where(mask, 0, ts), index=ts.index)
+ assert_series_equal(cp, expected)
+
+ with pytest.raises(KeyError, match=r"^947116800000000000L?$"):
+ ts[datetime(2000, 1, 6)]
+
+ # new index
+ ts[datetime(2000, 1, 6)] = 0
+ assert ts[datetime(2000, 1, 6)] == 0
+
+
+def test_range_slice():
+ idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000',
+ '1/4/2000'])
+
+ ts = Series(np.random.randn(len(idx)), index=idx)
+
+ result = ts['1/2/2000':]
+ expected = ts[1:]
+ assert_series_equal(result, expected)
+
+ result = ts['1/2/2000':'1/3/2000']
+ expected = ts[1:4]
+ assert_series_equal(result, expected)
+
+
+def test_groupby_average_dup_values(dups):
+ result = dups.groupby(level=0).mean()
+ expected = dups.groupby(dups.index).mean()
+ assert_series_equal(result, expected)
+
+
+def test_indexing_over_size_cutoff():
+ import datetime
+ # #1821
+
+ old_cutoff = _index._SIZE_CUTOFF
+ try:
+ _index._SIZE_CUTOFF = 1000
+
+ # create large list of non periodic datetime
+ dates = []
+ sec = datetime.timedelta(seconds=1)
+ half_sec = datetime.timedelta(microseconds=500000)
+ d = datetime.datetime(2011, 12, 5, 20, 30)
+ n = 1100
+ for i in range(n):
+ dates.append(d)
+ dates.append(d + sec)
+ dates.append(d + sec + half_sec)
+ dates.append(d + sec + sec + half_sec)
+ d += 3 * sec
+
+ # duplicate some values in the list
+ duplicate_positions = np.random.randint(0, len(dates) - 1, 20)
+ for p in duplicate_positions:
+ dates[p + 1] = dates[p]
+
+ df = DataFrame(np.random.randn(len(dates), 4),
+ index=dates,
+ columns=list('ABCD'))
+
+ pos = n * 3
+ timestamp = df.index[pos]
+ assert timestamp in df.index
+
+ # it works!
+ df.loc[timestamp]
+ assert len(df.loc[[timestamp]]) > 0
+ finally:
+ _index._SIZE_CUTOFF = old_cutoff
+
+
+def test_indexing_unordered():
+ # GH 2437
+ rng = date_range(start='2011-01-01', end='2011-01-15')
+ ts = Series(np.random.rand(len(rng)), index=rng)
+ ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]])
+
+ for t in ts.index:
+ # TODO: unused?
+ s = str(t) # noqa
+
+ expected = ts[t]
+ result = ts2[t]
+ assert expected == result
+
+ # GH 3448 (ranges)
+ def compare(slobj):
+ result = ts2[slobj].copy()
+ result = result.sort_index()
+ expected = ts[slobj]
+ assert_series_equal(result, expected)
+
+ compare(slice('2011-01-01', '2011-01-15'))
+ compare(slice('2010-12-30', '2011-01-15'))
+ compare(slice('2011-01-01', '2011-01-16'))
+
+ # partial ranges
+ compare(slice('2011-01-01', '2011-01-6'))
+ compare(slice('2011-01-06', '2011-01-8'))
+ compare(slice('2011-01-06', '2011-01-12'))
+
+ # single values
+ result = ts2['2011'].sort_index()
+ expected = ts['2011']
+ assert_series_equal(result, expected)
+
+ # diff freq
+ rng = date_range(datetime(2005, 1, 1), periods=20, freq='M')
+ ts = Series(np.arange(len(rng)), index=rng)
+ ts = ts.take(np.random.permutation(20))
+
+ result = ts['2005']
+ for t in result.index:
+ assert t.year == 2005
+
+
+def test_indexing():
+ idx = date_range("2001-1-1", periods=20, freq='M')
+ ts = Series(np.random.rand(len(idx)), index=idx)
+
+ # getting
+
+ # GH 3070, make sure semantics work on Series/Frame
+ expected = ts['2001']
+ expected.name = 'A'
+
+ df = DataFrame(dict(A=ts))
+ result = df['2001']['A']
+ assert_series_equal(expected, result)
+
+ # setting
+ ts['2001'] = 1
+ expected = ts['2001']
+ expected.name = 'A'
+
+ df.loc['2001', 'A'] = 1
+
+ result = df['2001']['A']
+ assert_series_equal(expected, result)
+
+ # GH3546 (not including times on the last day)
+ idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00',
+ freq='H')
+ ts = Series(lrange(len(idx)), index=idx)
+ expected = ts['2013-05']
+ assert_series_equal(expected, ts)
+
+ idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59',
+ freq='S')
+ ts = Series(lrange(len(idx)), index=idx)
+ expected = ts['2013-05']
+ assert_series_equal(expected, ts)
+
+ idx = [Timestamp('2013-05-31 00:00'),
+ Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))]
+ ts = Series(lrange(len(idx)), index=idx)
+ expected = ts['2013']
+ assert_series_equal(expected, ts)
+
+ # GH14826, indexing with a seconds resolution string / datetime object
+ df = DataFrame(np.random.rand(5, 5),
+ columns=['open', 'high', 'low', 'close', 'volume'],
+ index=date_range('2012-01-02 18:01:00',
+ periods=5, tz='US/Central', freq='s'))
+ expected = df.loc[[df.index[2]]]
+
+ # this is a single date, so will raise
+ with pytest.raises(KeyError, match=r"^'2012-01-02 18:01:02'$"):
+ df['2012-01-02 18:01:02']
+ msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central', freq='S'\)"
+ with pytest.raises(KeyError, match=msg):
+ df[df.index[2]]
+
+
+"""
+test NaT support
+"""
+
+
+def test_set_none_nan():
+ series = Series(date_range('1/1/2000', periods=10))
+ series[3] = None
+ assert series[3] is NaT
+
+ series[3:5] = None
+ assert series[4] is NaT
+
+ series[5] = np.nan
+ assert series[5] is NaT
+
+ series[5:7] = np.nan
+ assert series[6] is NaT
+
+
+def test_nat_operations():
+ # GH 8617
+ s = Series([0, pd.NaT], dtype='m8[ns]')
+ exp = s[0]
+ assert s.median() == exp
+ assert s.min() == exp
+ assert s.max() == exp
+
+
[email protected]('method', ["round", "floor", "ceil"])
[email protected]('freq', ["s", "5s", "min", "5min", "h", "5h"])
+def test_round_nat(method, freq):
+ # GH14940
+ s = Series([pd.NaT])
+ expected = Series(pd.NaT)
+ round_method = getattr(s.dt, method)
+ assert_series_equal(round_method(freq), expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_iloc.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_iloc.py
new file mode 100644
index 00000000000..fa85da6a70d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_iloc.py
@@ -0,0 +1,37 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+
+from pandas.compat import lrange, range
+
+from pandas import Series
+from pandas.util.testing import assert_almost_equal, assert_series_equal
+
+
+def test_iloc():
+ s = Series(np.random.randn(10), index=lrange(0, 20, 2))
+
+ for i in range(len(s)):
+ result = s.iloc[i]
+ exp = s[s.index[i]]
+ assert_almost_equal(result, exp)
+
+ # pass a slice
+ result = s.iloc[slice(1, 3)]
+ expected = s.loc[2:4]
+ assert_series_equal(result, expected)
+
+ # test slice is a view
+ result[:] = 0
+ assert (s[1:3] == 0).all()
+
+ # list of integers
+ result = s.iloc[[0, 2, 3, 4, 5]]
+ expected = s.reindex(s.index[[0, 2, 3, 4, 5]])
+ assert_series_equal(result, expected)
+
+
+def test_iloc_nonunique():
+ s = Series([0, 1, 2], index=[0, 1, 0])
+ assert s.iloc[2] == 2
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_indexing.py
new file mode 100644
index 00000000000..a5855f68127
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_indexing.py
@@ -0,0 +1,840 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+""" test get/set & misc """
+
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, range
+
+from pandas.core.dtypes.common import is_scalar
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp)
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+from pandas.tseries.offsets import BDay
+
+
+def test_basic_indexing():
+ s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b'])
+
+ msg = "index out of bounds"
+ with pytest.raises(IndexError, match=msg):
+ s[5]
+ msg = "index 5 is out of bounds for axis 0 with size 5"
+ with pytest.raises(IndexError, match=msg):
+ s[5] = 0
+
+ with pytest.raises(KeyError, match=r"^'c'$"):
+ s['c']
+
+ s = s.sort_index()
+
+ msg = r"index out of bounds|^5$"
+ with pytest.raises(IndexError, match=msg):
+ s[5]
+ msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$"
+ with pytest.raises(IndexError, match=msg):
+ s[5] = 0
+
+
+def test_basic_getitem_with_labels(test_data):
+ indices = test_data.ts.index[[5, 10, 15]]
+
+ result = test_data.ts[indices]
+ expected = test_data.ts.reindex(indices)
+ assert_series_equal(result, expected)
+
+ result = test_data.ts[indices[0]:indices[2]]
+ expected = test_data.ts.loc[indices[0]:indices[2]]
+ assert_series_equal(result, expected)
+
+ # integer indexes, be careful
+ s = Series(np.random.randn(10), index=lrange(0, 20, 2))
+ inds = [0, 2, 5, 7, 8]
+ arr_inds = np.array([0, 2, 5, 7, 8])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = s[inds]
+ expected = s.reindex(inds)
+ assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = s[arr_inds]
+ expected = s.reindex(arr_inds)
+ assert_series_equal(result, expected)
+
+ # GH12089
+ # with tz for values
+ s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"),
+ index=['a', 'b', 'c'])
+ expected = Timestamp('2011-01-01', tz='US/Eastern')
+ result = s.loc['a']
+ assert result == expected
+ result = s.iloc[0]
+ assert result == expected
+ result = s['a']
+ assert result == expected
+
+
+def test_getitem_setitem_ellipsis():
+ s = Series(np.random.randn(10))
+
+ np.fix(s)
+
+ result = s[...]
+ assert_series_equal(result, s)
+
+ s[...] = 5
+ assert (result == 5).all()
+
+
+def test_getitem_get(test_data):
+ test_series = test_data.series
+ test_obj_series = test_data.objSeries
+
+ idx1 = test_series.index[5]
+ idx2 = test_obj_series.index[5]
+
+ assert test_series[idx1] == test_series.get(idx1)
+ assert test_obj_series[idx2] == test_obj_series.get(idx2)
+
+ assert test_series[idx1] == test_series[5]
+ assert test_obj_series[idx2] == test_obj_series[5]
+
+ assert test_series.get(-1) == test_series.get(test_series.index[-1])
+ assert test_series[5] == test_series.get(test_series.index[5])
+
+ # missing
+ d = test_data.ts.index[0] - BDay()
+ with pytest.raises(KeyError, match=r"Timestamp\('1999-12-31 00:00:00'\)"):
+ test_data.ts[d]
+
+ # None
+ # GH 5652
+ for s in [Series(), Series(index=list('abc'))]:
+ result = s.get(None)
+ assert result is None
+
+
+def test_getitem_fancy(test_data):
+ slice1 = test_data.series[[1, 2, 3]]
+ slice2 = test_data.objSeries[[1, 2, 3]]
+ assert test_data.series.index[2] == slice1.index[1]
+ assert test_data.objSeries.index[2] == slice2.index[1]
+ assert test_data.series[2] == slice1[1]
+ assert test_data.objSeries[2] == slice2[1]
+
+
+def test_getitem_generator(test_data):
+ gen = (x > 0 for x in test_data.series)
+ result = test_data.series[gen]
+ result2 = test_data.series[iter(test_data.series > 0)]
+ expected = test_data.series[test_data.series > 0]
+ assert_series_equal(result, expected)
+ assert_series_equal(result2, expected)
+
+
+def test_type_promotion():
+ # GH12599
+ s = pd.Series()
+ s["a"] = pd.Timestamp("2016-01-01")
+ s["b"] = 3.0
+ s["c"] = "foo"
+ expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"],
+ index=["a", "b", "c"])
+ assert_series_equal(s, expected)
+
+
+ 'result_1, duplicate_item, expected_1',
+ [
+ [
+ pd.Series({1: 12, 2: [1, 2, 2, 3]}), pd.Series({1: 313}),
+ pd.Series({1: 12, }, dtype=object),
+ ],
+ [
+ pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}),
+ pd.Series({1: [1, 2, 3]}), pd.Series({1: [1, 2, 3], }),
+ ],
+ ])
+def test_getitem_with_duplicates_indices(
+ result_1, duplicate_item, expected_1):
+ # GH 17610
+ result = result_1.append(duplicate_item)
+ expected = expected_1.append(duplicate_item)
+ assert_series_equal(result[1], expected)
+ assert result[2] == result_1[2]
+
+
+def test_getitem_out_of_bounds(test_data):
+ # don't segfault, GH #495
+ msg = "index out of bounds"
+ with pytest.raises(IndexError, match=msg):
+ test_data.ts[len(test_data.ts)]
+
+ # GH #917
+ s = Series([])
+ with pytest.raises(IndexError, match=msg):
+ s[-1]
+
+
+def test_getitem_setitem_integers():
+ # caused bug without test
+ s = Series([1, 2, 3], ['a', 'b', 'c'])
+
+ assert s.iloc[0] == s['a']
+ s.iloc[0] = 5
+ tm.assert_almost_equal(s['a'], 5)
+
+
+def test_getitem_box_float64(test_data):
+ value = test_data.ts[5]
+ assert isinstance(value, np.float64)
+
+
+ 'arr',
+ [
+ np.random.randn(10),
+ tm.makeDateIndex(10, name='a').tz_localize(
+ tz='US/Eastern'),
+ ])
+def test_get(arr):
+ # GH 21260
+ s = Series(arr, index=[2 * i for i in range(len(arr))])
+ assert s.get(4) == s.iloc[2]
+
+ result = s.get([4, 6])
+ expected = s.iloc[[2, 3]]
+ tm.assert_series_equal(result, expected)
+
+ result = s.get(slice(2))
+ expected = s.iloc[[0, 1]]
+ tm.assert_series_equal(result, expected)
+
+ assert s.get(-1) is None
+ assert s.get(s.index.max() + 1) is None
+
+ s = Series(arr[:6], index=list('abcdef'))
+ assert s.get('c') == s.iloc[2]
+
+ result = s.get(slice('b', 'd'))
+ expected = s.iloc[[1, 2, 3]]
+ tm.assert_series_equal(result, expected)
+
+ result = s.get('Z')
+ assert result is None
+
+ assert s.get(4) == s.iloc[4]
+ assert s.get(-1) == s.iloc[-1]
+ assert s.get(len(s)) is None
+
+ # GH 21257
+ s = pd.Series(arr)
+ s2 = s[::2]
+ assert s2.get(1) is None
+
+
+def test_series_box_timestamp():
+ rng = pd.date_range('20090415', '20090519', freq='B')
+ ser = Series(rng)
+
+ assert isinstance(ser[5], pd.Timestamp)
+
+ rng = pd.date_range('20090415', '20090519', freq='B')
+ ser = Series(rng, index=rng)
+ assert isinstance(ser[5], pd.Timestamp)
+
+ assert isinstance(ser.iat[5], pd.Timestamp)
+
+
+def test_getitem_ambiguous_keyerror():
+ s = Series(lrange(10), index=lrange(0, 20, 2))
+ with pytest.raises(KeyError, match=r"^1L?$"):
+ s[1]
+ with pytest.raises(KeyError, match=r"^1L?$"):
+ s.loc[1]
+
+
+def test_getitem_unordered_dup():
+ obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b'])
+ assert is_scalar(obj['c'])
+ assert obj['c'] == 0
+
+
+def test_getitem_dups_with_missing():
+ # breaks reindex, so need to use .loc internally
+ # GH 4246
+ s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah'])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ expected = s.loc[['foo', 'bar', 'bah', 'bam']]
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = s[['foo', 'bar', 'bah', 'bam']]
+ assert_series_equal(result, expected)
+
+
+def test_getitem_dups():
+ s = Series(range(5), index=['A', 'A', 'B', 'C', 'C'], dtype=np.int64)
+ expected = Series([3, 4], index=['C', 'C'], dtype=np.int64)
+ result = s['C']
+ assert_series_equal(result, expected)
+
+
+def test_setitem_ambiguous_keyerror():
+ s = Series(lrange(10), index=lrange(0, 20, 2))
+
+ # equivalent of an append
+ s2 = s.copy()
+ s2[1] = 5
+ expected = s.append(Series([5], index=[1]))
+ assert_series_equal(s2, expected)
+
+ s2 = s.copy()
+ s2.loc[1] = 5
+ expected = s.append(Series([5], index=[1]))
+ assert_series_equal(s2, expected)
+
+
+def test_getitem_dataframe():
+ rng = list(range(10))
+ s = pd.Series(10, index=rng)
+ df = pd.DataFrame(rng, index=rng)
+ msg = ("Indexing a Series with DataFrame is not supported,"
+ " use the appropriate DataFrame column")
+ with pytest.raises(TypeError, match=msg):
+ s[df > 5]
+
+
+def test_setitem(test_data):
+ test_data.ts[test_data.ts.index[5]] = np.NaN
+ test_data.ts[[1, 2, 17]] = np.NaN
+ test_data.ts[6] = np.NaN
+ assert np.isnan(test_data.ts[6])
+ assert np.isnan(test_data.ts[2])
+ test_data.ts[np.isnan(test_data.ts)] = 5
+ assert not np.isnan(test_data.ts[2])
+
+ # caught this bug when writing tests
+ series = Series(tm.makeIntIndex(20).astype(float),
+ index=tm.makeIntIndex(20))
+
+ series[::2] = 0
+ assert (series[::2] == 0).all()
+
+ # set item that's not contained
+ s = test_data.series.copy()
+ s['foobar'] = 1
+
+ app = Series([1], index=['foobar'], name='series')
+ expected = test_data.series.append(app)
+ assert_series_equal(s, expected)
+
+ # Test for issue #10193
+ key = pd.Timestamp('2012-01-01')
+ series = pd.Series()
+ series[key] = 47
+ expected = pd.Series(47, [key])
+ assert_series_equal(series, expected)
+
+ series = pd.Series([], pd.DatetimeIndex([], freq='D'))
+ series[key] = 47
+ expected = pd.Series(47, pd.DatetimeIndex([key], freq='D'))
+ assert_series_equal(series, expected)
+
+
+def test_setitem_dtypes():
+ # change dtypes
+ # GH 4463
+ expected = Series([np.nan, 2, 3])
+
+ s = Series([1, 2, 3])
+ s.iloc[0] = np.nan
+ assert_series_equal(s, expected)
+
+ s = Series([1, 2, 3])
+ s.loc[0] = np.nan
+ assert_series_equal(s, expected)
+
+ s = Series([1, 2, 3])
+ s[0] = np.nan
+ assert_series_equal(s, expected)
+
+ s = Series([False])
+ s.loc[0] = np.nan
+ assert_series_equal(s, Series([np.nan]))
+
+ s = Series([False, True])
+ s.loc[0] = np.nan
+ assert_series_equal(s, Series([np.nan, 1.0]))
+
+
+def test_set_value(test_data):
+ idx = test_data.ts.index[10]
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res = test_data.ts.set_value(idx, 0)
+ assert res is test_data.ts
+ assert test_data.ts[idx] == 0
+
+ # equiv
+ s = test_data.series.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res = s.set_value('foobar', 0)
+ assert res is s
+ assert res.index[-1] == 'foobar'
+ assert res['foobar'] == 0
+
+ s = test_data.series.copy()
+ s.loc['foobar'] = 0
+ assert s.index[-1] == 'foobar'
+ assert s['foobar'] == 0
+
+
+def test_setslice(test_data):
+ sl = test_data.ts[5:20]
+ assert len(sl) == len(sl.index)
+ assert sl.index.is_unique is True
+
+
+# FutureWarning from NumPy about [slice(None, 5).
[email protected]("ignore:Using a non-tuple:FutureWarning")
+def test_basic_getitem_setitem_corner(test_data):
+ # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2]
+ msg = "Can only tuple-index with a MultiIndex"
+ with pytest.raises(ValueError, match=msg):
+ test_data.ts[:, 2]
+ with pytest.raises(ValueError, match=msg):
+ test_data.ts[:, 2] = 2
+
+ # weird lists. [slice(0, 5)] will work but not two slices
+ result = test_data.ts[[slice(None, 5)]]
+ expected = test_data.ts[:5]
+ assert_series_equal(result, expected)
+
+ # OK
+ msg = r"unhashable type(: 'slice')?"
+ with pytest.raises(TypeError, match=msg):
+ test_data.ts[[5, slice(None, None)]]
+ with pytest.raises(TypeError, match=msg):
+ test_data.ts[[5, slice(None, None)]] = 2
+
+
[email protected]('tz', ['US/Eastern', 'UTC', 'Asia/Tokyo'])
+def test_setitem_with_tz(tz):
+ orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3,
+ tz=tz))
+ assert orig.dtype == 'datetime64[ns, {0}]'.format(tz)
+
+ # scalar
+ s = orig.copy()
+ s[1] = pd.Timestamp('2011-01-01', tz=tz)
+ exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz),
+ pd.Timestamp('2011-01-01 00:00', tz=tz),
+ pd.Timestamp('2016-01-01 02:00', tz=tz)])
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.loc[1] = pd.Timestamp('2011-01-01', tz=tz)
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz)
+ tm.assert_series_equal(s, exp)
+
+ # vector
+ vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2])
+ assert vals.dtype == 'datetime64[ns, {0}]'.format(tz)
+
+ s[[1, 2]] = vals
+ exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz),
+ pd.Timestamp('2011-01-01 00:00', tz=tz),
+ pd.Timestamp('2012-01-01 00:00', tz=tz)])
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.loc[[1, 2]] = vals
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.iloc[[1, 2]] = vals
+ tm.assert_series_equal(s, exp)
+
+
+def test_setitem_with_tz_dst():
+ # GH XXX
+ tz = 'US/Eastern'
+ orig = pd.Series(pd.date_range('2016-11-06', freq='H', periods=3,
+ tz=tz))
+ assert orig.dtype == 'datetime64[ns, {0}]'.format(tz)
+
+ # scalar
+ s = orig.copy()
+ s[1] = pd.Timestamp('2011-01-01', tz=tz)
+ exp = pd.Series([pd.Timestamp('2016-11-06 00:00-04:00', tz=tz),
+ pd.Timestamp('2011-01-01 00:00-05:00', tz=tz),
+ pd.Timestamp('2016-11-06 01:00-05:00', tz=tz)])
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.loc[1] = pd.Timestamp('2011-01-01', tz=tz)
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz)
+ tm.assert_series_equal(s, exp)
+
+ # vector
+ vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz),
+ pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2])
+ assert vals.dtype == 'datetime64[ns, {0}]'.format(tz)
+
+ s[[1, 2]] = vals
+ exp = pd.Series([pd.Timestamp('2016-11-06 00:00', tz=tz),
+ pd.Timestamp('2011-01-01 00:00', tz=tz),
+ pd.Timestamp('2012-01-01 00:00', tz=tz)])
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.loc[[1, 2]] = vals
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.iloc[[1, 2]] = vals
+ tm.assert_series_equal(s, exp)
+
+
+def test_categorial_assigning_ops():
+ orig = Series(Categorical(["b", "b"], categories=["a", "b"]))
+ s = orig.copy()
+ s[:] = "a"
+ exp = Series(Categorical(["a", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[1] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[s.index > 0] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s[[False, True]] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
+ tm.assert_series_equal(s, exp)
+
+ s = orig.copy()
+ s.index = ["x", "y"]
+ s["y"] = "a"
+ exp = Series(Categorical(["b", "a"], categories=["a", "b"]),
+ index=["x", "y"])
+ tm.assert_series_equal(s, exp)
+
+ # ensure that one can set something to np.nan
+ s = Series(Categorical([1, 2, 3]))
+ exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3]))
+ s[1] = np.nan
+ tm.assert_series_equal(s, exp)
+
+
+def test_slice(test_data):
+ numSlice = test_data.series[10:20]
+ numSliceEnd = test_data.series[-10:]
+ objSlice = test_data.objSeries[10:20]
+
+ assert test_data.series.index[9] not in numSlice.index
+ assert test_data.objSeries.index[9] not in objSlice.index
+
+ assert len(numSlice) == len(numSlice.index)
+ assert test_data.series[numSlice.index[0]] == numSlice[numSlice.index[0]]
+
+ assert numSlice.index[1] == test_data.series.index[11]
+ assert tm.equalContents(numSliceEnd, np.array(test_data.series)[-10:])
+
+ # Test return view.
+ sl = test_data.series[10:20]
+ sl[:] = 0
+
+ assert (test_data.series[10:20] == 0).all()
+
+
+def test_slice_can_reorder_not_uniquely_indexed():
+ s = Series(1, index=['a', 'a', 'b', 'b', 'c'])
+ s[::-1] # it works!
+
+
+def test_ix_setitem(test_data):
+ inds = test_data.series.index[[3, 4, 7]]
+
+ result = test_data.series.copy()
+ result.loc[inds] = 5
+
+ expected = test_data.series.copy()
+ expected[[3, 4, 7]] = 5
+ assert_series_equal(result, expected)
+
+ result.iloc[5:10] = 10
+ expected[5:10] = 10
+ assert_series_equal(result, expected)
+
+ # set slice with indices
+ d1, d2 = test_data.series.index[[5, 15]]
+ result.loc[d1:d2] = 6
+ expected[5:16] = 6 # because it's inclusive
+ assert_series_equal(result, expected)
+
+ # set index value
+ test_data.series.loc[d1] = 4
+ test_data.series.loc[d2] = 6
+ assert test_data.series[d1] == 4
+ assert test_data.series[d2] == 6
+
+
+def test_setitem_na():
+ # these induce dtype changes
+ expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan])
+ s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10])
+ s[::2] = np.nan
+ assert_series_equal(s, expected)
+
+ # gets coerced to float, right?
+ expected = Series([np.nan, 1, np.nan, 0])
+ s = Series([True, True, False, False])
+ s[::2] = np.nan
+ assert_series_equal(s, expected)
+
+ expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8,
+ 9])
+ s = Series(np.arange(10))
+ s[:5] = np.nan
+ assert_series_equal(s, expected)
+
+
+def test_timedelta_assignment():
+ # GH 8209
+ s = Series([])
+ s.loc['B'] = timedelta(1)
+ tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B']))
+
+ s = s.reindex(s.index.insert(0, 'A'))
+ tm.assert_series_equal(s, Series(
+ [np.nan, Timedelta('1 days')], index=['A', 'B']))
+
+ result = s.fillna(timedelta(1))
+ expected = Series(Timedelta('1 days'), index=['A', 'B'])
+ tm.assert_series_equal(result, expected)
+
+ s.loc['A'] = timedelta(1)
+ tm.assert_series_equal(s, expected)
+
+ # GH 14155
+ s = Series(10 * [np.timedelta64(10, 'm')])
+ s.loc[[1, 2, 3]] = np.timedelta64(20, 'm')
+ expected = pd.Series(10 * [np.timedelta64(10, 'm')])
+ expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm'))
+ tm.assert_series_equal(s, expected)
+
+
+def test_underlying_data_conversion():
+ # GH 4080
+ df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']})
+ df.set_index(['a', 'b', 'c'], inplace=True)
+ s = Series([1], index=[(2, 2, 2)])
+ df['val'] = 0
+ df
+ df['val'].update(s)
+
+ expected = DataFrame(
+ dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0]))
+ expected.set_index(['a', 'b', 'c'], inplace=True)
+ tm.assert_frame_equal(df, expected)
+
+ # GH 3970
+ # these are chained assignments as well
+ pd.set_option('chained_assignment', None)
+ df = DataFrame({"aa": range(5), "bb": [2.2] * 5})
+ df["cc"] = 0.0
+
+ ck = [True] * len(df)
+
+ df["bb"].iloc[0] = .13
+
+ # TODO: unused
+ df_tmp = df.iloc[ck] # noqa
+
+ df["bb"].iloc[0] = .15
+ assert df['bb'].iloc[0] == 0.15
+ pd.set_option('chained_assignment', 'raise')
+
+ # GH 3217
+ df = DataFrame(dict(a=[1, 3], b=[np.nan, 2]))
+ df['c'] = np.nan
+ df['c'].update(pd.Series(['foo'], index=[0]))
+
+ expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=['foo', np.nan]))
+ tm.assert_frame_equal(df, expected)
+
+
+def test_preserve_refs(test_data):
+ seq = test_data.ts[[5, 10, 15]]
+ seq[1] = np.NaN
+ assert not np.isnan(test_data.ts[10])
+
+
+def test_cast_on_putmask():
+ # GH 2746
+
+ # need to upcast
+ s = Series([1, 2], index=[1, 2], dtype='int64')
+ s[[True, False]] = Series([0], index=[1], dtype='int64')
+ expected = Series([0, 2], index=[1, 2], dtype='int64')
+
+ assert_series_equal(s, expected)
+
+
+def test_type_promote_putmask():
+ # GH8387: test that changing types does not break alignment
+ ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5)
+ left, mask = ts.copy(), ts > 0
+ right = ts[mask].copy().map(str)
+ left[mask] = right
+ assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t))
+
+ s = Series([0, 1, 2, 0])
+ mask = s > 0
+ s2 = s[mask].map(str)
+ s[mask] = s2
+ assert_series_equal(s, Series([0, '1', '2', 0]))
+
+ s = Series([0, 'foo', 'bar', 0])
+ mask = Series([False, True, True, False])
+ s2 = s[mask]
+ s[mask] = s2
+ assert_series_equal(s, Series([0, 'foo', 'bar', 0]))
+
+
+def test_multilevel_preserve_name():
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ s = Series(np.random.randn(len(index)), index=index, name='sth')
+
+ result = s['foo']
+ result2 = s.loc['foo']
+ assert result.name == s.name
+ assert result2.name == s.name
+
+
+def test_setitem_scalar_into_readonly_backing_data():
+ # GH14359: test that you cannot mutate a read only buffer
+
+ array = np.zeros(5)
+ array.flags.writeable = False # make the array immutable
+ series = Series(array)
+
+ for n in range(len(series)):
+ msg = "assignment destination is read-only"
+ with pytest.raises(ValueError, match=msg):
+ series[n] = 1
+
+ assert array[n] == 0
+
+
+def test_setitem_slice_into_readonly_backing_data():
+ # GH14359: test that you cannot mutate a read only buffer
+
+ array = np.zeros(5)
+ array.flags.writeable = False # make the array immutable
+ series = Series(array)
+
+ msg = "assignment destination is read-only"
+ with pytest.raises(ValueError, match=msg):
+ series[1:3] = 1
+
+ assert not array.any()
+
+
+"""
+miscellaneous methods
+"""
+
+
+def test_select(test_data):
+ # deprecated: gh-12410
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ n = len(test_data.ts)
+ result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2])
+ expected = test_data.ts.reindex(test_data.ts.index[n // 2:])
+ assert_series_equal(result, expected)
+
+ result = test_data.ts.select(lambda x: x.weekday() == 2)
+ expected = test_data.ts[test_data.ts.index.weekday == 2]
+ assert_series_equal(result, expected)
+
+
+def test_pop():
+ # GH 6600
+ df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, })
+ k = df.iloc[4]
+
+ result = k.pop('B')
+ assert result == 4
+
+ expected = Series([0, 0], index=['A', 'C'], name=4)
+ assert_series_equal(k, expected)
+
+
+def test_take():
+ s = Series([-1, 5, 6, 2, 4])
+
+ actual = s.take([1, 3, 4])
+ expected = Series([5, 2, 4], index=[1, 3, 4])
+ tm.assert_series_equal(actual, expected)
+
+ actual = s.take([-1, 3, 4])
+ expected = Series([4, 2, 4], index=[4, 3, 4])
+ tm.assert_series_equal(actual, expected)
+
+ msg = "index {} is out of bounds for size 5"
+ with pytest.raises(IndexError, match=msg.format(10)):
+ s.take([1, 10])
+ with pytest.raises(IndexError, match=msg.format(5)):
+ s.take([2, 5])
+
+ with tm.assert_produces_warning(FutureWarning):
+ s.take([-1, 3, 4], convert=False)
+
+
+def test_take_categorical():
+ # https://github.com/pandas-dev/pandas/issues/20664
+ s = Series(pd.Categorical(['a', 'b', 'c']))
+ result = s.take([-2, -2, 0])
+ expected = Series(pd.Categorical(['b', 'b', 'a'],
+ categories=['a', 'b', 'c']),
+ index=[1, 1, 0])
+ assert_series_equal(result, expected)
+
+
+def test_head_tail(test_data):
+ assert_series_equal(test_data.series.head(), test_data.series[:5])
+ assert_series_equal(test_data.series.head(0), test_data.series[0:0])
+ assert_series_equal(test_data.series.tail(), test_data.series[-5:])
+ assert_series_equal(test_data.series.tail(0), test_data.series[0:0])
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_loc.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_loc.py
new file mode 100644
index 00000000000..8c1709ff016
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_loc.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import Series, Timestamp
+from pandas.util.testing import assert_series_equal
+
+
[email protected]("val,expected", [
+ (2**63 - 1, 3),
+ (2**63, 4),
+])
+def test_loc_uint64(val, expected):
+ # see gh-19399
+ s = Series({2**63 - 1: 3, 2**63: 4})
+ assert s.loc[val] == expected
+
+
+def test_loc_getitem(test_data):
+ inds = test_data.series.index[[3, 4, 7]]
+ assert_series_equal(
+ test_data.series.loc[inds],
+ test_data.series.reindex(inds))
+ assert_series_equal(test_data.series.iloc[5::2], test_data.series[5::2])
+
+ # slice with indices
+ d1, d2 = test_data.ts.index[[5, 15]]
+ result = test_data.ts.loc[d1:d2]
+ expected = test_data.ts.truncate(d1, d2)
+ assert_series_equal(result, expected)
+
+ # boolean
+ mask = test_data.series > test_data.series.median()
+ assert_series_equal(test_data.series.loc[mask], test_data.series[mask])
+
+ # ask for index value
+ assert test_data.ts.loc[d1] == test_data.ts[d1]
+ assert test_data.ts.loc[d2] == test_data.ts[d2]
+
+
+def test_loc_getitem_not_monotonic(test_data):
+ d1, d2 = test_data.ts.index[[5, 15]]
+
+ ts2 = test_data.ts[::2][[1, 2, 0]]
+
+ msg = r"Timestamp\('2000-01-10 00:00:00'\)"
+ with pytest.raises(KeyError, match=msg):
+ ts2.loc[d1:d2]
+ with pytest.raises(KeyError, match=msg):
+ ts2.loc[d1:d2] = 0
+
+
+def test_loc_getitem_setitem_integer_slice_keyerrors():
+ s = Series(np.random.randn(10), index=lrange(0, 20, 2))
+
+ # this is OK
+ cp = s.copy()
+ cp.iloc[4:10] = 0
+ assert (cp.iloc[4:10] == 0).all()
+
+ # so is this
+ cp = s.copy()
+ cp.iloc[3:11] = 0
+ assert (cp.iloc[3:11] == 0).values.all()
+
+ result = s.iloc[2:6]
+ result2 = s.loc[3:11]
+ expected = s.reindex([4, 6, 8, 10])
+
+ assert_series_equal(result, expected)
+ assert_series_equal(result2, expected)
+
+ # non-monotonic, raise KeyError
+ s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]]
+ with pytest.raises(KeyError, match=r"^3L?$"):
+ s2.loc[3:11]
+ with pytest.raises(KeyError, match=r"^3L?$"):
+ s2.loc[3:11] = 0
+
+
+def test_loc_getitem_iterator(test_data):
+ idx = iter(test_data.series.index[:10])
+ result = test_data.series.loc[idx]
+ assert_series_equal(result, test_data.series[:10])
+
+
+def test_loc_setitem_boolean(test_data):
+ mask = test_data.series > test_data.series.median()
+
+ result = test_data.series.copy()
+ result.loc[mask] = 0
+ expected = test_data.series
+ expected[mask] = 0
+ assert_series_equal(result, expected)
+
+
+def test_loc_setitem_corner(test_data):
+ inds = list(test_data.series.index[[5, 8, 12]])
+ test_data.series.loc[inds] = 5
+ msg = r"\['foo'\] not in index"
+ with pytest.raises(KeyError, match=msg):
+ test_data.series.loc[inds + ['foo']] = 5
+
+
+def test_basic_setitem_with_labels(test_data):
+ indices = test_data.ts.index[[5, 10, 15]]
+
+ cp = test_data.ts.copy()
+ exp = test_data.ts.copy()
+ cp[indices] = 0
+ exp.loc[indices] = 0
+ assert_series_equal(cp, exp)
+
+ cp = test_data.ts.copy()
+ exp = test_data.ts.copy()
+ cp[indices[0]:indices[2]] = 0
+ exp.loc[indices[0]:indices[2]] = 0
+ assert_series_equal(cp, exp)
+
+ # integer indexes, be careful
+ s = Series(np.random.randn(10), index=lrange(0, 20, 2))
+ inds = [0, 4, 6]
+ arr_inds = np.array([0, 4, 6])
+
+ cp = s.copy()
+ exp = s.copy()
+ s[inds] = 0
+ s.loc[inds] = 0
+ assert_series_equal(cp, exp)
+
+ cp = s.copy()
+ exp = s.copy()
+ s[arr_inds] = 0
+ s.loc[arr_inds] = 0
+ assert_series_equal(cp, exp)
+
+ inds_notfound = [0, 4, 5, 6]
+ arr_inds_notfound = np.array([0, 4, 5, 6])
+ msg = r"\[5\] not contained in the index"
+ with pytest.raises(ValueError, match=msg):
+ s[inds_notfound] = 0
+ with pytest.raises(Exception, match=msg):
+ s[arr_inds_notfound] = 0
+
+ # GH12089
+ # with tz for values
+ s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"),
+ index=['a', 'b', 'c'])
+ s2 = s.copy()
+ expected = Timestamp('2011-01-03', tz='US/Eastern')
+ s2.loc['a'] = expected
+ result = s2.loc['a']
+ assert result == expected
+
+ s2 = s.copy()
+ s2.iloc[0] = expected
+ result = s2.iloc[0]
+ assert result == expected
+
+ s2 = s.copy()
+ s2['a'] = expected
+ result = s2['a']
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/series/indexing/test_numeric.py b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_numeric.py
new file mode 100644
index 00000000000..e4afb0e4567
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/indexing/test_numeric.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, range
+
+import pandas as pd
+from pandas import DataFrame, Index, Series
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+def test_get():
+ # GH 6383
+ s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45,
+ 51, 39, 55, 43, 54, 52, 51, 54]))
+
+ result = s.get(25, 0)
+ expected = 0
+ assert result == expected
+
+ s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56,
+ 45, 51, 39, 55, 43, 54, 52, 51, 54]),
+ index=pd.Float64Index(
+ [25.0, 36.0, 49.0, 64.0, 81.0, 100.0,
+ 121.0, 144.0, 169.0, 196.0, 1225.0,
+ 1296.0, 1369.0, 1444.0, 1521.0, 1600.0,
+ 1681.0, 1764.0, 1849.0, 1936.0],
+ dtype='object'))
+
+ result = s.get(25, 0)
+ expected = 43
+ assert result == expected
+
+ # GH 7407
+ # with a boolean accessor
+ df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3})
+ vc = df.i.value_counts()
+ result = vc.get(99, default='Missing')
+ assert result == 'Missing'
+
+ vc = df.b.value_counts()
+ result = vc.get(False, default='Missing')
+ assert result == 3
+
+ result = vc.get(True, default='Missing')
+ assert result == 'Missing'
+
+
+def test_get_nan():
+ # GH 8569
+ s = pd.Float64Index(range(10)).to_series()
+ assert s.get(np.nan) is None
+ assert s.get(np.nan, default='Missing') == 'Missing'
+
+
+def test_get_nan_multiple():
+ # GH 8569
+ # ensure that fixing "test_get_nan" above hasn't broken get
+ # with multiple elements
+ s = pd.Float64Index(range(10)).to_series()
+
+ idx = [2, 30]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ assert_series_equal(s.get(idx),
+ Series([2, np.nan], index=idx))
+
+ idx = [2, np.nan]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ assert_series_equal(s.get(idx),
+ Series([2, np.nan], index=idx))
+
+ # GH 17295 - all missing keys
+ idx = [20, 30]
+ assert(s.get(idx) is None)
+
+ idx = [np.nan, np.nan]
+ assert(s.get(idx) is None)
+
+
+def test_delitem():
+ # GH 5542
+ # should delete the item inplace
+ s = Series(lrange(5))
+ del s[0]
+
+ expected = Series(lrange(1, 5), index=lrange(1, 5))
+ assert_series_equal(s, expected)
+
+ del s[1]
+ expected = Series(lrange(2, 5), index=lrange(2, 5))
+ assert_series_equal(s, expected)
+
+ # empty
+ s = Series()
+
+ with pytest.raises(KeyError, match=r"^0$"):
+ del s[0]
+
+ # only 1 left, del, add, del
+ s = Series(1)
+ del s[0]
+ assert_series_equal(s, Series(dtype='int64', index=Index(
+ [], dtype='int64')))
+ s[0] = 1
+ assert_series_equal(s, Series(1))
+ del s[0]
+ assert_series_equal(s, Series(dtype='int64', index=Index(
+ [], dtype='int64')))
+
+ # Index(dtype=object)
+ s = Series(1, index=['a'])
+ del s['a']
+ assert_series_equal(s, Series(dtype='int64', index=Index(
+ [], dtype='object')))
+ s['a'] = 1
+ assert_series_equal(s, Series(1, index=['a']))
+ del s['a']
+ assert_series_equal(s, Series(dtype='int64', index=Index(
+ [], dtype='object')))
+
+
+def test_slice_float64():
+ values = np.arange(10., 50., 2)
+ index = Index(values)
+
+ start, end = values[[5, 15]]
+
+ s = Series(np.random.randn(20), index=index)
+
+ result = s[start:end]
+ expected = s.iloc[5:16]
+ assert_series_equal(result, expected)
+
+ result = s.loc[start:end]
+ assert_series_equal(result, expected)
+
+ df = DataFrame(np.random.randn(20, 3), index=index)
+
+ result = df[start:end]
+ expected = df.iloc[5:16]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.loc[start:end]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_getitem_negative_out_of_bounds():
+ s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
+
+ msg = "index out of bounds"
+ with pytest.raises(IndexError, match=msg):
+ s[-11]
+ msg = "index -11 is out of bounds for axis 0 with size 10"
+ with pytest.raises(IndexError, match=msg):
+ s[-11] = 'foo'
+
+
+def test_getitem_regression():
+ s = Series(lrange(5), index=lrange(5))
+ result = s[lrange(5)]
+ assert_series_equal(result, s)
+
+
+def test_getitem_setitem_slice_bug():
+ s = Series(lrange(10), lrange(10))
+ result = s[-12:]
+ assert_series_equal(result, s)
+
+ result = s[-7:]
+ assert_series_equal(result, s[3:])
+
+ result = s[:-12]
+ assert_series_equal(result, s[:0])
+
+ s = Series(lrange(10), lrange(10))
+ s[-12:] = 0
+ assert (s == 0).all()
+
+ s[:-12] = 5
+ assert (s == 0).all()
+
+
+def test_getitem_setitem_slice_integers():
+ s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16])
+
+ result = s[:4]
+ expected = s.reindex([2, 4, 6, 8])
+ assert_series_equal(result, expected)
+
+ s[:4] = 0
+ assert (s[:4] == 0).all()
+ assert not (s[4:] == 0).any()
+
+
+def test_setitem_float_labels():
+ # note labels are floats
+ s = Series(['a', 'b', 'c'], index=[0, 0.5, 1])
+ tmp = s.copy()
+
+ s.loc[1] = 'zoo'
+ tmp.iloc[2] = 'zoo'
+
+ assert_series_equal(s, tmp)
+
+
+def test_slice_float_get_set(test_data):
+ msg = (r"cannot do slice indexing on <class 'pandas\.core\.indexes"
+ r"\.datetimes\.DatetimeIndex'> with these indexers \[{key}\]"
+ r" of <(class|type) 'float'>")
+ with pytest.raises(TypeError, match=msg.format(key=r"4\.0")):
+ test_data.ts[4.0:10.0]
+
+ with pytest.raises(TypeError, match=msg.format(key=r"4\.0")):
+ test_data.ts[4.0:10.0] = 0
+
+ with pytest.raises(TypeError, match=msg.format(key=r"4\.5")):
+ test_data.ts[4.5:10.0]
+ with pytest.raises(TypeError, match=msg.format(key=r"4\.5")):
+ test_data.ts[4.5:10.0] = 0
+
+
+def test_slice_floats2():
+ s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float))
+
+ assert len(s.loc[12.0:]) == 8
+ assert len(s.loc[12.5:]) == 7
+
+ i = np.arange(10, 20, dtype=float)
+ i[2] = 12.2
+ s.index = i
+ assert len(s.loc[12.0:]) == 8
+ assert len(s.loc[12.5:]) == 7
+
+
+def test_int_indexing():
+ s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2])
+
+ with pytest.raises(KeyError, match=r"^5$"):
+ s[5]
+
+ with pytest.raises(KeyError, match=r"^'c'$"):
+ s['c']
+
+ # not monotonic
+ s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1])
+
+ with pytest.raises(KeyError, match=r"^5$"):
+ s[5]
+
+ with pytest.raises(KeyError, match=r"^'c'$"):
+ s['c']
+
+
+def test_getitem_int64(test_data):
+ idx = np.int64(5)
+ assert test_data.ts[idx] == test_data.ts[5]
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_alter_axes.py b/contrib/python/pandas/py2/pandas/tests/series/test_alter_axes.py
new file mode 100644
index 00000000000..73adc7d4bf8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_alter_axes.py
@@ -0,0 +1,347 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import lrange, range, zip
+
+from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series
+import pandas.util.testing as tm
+
+
+class TestSeriesAlterAxes(object):
+
+ def test_setindex(self, string_series):
+ # wrong type
+ msg = (r"Index\(\.\.\.\) must be called with a collection of some"
+ r" kind, None was passed")
+ with pytest.raises(TypeError, match=msg):
+ string_series.index = None
+
+ # wrong length
+ msg = ("Length mismatch: Expected axis has 30 elements, new"
+ " values have 29 elements")
+ with pytest.raises(ValueError, match=msg):
+ string_series.index = np.arange(len(string_series) - 1)
+
+ # works
+ string_series.index = np.arange(len(string_series))
+ assert isinstance(string_series.index, Index)
+
+ # Renaming
+
+ def test_rename(self, datetime_series):
+ ts = datetime_series
+ renamer = lambda x: x.strftime('%Y%m%d')
+ renamed = ts.rename(renamer)
+ assert renamed.index[0] == renamer(ts.index[0])
+
+ # dict
+ rename_dict = dict(zip(ts.index, renamed.index))
+ renamed2 = ts.rename(rename_dict)
+ tm.assert_series_equal(renamed, renamed2)
+
+ # partial dict
+ s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64')
+ renamed = s.rename({'b': 'foo', 'd': 'bar'})
+ tm.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar']))
+
+ # index with name
+ renamer = Series(np.arange(4),
+ index=Index(['a', 'b', 'c', 'd'], name='name'),
+ dtype='int64')
+ renamed = renamer.rename({})
+ assert renamed.index.name == renamer.index.name
+
+ def test_rename_by_series(self):
+ s = Series(range(5), name='foo')
+ renamer = Series({1: 10, 2: 20})
+ result = s.rename(renamer)
+ expected = Series(range(5), index=[0, 10, 20, 3, 4], name='foo')
+ tm.assert_series_equal(result, expected)
+
+ def test_rename_set_name(self):
+ s = Series(range(4), index=list('abcd'))
+ for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]:
+ result = s.rename(name)
+ assert result.name == name
+ tm.assert_numpy_array_equal(result.index.values, s.index.values)
+ assert s.name is None
+
+ def test_rename_set_name_inplace(self):
+ s = Series(range(3), index=list('abc'))
+ for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]:
+ s.rename(name, inplace=True)
+ assert s.name == name
+
+ exp = np.array(['a', 'b', 'c'], dtype=np.object_)
+ tm.assert_numpy_array_equal(s.index.values, exp)
+
+ def test_rename_axis_supported(self):
+ # Supporting axis for compatibility, detailed in GH-18589
+ s = Series(range(5))
+ s.rename({}, axis=0)
+ s.rename({}, axis='index')
+ with pytest.raises(ValueError, match='No axis named 5'):
+ s.rename({}, axis=5)
+
+ def test_set_name_attribute(self):
+ s = Series([1, 2, 3])
+ s2 = Series([1, 2, 3], name='bar')
+ for name in [7, 7., 'name', datetime(2001, 1, 1), (1,), u"\u05D0"]:
+ s.name = name
+ assert s.name == name
+ s2.name = name
+ assert s2.name == name
+
+ def test_set_name(self):
+ s = Series([1, 2, 3])
+ s2 = s._set_name('foo')
+ assert s2.name == 'foo'
+ assert s.name is None
+ assert s is not s2
+
+ def test_rename_inplace(self, datetime_series):
+ renamer = lambda x: x.strftime('%Y%m%d')
+ expected = renamer(datetime_series.index[0])
+
+ datetime_series.rename(renamer, inplace=True)
+ assert datetime_series.index[0] == expected
+
+ def test_set_index_makes_timeseries(self):
+ idx = tm.makeDateIndex(10)
+
+ s = Series(lrange(10))
+ s.index = idx
+ assert s.index.is_all_dates
+
+ def test_reset_index(self):
+ df = tm.makeDataFrame()[:5]
+ ser = df.stack()
+ ser.index.names = ['hash', 'category']
+
+ ser.name = 'value'
+ df = ser.reset_index()
+ assert 'value' in df
+
+ df = ser.reset_index(name='value2')
+ assert 'value2' in df
+
+ # check inplace
+ s = ser.reset_index(drop=True)
+ s2 = ser
+ s2.reset_index(drop=True, inplace=True)
+ tm.assert_series_equal(s, s2)
+
+ # level
+ index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1]])
+ s = Series(np.random.randn(6), index=index)
+ rs = s.reset_index(level=1)
+ assert len(rs.columns) == 2
+
+ rs = s.reset_index(level=[0, 2], drop=True)
+ tm.assert_index_equal(rs.index, Index(index.get_level_values(1)))
+ assert isinstance(rs, Series)
+
+ def test_reset_index_name(self):
+ s = Series([1, 2, 3], index=Index(range(3), name='x'))
+ assert s.reset_index().index.name is None
+ assert s.reset_index(drop=True).index.name is None
+
+ def test_reset_index_level(self):
+ df = DataFrame([[1, 2, 3], [4, 5, 6]],
+ columns=['A', 'B', 'C'])
+
+ for levels in ['A', 'B'], [0, 1]:
+ # With MultiIndex
+ s = df.set_index(['A', 'B'])['C']
+
+ result = s.reset_index(level=levels[0])
+ tm.assert_frame_equal(result, df.set_index('B'))
+
+ result = s.reset_index(level=levels[:1])
+ tm.assert_frame_equal(result, df.set_index('B'))
+
+ result = s.reset_index(level=levels)
+ tm.assert_frame_equal(result, df)
+
+ result = df.set_index(['A', 'B']).reset_index(level=levels,
+ drop=True)
+ tm.assert_frame_equal(result, df[['C']])
+
+ with pytest.raises(KeyError, match='Level E '):
+ s.reset_index(level=['A', 'E'])
+
+ # With single-level Index
+ s = df.set_index('A')['B']
+
+ result = s.reset_index(level=levels[0])
+ tm.assert_frame_equal(result, df[['A', 'B']])
+
+ result = s.reset_index(level=levels[:1])
+ tm.assert_frame_equal(result, df[['A', 'B']])
+
+ result = s.reset_index(level=levels[0], drop=True)
+ tm.assert_series_equal(result, df['B'])
+
+ with pytest.raises(IndexError, match='Too many levels'):
+ s.reset_index(level=[0, 1, 2])
+
+ # Check that .reset_index([],drop=True) doesn't fail
+ result = Series(range(4)).reset_index([], drop=True)
+ expected = Series(range(4))
+ tm.assert_series_equal(result, expected)
+
+ def test_reset_index_range(self):
+ # GH 12071
+ s = Series(range(2), name='A', dtype='int64')
+ series_result = s.reset_index()
+ assert isinstance(series_result.index, RangeIndex)
+ series_expected = DataFrame([[0, 0], [1, 1]],
+ columns=['index', 'A'],
+ index=RangeIndex(stop=2))
+ tm.assert_frame_equal(series_result, series_expected)
+
+ def test_reorder_levels(self):
+ index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1]],
+ names=['L0', 'L1', 'L2'])
+ s = Series(np.arange(6), index=index)
+
+ # no change, position
+ result = s.reorder_levels([0, 1, 2])
+ tm.assert_series_equal(s, result)
+
+ # no change, labels
+ result = s.reorder_levels(['L0', 'L1', 'L2'])
+ tm.assert_series_equal(s, result)
+
+ # rotate, position
+ result = s.reorder_levels([1, 2, 0])
+ e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']],
+ codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1],
+ [0, 0, 0, 0, 0, 0]],
+ names=['L1', 'L2', 'L0'])
+ expected = Series(np.arange(6), index=e_idx)
+ tm.assert_series_equal(result, expected)
+
+ def test_rename_axis_mapper(self):
+ # GH 19978
+ mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]],
+ names=['ll', 'nn'])
+ s = Series([i for i in range(len(mi))], index=mi)
+
+ result = s.rename_axis(index={'ll': 'foo'})
+ assert result.index.names == ['foo', 'nn']
+
+ result = s.rename_axis(index=str.upper, axis=0)
+ assert result.index.names == ['LL', 'NN']
+
+ result = s.rename_axis(index=['foo', 'goo'])
+ assert result.index.names == ['foo', 'goo']
+
+ with pytest.raises(TypeError, match='unexpected'):
+ s.rename_axis(columns='wrong')
+
+ def test_rename_axis_inplace(self, datetime_series):
+ # GH 15704
+ expected = datetime_series.rename_axis('foo')
+ result = datetime_series
+ no_return = result.rename_axis('foo', inplace=True)
+
+ assert no_return is None
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('kwargs', [{'mapper': None}, {'index': None}, {}])
+ def test_rename_axis_none(self, kwargs):
+ # GH 25034
+ index = Index(list('abc'), name='foo')
+ df = Series([1, 2, 3], index=index)
+
+ result = df.rename_axis(**kwargs)
+ expected_index = index.rename(None) if kwargs else index
+ expected = Series([1, 2, 3], index=expected_index)
+ tm.assert_series_equal(result, expected)
+
+ def test_set_axis_inplace_axes(self, axis_series):
+ # GH14636
+ ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64')
+
+ expected = ser.copy()
+ expected.index = list('abcd')
+
+ # inplace=True
+ # The FutureWarning comes from the fact that we would like to have
+ # inplace default to False some day
+ for inplace, warn in [(None, FutureWarning), (True, None)]:
+ result = ser.copy()
+ kwargs = {'inplace': inplace}
+ with tm.assert_produces_warning(warn):
+ result.set_axis(list('abcd'), axis=axis_series, **kwargs)
+ tm.assert_series_equal(result, expected)
+
+ def test_set_axis_inplace(self):
+ # GH14636
+
+ s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64')
+
+ expected = s.copy()
+ expected.index = list('abcd')
+
+ # inplace=False
+ result = s.set_axis(list('abcd'), axis=0, inplace=False)
+ tm.assert_series_equal(expected, result)
+
+ # omitting the "axis" parameter
+ with tm.assert_produces_warning(None):
+ result = s.set_axis(list('abcd'), inplace=False)
+ tm.assert_series_equal(result, expected)
+
+ # wrong values for the "axis" parameter
+ for axis in [2, 'foo']:
+ with pytest.raises(ValueError, match='No axis named'):
+ s.set_axis(list('abcd'), axis=axis, inplace=False)
+
+ def test_set_axis_prior_to_deprecation_signature(self):
+ s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64')
+
+ expected = s.copy()
+ expected.index = list('abcd')
+
+ for axis in [0, 'index']:
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.set_axis(0, list('abcd'), inplace=False)
+ tm.assert_series_equal(result, expected)
+
+ def test_reset_index_drop_errors(self):
+ # GH 20925
+
+ # KeyError raised for series index when passed level name is missing
+ s = Series(range(4))
+ with pytest.raises(KeyError, match='must be same as name'):
+ s.reset_index('wrong', drop=True)
+ with pytest.raises(KeyError, match='must be same as name'):
+ s.reset_index('wrong')
+
+ # KeyError raised for series when level to be dropped is missing
+ s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2))
+ with pytest.raises(KeyError, match='not found'):
+ s.reset_index('wrong', drop=True)
+
+ def test_droplevel(self):
+ # GH20342
+ ser = Series([1, 2, 3, 4])
+ ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)],
+ names=['a', 'b'])
+ expected = ser.reset_index('b', drop=True)
+ result = ser.droplevel('b', axis='index')
+ tm.assert_series_equal(result, expected)
+ # test that droplevel raises ValueError on axis != 0
+ with pytest.raises(ValueError):
+ ser.droplevel(1, axis='columns')
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_analytics.py b/contrib/python/pandas/py2/pandas/tests/series/test_analytics.py
new file mode 100644
index 00000000000..6811e370726
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_analytics.py
@@ -0,0 +1,1499 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from distutils.version import LooseVersion
+from itertools import product
+import operator
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas.compat import PY35, lrange, range
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DataFrame, Series, compat, date_range, isna,
+ notna)
+from pandas.api.types import is_scalar
+from pandas.core.index import MultiIndex
+from pandas.core.indexes.datetimes import Timestamp
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_index_equal,
+ assert_series_equal)
+
+
+class TestSeriesAnalytics(object):
+
+ def test_describe(self):
+ s = Series([0, 1, 2, 3, 4], name='int_data')
+ result = s.describe()
+ expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4],
+ name='int_data',
+ index=['count', 'mean', 'std', 'min', '25%',
+ '50%', '75%', 'max'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series([True, True, False, False, False], name='bool_data')
+ result = s.describe()
+ expected = Series([5, 2, False, 3], name='bool_data',
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data')
+ result = s.describe()
+ expected = Series([5, 4, 'a', 2], name='str_data',
+ index=['count', 'unique', 'top', 'freq'])
+ tm.assert_series_equal(result, expected)
+
+ def test_describe_with_tz(self, tz_naive_fixture):
+ # GH 21332
+ tz = tz_naive_fixture
+ name = str(tz_naive_fixture)
+ start = Timestamp(2018, 1, 1)
+ end = Timestamp(2018, 1, 5)
+ s = Series(date_range(start, end, tz=tz), name=name)
+ result = s.describe()
+ expected = Series(
+ [5, 5, s.value_counts().index[0], 1, start.tz_localize(tz),
+ end.tz_localize(tz)
+ ],
+ name=name,
+ index=['count', 'unique', 'top', 'freq', 'first', 'last']
+ )
+ tm.assert_series_equal(result, expected)
+
+ def test_argsort(self, datetime_series):
+ self._check_accum_op('argsort', datetime_series, check_dtype=False)
+ argsorted = datetime_series.argsort()
+ assert issubclass(argsorted.dtype.type, np.integer)
+
+ # GH 2967 (introduced bug in 0.11-dev I think)
+ s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)])
+ assert s.dtype == 'datetime64[ns]'
+ shifted = s.shift(-1)
+ assert shifted.dtype == 'datetime64[ns]'
+ assert isna(shifted[4])
+
+ result = s.argsort()
+ expected = Series(lrange(5), dtype='int64')
+ assert_series_equal(result, expected)
+
+ result = shifted.argsort()
+ expected = Series(lrange(4) + [-1], dtype='int64')
+ assert_series_equal(result, expected)
+
+ def test_argsort_stable(self):
+ s = Series(np.random.randint(0, 100, size=10000))
+ mindexer = s.argsort(kind='mergesort')
+ qindexer = s.argsort()
+
+ mexpected = np.argsort(s.values, kind='mergesort')
+ qexpected = np.argsort(s.values, kind='quicksort')
+
+ tm.assert_series_equal(mindexer, Series(mexpected),
+ check_dtype=False)
+ tm.assert_series_equal(qindexer, Series(qexpected),
+ check_dtype=False)
+ msg = (r"ndarray Expected type <(class|type) 'numpy\.ndarray'>,"
+ r" found <class 'pandas\.core\.series\.Series'> instead")
+ with pytest.raises(AssertionError, match=msg):
+ tm.assert_numpy_array_equal(qindexer, mindexer)
+
+ def test_cumsum(self, datetime_series):
+ self._check_accum_op('cumsum', datetime_series)
+
+ def test_cumprod(self, datetime_series):
+ self._check_accum_op('cumprod', datetime_series)
+
+ def test_cummin(self, datetime_series):
+ tm.assert_numpy_array_equal(datetime_series.cummin().values,
+ np.minimum
+ .accumulate(np.array(datetime_series)))
+ ts = datetime_series.copy()
+ ts[::2] = np.NaN
+ result = ts.cummin()[1::2]
+ expected = np.minimum.accumulate(ts.dropna())
+
+ tm.assert_series_equal(result, expected)
+
+ def test_cummax(self, datetime_series):
+ tm.assert_numpy_array_equal(datetime_series.cummax().values,
+ np.maximum
+ .accumulate(np.array(datetime_series)))
+ ts = datetime_series.copy()
+ ts[::2] = np.NaN
+ result = ts.cummax()[1::2]
+ expected = np.maximum.accumulate(ts.dropna())
+
+ tm.assert_series_equal(result, expected)
+
+ def test_cummin_datetime64(self):
+ s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
+ 'NaT', '2000-1-3']))
+
+ expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
+ '2000-1-1', 'NaT', '2000-1-1']))
+ result = s.cummin(skipna=True)
+ tm.assert_series_equal(expected, result)
+
+ expected = pd.Series(pd.to_datetime(
+ ['NaT', '2000-1-2', '2000-1-2', '2000-1-1', '2000-1-1', '2000-1-1'
+ ]))
+ result = s.cummin(skipna=False)
+ tm.assert_series_equal(expected, result)
+
+ def test_cummax_datetime64(self):
+ s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
+ 'NaT', '2000-1-3']))
+
+ expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
+ '2000-1-2', 'NaT', '2000-1-3']))
+ result = s.cummax(skipna=True)
+ tm.assert_series_equal(expected, result)
+
+ expected = pd.Series(pd.to_datetime(
+ ['NaT', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-3'
+ ]))
+ result = s.cummax(skipna=False)
+ tm.assert_series_equal(expected, result)
+
+ def test_cummin_timedelta64(self):
+ s = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ 'NaT',
+ '1 min',
+ 'NaT',
+ '3 min', ]))
+
+ expected = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ 'NaT',
+ '1 min',
+ 'NaT',
+ '1 min', ]))
+ result = s.cummin(skipna=True)
+ tm.assert_series_equal(expected, result)
+
+ expected = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ '2 min',
+ '1 min',
+ '1 min',
+ '1 min', ]))
+ result = s.cummin(skipna=False)
+ tm.assert_series_equal(expected, result)
+
+ def test_cummax_timedelta64(self):
+ s = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ 'NaT',
+ '1 min',
+ 'NaT',
+ '3 min', ]))
+
+ expected = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ 'NaT',
+ '2 min',
+ 'NaT',
+ '3 min', ]))
+ result = s.cummax(skipna=True)
+ tm.assert_series_equal(expected, result)
+
+ expected = pd.Series(pd.to_timedelta(['NaT',
+ '2 min',
+ '2 min',
+ '2 min',
+ '2 min',
+ '3 min', ]))
+ result = s.cummax(skipna=False)
+ tm.assert_series_equal(expected, result)
+
+ def test_npdiff(self):
+ pytest.skip("skipping due to Series no longer being an "
+ "ndarray")
+
+ # no longer works as the return type of np.diff is now nd.array
+ s = Series(np.arange(5))
+
+ r = np.diff(s)
+ assert_series_equal(Series([nan, 0, 0, 0, nan]), r)
+
+ def _check_accum_op(self, name, datetime_series_, check_dtype=True):
+ func = getattr(np, name)
+ tm.assert_numpy_array_equal(func(datetime_series_).values,
+ func(np.array(datetime_series_)),
+ check_dtype=check_dtype)
+
+ # with missing values
+ ts = datetime_series_.copy()
+ ts[::2] = np.NaN
+
+ result = func(ts)[1::2]
+ expected = func(np.array(ts.dropna()))
+
+ tm.assert_numpy_array_equal(result.values, expected,
+ check_dtype=False)
+
+ def test_compress(self):
+ cond = [True, False, True, False, False]
+ s = Series([1, -1, 5, 8, 7],
+ index=list('abcde'), name='foo')
+ expected = Series(s.values.compress(cond),
+ index=list('ac'), name='foo')
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.compress(cond)
+ tm.assert_series_equal(result, expected)
+
+ def test_numpy_compress(self):
+ cond = [True, False, True, False, False]
+ s = Series([1, -1, 5, 8, 7],
+ index=list('abcde'), name='foo')
+ expected = Series(s.values.compress(cond),
+ index=list('ac'), name='foo')
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ tm.assert_series_equal(np.compress(cond, s), expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.compress(cond, s, axis=1)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.compress(cond, s, out=s)
+
+ def test_round(self, datetime_series):
+ datetime_series.index.name = "index_name"
+ result = datetime_series.round(2)
+ expected = Series(np.round(datetime_series.values, 2),
+ index=datetime_series.index, name='ts')
+ assert_series_equal(result, expected)
+ assert result.name == datetime_series.name
+
+ def test_numpy_round(self):
+ # See gh-12600
+ s = Series([1.53, 1.36, 0.06])
+ out = np.round(s, decimals=0)
+ expected = Series([2., 1., 0.])
+ assert_series_equal(out, expected)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.round(s, decimals=0, out=s)
+
+ def test_built_in_round(self):
+ if not compat.PY3:
+ pytest.skip(
+ 'build in round cannot be overridden prior to Python 3')
+
+ s = Series([1.123, 2.123, 3.123], index=lrange(3))
+ result = round(s)
+ expected_rounded0 = Series([1., 2., 3.], index=lrange(3))
+ tm.assert_series_equal(result, expected_rounded0)
+
+ decimals = 2
+ expected_rounded = Series([1.12, 2.12, 3.12], index=lrange(3))
+ result = round(s, decimals)
+ tm.assert_series_equal(result, expected_rounded)
+
+ def test_prod_numpy16_bug(self):
+ s = Series([1., 1., 1.], index=lrange(3))
+ result = s.prod()
+
+ assert not isinstance(result, Series)
+
+ @td.skip_if_no_scipy
+ def test_corr(self, datetime_series):
+ import scipy.stats as stats
+
+ # full overlap
+ tm.assert_almost_equal(datetime_series.corr(datetime_series), 1)
+
+ # partial overlap
+ tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]),
+ 1)
+
+ assert isna(datetime_series[:15].corr(datetime_series[5:],
+ min_periods=12))
+
+ ts1 = datetime_series[:15].reindex(datetime_series.index)
+ ts2 = datetime_series[5:].reindex(datetime_series.index)
+ assert isna(ts1.corr(ts2, min_periods=12))
+
+ # No overlap
+ assert np.isnan(datetime_series[::2].corr(datetime_series[1::2]))
+
+ # all NA
+ cp = datetime_series[:10].copy()
+ cp[:] = np.nan
+ assert isna(cp.corr(cp))
+
+ A = tm.makeTimeSeries()
+ B = tm.makeTimeSeries()
+ result = A.corr(B)
+ expected, _ = stats.pearsonr(A, B)
+ tm.assert_almost_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_corr_rank(self):
+ import scipy
+ import scipy.stats as stats
+
+ # kendall and spearman
+ A = tm.makeTimeSeries()
+ B = tm.makeTimeSeries()
+ A[-5:] = A[:5]
+ result = A.corr(B, method='kendall')
+ expected = stats.kendalltau(A, B)[0]
+ tm.assert_almost_equal(result, expected)
+
+ result = A.corr(B, method='spearman')
+ expected = stats.spearmanr(A, B)[0]
+ tm.assert_almost_equal(result, expected)
+
+ # these methods got rewritten in 0.8
+ if LooseVersion(scipy.__version__) < LooseVersion('0.9'):
+ pytest.skip("skipping corr rank because of scipy version "
+ "{0}".format(scipy.__version__))
+
+ # results from R
+ A = Series(
+ [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, -
+ 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606])
+ B = Series(
+ [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292,
+ 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375])
+ kexp = 0.4319297
+ sexp = 0.5853767
+ tm.assert_almost_equal(A.corr(B, method='kendall'), kexp)
+ tm.assert_almost_equal(A.corr(B, method='spearman'), sexp)
+
+ def test_corr_invalid_method(self):
+ # GH PR #22298
+ s1 = pd.Series(np.random.randn(10))
+ s2 = pd.Series(np.random.randn(10))
+ msg = ("method must be either 'pearson', 'spearman', "
+ "or 'kendall'")
+ with pytest.raises(ValueError, match=msg):
+ s1.corr(s2, method="____")
+
+ def test_corr_callable_method(self, datetime_series):
+ # simple correlation example
+ # returns 1 if exact equality, 0 otherwise
+ my_corr = lambda a, b: 1. if (a == b).all() else 0.
+
+ # simple example
+ s1 = Series([1, 2, 3, 4, 5])
+ s2 = Series([5, 4, 3, 2, 1])
+ expected = 0
+ tm.assert_almost_equal(
+ s1.corr(s2, method=my_corr),
+ expected)
+
+ # full overlap
+ tm.assert_almost_equal(datetime_series.corr(
+ datetime_series, method=my_corr), 1.)
+
+ # partial overlap
+ tm.assert_almost_equal(datetime_series[:15].corr(
+ datetime_series[5:], method=my_corr), 1.)
+
+ # No overlap
+ assert np.isnan(datetime_series[::2].corr(
+ datetime_series[1::2], method=my_corr))
+
+ # dataframe example
+ df = pd.DataFrame([s1, s2])
+ expected = pd.DataFrame([
+ {0: 1., 1: 0}, {0: 0, 1: 1.}])
+ tm.assert_almost_equal(
+ df.transpose().corr(method=my_corr), expected)
+
+ def test_cov(self, datetime_series):
+ # full overlap
+ tm.assert_almost_equal(datetime_series.cov(datetime_series),
+ datetime_series.std() ** 2)
+
+ # partial overlap
+ tm.assert_almost_equal(datetime_series[:15].cov(datetime_series[5:]),
+ datetime_series[5:15].std() ** 2)
+
+ # No overlap
+ assert np.isnan(datetime_series[::2].cov(datetime_series[1::2]))
+
+ # all NA
+ cp = datetime_series[:10].copy()
+ cp[:] = np.nan
+ assert isna(cp.cov(cp))
+
+ # min_periods
+ assert isna(datetime_series[:15].cov(datetime_series[5:],
+ min_periods=12))
+
+ ts1 = datetime_series[:15].reindex(datetime_series.index)
+ ts2 = datetime_series[5:].reindex(datetime_series.index)
+ assert isna(ts1.cov(ts2, min_periods=12))
+
+ def test_count(self, datetime_series):
+ assert datetime_series.count() == len(datetime_series)
+
+ datetime_series[::2] = np.NaN
+
+ assert datetime_series.count() == np.isfinite(datetime_series).sum()
+
+ mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
+ ts = Series(np.arange(len(mi)), index=mi)
+
+ left = ts.count(level=1)
+ right = Series([2, 3, 1], index=[1, 2, nan])
+ assert_series_equal(left, right)
+
+ ts.iloc[[0, 3, 5]] = nan
+ assert_series_equal(ts.count(level=1), right - 1)
+
+ def test_dot(self):
+ a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
+ b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
+ columns=['p', 'q', 'r', 's']).T
+
+ result = a.dot(b)
+ expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
+ assert_series_equal(result, expected)
+
+ # Check index alignment
+ b2 = b.reindex(index=reversed(b.index))
+ result = a.dot(b)
+ assert_series_equal(result, expected)
+
+ # Check ndarray argument
+ result = a.dot(b.values)
+ assert np.all(result == expected.values)
+ assert_almost_equal(a.dot(b['2'].values), expected['2'])
+
+ # Check series argument
+ assert_almost_equal(a.dot(b['1']), expected['1'])
+ assert_almost_equal(a.dot(b2['1']), expected['1'])
+
+ msg = r"Dot product shape mismatch, \(4L?,\) vs \(3L?,\)"
+ # exception raised is of type Exception
+ with pytest.raises(Exception, match=msg):
+ a.dot(a.values[:3])
+ msg = "matrices are not aligned"
+ with pytest.raises(ValueError, match=msg):
+ a.dot(b.T)
+
+ @pytest.mark.skipif(not PY35,
+ reason='matmul supported for Python>=3.5')
+ def test_matmul(self):
+ # matmul test is for GH #10259
+ a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
+ b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
+ columns=['p', 'q', 'r', 's']).T
+
+ # Series @ DataFrame
+ result = operator.matmul(a, b)
+ expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
+ assert_series_equal(result, expected)
+
+ # DataFrame @ Series
+ result = operator.matmul(b.T, a)
+ expected = Series(np.dot(b.T.values, a.T.values),
+ index=['1', '2', '3'])
+ assert_series_equal(result, expected)
+
+ # Series @ Series
+ result = operator.matmul(a, a)
+ expected = np.dot(a.values, a.values)
+ assert_almost_equal(result, expected)
+
+ # GH 21530
+ # vector (1D np.array) @ Series (__rmatmul__)
+ result = operator.matmul(a.values, a)
+ expected = np.dot(a.values, a.values)
+ assert_almost_equal(result, expected)
+
+ # GH 21530
+ # vector (1D list) @ Series (__rmatmul__)
+ result = operator.matmul(a.values.tolist(), a)
+ expected = np.dot(a.values, a.values)
+ assert_almost_equal(result, expected)
+
+ # GH 21530
+ # matrix (2D np.array) @ Series (__rmatmul__)
+ result = operator.matmul(b.T.values, a)
+ expected = np.dot(b.T.values, a.values)
+ assert_almost_equal(result, expected)
+
+ # GH 21530
+ # matrix (2D nested lists) @ Series (__rmatmul__)
+ result = operator.matmul(b.T.values.tolist(), a)
+ expected = np.dot(b.T.values, a.values)
+ assert_almost_equal(result, expected)
+
+ # mixed dtype DataFrame @ Series
+ a['p'] = int(a.p)
+ result = operator.matmul(b.T, a)
+ expected = Series(np.dot(b.T.values, a.T.values),
+ index=['1', '2', '3'])
+ assert_series_equal(result, expected)
+
+ # different dtypes DataFrame @ Series
+ a = a.astype(int)
+ result = operator.matmul(b.T, a)
+ expected = Series(np.dot(b.T.values, a.T.values),
+ index=['1', '2', '3'])
+ assert_series_equal(result, expected)
+
+ msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)"
+ # exception raised is of type Exception
+ with pytest.raises(Exception, match=msg):
+ a.dot(a.values[:3])
+ msg = "matrices are not aligned"
+ with pytest.raises(ValueError, match=msg):
+ a.dot(b.T)
+
+ def test_clip(self, datetime_series):
+ val = datetime_series.median()
+
+ with tm.assert_produces_warning(FutureWarning):
+ assert datetime_series.clip_lower(val).min() == val
+ with tm.assert_produces_warning(FutureWarning):
+ assert datetime_series.clip_upper(val).max() == val
+
+ assert datetime_series.clip(lower=val).min() == val
+ assert datetime_series.clip(upper=val).max() == val
+
+ result = datetime_series.clip(-0.5, 0.5)
+ expected = np.clip(datetime_series, -0.5, 0.5)
+ assert_series_equal(result, expected)
+ assert isinstance(expected, Series)
+
+ def test_clip_types_and_nulls(self):
+
+ sers = [Series([np.nan, 1.0, 2.0, 3.0]), Series([None, 'a', 'b', 'c']),
+ Series(pd.to_datetime(
+ [np.nan, 1, 2, 3], unit='D'))]
+
+ for s in sers:
+ thresh = s[2]
+ with tm.assert_produces_warning(FutureWarning):
+ lower = s.clip_lower(thresh)
+ with tm.assert_produces_warning(FutureWarning):
+ upper = s.clip_upper(thresh)
+ assert lower[notna(lower)].min() == thresh
+ assert upper[notna(upper)].max() == thresh
+ assert list(isna(s)) == list(isna(lower))
+ assert list(isna(s)) == list(isna(upper))
+
+ def test_clip_with_na_args(self):
+ """Should process np.nan argument as None """
+ # GH # 17276
+ s = Series([1, 2, 3])
+
+ assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
+ assert_series_equal(s.clip(upper=np.nan, lower=np.nan),
+ Series([1, 2, 3]))
+
+ # GH #19992
+ assert_series_equal(s.clip(lower=[0, 4, np.nan]),
+ Series([1, 4, np.nan]))
+ assert_series_equal(s.clip(upper=[1, np.nan, 1]),
+ Series([1, np.nan, 1]))
+
+ def test_clip_against_series(self):
+ # GH #6966
+
+ s = Series([1.0, 1.0, 4.0])
+ threshold = Series([1.0, 2.0, 3.0])
+
+ with tm.assert_produces_warning(FutureWarning):
+ assert_series_equal(s.clip_lower(threshold),
+ Series([1.0, 2.0, 4.0]))
+ with tm.assert_produces_warning(FutureWarning):
+ assert_series_equal(s.clip_upper(threshold),
+ Series([1.0, 1.0, 3.0]))
+
+ lower = Series([1.0, 2.0, 3.0])
+ upper = Series([1.5, 2.5, 3.5])
+
+ assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
+ assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))
+
+ @pytest.mark.parametrize("inplace", [True, False])
+ @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])])
+ def test_clip_against_list_like(self, inplace, upper):
+ # GH #15390
+ original = pd.Series([5, 6, 7])
+ result = original.clip(upper=upper, inplace=inplace)
+ expected = pd.Series([1, 2, 3])
+
+ if inplace:
+ result = original
+ tm.assert_series_equal(result, expected, check_exact=True)
+
+ def test_clip_with_datetimes(self):
+
+ # GH 11838
+ # naive and tz-aware datetimes
+
+ t = Timestamp('2015-12-01 09:30:30')
+ s = Series([Timestamp('2015-12-01 09:30:00'),
+ Timestamp('2015-12-01 09:31:00')])
+ result = s.clip(upper=t)
+ expected = Series([Timestamp('2015-12-01 09:30:00'),
+ Timestamp('2015-12-01 09:30:30')])
+ assert_series_equal(result, expected)
+
+ t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern')
+ s = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
+ Timestamp('2015-12-01 09:31:00', tz='US/Eastern')])
+ result = s.clip(upper=t)
+ expected = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
+ Timestamp('2015-12-01 09:30:30', tz='US/Eastern')])
+ assert_series_equal(result, expected)
+
+ def test_cummethods_bool(self):
+ # GH 6270
+
+ a = pd.Series([False, False, False, True, True, False, False])
+ b = ~a
+ c = pd.Series([False] * len(b))
+ d = ~c
+ methods = {'cumsum': np.cumsum,
+ 'cumprod': np.cumprod,
+ 'cummin': np.minimum.accumulate,
+ 'cummax': np.maximum.accumulate}
+ args = product((a, b, c, d), methods)
+ for s, method in args:
+ expected = Series(methods[method](s.values))
+ result = getattr(s, method)()
+ assert_series_equal(result, expected)
+
+ e = pd.Series([False, True, nan, False])
+ cse = pd.Series([0, 1, nan, 1], dtype=object)
+ cpe = pd.Series([False, 0, nan, 0])
+ cmin = pd.Series([False, False, nan, False])
+ cmax = pd.Series([False, True, nan, True])
+ expecteds = {'cumsum': cse,
+ 'cumprod': cpe,
+ 'cummin': cmin,
+ 'cummax': cmax}
+
+ for method in methods:
+ res = getattr(e, method)()
+ assert_series_equal(res, expecteds[method])
+
+ def test_isin(self):
+ s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
+
+ result = s.isin(['A', 'C'])
+ expected = Series([True, False, True, False, False, False, True, True])
+ assert_series_equal(result, expected)
+
+ # GH: 16012
+ # This specific issue has to have a series over 1e6 in len, but the
+ # comparison array (in_list) must be large enough so that numpy doesn't
+ # do a manual masking trick that will avoid this issue altogether
+ s = Series(list('abcdefghijk' * 10 ** 5))
+ # If numpy doesn't do the manual comparison/mask, these
+ # unorderable mixed types are what cause the exception in numpy
+ in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E',
+ 'K', 'E', 'S', 'I', 'R', 'R'] * 6
+
+ assert s.isin(in_list).sum() == 200000
+
+ def test_isin_with_string_scalar(self):
+ # GH4763
+ s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
+ msg = (r"only list-like objects are allowed to be passed to isin\(\),"
+ r" you passed a \[str\]")
+ with pytest.raises(TypeError, match=msg):
+ s.isin('a')
+
+ s = Series(['aaa', 'b', 'c'])
+ with pytest.raises(TypeError, match=msg):
+ s.isin('aaa')
+
+ def test_isin_with_i8(self):
+ # GH 5021
+
+ expected = Series([True, True, False, False, False])
+ expected2 = Series([False, True, False, False, False])
+
+ # datetime64[ns]
+ s = Series(date_range('jan-01-2013', 'jan-05-2013'))
+
+ result = s.isin(s[0:2])
+ assert_series_equal(result, expected)
+
+ result = s.isin(s[0:2].values)
+ assert_series_equal(result, expected)
+
+ # fails on dtype conversion in the first place
+ result = s.isin(s[0:2].values.astype('datetime64[D]'))
+ assert_series_equal(result, expected)
+
+ result = s.isin([s[1]])
+ assert_series_equal(result, expected2)
+
+ result = s.isin([np.datetime64(s[1])])
+ assert_series_equal(result, expected2)
+
+ result = s.isin(set(s[0:2]))
+ assert_series_equal(result, expected)
+
+ # timedelta64[ns]
+ s = Series(pd.to_timedelta(lrange(5), unit='d'))
+ result = s.isin(s[0:2])
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
+ def test_isin_empty(self, empty):
+ # see gh-16991
+ s = Series(["a", "b"])
+ expected = Series([False, False])
+
+ result = s.isin(empty)
+ tm.assert_series_equal(expected, result)
+
+ def test_ptp(self):
+ # GH21614
+ N = 1000
+ arr = np.random.randn(N)
+ ser = Series(arr)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ assert np.ptp(ser) == np.ptp(arr)
+
+ # GH11163
+ s = Series([3, 5, np.nan, -3, 10])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ assert s.ptp() == 13
+ assert pd.isna(s.ptp(skipna=False))
+
+ mi = pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]])
+ s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi)
+
+ expected = pd.Series([6, 2], index=['a', 'b'], dtype=np.float64)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ tm.assert_series_equal(s.ptp(level=0), expected)
+
+ expected = pd.Series([np.nan, np.nan], index=['a', 'b'])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ tm.assert_series_equal(s.ptp(level=0, skipna=False), expected)
+
+ msg = r"No axis named 1 for object type <(class|type) 'type'>"
+ with pytest.raises(ValueError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ s.ptp(axis=1)
+
+ s = pd.Series(['a', 'b', 'c', 'd', 'e'])
+ msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
+ with pytest.raises(TypeError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ s.ptp()
+
+ msg = r"Series\.ptp does not implement numeric_only\."
+ with pytest.raises(NotImplementedError, match=msg):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ s.ptp(numeric_only=True)
+
+ def test_repeat(self):
+ s = Series(np.random.randn(3), index=['a', 'b', 'c'])
+
+ reps = s.repeat(5)
+ exp = Series(s.values.repeat(5), index=s.index.values.repeat(5))
+ assert_series_equal(reps, exp)
+
+ to_rep = [2, 3, 4]
+ reps = s.repeat(to_rep)
+ exp = Series(s.values.repeat(to_rep),
+ index=s.index.values.repeat(to_rep))
+ assert_series_equal(reps, exp)
+
+ def test_numpy_repeat(self):
+ s = Series(np.arange(3), name='x')
+ expected = Series(s.values.repeat(2), name='x',
+ index=s.index.values.repeat(2))
+ assert_series_equal(np.repeat(s, 2), expected)
+
+ msg = "the 'axis' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.repeat(s, 2, axis=0)
+
+ def test_searchsorted(self):
+ s = Series([1, 2, 3])
+
+ result = s.searchsorted(1, side='left')
+ assert is_scalar(result)
+ assert result == 0
+
+ result = s.searchsorted(1, side='right')
+ assert is_scalar(result)
+ assert result == 1
+
+ def test_searchsorted_numeric_dtypes_scalar(self):
+ s = Series([1, 2, 90, 1000, 3e9])
+ r = s.searchsorted(30)
+ assert is_scalar(r)
+ assert r == 2
+
+ r = s.searchsorted([30])
+ e = np.array([2], dtype=np.intp)
+ tm.assert_numpy_array_equal(r, e)
+
+ def test_searchsorted_numeric_dtypes_vector(self):
+ s = Series([1, 2, 90, 1000, 3e9])
+ r = s.searchsorted([91, 2e6])
+ e = np.array([3, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(r, e)
+
+ def test_search_sorted_datetime64_scalar(self):
+ s = Series(pd.date_range('20120101', periods=10, freq='2D'))
+ v = pd.Timestamp('20120102')
+ r = s.searchsorted(v)
+ assert is_scalar(r)
+ assert r == 1
+
+ def test_search_sorted_datetime64_list(self):
+ s = Series(pd.date_range('20120101', periods=10, freq='2D'))
+ v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')]
+ r = s.searchsorted(v)
+ e = np.array([1, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(r, e)
+
+ def test_searchsorted_sorter(self):
+ # GH8490
+ s = Series([3, 1, 2])
+ r = s.searchsorted([0, 3], sorter=np.argsort(s))
+ e = np.array([0, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(r, e)
+
+ def test_is_monotonic(self):
+
+ s = Series(np.random.randint(0, 10, size=1000))
+ assert not s.is_monotonic
+ s = Series(np.arange(1000))
+ assert s.is_monotonic is True
+ assert s.is_monotonic_increasing is True
+ s = Series(np.arange(1000, 0, -1))
+ assert s.is_monotonic_decreasing is True
+
+ s = Series(pd.date_range('20130101', periods=10))
+ assert s.is_monotonic is True
+ assert s.is_monotonic_increasing is True
+ s = Series(list(reversed(s.tolist())))
+ assert s.is_monotonic is False
+ assert s.is_monotonic_decreasing is True
+
+ def test_sort_index_level(self):
+ mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
+ s = Series([1, 2], mi)
+ backwards = s.iloc[[1, 0]]
+
+ res = s.sort_index(level='A')
+ assert_series_equal(backwards, res)
+
+ res = s.sort_index(level=['A', 'B'])
+ assert_series_equal(backwards, res)
+
+ res = s.sort_index(level='A', sort_remaining=False)
+ assert_series_equal(s, res)
+
+ res = s.sort_index(level=['A', 'B'], sort_remaining=False)
+ assert_series_equal(s, res)
+
+ def test_apply_categorical(self):
+ values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
+ ordered=True)
+ s = pd.Series(values, name='XX', index=list('abcdefg'))
+ result = s.apply(lambda x: x.lower())
+
+ # should be categorical dtype when the number of categories are
+ # the same
+ values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
+ ordered=True)
+ exp = pd.Series(values, name='XX', index=list('abcdefg'))
+ tm.assert_series_equal(result, exp)
+ tm.assert_categorical_equal(result.values, exp.values)
+
+ result = s.apply(lambda x: 'A')
+ exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
+ tm.assert_series_equal(result, exp)
+ assert result.dtype == np.object
+
+ def test_shift_int(self, datetime_series):
+ ts = datetime_series.astype(int)
+ shifted = ts.shift(1)
+ expected = ts.astype(float).shift(1)
+ assert_series_equal(shifted, expected)
+
+ def test_shift_categorical(self):
+ # GH 9416
+ s = pd.Series(['a', 'b', 'c', 'd'], dtype='category')
+
+ assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna())
+
+ sp1 = s.shift(1)
+ assert_index_equal(s.index, sp1.index)
+ assert np.all(sp1.values.codes[:1] == -1)
+ assert np.all(s.values.codes[:-1] == sp1.values.codes[1:])
+
+ sn2 = s.shift(-2)
+ assert_index_equal(s.index, sn2.index)
+ assert np.all(sn2.values.codes[-2:] == -1)
+ assert np.all(s.values.codes[2:] == sn2.values.codes[:-2])
+
+ assert_index_equal(s.values.categories, sp1.values.categories)
+ assert_index_equal(s.values.categories, sn2.values.categories)
+
+ def test_unstack(self):
+ from numpy import nan
+
+ index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 2]])
+
+ s = Series(np.arange(4.), index=index)
+ unstacked = s.unstack()
+
+ expected = DataFrame([[2., nan, 3.], [0., 1., nan]],
+ index=['bar', 'foo'],
+ columns=['one', 'three', 'two'])
+
+ assert_frame_equal(unstacked, expected)
+
+ unstacked = s.unstack(level=0)
+ assert_frame_equal(unstacked, expected.T)
+
+ index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
+ [0, 1, 0, 1, 0, 1]])
+ s = Series(np.random.randn(6), index=index)
+ exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]],
+ codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
+ expected = DataFrame({'bar': s.values},
+ index=exp_index).sort_index(level=0)
+ unstacked = s.unstack(0).sort_index()
+ assert_frame_equal(unstacked, expected)
+
+ # GH5873
+ idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]])
+ ts = pd.Series([1, 2], index=idx)
+ left = ts.unstack()
+ right = DataFrame([[nan, 1], [2, nan]], index=[101, 102],
+ columns=[nan, 3.5])
+ assert_frame_equal(left, right)
+
+ idx = pd.MultiIndex.from_arrays([['cat', 'cat', 'cat', 'dog', 'dog'
+ ], ['a', 'a', 'b', 'a', 'b'],
+ [1, 2, 1, 1, np.nan]])
+ ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx)
+ right = DataFrame([[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]],
+ columns=['cat', 'dog'])
+ tpls = [('a', 1), ('a', 2), ('b', nan), ('b', 1)]
+ right.index = pd.MultiIndex.from_tuples(tpls)
+ assert_frame_equal(ts.unstack(level=0), right)
+
+ def test_value_counts_datetime(self):
+ # most dtypes are tested in test_base.py
+ values = [pd.Timestamp('2011-01-01 09:00'),
+ pd.Timestamp('2011-01-01 10:00'),
+ pd.Timestamp('2011-01-01 11:00'),
+ pd.Timestamp('2011-01-01 09:00'),
+ pd.Timestamp('2011-01-01 09:00'),
+ pd.Timestamp('2011-01-01 11:00')]
+
+ exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
+ '2011-01-01 10:00'])
+ exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+ s = pd.Series(values, name='xxx')
+ tm.assert_series_equal(s.value_counts(), exp)
+ # check DatetimeIndex outputs the same result
+ idx = pd.DatetimeIndex(values, name='xxx')
+ tm.assert_series_equal(idx.value_counts(), exp)
+
+ # normalize
+ exp = pd.Series(np.array([3., 2., 1]) / 6.,
+ index=exp_idx, name='xxx')
+ tm.assert_series_equal(s.value_counts(normalize=True), exp)
+ tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
+ def test_value_counts_datetime_tz(self):
+ values = [pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-01 10:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-01 11:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
+ pd.Timestamp('2011-01-01 11:00', tz='US/Eastern')]
+
+ exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
+ '2011-01-01 10:00'], tz='US/Eastern')
+ exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+ s = pd.Series(values, name='xxx')
+ tm.assert_series_equal(s.value_counts(), exp)
+ idx = pd.DatetimeIndex(values, name='xxx')
+ tm.assert_series_equal(idx.value_counts(), exp)
+
+ exp = pd.Series(np.array([3., 2., 1]) / 6.,
+ index=exp_idx, name='xxx')
+ tm.assert_series_equal(s.value_counts(normalize=True), exp)
+ tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
+ def test_value_counts_period(self):
+ values = [pd.Period('2011-01', freq='M'),
+ pd.Period('2011-02', freq='M'),
+ pd.Period('2011-03', freq='M'),
+ pd.Period('2011-01', freq='M'),
+ pd.Period('2011-01', freq='M'),
+ pd.Period('2011-03', freq='M')]
+
+ exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M')
+ exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+ s = pd.Series(values, name='xxx')
+ tm.assert_series_equal(s.value_counts(), exp)
+ # check DatetimeIndex outputs the same result
+ idx = pd.PeriodIndex(values, name='xxx')
+ tm.assert_series_equal(idx.value_counts(), exp)
+
+ # normalize
+ exp = pd.Series(np.array([3., 2., 1]) / 6.,
+ index=exp_idx, name='xxx')
+ tm.assert_series_equal(s.value_counts(normalize=True), exp)
+ tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
+ def test_value_counts_categorical_ordered(self):
+ # most dtypes are tested in test_base.py
+ values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True)
+
+ exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
+ ordered=True)
+ exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+ s = pd.Series(values, name='xxx')
+ tm.assert_series_equal(s.value_counts(), exp)
+ # check CategoricalIndex outputs the same result
+ idx = pd.CategoricalIndex(values, name='xxx')
+ tm.assert_series_equal(idx.value_counts(), exp)
+
+ # normalize
+ exp = pd.Series(np.array([3., 2., 1]) / 6.,
+ index=exp_idx, name='xxx')
+ tm.assert_series_equal(s.value_counts(normalize=True), exp)
+ tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
+ def test_value_counts_categorical_not_ordered(self):
+ values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False)
+
+ exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
+ ordered=False)
+ exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+ s = pd.Series(values, name='xxx')
+ tm.assert_series_equal(s.value_counts(), exp)
+ # check CategoricalIndex outputs the same result
+ idx = pd.CategoricalIndex(values, name='xxx')
+ tm.assert_series_equal(idx.value_counts(), exp)
+
+ # normalize
+ exp = pd.Series(np.array([3., 2., 1]) / 6.,
+ index=exp_idx, name='xxx')
+ tm.assert_series_equal(s.value_counts(normalize=True), exp)
+ tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
+ @pytest.mark.parametrize("func", [np.any, np.all])
+ @pytest.mark.parametrize("kwargs", [
+ dict(keepdims=True),
+ dict(out=object()),
+ ])
+ @td.skip_if_np_lt_115
+ def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
+ s = pd.Series([1, 2])
+ param = list(kwargs)[0]
+ name = func.__name__
+
+ msg = (r"the '{arg}' parameter is not "
+ r"supported in the pandas "
+ r"implementation of {fname}\(\)").format(arg=param, fname=name)
+ with pytest.raises(ValueError, match=msg):
+ func(s, **kwargs)
+
+ @td.skip_if_np_lt_115
+ def test_validate_sum_initial(self):
+ s = pd.Series([1, 2])
+ msg = (r"the 'initial' parameter is not "
+ r"supported in the pandas "
+ r"implementation of sum\(\)")
+ with pytest.raises(ValueError, match=msg):
+ np.sum(s, initial=10)
+
+ def test_validate_median_initial(self):
+ s = pd.Series([1, 2])
+ msg = (r"the 'overwrite_input' parameter is not "
+ r"supported in the pandas "
+ r"implementation of median\(\)")
+ with pytest.raises(ValueError, match=msg):
+ # It seems like np.median doesn't dispatch, so we use the
+ # method instead of the ufunc.
+ s.median(overwrite_input=True)
+
+ @td.skip_if_np_lt_115
+ def test_validate_stat_keepdims(self):
+ s = pd.Series([1, 2])
+ msg = (r"the 'keepdims' parameter is not "
+ r"supported in the pandas "
+ r"implementation of sum\(\)")
+ with pytest.raises(ValueError, match=msg):
+ np.sum(s, keepdims=True)
+
+
+main_dtypes = [
+ 'datetime',
+ 'datetimetz',
+ 'timedelta',
+ 'int8',
+ 'int16',
+ 'int32',
+ 'int64',
+ 'float32',
+ 'float64',
+ 'uint8',
+ 'uint16',
+ 'uint32',
+ 'uint64'
+]
+
+
+def s_main_dtypes():
+ """A DataFrame with many dtypes
+
+ * datetime
+ * datetimetz
+ * timedelta
+ * [u]int{8,16,32,64}
+ * float{32,64}
+
+ The columns are the name of the dtype.
+ """
+ df = pd.DataFrame(
+ {'datetime': pd.to_datetime(['2003', '2002',
+ '2001', '2002',
+ '2005']),
+ 'datetimetz': pd.to_datetime(
+ ['2003', '2002',
+ '2001', '2002',
+ '2005']).tz_localize('US/Eastern'),
+ 'timedelta': pd.to_timedelta(['3d', '2d', '1d',
+ '2d', '5d'])})
+
+ for dtype in ['int8', 'int16', 'int32', 'int64',
+ 'float32', 'float64',
+ 'uint8', 'uint16', 'uint32', 'uint64']:
+ df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype)
+
+ return df
+
+
[email protected](params=main_dtypes)
+def s_main_dtypes_split(request, s_main_dtypes):
+ """Each series in s_main_dtypes."""
+ return s_main_dtypes[request.param]
+
+
+def assert_check_nselect_boundary(vals, dtype, method):
+ # helper function for 'test_boundary_{dtype}' tests
+ s = Series(vals, dtype=dtype)
+ result = getattr(s, method)(3)
+ expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
+ expected = s.loc[expected_idxr]
+ tm.assert_series_equal(result, expected)
+
+
+class TestNLargestNSmallest(object):
+
+ @pytest.mark.parametrize(
+ "r", [Series([3., 2, 1, 2, '5'], dtype='object'),
+ Series([3., 2, 1, 2, 5], dtype='object'),
+ # not supported on some archs
+ # Series([3., 2, 1, 2, 5], dtype='complex256'),
+ Series([3., 2, 1, 2, 5], dtype='complex128'),
+ Series(list('abcde')),
+ Series(list('abcde'), dtype='category')])
+ def test_error(self, r):
+ dt = r.dtype
+ msg = ("Cannot use method 'n(larg|small)est' with "
+ "dtype {dt}".format(dt=dt))
+ args = 2, len(r), 0, -1
+ methods = r.nlargest, r.nsmallest
+ for method, arg in product(methods, args):
+ with pytest.raises(TypeError, match=msg):
+ method(arg)
+
+ def test_nsmallest_nlargest(self, s_main_dtypes_split):
+ # float, int, datetime64 (use i8), timedelts64 (same),
+ # object that are numbers, object that are strings
+ s = s_main_dtypes_split
+
+ assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
+ assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
+
+ empty = s.iloc[0:0]
+ assert_series_equal(s.nsmallest(0), empty)
+ assert_series_equal(s.nsmallest(-1), empty)
+ assert_series_equal(s.nlargest(0), empty)
+ assert_series_equal(s.nlargest(-1), empty)
+
+ assert_series_equal(s.nsmallest(len(s)), s.sort_values())
+ assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values())
+ assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
+ assert_series_equal(s.nlargest(len(s) + 1),
+ s.iloc[[4, 0, 1, 3, 2]])
+
+ def test_misc(self):
+
+ s = Series([3., np.nan, 1, 2, 5])
+ assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
+ assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
+
+ msg = 'keep must be either "first", "last"'
+ with pytest.raises(ValueError, match=msg):
+ s.nsmallest(keep='invalid')
+ with pytest.raises(ValueError, match=msg):
+ s.nlargest(keep='invalid')
+
+ # GH 15297
+ s = Series([1] * 5, index=[1, 2, 3, 4, 5])
+ expected_first = Series([1] * 3, index=[1, 2, 3])
+ expected_last = Series([1] * 3, index=[5, 4, 3])
+
+ result = s.nsmallest(3)
+ assert_series_equal(result, expected_first)
+
+ result = s.nsmallest(3, keep='last')
+ assert_series_equal(result, expected_last)
+
+ result = s.nlargest(3)
+ assert_series_equal(result, expected_first)
+
+ result = s.nlargest(3, keep='last')
+ assert_series_equal(result, expected_last)
+
+ @pytest.mark.parametrize('n', range(1, 5))
+ def test_n(self, n):
+
+ # GH 13412
+ s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
+ result = s.nlargest(n)
+ expected = s.sort_values(ascending=False).head(n)
+ assert_series_equal(result, expected)
+
+ result = s.nsmallest(n)
+ expected = s.sort_values().head(n)
+ assert_series_equal(result, expected)
+
+ def test_boundary_integer(self, nselect_method, any_int_dtype):
+ # GH 21426
+ dtype_info = np.iinfo(any_int_dtype)
+ min_val, max_val = dtype_info.min, dtype_info.max
+ vals = [min_val, min_val + 1, max_val - 1, max_val]
+ assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)
+
+ def test_boundary_float(self, nselect_method, float_dtype):
+ # GH 21426
+ dtype_info = np.finfo(float_dtype)
+ min_val, max_val = dtype_info.min, dtype_info.max
+ min_2nd, max_2nd = np.nextafter(
+ [min_val, max_val], 0, dtype=float_dtype)
+ vals = [min_val, min_2nd, max_2nd, max_val]
+ assert_check_nselect_boundary(vals, float_dtype, nselect_method)
+
+ @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
+ def test_boundary_datetimelike(self, nselect_method, dtype):
+ # GH 21426
+ # use int64 bounds and +1 to min_val since true minimum is NaT
+ # (include min_val/NaT at end to maintain same expected_idxr)
+ dtype_info = np.iinfo('int64')
+ min_val, max_val = dtype_info.min, dtype_info.max
+ vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
+ assert_check_nselect_boundary(vals, dtype, nselect_method)
+
+ def test_duplicate_keep_all_ties(self):
+ # see gh-16818
+ s = Series([10, 9, 8, 7, 7, 7, 7, 6])
+ result = s.nlargest(4, keep='all')
+ expected = Series([10, 9, 8, 7, 7, 7, 7])
+ assert_series_equal(result, expected)
+
+ result = s.nsmallest(2, keep='all')
+ expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6])
+ assert_series_equal(result, expected)
+
+
+class TestCategoricalSeriesAnalytics(object):
+
+ def test_count(self):
+
+ s = Series(Categorical([np.nan, 1, 2, np.nan],
+ categories=[5, 4, 3, 2, 1], ordered=True))
+ result = s.count()
+ assert result == 2
+
+ def test_value_counts(self):
+ # GH 12835
+ cats = Categorical(list('abcccb'), categories=list('cabd'))
+ s = Series(cats, name='xxx')
+ res = s.value_counts(sort=False)
+
+ exp_index = CategoricalIndex(list('cabd'), categories=cats.categories)
+ exp = Series([3, 1, 2, 0], name='xxx', index=exp_index)
+ tm.assert_series_equal(res, exp)
+
+ res = s.value_counts(sort=True)
+
+ exp_index = CategoricalIndex(list('cbad'), categories=cats.categories)
+ exp = Series([3, 2, 1, 0], name='xxx', index=exp_index)
+ tm.assert_series_equal(res, exp)
+
+ # check object dtype handles the Series.name as the same
+ # (tested in test_base.py)
+ s = Series(["a", "b", "c", "c", "c", "b"], name='xxx')
+ res = s.value_counts()
+ exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"])
+ tm.assert_series_equal(res, exp)
+
+ def test_value_counts_with_nan(self):
+ # see gh-9443
+
+ # sanity check
+ s = Series(["a", "b", "a"], dtype="category")
+ exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ # same Series via two different constructions --> same behaviour
+ series = [
+ Series(["a", "b", None, "a", None, None], dtype="category"),
+ Series(Categorical(["a", "b", None, "a", None, None],
+ categories=["a", "b"]))
+ ]
+
+ for s in series:
+ # None is a NaN value, so we exclude its count here
+ exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+ res = s.value_counts(dropna=True)
+ tm.assert_series_equal(res, exp)
+
+ # we don't exclude the count of None and sort by counts
+ exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
+ res = s.value_counts(dropna=False)
+ tm.assert_series_equal(res, exp)
+
+ # When we aren't sorting by counts, and np.nan isn't a
+ # category, it should be last.
+ exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
+ res = s.value_counts(dropna=False, sort=False)
+ tm.assert_series_equal(res, exp)
+
+ @pytest.mark.parametrize(
+ "dtype",
+ ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
+ pytest.param("datetime64[D]",
+ marks=pytest.mark.xfail(reason="GH#7996"))]
+ )
+ @pytest.mark.parametrize("is_ordered", [True, False])
+ def test_drop_duplicates_categorical_non_bool(self, dtype, is_ordered):
+ cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
+
+ # Test case 1
+ input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
+ tc1 = Series(Categorical(input1, categories=cat_array,
+ ordered=is_ordered))
+
+ expected = Series([False, False, False, True])
+ tm.assert_series_equal(tc1.duplicated(), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ expected = Series([False, False, True, False])
+ tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
+ tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ expected = Series([False, False, True, True])
+ tm.assert_series_equal(tc1.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
+ sc = tc1.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc1[~expected])
+
+ # Test case 2
+ input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
+ tc2 = Series(Categorical(
+ input2, categories=cat_array, ordered=is_ordered)
+ )
+
+ expected = Series([False, False, False, False, True, True, False])
+ tm.assert_series_equal(tc2.duplicated(), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ expected = Series([False, True, True, False, False, False, False])
+ tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
+ tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ expected = Series([False, True, True, False, True, True, False])
+ tm.assert_series_equal(tc2.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
+ sc = tc2.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc2[~expected])
+
+ @pytest.mark.parametrize("is_ordered", [True, False])
+ def test_drop_duplicates_categorical_bool(self, is_ordered):
+ tc = Series(Categorical([True, False, True, False],
+ categories=[True, False], ordered=is_ordered))
+
+ expected = Series([False, False, True, True])
+ tm.assert_series_equal(tc.duplicated(), expected)
+ tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+ expected = Series([True, True, False, False])
+ tm.assert_series_equal(tc.duplicated(keep='last'), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep='last', inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+ expected = Series([True, True, True, True])
+ tm.assert_series_equal(tc.duplicated(keep=False), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep=False, inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_api.py b/contrib/python/pandas/py2/pandas/tests/series/test_api.py
new file mode 100644
index 00000000000..3ad9d54175f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_api.py
@@ -0,0 +1,719 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+from collections import OrderedDict
+import pydoc
+import warnings
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import isidentifier, lzip, range, string_types
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, DatetimeIndex, Index, Series, TimedeltaIndex,
+ date_range, period_range, timedelta_range)
+from pandas.core.arrays import PeriodArray
+from pandas.core.indexes.datetimes import Timestamp
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal, ensure_clean
+
+import pandas.io.formats.printing as printing
+
+from .common import TestData
+
+
+class SharedWithSparse(object):
+ """
+ A collection of tests Series and SparseSeries can share.
+
+ In generic tests on this class, use ``self._assert_series_equal()``
+ which is implemented in sub-classes.
+ """
+ def _assert_series_equal(self, left, right):
+ """Dispatch to series class dependent assertion"""
+ raise NotImplementedError
+
+ def test_scalarop_preserve_name(self):
+ result = self.ts * 2
+ assert result.name == self.ts.name
+
+ def test_copy_name(self):
+ result = self.ts.copy()
+ assert result.name == self.ts.name
+
+ def test_copy_index_name_checking(self):
+ # don't want to be able to modify the index stored elsewhere after
+ # making a copy
+
+ self.ts.index.name = None
+ assert self.ts.index.name is None
+ assert self.ts is self.ts
+
+ cp = self.ts.copy()
+ cp.index.name = 'foo'
+ printing.pprint_thing(self.ts.index.name)
+ assert self.ts.index.name is None
+
+ def test_append_preserve_name(self):
+ result = self.ts[:5].append(self.ts[5:])
+ assert result.name == self.ts.name
+
+ def test_binop_maybe_preserve_name(self):
+ # names match, preserve
+ result = self.ts * self.ts
+ assert result.name == self.ts.name
+ result = self.ts.mul(self.ts)
+ assert result.name == self.ts.name
+
+ result = self.ts * self.ts[:-2]
+ assert result.name == self.ts.name
+
+ # names don't match, don't preserve
+ cp = self.ts.copy()
+ cp.name = 'something else'
+ result = self.ts + cp
+ assert result.name is None
+ result = self.ts.add(cp)
+ assert result.name is None
+
+ ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow']
+ ops = ops + ['r' + op for op in ops]
+ for op in ops:
+ # names match, preserve
+ s = self.ts.copy()
+ result = getattr(s, op)(s)
+ assert result.name == self.ts.name
+
+ # names don't match, don't preserve
+ cp = self.ts.copy()
+ cp.name = 'changed'
+ result = getattr(s, op)(cp)
+ assert result.name is None
+
+ def test_combine_first_name(self):
+ result = self.ts.combine_first(self.ts[:5])
+ assert result.name == self.ts.name
+
+ def test_getitem_preserve_name(self):
+ result = self.ts[self.ts > 0]
+ assert result.name == self.ts.name
+
+ result = self.ts[[0, 2, 4]]
+ assert result.name == self.ts.name
+
+ result = self.ts[5:10]
+ assert result.name == self.ts.name
+
+ def test_pickle(self):
+ unp_series = self._pickle_roundtrip(self.series)
+ unp_ts = self._pickle_roundtrip(self.ts)
+ assert_series_equal(unp_series, self.series)
+ assert_series_equal(unp_ts, self.ts)
+
+ def _pickle_roundtrip(self, obj):
+
+ with ensure_clean() as path:
+ obj.to_pickle(path)
+ unpickled = pd.read_pickle(path)
+ return unpickled
+
+ def test_argsort_preserve_name(self):
+ result = self.ts.argsort()
+ assert result.name == self.ts.name
+
+ def test_sort_index_name(self):
+ result = self.ts.sort_index(ascending=False)
+ assert result.name == self.ts.name
+
+ def test_to_sparse_pass_name(self):
+ result = self.ts.to_sparse()
+ assert result.name == self.ts.name
+
+ def test_constructor_dict(self):
+ d = {'a': 0., 'b': 1., 'c': 2.}
+ result = self.series_klass(d)
+ expected = self.series_klass(d, index=sorted(d.keys()))
+ self._assert_series_equal(result, expected)
+
+ result = self.series_klass(d, index=['b', 'c', 'd', 'a'])
+ expected = self.series_klass([1, 2, np.nan, 0],
+ index=['b', 'c', 'd', 'a'])
+ self._assert_series_equal(result, expected)
+
+ def test_constructor_subclass_dict(self):
+ data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
+ series = self.series_klass(data)
+ expected = self.series_klass(dict(compat.iteritems(data)))
+ self._assert_series_equal(series, expected)
+
+ def test_constructor_ordereddict(self):
+ # GH3283
+ data = OrderedDict(
+ ('col%s' % i, np.random.random()) for i in range(12))
+
+ series = self.series_klass(data)
+ expected = self.series_klass(list(data.values()), list(data.keys()))
+ self._assert_series_equal(series, expected)
+
+ # Test with subclass
+ class A(OrderedDict):
+ pass
+
+ series = self.series_klass(A(data))
+ self._assert_series_equal(series, expected)
+
+ def test_constructor_dict_multiindex(self):
+ d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.}
+ _d = sorted(d.items())
+ result = self.series_klass(d)
+ expected = self.series_klass(
+ [x[1] for x in _d],
+ index=pd.MultiIndex.from_tuples([x[0] for x in _d]))
+ self._assert_series_equal(result, expected)
+
+ d['z'] = 111.
+ _d.insert(0, ('z', d['z']))
+ result = self.series_klass(d)
+ expected = self.series_klass([x[1] for x in _d],
+ index=pd.Index([x[0] for x in _d],
+ tupleize_cols=False))
+ result = result.reindex(index=expected.index)
+ self._assert_series_equal(result, expected)
+
+ def test_constructor_dict_timedelta_index(self):
+ # GH #12169 : Resample category data with timedelta index
+ # construct Series from dict as data and TimedeltaIndex as index
+ # will result NaN in result Series data
+ expected = self.series_klass(
+ data=['A', 'B', 'C'],
+ index=pd.to_timedelta([0, 10, 20], unit='s')
+ )
+
+ result = self.series_klass(
+ data={pd.to_timedelta(0, unit='s'): 'A',
+ pd.to_timedelta(10, unit='s'): 'B',
+ pd.to_timedelta(20, unit='s'): 'C'},
+ index=pd.to_timedelta([0, 10, 20], unit='s')
+ )
+ self._assert_series_equal(result, expected)
+
+ def test_from_array_deprecated(self):
+
+ with tm.assert_produces_warning(FutureWarning):
+ self.series_klass.from_array([1, 2, 3])
+
+ def test_sparse_accessor_updates_on_inplace(self):
+ s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]")
+ s.drop([0, 1], inplace=True)
+ assert s.sparse.density == 1.0
+
+
+class TestSeriesMisc(TestData, SharedWithSparse):
+
+ series_klass = Series
+ # SharedWithSparse tests use generic, series_klass-agnostic assertion
+ _assert_series_equal = staticmethod(tm.assert_series_equal)
+
+ def test_tab_completion(self):
+ # GH 9910
+ s = Series(list('abcd'))
+ # Series of str values should have .str but not .dt/.cat in __dir__
+ assert 'str' in dir(s)
+ assert 'dt' not in dir(s)
+ assert 'cat' not in dir(s)
+
+ # similarly for .dt
+ s = Series(date_range('1/1/2015', periods=5))
+ assert 'dt' in dir(s)
+ assert 'str' not in dir(s)
+ assert 'cat' not in dir(s)
+
+ # Similarly for .cat, but with the twist that str and dt should be
+ # there if the categories are of that type first cat and str.
+ s = Series(list('abbcd'), dtype="category")
+ assert 'cat' in dir(s)
+ assert 'str' in dir(s) # as it is a string categorical
+ assert 'dt' not in dir(s)
+
+ # similar to cat and str
+ s = Series(date_range('1/1/2015', periods=5)).astype("category")
+ assert 'cat' in dir(s)
+ assert 'str' not in dir(s)
+ assert 'dt' in dir(s) # as it is a datetime categorical
+
+ def test_tab_completion_with_categorical(self):
+ # test the tab completion display
+ ok_for_cat = ['name', 'index', 'categorical', 'categories', 'codes',
+ 'ordered', 'set_categories', 'add_categories',
+ 'remove_categories', 'rename_categories',
+ 'reorder_categories', 'remove_unused_categories',
+ 'as_ordered', 'as_unordered']
+
+ def get_dir(s):
+ results = [r for r in s.cat.__dir__() if not r.startswith('_')]
+ return list(sorted(set(results)))
+
+ s = Series(list('aabbcde')).astype('category')
+ results = get_dir(s)
+ tm.assert_almost_equal(results, list(sorted(set(ok_for_cat))))
+
+ @pytest.mark.parametrize("index", [
+ tm.makeUnicodeIndex(10),
+ tm.makeStringIndex(10),
+ tm.makeCategoricalIndex(10),
+ Index(['foo', 'bar', 'baz'] * 2),
+ tm.makeDateIndex(10),
+ tm.makePeriodIndex(10),
+ tm.makeTimedeltaIndex(10),
+ tm.makeIntIndex(10),
+ tm.makeUIntIndex(10),
+ tm.makeIntIndex(10),
+ tm.makeFloatIndex(10),
+ Index([True, False]),
+ Index(['a{}'.format(i) for i in range(101)]),
+ pd.MultiIndex.from_tuples(lzip('ABCD', 'EFGH')),
+ pd.MultiIndex.from_tuples(lzip([0, 1, 2, 3], 'EFGH')), ])
+ def test_index_tab_completion(self, index):
+ # dir contains string-like values of the Index.
+ s = pd.Series(index=index)
+ dir_s = dir(s)
+ for i, x in enumerate(s.index.unique(level=0)):
+ if i < 100:
+ assert (not isinstance(x, string_types) or
+ not isidentifier(x) or x in dir_s)
+ else:
+ assert x not in dir_s
+
+ def test_not_hashable(self):
+ s_empty = Series()
+ s = Series([1])
+ msg = "'Series' objects are mutable, thus they cannot be hashed"
+ with pytest.raises(TypeError, match=msg):
+ hash(s_empty)
+ with pytest.raises(TypeError, match=msg):
+ hash(s)
+
+ def test_contains(self):
+ tm.assert_contains_all(self.ts.index, self.ts)
+
+ def test_iter(self):
+ for i, val in enumerate(self.series):
+ assert val == self.series[i]
+
+ for i, val in enumerate(self.ts):
+ assert val == self.ts[i]
+
+ def test_keys(self):
+ # HACK: By doing this in two stages, we avoid 2to3 wrapping the call
+ # to .keys() in a list()
+ getkeys = self.ts.keys
+ assert getkeys() is self.ts.index
+
+ def test_values(self):
+ tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False)
+
+ def test_iteritems(self):
+ for idx, val in compat.iteritems(self.series):
+ assert val == self.series[idx]
+
+ for idx, val in compat.iteritems(self.ts):
+ assert val == self.ts[idx]
+
+ # assert is lazy (genrators don't define reverse, lists do)
+ assert not hasattr(self.series.iteritems(), 'reverse')
+
+ def test_items(self):
+ for idx, val in self.series.items():
+ assert val == self.series[idx]
+
+ for idx, val in self.ts.items():
+ assert val == self.ts[idx]
+
+ # assert is lazy (genrators don't define reverse, lists do)
+ assert not hasattr(self.series.items(), 'reverse')
+
+ def test_raise_on_info(self):
+ s = Series(np.random.randn(10))
+ msg = "'Series' object has no attribute 'info'"
+ with pytest.raises(AttributeError, match=msg):
+ s.info()
+
+ def test_copy(self):
+
+ for deep in [None, False, True]:
+ s = Series(np.arange(10), dtype='float64')
+
+ # default deep is True
+ if deep is None:
+ s2 = s.copy()
+ else:
+ s2 = s.copy(deep=deep)
+
+ s2[::2] = np.NaN
+
+ if deep is None or deep is True:
+ # Did not modify original Series
+ assert np.isnan(s2[0])
+ assert not np.isnan(s[0])
+ else:
+ # we DID modify the original Series
+ assert np.isnan(s2[0])
+ assert np.isnan(s[0])
+
+ # GH 11794
+ # copy of tz-aware
+ expected = Series([Timestamp('2012/01/01', tz='UTC')])
+ expected2 = Series([Timestamp('1999/01/01', tz='UTC')])
+
+ for deep in [None, False, True]:
+
+ s = Series([Timestamp('2012/01/01', tz='UTC')])
+
+ if deep is None:
+ s2 = s.copy()
+ else:
+ s2 = s.copy(deep=deep)
+
+ s2[0] = pd.Timestamp('1999/01/01', tz='UTC')
+
+ # default deep is True
+ if deep is None or deep is True:
+ # Did not modify original Series
+ assert_series_equal(s2, expected2)
+ assert_series_equal(s, expected)
+ else:
+ # we DID modify the original Series
+ assert_series_equal(s2, expected2)
+ assert_series_equal(s, expected2)
+
+ def test_axis_alias(self):
+ s = Series([1, 2, np.nan])
+ assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index'))
+ assert s.dropna().sum('rows') == 3
+ assert s._get_axis_number('rows') == 0
+ assert s._get_axis_name('rows') == 'index'
+
+ def test_class_axis(self):
+ # https://github.com/pandas-dev/pandas/issues/18147
+ # no exception and no empty docstring
+ assert pydoc.getdoc(Series.index)
+
+ def test_numpy_unique(self):
+ # it works!
+ np.unique(self.ts)
+
+ def test_ndarray_compat(self):
+
+ # test numpy compat with Series as sub-class of NDFrame
+ tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
+ index=date_range('1/1/2000', periods=1000))
+
+ def f(x):
+ return x[x.idxmax()]
+
+ result = tsdf.apply(f)
+ expected = tsdf.max()
+ tm.assert_series_equal(result, expected)
+
+ # .item()
+ s = Series([1])
+ result = s.item()
+ assert result == 1
+ assert s.item() == s.iloc[0]
+
+ # using an ndarray like function
+ s = Series(np.random.randn(10))
+ result = Series(np.ones_like(s))
+ expected = Series(1, index=range(10), dtype='float64')
+ tm.assert_series_equal(result, expected)
+
+ # ravel
+ s = Series(np.random.randn(10))
+ tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F'))
+
+ # compress
+ # GH 6658
+ s = Series([0, 1., -1], index=list('abc'))
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = np.compress(s > 0, s)
+ tm.assert_series_equal(result, Series([1.], index=['b']))
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = np.compress(s < -1, s)
+ # result empty Index(dtype=object) as the same as original
+ exp = Series([], dtype='float64', index=Index([], dtype='object'))
+ tm.assert_series_equal(result, exp)
+
+ s = Series([0, 1., -1], index=[.1, .2, .3])
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = np.compress(s > 0, s)
+ tm.assert_series_equal(result, Series([1.], index=[.2]))
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = np.compress(s < -1, s)
+ # result empty Float64Index as the same as original
+ exp = Series([], dtype='float64', index=Index([], dtype='float64'))
+ tm.assert_series_equal(result, exp)
+
+ def test_str_accessor_updates_on_inplace(self):
+ s = pd.Series(list('abc'))
+ s.drop([0], inplace=True)
+ assert len(s.str.lower()) == 2
+
+ def test_str_attribute(self):
+ # GH9068
+ methods = ['strip', 'rstrip', 'lstrip']
+ s = Series([' jack', 'jill ', ' jesse ', 'frank'])
+ for method in methods:
+ expected = Series([getattr(str, method)(x) for x in s.values])
+ assert_series_equal(getattr(Series.str, method)(s.str), expected)
+
+ # str accessor only valid with string values
+ s = Series(range(5))
+ with pytest.raises(AttributeError, match='only use .str accessor'):
+ s.str.repeat(2)
+
+ def test_empty_method(self):
+ s_empty = pd.Series()
+ assert s_empty.empty
+
+ for full_series in [pd.Series([1]), pd.Series(index=[1])]:
+ assert not full_series.empty
+
+ def test_tab_complete_warning(self, ip):
+ # https://github.com/pandas-dev/pandas/issues/16409
+ pytest.importorskip('IPython', minversion="6.0.0")
+ from IPython.core.completer import provisionalcompleter
+
+ code = "import pandas as pd; s = pd.Series()"
+ ip.run_code(code)
+ with tm.assert_produces_warning(None):
+ with provisionalcompleter('ignore'):
+ list(ip.Completer.completions('s.', 1))
+
+ def test_integer_series_size(self):
+ # GH 25580
+ s = Series(range(9))
+ assert s.size == 9
+ s = Series(range(9), dtype="Int64")
+ assert s.size == 9
+
+
+class TestCategoricalSeries(object):
+
+ @pytest.mark.parametrize(
+ "method",
+ [
+ lambda x: x.cat.set_categories([1, 2, 3]),
+ lambda x: x.cat.reorder_categories([2, 3, 1], ordered=True),
+ lambda x: x.cat.rename_categories([1, 2, 3]),
+ lambda x: x.cat.remove_unused_categories(),
+ lambda x: x.cat.remove_categories([2]),
+ lambda x: x.cat.add_categories([4]),
+ lambda x: x.cat.as_ordered(),
+ lambda x: x.cat.as_unordered(),
+ ])
+ def test_getname_categorical_accessor(self, method):
+ # GH 17509
+ s = Series([1, 2, 3], name='A').astype('category')
+ expected = 'A'
+ result = method(s).name
+ assert result == expected
+
+ def test_cat_accessor(self):
+ s = Series(Categorical(["a", "b", np.nan, "a"]))
+ tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+ assert not s.cat.ordered, False
+
+ exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
+ s.cat.set_categories(["b", "a"], inplace=True)
+ tm.assert_categorical_equal(s.values, exp)
+
+ res = s.cat.set_categories(["b", "a"])
+ tm.assert_categorical_equal(res.values, exp)
+
+ s[:] = "a"
+ s = s.cat.remove_unused_categories()
+ tm.assert_index_equal(s.cat.categories, Index(["a"]))
+
+ def test_cat_accessor_api(self):
+ # GH 9322
+ from pandas.core.arrays.categorical import CategoricalAccessor
+ assert Series.cat is CategoricalAccessor
+ s = Series(list('aabbcde')).astype('category')
+ assert isinstance(s.cat, CategoricalAccessor)
+
+ invalid = Series([1])
+ with pytest.raises(AttributeError, match="only use .cat accessor"):
+ invalid.cat
+ assert not hasattr(invalid, 'cat')
+
+ def test_cat_accessor_no_new_attributes(self):
+ # https://github.com/pandas-dev/pandas/issues/10673
+ c = Series(list('aabbcde')).astype('category')
+ with pytest.raises(AttributeError,
+ match="You cannot add any new attribute"):
+ c.cat.xlabel = "a"
+
+ def test_cat_accessor_updates_on_inplace(self):
+ s = Series(list('abc')).astype('category')
+ s.drop(0, inplace=True)
+ s.cat.remove_unused_categories(inplace=True)
+ assert len(s.cat.categories) == 2
+
+ def test_categorical_delegations(self):
+
+ # invalid accessor
+ msg = r"Can only use \.cat accessor with a 'category' dtype"
+ with pytest.raises(AttributeError, match=msg):
+ Series([1, 2, 3]).cat
+ with pytest.raises(AttributeError, match=msg):
+ Series([1, 2, 3]).cat()
+ with pytest.raises(AttributeError, match=msg):
+ Series(['a', 'b', 'c']).cat
+ with pytest.raises(AttributeError, match=msg):
+ Series(np.arange(5.)).cat
+ with pytest.raises(AttributeError, match=msg):
+ Series([Timestamp('20130101')]).cat
+
+ # Series should delegate calls to '.categories', '.codes', '.ordered'
+ # and the methods '.set_categories()' 'drop_unused_categories()' to the
+ # categorical# -*- coding: utf-8 -*-
+ s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
+ exp_categories = Index(["a", "b", "c"])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ s.cat.categories = [1, 2, 3]
+ exp_categories = Index([1, 2, 3])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+
+ exp_codes = Series([0, 1, 2, 0], dtype='int8')
+ tm.assert_series_equal(s.cat.codes, exp_codes)
+
+ assert s.cat.ordered
+ s = s.cat.as_unordered()
+ assert not s.cat.ordered
+ s.cat.as_ordered(inplace=True)
+ assert s.cat.ordered
+
+ # reorder
+ s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
+ exp_categories = Index(["c", "b", "a"])
+ exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ s = s.cat.set_categories(["c", "b", "a"])
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
+ tm.assert_numpy_array_equal(s.__array__(), exp_values)
+
+ # remove unused categories
+ s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"
+ ]))
+ exp_categories = Index(["a", "b"])
+ exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_)
+ s = s.cat.remove_unused_categories()
+ tm.assert_index_equal(s.cat.categories, exp_categories)
+ tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
+ tm.assert_numpy_array_equal(s.__array__(), exp_values)
+
+ # This method is likely to be confused, so test that it raises an error
+ # on wrong inputs:
+ msg = "'Series' object has no attribute 'set_categories'"
+ with pytest.raises(AttributeError, match=msg):
+ s.set_categories([4, 3, 2, 1])
+
+ # right: s.cat.set_categories([4,3,2,1])
+
+ # GH18862 (let Series.cat.rename_categories take callables)
+ s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
+ result = s.cat.rename_categories(lambda x: x.upper())
+ expected = Series(Categorical(["A", "B", "C", "A"],
+ categories=["A", "B", "C"],
+ ordered=True))
+ tm.assert_series_equal(result, expected)
+
+ def test_dt_accessor_api_for_categorical(self):
+ # https://github.com/pandas-dev/pandas/issues/10661
+ from pandas.core.indexes.accessors import Properties
+
+ s_dr = Series(date_range('1/1/2015', periods=5, tz="MET"))
+ c_dr = s_dr.astype("category")
+
+ s_pr = Series(period_range('1/1/2015', freq='D', periods=5))
+ c_pr = s_pr.astype("category")
+
+ s_tdr = Series(timedelta_range('1 days', '10 days'))
+ c_tdr = s_tdr.astype("category")
+
+ # only testing field (like .day)
+ # and bool (is_month_start)
+ get_ops = lambda x: x._datetimelike_ops
+
+ test_data = [
+ ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr),
+ ("Period", get_ops(PeriodArray), s_pr, c_pr),
+ ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr)]
+
+ assert isinstance(c_dr.dt, Properties)
+
+ special_func_defs = [
+ ('strftime', ("%Y-%m-%d",), {}),
+ ('tz_convert', ("EST",), {}),
+ ('round', ("D",), {}),
+ ('floor', ("D",), {}),
+ ('ceil', ("D",), {}),
+ ('asfreq', ("D",), {}),
+ # ('tz_localize', ("UTC",), {}),
+ ]
+ _special_func_names = [f[0] for f in special_func_defs]
+
+ # the series is already localized
+ _ignore_names = ['tz_localize', 'components']
+
+ for name, attr_names, s, c in test_data:
+ func_names = [f
+ for f in dir(s.dt)
+ if not (f.startswith("_") or f in attr_names or f in
+ _special_func_names or f in _ignore_names)]
+
+ func_defs = [(f, (), {}) for f in func_names]
+ for f_def in special_func_defs:
+ if f_def[0] in dir(s.dt):
+ func_defs.append(f_def)
+
+ for func, args, kwargs in func_defs:
+ with warnings.catch_warnings():
+ if func == 'to_period':
+ # dropping TZ
+ warnings.simplefilter("ignore", UserWarning)
+ res = getattr(c.dt, func)(*args, **kwargs)
+ exp = getattr(s.dt, func)(*args, **kwargs)
+
+ if isinstance(res, DataFrame):
+ tm.assert_frame_equal(res, exp)
+ elif isinstance(res, Series):
+ tm.assert_series_equal(res, exp)
+ else:
+ tm.assert_almost_equal(res, exp)
+
+ for attr in attr_names:
+ try:
+ res = getattr(c.dt, attr)
+ exp = getattr(s.dt, attr)
+ except Exception as e:
+ print(name, attr)
+ raise e
+
+ if isinstance(res, DataFrame):
+ tm.assert_frame_equal(res, exp)
+ elif isinstance(res, Series):
+ tm.assert_series_equal(res, exp)
+ else:
+ tm.assert_almost_equal(res, exp)
+
+ invalid = Series([1, 2, 3]).astype('category')
+ msg = "Can only use .dt accessor with datetimelike"
+
+ with pytest.raises(AttributeError, match=msg):
+ invalid.dt
+ assert not hasattr(invalid, 'str')
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_apply.py b/contrib/python/pandas/py2/pandas/tests/series/test_apply.py
new file mode 100644
index 00000000000..90cf6916df0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_apply.py
@@ -0,0 +1,667 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from collections import Counter, OrderedDict, defaultdict
+from itertools import chain
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import lrange
+
+import pandas as pd
+from pandas import DataFrame, Index, Series, isna
+from pandas.conftest import _get_cython_table_params
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestSeriesApply():
+
+ def test_apply(self, datetime_series):
+ with np.errstate(all='ignore'):
+ tm.assert_series_equal(datetime_series.apply(np.sqrt),
+ np.sqrt(datetime_series))
+
+ # element-wise apply
+ import math
+ tm.assert_series_equal(datetime_series.apply(math.exp),
+ np.exp(datetime_series))
+
+ # empty series
+ s = Series(dtype=object, name='foo', index=pd.Index([], name='bar'))
+ rs = s.apply(lambda x: x)
+ tm.assert_series_equal(s, rs)
+
+ # check all metadata (GH 9322)
+ assert s is not rs
+ assert s.index is rs.index
+ assert s.dtype == rs.dtype
+ assert s.name == rs.name
+
+ # index but no data
+ s = Series(index=[1, 2, 3])
+ rs = s.apply(lambda x: x)
+ tm.assert_series_equal(s, rs)
+
+ def test_apply_same_length_inference_bug(self):
+ s = Series([1, 2])
+ f = lambda x: (x, x + 1)
+
+ result = s.apply(f)
+ expected = s.map(f)
+ assert_series_equal(result, expected)
+
+ s = Series([1, 2, 3])
+ result = s.apply(f)
+ expected = s.map(f)
+ assert_series_equal(result, expected)
+
+ def test_apply_dont_convert_dtype(self):
+ s = Series(np.random.randn(10))
+
+ f = lambda x: x if x > 0 else np.nan
+ result = s.apply(f, convert_dtype=False)
+ assert result.dtype == object
+
+ def test_with_string_args(self, datetime_series):
+
+ for arg in ['sum', 'mean', 'min', 'max', 'std']:
+ result = datetime_series.apply(arg)
+ expected = getattr(datetime_series, arg)()
+ assert result == expected
+
+ def test_apply_args(self):
+ s = Series(['foo,bar'])
+
+ result = s.apply(str.split, args=(',', ))
+ assert result[0] == ['foo', 'bar']
+ assert isinstance(result[0], list)
+
+ def test_series_map_box_timestamps(self):
+ # GH#2689, GH#2627
+ ser = Series(pd.date_range('1/1/2000', periods=10))
+
+ def func(x):
+ return (x.hour, x.day, x.month)
+
+ # it works!
+ ser.map(func)
+ ser.apply(func)
+
+ def test_apply_box(self):
+ # ufunc will not be boxed. Same test cases as the test_map_box
+ vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]
+ s = pd.Series(vals)
+ assert s.dtype == 'datetime64[ns]'
+ # boxed value must be Timestamp instance
+ res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__,
+ x.day, x.tz))
+ exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None'])
+ tm.assert_series_equal(res, exp)
+
+ vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern')]
+ s = pd.Series(vals)
+ assert s.dtype == 'datetime64[ns, US/Eastern]'
+ res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__,
+ x.day, x.tz))
+ exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern'])
+ tm.assert_series_equal(res, exp)
+
+ # timedelta
+ vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')]
+ s = pd.Series(vals)
+ assert s.dtype == 'timedelta64[ns]'
+ res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days))
+ exp = pd.Series(['Timedelta_1', 'Timedelta_2'])
+ tm.assert_series_equal(res, exp)
+
+ # period
+ vals = [pd.Period('2011-01-01', freq='M'),
+ pd.Period('2011-01-02', freq='M')]
+ s = pd.Series(vals)
+ assert s.dtype == 'Period[M]'
+ res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__,
+ x.freqstr))
+ exp = pd.Series(['Period_M', 'Period_M'])
+ tm.assert_series_equal(res, exp)
+
+ def test_apply_datetimetz(self):
+ values = pd.date_range('2011-01-01', '2011-01-02',
+ freq='H').tz_localize('Asia/Tokyo')
+ s = pd.Series(values, name='XX')
+
+ result = s.apply(lambda x: x + pd.offsets.Day())
+ exp_values = pd.date_range('2011-01-02', '2011-01-03',
+ freq='H').tz_localize('Asia/Tokyo')
+ exp = pd.Series(exp_values, name='XX')
+ tm.assert_series_equal(result, exp)
+
+ # change dtype
+ # GH 14506 : Returned dtype changed from int32 to int64
+ result = s.apply(lambda x: x.hour)
+ exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
+ tm.assert_series_equal(result, exp)
+
+ # not vectorized
+ def f(x):
+ if not isinstance(x, pd.Timestamp):
+ raise ValueError
+ return str(x.tz)
+
+ result = s.map(f)
+ exp = pd.Series(['Asia/Tokyo'] * 25, name='XX')
+ tm.assert_series_equal(result, exp)
+
+ def test_apply_dict_depr(self):
+
+ tsdf = pd.DataFrame(np.random.randn(10, 3),
+ columns=['A', 'B', 'C'],
+ index=pd.date_range('1/1/2000', periods=10))
+ with tm.assert_produces_warning(FutureWarning):
+ tsdf.A.agg({'foo': ['sum', 'mean']})
+
+
+class TestSeriesAggregate():
+
+ def test_transform(self, string_series):
+ # transforming functions
+
+ with np.errstate(all='ignore'):
+
+ f_sqrt = np.sqrt(string_series)
+ f_abs = np.abs(string_series)
+
+ # ufunc
+ result = string_series.transform(np.sqrt)
+ expected = f_sqrt.copy()
+ assert_series_equal(result, expected)
+
+ result = string_series.apply(np.sqrt)
+ assert_series_equal(result, expected)
+
+ # list-like
+ result = string_series.transform([np.sqrt])
+ expected = f_sqrt.to_frame().copy()
+ expected.columns = ['sqrt']
+ assert_frame_equal(result, expected)
+
+ result = string_series.transform([np.sqrt])
+ assert_frame_equal(result, expected)
+
+ result = string_series.transform(['sqrt'])
+ assert_frame_equal(result, expected)
+
+ # multiple items in list
+ # these are in the order as if we are applying both functions per
+ # series and then concatting
+ expected = pd.concat([f_sqrt, f_abs], axis=1)
+ expected.columns = ['sqrt', 'absolute']
+ result = string_series.apply([np.sqrt, np.abs])
+ assert_frame_equal(result, expected)
+
+ result = string_series.transform(['sqrt', 'abs'])
+ expected.columns = ['sqrt', 'abs']
+ assert_frame_equal(result, expected)
+
+ # dict, provide renaming
+ expected = pd.concat([f_sqrt, f_abs], axis=1)
+ expected.columns = ['foo', 'bar']
+ expected = expected.unstack().rename('series')
+
+ result = string_series.apply({'foo': np.sqrt, 'bar': np.abs})
+ assert_series_equal(result.reindex_like(expected), expected)
+
+ def test_transform_and_agg_error(self, string_series):
+ # we are trying to transform with an aggregator
+ with pytest.raises(ValueError):
+ string_series.transform(['min', 'max'])
+
+ with pytest.raises(ValueError):
+ with np.errstate(all='ignore'):
+ string_series.agg(['sqrt', 'max'])
+
+ with pytest.raises(ValueError):
+ with np.errstate(all='ignore'):
+ string_series.transform(['sqrt', 'max'])
+
+ with pytest.raises(ValueError):
+ with np.errstate(all='ignore'):
+ string_series.agg({'foo': np.sqrt, 'bar': 'sum'})
+
+ def test_demo(self):
+ # demonstration tests
+ s = Series(range(6), dtype='int64', name='series')
+
+ result = s.agg(['min', 'max'])
+ expected = Series([0, 5], index=['min', 'max'], name='series')
+ tm.assert_series_equal(result, expected)
+
+ result = s.agg({'foo': 'min'})
+ expected = Series([0], index=['foo'], name='series')
+ tm.assert_series_equal(result, expected)
+
+ # nested renaming
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.agg({'foo': ['min', 'max']})
+
+ expected = DataFrame(
+ {'foo': [0, 5]},
+ index=['min', 'max']).unstack().rename('series')
+ tm.assert_series_equal(result, expected)
+
+ def test_multiple_aggregators_with_dict_api(self):
+
+ s = Series(range(6), dtype='int64', name='series')
+ # nested renaming
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.agg({'foo': ['min', 'max'], 'bar': ['sum', 'mean']})
+
+ expected = DataFrame(
+ {'foo': [5.0, np.nan, 0.0, np.nan],
+ 'bar': [np.nan, 2.5, np.nan, 15.0]},
+ columns=['foo', 'bar'],
+ index=['max', 'mean',
+ 'min', 'sum']).unstack().rename('series')
+ tm.assert_series_equal(result.reindex_like(expected), expected)
+
+ def test_agg_apply_evaluate_lambdas_the_same(self, string_series):
+ # test that we are evaluating row-by-row first
+ # before vectorized evaluation
+ result = string_series.apply(lambda x: str(x))
+ expected = string_series.agg(lambda x: str(x))
+ tm.assert_series_equal(result, expected)
+
+ result = string_series.apply(str)
+ expected = string_series.agg(str)
+ tm.assert_series_equal(result, expected)
+
+ def test_with_nested_series(self, datetime_series):
+ # GH 2316
+ # .agg with a reducer and a transform, what to do
+ result = datetime_series.apply(lambda x: Series(
+ [x, x ** 2], index=['x', 'x^2']))
+ expected = DataFrame({'x': datetime_series,
+ 'x^2': datetime_series ** 2})
+ tm.assert_frame_equal(result, expected)
+
+ result = datetime_series.agg(lambda x: Series(
+ [x, x ** 2], index=['x', 'x^2']))
+ tm.assert_frame_equal(result, expected)
+
+ def test_replicate_describe(self, string_series):
+ # this also tests a result set that is all scalars
+ expected = string_series.describe()
+ result = string_series.apply(OrderedDict(
+ [('count', 'count'),
+ ('mean', 'mean'),
+ ('std', 'std'),
+ ('min', 'min'),
+ ('25%', lambda x: x.quantile(0.25)),
+ ('50%', 'median'),
+ ('75%', lambda x: x.quantile(0.75)),
+ ('max', 'max')]))
+ assert_series_equal(result, expected)
+
+ def test_reduce(self, string_series):
+ # reductions with named functions
+ result = string_series.agg(['sum', 'mean'])
+ expected = Series([string_series.sum(),
+ string_series.mean()],
+ ['sum', 'mean'],
+ name=string_series.name)
+ assert_series_equal(result, expected)
+
+ def test_non_callable_aggregates(self):
+ # test agg using non-callable series attributes
+ s = Series([1, 2, None])
+
+ # Calling agg w/ just a string arg same as calling s.arg
+ result = s.agg('size')
+ expected = s.size
+ assert result == expected
+
+ # test when mixed w/ callable reducers
+ result = s.agg(['size', 'count', 'mean'])
+ expected = Series(OrderedDict([('size', 3.0),
+ ('count', 2.0),
+ ('mean', 1.5)]))
+ assert_series_equal(result[expected.index], expected)
+
+ @pytest.mark.parametrize("series, func, expected", chain(
+ _get_cython_table_params(Series(), [
+ ('sum', 0),
+ ('max', np.nan),
+ ('min', np.nan),
+ ('all', True),
+ ('any', False),
+ ('mean', np.nan),
+ ('prod', 1),
+ ('std', np.nan),
+ ('var', np.nan),
+ ('median', np.nan),
+ ]),
+ _get_cython_table_params(Series([np.nan, 1, 2, 3]), [
+ ('sum', 6),
+ ('max', 3),
+ ('min', 1),
+ ('all', True),
+ ('any', True),
+ ('mean', 2),
+ ('prod', 6),
+ ('std', 1),
+ ('var', 1),
+ ('median', 2),
+ ]),
+ _get_cython_table_params(Series('a b c'.split()), [
+ ('sum', 'abc'),
+ ('max', 'c'),
+ ('min', 'a'),
+ ('all', 'c'), # see GH12863
+ ('any', 'a'),
+ ]),
+ ))
+ def test_agg_cython_table(self, series, func, expected):
+ # GH21224
+ # test reducing functions in
+ # pandas.core.base.SelectionMixin._cython_table
+ result = series.agg(func)
+ if tm.is_number(expected):
+ assert np.isclose(result, expected, equal_nan=True)
+ else:
+ assert result == expected
+
+ @pytest.mark.parametrize("series, func, expected", chain(
+ _get_cython_table_params(Series(), [
+ ('cumprod', Series([], Index([]))),
+ ('cumsum', Series([], Index([]))),
+ ]),
+ _get_cython_table_params(Series([np.nan, 1, 2, 3]), [
+ ('cumprod', Series([np.nan, 1, 2, 6])),
+ ('cumsum', Series([np.nan, 1, 3, 6])),
+ ]),
+ _get_cython_table_params(Series('a b c'.split()), [
+ ('cumsum', Series(['a', 'ab', 'abc'])),
+ ]),
+ ))
+ def test_agg_cython_table_transform(self, series, func, expected):
+ # GH21224
+ # test transforming functions in
+ # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+ result = series.agg(func)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("series, func, expected", chain(
+ _get_cython_table_params(Series('a b c'.split()), [
+ ('mean', TypeError), # mean raises TypeError
+ ('prod', TypeError),
+ ('std', TypeError),
+ ('var', TypeError),
+ ('median', TypeError),
+ ('cumprod', TypeError),
+ ])
+ ))
+ def test_agg_cython_table_raises(self, series, func, expected):
+ # GH21224
+ with pytest.raises(expected):
+ # e.g. Series('a b'.split()).cumprod() will raise
+ series.agg(func)
+
+
+class TestSeriesMap():
+
+ def test_map(self, datetime_series):
+ index, data = tm.getMixedTypeDict()
+
+ source = Series(data['B'], index=data['C'])
+ target = Series(data['C'][:4], index=data['D'][:4])
+
+ merged = target.map(source)
+
+ for k, v in compat.iteritems(merged):
+ assert v == source[target[k]]
+
+ # input could be a dict
+ merged = target.map(source.to_dict())
+
+ for k, v in compat.iteritems(merged):
+ assert v == source[target[k]]
+
+ # function
+ result = datetime_series.map(lambda x: x * 2)
+ tm.assert_series_equal(result, datetime_series * 2)
+
+ # GH 10324
+ a = Series([1, 2, 3, 4])
+ b = Series(["even", "odd", "even", "odd"], dtype="category")
+ c = Series(["even", "odd", "even", "odd"])
+
+ exp = Series(["odd", "even", "odd", np.nan], dtype="category")
+ tm.assert_series_equal(a.map(b), exp)
+ exp = Series(["odd", "even", "odd", np.nan])
+ tm.assert_series_equal(a.map(c), exp)
+
+ a = Series(['a', 'b', 'c', 'd'])
+ b = Series([1, 2, 3, 4],
+ index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
+ c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e']))
+
+ exp = Series([np.nan, 1, 2, 3])
+ tm.assert_series_equal(a.map(b), exp)
+ exp = Series([np.nan, 1, 2, 3])
+ tm.assert_series_equal(a.map(c), exp)
+
+ a = Series(['a', 'b', 'c', 'd'])
+ b = Series(['B', 'C', 'D', 'E'], dtype='category',
+ index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
+ c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e']))
+
+ exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'],
+ categories=['B', 'C', 'D', 'E']))
+ tm.assert_series_equal(a.map(b), exp)
+ exp = Series([np.nan, 'B', 'C', 'D'])
+ tm.assert_series_equal(a.map(c), exp)
+
+ @pytest.mark.parametrize("index", tm.all_index_generator(10))
+ def test_map_empty(self, index):
+ s = Series(index)
+ result = s.map({})
+
+ expected = pd.Series(np.nan, index=s.index)
+ tm.assert_series_equal(result, expected)
+
+ def test_map_compat(self):
+ # related GH 8024
+ s = Series([True, True, False], index=[1, 2, 3])
+ result = s.map({True: 'foo', False: 'bar'})
+ expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+ def test_map_int(self):
+ left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4})
+ right = Series({1: 11, 2: 22, 3: 33})
+
+ assert left.dtype == np.float_
+ assert issubclass(right.dtype.type, np.integer)
+
+ merged = left.map(right)
+ assert merged.dtype == np.float_
+ assert isna(merged['d'])
+ assert not isna(merged['c'])
+
+ def test_map_type_inference(self):
+ s = Series(lrange(3))
+ s2 = s.map(lambda x: np.where(x == 0, 0, 1))
+ assert issubclass(s2.dtype.type, np.integer)
+
+ def test_map_decimal(self, string_series):
+ from decimal import Decimal
+
+ result = string_series.map(lambda x: Decimal(str(x)))
+ assert result.dtype == np.object_
+ assert isinstance(result[0], Decimal)
+
+ def test_map_na_exclusion(self):
+ s = Series([1.5, np.nan, 3, np.nan, 5])
+
+ result = s.map(lambda x: x * 2, na_action='ignore')
+ exp = s * 2
+ assert_series_equal(result, exp)
+
+ def test_map_dict_with_tuple_keys(self):
+ """
+ Due to new MultiIndex-ing behaviour in v0.14.0,
+ dicts with tuple keys passed to map were being
+ converted to a multi-index, preventing tuple values
+ from being mapped properly.
+ """
+ # GH 18496
+ df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]})
+ label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'}
+
+ df['labels'] = df['a'].map(label_mappings)
+ df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index)
+ # All labels should be filled now
+ tm.assert_series_equal(df['labels'], df['expected_labels'],
+ check_names=False)
+
+ def test_map_counter(self):
+ s = Series(['a', 'b', 'c'], index=[1, 2, 3])
+ counter = Counter()
+ counter['b'] = 5
+ counter['c'] += 1
+ result = s.map(counter)
+ expected = Series([0, 5, 1], index=[1, 2, 3])
+ assert_series_equal(result, expected)
+
+ def test_map_defaultdict(self):
+ s = Series([1, 2, 3], index=['a', 'b', 'c'])
+ default_dict = defaultdict(lambda: 'blank')
+ default_dict[1] = 'stuff'
+ result = s.map(default_dict)
+ expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c'])
+ assert_series_equal(result, expected)
+
+ def test_map_dict_subclass_with_missing(self):
+ """
+ Test Series.map with a dictionary subclass that defines __missing__,
+ i.e. sets a default value (GH #15999).
+ """
+ class DictWithMissing(dict):
+ def __missing__(self, key):
+ return 'missing'
+ s = Series([1, 2, 3])
+ dictionary = DictWithMissing({3: 'three'})
+ result = s.map(dictionary)
+ expected = Series(['missing', 'missing', 'three'])
+ assert_series_equal(result, expected)
+
+ def test_map_dict_subclass_without_missing(self):
+ class DictWithoutMissing(dict):
+ pass
+ s = Series([1, 2, 3])
+ dictionary = DictWithoutMissing({3: 'three'})
+ result = s.map(dictionary)
+ expected = Series([np.nan, np.nan, 'three'])
+ assert_series_equal(result, expected)
+
+ def test_map_box(self):
+ vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]
+ s = pd.Series(vals)
+ assert s.dtype == 'datetime64[ns]'
+ # boxed value must be Timestamp instance
+ res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__,
+ x.day, x.tz))
+ exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None'])
+ tm.assert_series_equal(res, exp)
+
+ vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern')]
+ s = pd.Series(vals)
+ assert s.dtype == 'datetime64[ns, US/Eastern]'
+ res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__,
+ x.day, x.tz))
+ exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern'])
+ tm.assert_series_equal(res, exp)
+
+ # timedelta
+ vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')]
+ s = pd.Series(vals)
+ assert s.dtype == 'timedelta64[ns]'
+ res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days))
+ exp = pd.Series(['Timedelta_1', 'Timedelta_2'])
+ tm.assert_series_equal(res, exp)
+
+ # period
+ vals = [pd.Period('2011-01-01', freq='M'),
+ pd.Period('2011-01-02', freq='M')]
+ s = pd.Series(vals)
+ assert s.dtype == 'Period[M]'
+ res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__,
+ x.freqstr))
+ exp = pd.Series(['Period_M', 'Period_M'])
+ tm.assert_series_equal(res, exp)
+
+ def test_map_categorical(self):
+ values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
+ ordered=True)
+ s = pd.Series(values, name='XX', index=list('abcdefg'))
+
+ result = s.map(lambda x: x.lower())
+ exp_values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
+ ordered=True)
+ exp = pd.Series(exp_values, name='XX', index=list('abcdefg'))
+ tm.assert_series_equal(result, exp)
+ tm.assert_categorical_equal(result.values, exp_values)
+
+ result = s.map(lambda x: 'A')
+ exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
+ tm.assert_series_equal(result, exp)
+ assert result.dtype == np.object
+
+ with pytest.raises(NotImplementedError):
+ s.map(lambda x: x, na_action='ignore')
+
+ def test_map_datetimetz(self):
+ values = pd.date_range('2011-01-01', '2011-01-02',
+ freq='H').tz_localize('Asia/Tokyo')
+ s = pd.Series(values, name='XX')
+
+ # keep tz
+ result = s.map(lambda x: x + pd.offsets.Day())
+ exp_values = pd.date_range('2011-01-02', '2011-01-03',
+ freq='H').tz_localize('Asia/Tokyo')
+ exp = pd.Series(exp_values, name='XX')
+ tm.assert_series_equal(result, exp)
+
+ # change dtype
+ # GH 14506 : Returned dtype changed from int32 to int64
+ result = s.map(lambda x: x.hour)
+ exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
+ tm.assert_series_equal(result, exp)
+
+ with pytest.raises(NotImplementedError):
+ s.map(lambda x: x, na_action='ignore')
+
+ # not vectorized
+ def f(x):
+ if not isinstance(x, pd.Timestamp):
+ raise ValueError
+ return str(x.tz)
+
+ result = s.map(f)
+ exp = pd.Series(['Asia/Tokyo'] * 25, name='XX')
+ tm.assert_series_equal(result, exp)
+
+ @pytest.mark.parametrize("vals,mapping,exp", [
+ (list('abc'), {np.nan: 'not NaN'}, [np.nan] * 3 + ['not NaN']),
+ (list('abc'), {'a': 'a letter'}, ['a letter'] + [np.nan] * 3),
+ (list(range(3)), {0: 42}, [42] + [np.nan] * 3)])
+ def test_map_missing_mixed(self, vals, mapping, exp):
+ # GH20495
+ s = pd.Series(vals + [np.nan])
+ result = s.map(mapping)
+
+ tm.assert_series_equal(result, pd.Series(exp))
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_arithmetic.py b/contrib/python/pandas/py2/pandas/tests/series/test_arithmetic.py
new file mode 100644
index 00000000000..687ed59772d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_arithmetic.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Series, compat
+from pandas.core.indexes.period import IncompatibleFrequency
+import pandas.util.testing as tm
+
+
+def _permute(obj):
+ return obj.take(np.random.permutation(len(obj)))
+
+
+class TestSeriesFlexArithmetic(object):
+ @pytest.mark.parametrize(
+ 'ts',
+ [
+ (lambda x: x, lambda x: x * 2, False),
+ (lambda x: x, lambda x: x[::2], False),
+ (lambda x: x, lambda x: 5, True),
+ (lambda x: tm.makeFloatSeries(),
+ lambda x: tm.makeFloatSeries(),
+ True)
+ ])
+ @pytest.mark.parametrize('opname', ['add', 'sub', 'mul', 'floordiv',
+ 'truediv', 'div', 'pow'])
+ def test_flex_method_equivalence(self, opname, ts):
+ # check that Series.{opname} behaves like Series.__{opname}__,
+ tser = tm.makeTimeSeries().rename('ts')
+
+ series = ts[0](tser)
+ other = ts[1](tser)
+ check_reverse = ts[2]
+
+ if opname == 'div' and compat.PY3:
+ pytest.skip('div test only for Py3')
+
+ op = getattr(Series, opname)
+
+ if op == 'div':
+ alt = operator.truediv
+ else:
+ alt = getattr(operator, opname)
+
+ result = op(series, other)
+ expected = alt(series, other)
+ tm.assert_almost_equal(result, expected)
+ if check_reverse:
+ rop = getattr(Series, "r" + opname)
+ result = rop(series, other)
+ expected = alt(other, series)
+ tm.assert_almost_equal(result, expected)
+
+
+class TestSeriesArithmetic(object):
+ # Some of these may end up in tests/arithmetic, but are not yet sorted
+
+ def test_add_series_with_period_index(self):
+ rng = pd.period_range('1/1/2000', '1/1/2010', freq='A')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts + ts[::2]
+ expected = ts + ts
+ expected[1::2] = np.nan
+ tm.assert_series_equal(result, expected)
+
+ result = ts + _permute(ts[::2])
+ tm.assert_series_equal(result, expected)
+
+ msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)"
+ with pytest.raises(IncompatibleFrequency, match=msg):
+ ts + ts.asfreq('D', how="end")
+
+
+# ------------------------------------------------------------------
+# Comparisons
+
+class TestSeriesFlexComparison(object):
+ def test_comparison_flex_basic(self):
+ left = pd.Series(np.random.randn(10))
+ right = pd.Series(np.random.randn(10))
+
+ tm.assert_series_equal(left.eq(right), left == right)
+ tm.assert_series_equal(left.ne(right), left != right)
+ tm.assert_series_equal(left.le(right), left < right)
+ tm.assert_series_equal(left.lt(right), left <= right)
+ tm.assert_series_equal(left.gt(right), left > right)
+ tm.assert_series_equal(left.ge(right), left >= right)
+
+ # axis
+ for axis in [0, None, 'index']:
+ tm.assert_series_equal(left.eq(right, axis=axis), left == right)
+ tm.assert_series_equal(left.ne(right, axis=axis), left != right)
+ tm.assert_series_equal(left.le(right, axis=axis), left < right)
+ tm.assert_series_equal(left.lt(right, axis=axis), left <= right)
+ tm.assert_series_equal(left.gt(right, axis=axis), left > right)
+ tm.assert_series_equal(left.ge(right, axis=axis), left >= right)
+
+ #
+ msg = 'No axis named 1 for object type'
+ for op in ['eq', 'ne', 'le', 'le', 'gt', 'ge']:
+ with pytest.raises(ValueError, match=msg):
+ getattr(left, op)(right, axis=1)
+
+
+class TestSeriesComparison(object):
+ def test_comparison_different_length(self):
+ a = Series(['a', 'b', 'c'])
+ b = Series(['b', 'a'])
+ with pytest.raises(ValueError):
+ a < b
+
+ a = Series([1, 2])
+ b = Series([2, 3, 4])
+ with pytest.raises(ValueError):
+ a == b
+
+ @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
+ def test_ser_flex_cmp_return_dtypes(self, opname):
+ # GH#15115
+ ser = Series([1, 3, 2], index=range(3))
+ const = 2
+
+ result = getattr(ser, opname)(const).get_dtype_counts()
+ tm.assert_series_equal(result, Series([1], ['bool']))
+
+ @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
+ def test_ser_flex_cmp_return_dtypes_empty(self, opname):
+ # GH#15115 empty Series case
+ ser = Series([1, 3, 2], index=range(3))
+ empty = ser.iloc[:0]
+ const = 2
+
+ result = getattr(empty, opname)(const).get_dtype_counts()
+ tm.assert_series_equal(result, Series([1], ['bool']))
+
+ @pytest.mark.parametrize('op', [operator.eq, operator.ne,
+ operator.le, operator.lt,
+ operator.ge, operator.gt])
+ @pytest.mark.parametrize('names', [(None, None, None),
+ ('foo', 'bar', None),
+ ('baz', 'baz', 'baz')])
+ def test_ser_cmp_result_names(self, names, op):
+ # datetime64 dtype
+ dti = pd.date_range('1949-06-07 03:00:00',
+ freq='H', periods=5, name=names[0])
+ ser = Series(dti).rename(names[1])
+ result = op(ser, dti)
+ assert result.name == names[2]
+
+ # datetime64tz dtype
+ dti = dti.tz_localize('US/Central')
+ ser = Series(dti).rename(names[1])
+ result = op(ser, dti)
+ assert result.name == names[2]
+
+ # timedelta64 dtype
+ tdi = dti - dti.shift(1)
+ ser = Series(tdi).rename(names[1])
+ result = op(ser, tdi)
+ assert result.name == names[2]
+
+ # categorical
+ if op in [operator.eq, operator.ne]:
+ # categorical dtype comparisons raise for inequalities
+ cidx = tdi.astype('category')
+ ser = Series(cidx).rename(names[1])
+ result = op(ser, cidx)
+ assert result.name == names[2]
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_asof.py b/contrib/python/pandas/py2/pandas/tests/series/test_asof.py
new file mode 100644
index 00000000000..488fc894b95
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_asof.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+
+import numpy as np
+import pytest
+
+from pandas import Series, Timestamp, date_range, isna, notna, offsets
+import pandas.util.testing as tm
+
+
+class TestSeriesAsof():
+
+ def test_basic(self):
+
+ # array or list or dates
+ N = 50
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+ ts = Series(np.random.randn(N), index=rng)
+ ts[15:30] = np.nan
+ dates = date_range('1/1/1990', periods=N * 3, freq='25s')
+
+ result = ts.asof(dates)
+ assert notna(result).all()
+ lb = ts.index[14]
+ ub = ts.index[30]
+
+ result = ts.asof(list(dates))
+ assert notna(result).all()
+ lb = ts.index[14]
+ ub = ts.index[30]
+
+ mask = (result.index >= lb) & (result.index < ub)
+ rs = result[mask]
+ assert (rs == ts[lb]).all()
+
+ val = result[result.index[result.index >= ub][0]]
+ assert ts[ub] == val
+
+ def test_scalar(self):
+
+ N = 30
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+ ts = Series(np.arange(N), index=rng)
+ ts[5:10] = np.NaN
+ ts[15:20] = np.NaN
+
+ val1 = ts.asof(ts.index[7])
+ val2 = ts.asof(ts.index[19])
+
+ assert val1 == ts[4]
+ assert val2 == ts[14]
+
+ # accepts strings
+ val1 = ts.asof(str(ts.index[7]))
+ assert val1 == ts[4]
+
+ # in there
+ result = ts.asof(ts.index[3])
+ assert result == ts[3]
+
+ # no as of value
+ d = ts.index[0] - offsets.BDay()
+ assert np.isnan(ts.asof(d))
+
+ def test_with_nan(self):
+ # basic asof test
+ rng = date_range('1/1/2000', '1/2/2000', freq='4h')
+ s = Series(np.arange(len(rng)), index=rng)
+ r = s.resample('2h').mean()
+
+ result = r.asof(r.index)
+ expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.],
+ index=date_range('1/1/2000', '1/2/2000', freq='2h'))
+ tm.assert_series_equal(result, expected)
+
+ r.iloc[3:5] = np.nan
+ result = r.asof(r.index)
+ expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.],
+ index=date_range('1/1/2000', '1/2/2000', freq='2h'))
+ tm.assert_series_equal(result, expected)
+
+ r.iloc[-3:] = np.nan
+ result = r.asof(r.index)
+ expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.],
+ index=date_range('1/1/2000', '1/2/2000', freq='2h'))
+ tm.assert_series_equal(result, expected)
+
+ def test_periodindex(self):
+ from pandas import period_range, PeriodIndex
+ # array or list or dates
+ N = 50
+ rng = period_range('1/1/1990', periods=N, freq='H')
+ ts = Series(np.random.randn(N), index=rng)
+ ts[15:30] = np.nan
+ dates = date_range('1/1/1990', periods=N * 3, freq='37min')
+
+ result = ts.asof(dates)
+ assert notna(result).all()
+ lb = ts.index[14]
+ ub = ts.index[30]
+
+ result = ts.asof(list(dates))
+ assert notna(result).all()
+ lb = ts.index[14]
+ ub = ts.index[30]
+
+ pix = PeriodIndex(result.index.values, freq='H')
+ mask = (pix >= lb) & (pix < ub)
+ rs = result[mask]
+ assert (rs == ts[lb]).all()
+
+ ts[5:10] = np.nan
+ ts[15:20] = np.nan
+
+ val1 = ts.asof(ts.index[7])
+ val2 = ts.asof(ts.index[19])
+
+ assert val1 == ts[4]
+ assert val2 == ts[14]
+
+ # accepts strings
+ val1 = ts.asof(str(ts.index[7]))
+ assert val1 == ts[4]
+
+ # in there
+ assert ts.asof(ts.index[3]) == ts[3]
+
+ # no as of value
+ d = ts.index[0].to_timestamp() - offsets.BDay()
+ assert isna(ts.asof(d))
+
+ def test_errors(self):
+
+ s = Series([1, 2, 3],
+ index=[Timestamp('20130101'),
+ Timestamp('20130103'),
+ Timestamp('20130102')])
+
+ # non-monotonic
+ assert not s.index.is_monotonic
+ with pytest.raises(ValueError):
+ s.asof(s.index[0])
+
+ # subset with Series
+ N = 10
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+ s = Series(np.random.randn(N), index=rng)
+ with pytest.raises(ValueError):
+ s.asof(s.index[0], subset='foo')
+
+ def test_all_nans(self):
+ # GH 15713
+ # series is all nans
+ result = Series([np.nan]).asof([0])
+ expected = Series([np.nan])
+ tm.assert_series_equal(result, expected)
+
+ # testing non-default indexes
+ N = 50
+ rng = date_range('1/1/1990', periods=N, freq='53s')
+
+ dates = date_range('1/1/1990', periods=N * 3, freq='25s')
+ result = Series(np.nan, index=rng).asof(dates)
+ expected = Series(np.nan, index=dates)
+ tm.assert_series_equal(result, expected)
+
+ # testing scalar input
+ date = date_range('1/1/1990', periods=N * 3, freq='25s')[0]
+ result = Series(np.nan, index=rng).asof(date)
+ assert isna(result)
+
+ # test name is propagated
+ result = Series(np.nan, index=[1, 2, 3, 4], name='test').asof([4, 5])
+ expected = Series(np.nan, index=[4, 5], name='test')
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_block_internals.py b/contrib/python/pandas/py2/pandas/tests/series/test_block_internals.py
new file mode 100644
index 00000000000..e74b32181ce
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_block_internals.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+
+# Segregated collection of methods that require the BlockManager internal data
+# structure
+
+
+class TestSeriesBlockInternals(object):
+
+ def test_setitem_invalidates_datetime_index_freq(self):
+ # GH#24096 altering a datetime64tz Series inplace invalidates the
+ # `freq` attribute on the underlying DatetimeIndex
+
+ dti = pd.date_range('20130101', periods=3, tz='US/Eastern')
+ ts = dti[1]
+ ser = pd.Series(dti)
+ assert ser._values is not dti
+ assert ser._values._data.base is not dti._data._data.base
+ assert dti.freq == 'D'
+ ser.iloc[1] = pd.NaT
+ assert ser._values.freq is None
+
+ # check that the DatetimeIndex was not altered in place
+ assert ser._values is not dti
+ assert ser._values._data.base is not dti._data._data.base
+ assert dti[1] == ts
+ assert dti.freq == 'D'
+
+ def test_dt64tz_setitem_does_not_mutate_dti(self):
+ # GH#21907, GH#24096
+ dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific')
+ ts = dti[0]
+ ser = pd.Series(dti)
+ assert ser._values is not dti
+ assert ser._values._data.base is not dti._data._data.base
+ assert ser._data.blocks[0].values is not dti
+ assert (ser._data.blocks[0].values._data.base
+ is not dti._data._data.base)
+
+ ser[::3] = pd.NaT
+ assert ser[0] is pd.NaT
+ assert dti[0] == ts
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_combine_concat.py b/contrib/python/pandas/py2/pandas/tests/series/test_combine_concat.py
new file mode 100644
index 00000000000..45e3dffde60
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_combine_concat.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime
+
+import numpy as np
+from numpy import nan
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, DatetimeIndex, Series, compat, date_range
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestSeriesCombine(object):
+
+ def test_append(self, datetime_series, string_series, object_series):
+ appendedSeries = string_series.append(object_series)
+ for idx, value in compat.iteritems(appendedSeries):
+ if idx in string_series.index:
+ assert value == string_series[idx]
+ elif idx in object_series.index:
+ assert value == object_series[idx]
+ else:
+ raise AssertionError("orphaned index!")
+
+ msg = "Indexes have overlapping values:"
+ with pytest.raises(ValueError, match=msg):
+ datetime_series.append(datetime_series, verify_integrity=True)
+
+ def test_append_many(self, datetime_series):
+ pieces = [datetime_series[:5], datetime_series[5:10],
+ datetime_series[10:]]
+
+ result = pieces[0].append(pieces[1:])
+ assert_series_equal(result, datetime_series)
+
+ def test_append_duplicates(self):
+ # GH 13677
+ s1 = pd.Series([1, 2, 3])
+ s2 = pd.Series([4, 5, 6])
+ exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2])
+ tm.assert_series_equal(s1.append(s2), exp)
+ tm.assert_series_equal(pd.concat([s1, s2]), exp)
+
+ # the result must have RangeIndex
+ exp = pd.Series([1, 2, 3, 4, 5, 6])
+ tm.assert_series_equal(s1.append(s2, ignore_index=True),
+ exp, check_index_type=True)
+ tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True),
+ exp, check_index_type=True)
+
+ msg = 'Indexes have overlapping values:'
+ with pytest.raises(ValueError, match=msg):
+ s1.append(s2, verify_integrity=True)
+ with pytest.raises(ValueError, match=msg):
+ pd.concat([s1, s2], verify_integrity=True)
+
+ def test_combine_scalar(self):
+ # GH 21248
+ # Note - combine() with another Series is tested elsewhere because
+ # it is used when testing operators
+ s = pd.Series([i * 10 for i in range(5)])
+ result = s.combine(3, lambda x, y: x + y)
+ expected = pd.Series([i * 10 + 3 for i in range(5)])
+ tm.assert_series_equal(result, expected)
+
+ result = s.combine(22, lambda x, y: min(x, y))
+ expected = pd.Series([min(i * 10, 22) for i in range(5)])
+ tm.assert_series_equal(result, expected)
+
+ def test_combine_first(self):
+ values = tm.makeIntIndex(20).values.astype(float)
+ series = Series(values, index=tm.makeIntIndex(20))
+
+ series_copy = series * 2
+ series_copy[::2] = np.NaN
+
+ # nothing used from the input
+ combined = series.combine_first(series_copy)
+
+ tm.assert_series_equal(combined, series)
+
+ # Holes filled from input
+ combined = series_copy.combine_first(series)
+ assert np.isfinite(combined).all()
+
+ tm.assert_series_equal(combined[::2], series[::2])
+ tm.assert_series_equal(combined[1::2], series_copy[1::2])
+
+ # mixed types
+ index = tm.makeStringIndex(20)
+ floats = Series(tm.randn(20), index=index)
+ strings = Series(tm.makeStringIndex(10), index=index[::2])
+
+ combined = strings.combine_first(floats)
+
+ tm.assert_series_equal(strings, combined.loc[index[::2]])
+ tm.assert_series_equal(floats[1::2].astype(object),
+ combined.loc[index[1::2]])
+
+ # corner case
+ s = Series([1., 2, 3], index=[0, 1, 2])
+ result = s.combine_first(Series([], index=[]))
+ assert_series_equal(s, result)
+
+ def test_update(self):
+ s = Series([1.5, nan, 3., 4., nan])
+ s2 = Series([nan, 3.5, nan, 5.])
+ s.update(s2)
+
+ expected = Series([1.5, 3.5, 3., 5., np.nan])
+ assert_series_equal(s, expected)
+
+ # GH 3217
+ df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
+ df['c'] = np.nan
+
+ df['c'].update(Series(['foo'], index=[0]))
+ expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]],
+ columns=['a', 'b', 'c'])
+ assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize('other, dtype, expected', [
+ # other is int
+ ([61, 63], 'int32', pd.Series([10, 61, 12], dtype='int32')),
+ ([61, 63], 'int64', pd.Series([10, 61, 12])),
+ ([61, 63], float, pd.Series([10., 61., 12.])),
+ ([61, 63], object, pd.Series([10, 61, 12], dtype=object)),
+ # other is float, but can be cast to int
+ ([61., 63.], 'int32', pd.Series([10, 61, 12], dtype='int32')),
+ ([61., 63.], 'int64', pd.Series([10, 61, 12])),
+ ([61., 63.], float, pd.Series([10., 61., 12.])),
+ ([61., 63.], object, pd.Series([10, 61., 12], dtype=object)),
+ # others is float, cannot be cast to int
+ ([61.1, 63.1], 'int32', pd.Series([10., 61.1, 12.])),
+ ([61.1, 63.1], 'int64', pd.Series([10., 61.1, 12.])),
+ ([61.1, 63.1], float, pd.Series([10., 61.1, 12.])),
+ ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)),
+ # other is object, cannot be cast
+ ([(61,), (63,)], 'int32', pd.Series([10, (61,), 12])),
+ ([(61,), (63,)], 'int64', pd.Series([10, (61,), 12])),
+ ([(61,), (63,)], float, pd.Series([10., (61,), 12.])),
+ ([(61,), (63,)], object, pd.Series([10, (61,), 12]))
+ ])
+ def test_update_dtypes(self, other, dtype, expected):
+
+ s = Series([10, 11, 12], dtype=dtype)
+ other = Series(other, index=[1, 3])
+ s.update(other)
+
+ assert_series_equal(s, expected)
+
+ def test_concat_empty_series_dtypes_roundtrips(self):
+
+ # round-tripping with self & like self
+ dtypes = map(np.dtype, ['float64', 'int8', 'uint8', 'bool', 'm8[ns]',
+ 'M8[ns]'])
+
+ for dtype in dtypes:
+ assert pd.concat([Series(dtype=dtype)]).dtype == dtype
+ assert pd.concat([Series(dtype=dtype),
+ Series(dtype=dtype)]).dtype == dtype
+
+ def int_result_type(dtype, dtype2):
+ typs = {dtype.kind, dtype2.kind}
+ if not len(typs - {'i', 'u', 'b'}) and (dtype.kind == 'i' or
+ dtype2.kind == 'i'):
+ return 'i'
+ elif not len(typs - {'u', 'b'}) and (dtype.kind == 'u' or
+ dtype2.kind == 'u'):
+ return 'u'
+ return None
+
+ def float_result_type(dtype, dtype2):
+ typs = {dtype.kind, dtype2.kind}
+ if not len(typs - {'f', 'i', 'u'}) and (dtype.kind == 'f' or
+ dtype2.kind == 'f'):
+ return 'f'
+ return None
+
+ def get_result_type(dtype, dtype2):
+ result = float_result_type(dtype, dtype2)
+ if result is not None:
+ return result
+ result = int_result_type(dtype, dtype2)
+ if result is not None:
+ return result
+ return 'O'
+
+ for dtype in dtypes:
+ for dtype2 in dtypes:
+ if dtype == dtype2:
+ continue
+
+ expected = get_result_type(dtype, dtype2)
+ result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)
+ ]).dtype
+ assert result.kind == expected
+
+ def test_combine_first_dt_tz_values(self, tz_naive_fixture):
+ ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'],
+ tz=tz_naive_fixture),
+ name='ser1')
+ ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'],
+ tz=tz_naive_fixture),
+ index=[2, 3, 4], name='ser2')
+ result = ser1.combine_first(ser2)
+ exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103',
+ '20160515', '20160516'],
+ tz=tz_naive_fixture)
+ exp = pd.Series(exp_vals, name='ser1')
+ assert_series_equal(exp, result)
+
+ def test_concat_empty_series_dtypes(self):
+
+ # booleans
+ assert pd.concat([Series(dtype=np.bool_),
+ Series(dtype=np.int32)]).dtype == np.int32
+ assert pd.concat([Series(dtype=np.bool_),
+ Series(dtype=np.float32)]).dtype == np.object_
+
+ # datetime-like
+ assert pd.concat([Series(dtype='m8[ns]'),
+ Series(dtype=np.bool)]).dtype == np.object_
+ assert pd.concat([Series(dtype='m8[ns]'),
+ Series(dtype=np.int64)]).dtype == np.object_
+ assert pd.concat([Series(dtype='M8[ns]'),
+ Series(dtype=np.bool)]).dtype == np.object_
+ assert pd.concat([Series(dtype='M8[ns]'),
+ Series(dtype=np.int64)]).dtype == np.object_
+ assert pd.concat([Series(dtype='M8[ns]'),
+ Series(dtype=np.bool_),
+ Series(dtype=np.int64)]).dtype == np.object_
+
+ # categorical
+ assert pd.concat([Series(dtype='category'),
+ Series(dtype='category')]).dtype == 'category'
+ # GH 18515
+ assert pd.concat([Series(np.array([]), dtype='category'),
+ Series(dtype='float64')]).dtype == 'float64'
+ assert pd.concat([Series(dtype='category'),
+ Series(dtype='object')]).dtype == 'object'
+
+ # sparse
+ # TODO: move?
+ result = pd.concat([Series(dtype='float64').to_sparse(), Series(
+ dtype='float64').to_sparse()])
+ assert result.dtype == 'Sparse[float64]'
+ assert result.ftype == 'float64:sparse'
+
+ result = pd.concat([Series(dtype='float64').to_sparse(), Series(
+ dtype='float64')])
+ # TODO: release-note: concat sparse dtype
+ expected = pd.core.sparse.api.SparseDtype(np.float64)
+ assert result.dtype == expected
+ assert result.ftype == 'float64:sparse'
+
+ result = pd.concat([Series(dtype='float64').to_sparse(), Series(
+ dtype='object')])
+ # TODO: release-note: concat sparse dtype
+ expected = pd.core.sparse.api.SparseDtype('object')
+ assert result.dtype == expected
+ assert result.ftype == 'object:sparse'
+
+ def test_combine_first_dt64(self):
+ from pandas.core.tools.datetimes import to_datetime
+ s0 = to_datetime(Series(["2010", np.NaN]))
+ s1 = to_datetime(Series([np.NaN, "2011"]))
+ rs = s0.combine_first(s1)
+ xp = to_datetime(Series(['2010', '2011']))
+ assert_series_equal(rs, xp)
+
+ s0 = to_datetime(Series(["2010", np.NaN]))
+ s1 = Series([np.NaN, "2011"])
+ rs = s0.combine_first(s1)
+ xp = Series([datetime(2010, 1, 1), '2011'])
+ assert_series_equal(rs, xp)
+
+
+class TestTimeseries(object):
+
+ def test_append_concat(self):
+ rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
+ ts = Series(np.random.randn(len(rng)), rng)
+ df = DataFrame(np.random.randn(len(rng), 4), index=rng)
+
+ result = ts.append(ts)
+ result_df = df.append(df)
+ ex_index = DatetimeIndex(np.tile(rng.values, 2))
+ tm.assert_index_equal(result.index, ex_index)
+ tm.assert_index_equal(result_df.index, ex_index)
+
+ appended = rng.append(rng)
+ tm.assert_index_equal(appended, ex_index)
+
+ appended = rng.append([rng, rng])
+ ex_index = DatetimeIndex(np.tile(rng.values, 3))
+ tm.assert_index_equal(appended, ex_index)
+
+ # different index names
+ rng1 = rng.copy()
+ rng2 = rng.copy()
+ rng1.name = 'foo'
+ rng2.name = 'bar'
+ assert rng1.append(rng1).name == 'foo'
+ assert rng1.append(rng2).name is None
+
+ def test_append_concat_tz(self):
+ # see gh-2938
+ rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
+ tz='US/Eastern')
+ rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
+ tz='US/Eastern')
+ rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
+ tz='US/Eastern')
+ ts = Series(np.random.randn(len(rng)), rng)
+ df = DataFrame(np.random.randn(len(rng), 4), index=rng)
+ ts2 = Series(np.random.randn(len(rng2)), rng2)
+ df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
+
+ result = ts.append(ts2)
+ result_df = df.append(df2)
+ tm.assert_index_equal(result.index, rng3)
+ tm.assert_index_equal(result_df.index, rng3)
+
+ appended = rng.append(rng2)
+ tm.assert_index_equal(appended, rng3)
+
+ def test_append_concat_tz_explicit_pytz(self):
+ # see gh-2938
+ from pytz import timezone as timezone
+
+ rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
+ tz=timezone('US/Eastern'))
+ rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
+ tz=timezone('US/Eastern'))
+ rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
+ tz=timezone('US/Eastern'))
+ ts = Series(np.random.randn(len(rng)), rng)
+ df = DataFrame(np.random.randn(len(rng), 4), index=rng)
+ ts2 = Series(np.random.randn(len(rng2)), rng2)
+ df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
+
+ result = ts.append(ts2)
+ result_df = df.append(df2)
+ tm.assert_index_equal(result.index, rng3)
+ tm.assert_index_equal(result_df.index, rng3)
+
+ appended = rng.append(rng2)
+ tm.assert_index_equal(appended, rng3)
+
+ def test_append_concat_tz_dateutil(self):
+ # see gh-2938
+ rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
+ tz='dateutil/US/Eastern')
+ rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
+ tz='dateutil/US/Eastern')
+ rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
+ tz='dateutil/US/Eastern')
+ ts = Series(np.random.randn(len(rng)), rng)
+ df = DataFrame(np.random.randn(len(rng), 4), index=rng)
+ ts2 = Series(np.random.randn(len(rng2)), rng2)
+ df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
+
+ result = ts.append(ts2)
+ result_df = df.append(df2)
+ tm.assert_index_equal(result.index, rng3)
+ tm.assert_index_equal(result_df.index, rng3)
+
+ appended = rng.append(rng2)
+ tm.assert_index_equal(appended, rng3)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_constructors.py b/contrib/python/pandas/py2/pandas/tests/series/test_constructors.py
new file mode 100644
index 00000000000..d92ca48751d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_constructors.py
@@ -0,0 +1,1266 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from collections import OrderedDict
+from datetime import datetime, timedelta
+
+import numpy as np
+from numpy import nan
+import numpy.ma as ma
+import pytest
+
+from pandas._libs import lib
+from pandas._libs.tslib import iNaT
+from pandas.compat import PY36, long, lrange, range, zip
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64tz_dtype)
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series,
+ Timestamp, date_range, isna, period_range, timedelta_range)
+from pandas.api.types import CategoricalDtype
+from pandas.core.arrays import period_array
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestSeriesConstructors():
+
+ def test_invalid_dtype(self):
+ # GH15520
+ msg = 'not understood'
+ invalid_list = [pd.Timestamp, 'pd.Timestamp', list]
+ for dtype in invalid_list:
+ with pytest.raises(TypeError, match=msg):
+ Series([], name='time', dtype=dtype)
+
+ def test_scalar_conversion(self):
+
+ # Pass in scalar is disabled
+ scalar = Series(0.5)
+ assert not isinstance(scalar, float)
+
+ # Coercion
+ assert float(Series([1.])) == 1.0
+ assert int(Series([1.])) == 1
+ assert long(Series([1.])) == 1
+
+ def test_constructor(self, datetime_series, empty_series):
+ assert datetime_series.index.is_all_dates
+
+ # Pass in Series
+ derived = Series(datetime_series)
+ assert derived.index.is_all_dates
+
+ assert tm.equalContents(derived.index, datetime_series.index)
+ # Ensure new index is not created
+ assert id(datetime_series.index) == id(derived.index)
+
+ # Mixed type Series
+ mixed = Series(['hello', np.NaN], index=[0, 1])
+ assert mixed.dtype == np.object_
+ assert mixed[1] is np.NaN
+
+ assert not empty_series.index.is_all_dates
+ assert not Series({}).index.is_all_dates
+
+ # exception raised is of type Exception
+ with pytest.raises(Exception, match="Data must be 1-dimensional"):
+ Series(np.random.randn(3, 3), index=np.arange(3))
+
+ mixed.name = 'Series'
+ rs = Series(mixed).name
+ xp = 'Series'
+ assert rs == xp
+
+ # raise on MultiIndex GH4187
+ m = MultiIndex.from_arrays([[1, 2], [3, 4]])
+ msg = "initializing a Series from a MultiIndex is not supported"
+ with pytest.raises(NotImplementedError, match=msg):
+ Series(m)
+
+ @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
+ def test_constructor_empty(self, input_class):
+ empty = Series()
+ empty2 = Series(input_class())
+
+ # these are Index() and RangeIndex() which don't compare type equal
+ # but are just .equals
+ assert_series_equal(empty, empty2, check_index_type=False)
+
+ # With explicit dtype:
+ empty = Series(dtype='float64')
+ empty2 = Series(input_class(), dtype='float64')
+ assert_series_equal(empty, empty2, check_index_type=False)
+
+ # GH 18515 : with dtype=category:
+ empty = Series(dtype='category')
+ empty2 = Series(input_class(), dtype='category')
+ assert_series_equal(empty, empty2, check_index_type=False)
+
+ if input_class is not list:
+ # With index:
+ empty = Series(index=lrange(10))
+ empty2 = Series(input_class(), index=lrange(10))
+ assert_series_equal(empty, empty2)
+
+ # With index and dtype float64:
+ empty = Series(np.nan, index=lrange(10))
+ empty2 = Series(input_class(), index=lrange(10), dtype='float64')
+ assert_series_equal(empty, empty2)
+
+ # GH 19853 : with empty string, index and dtype str
+ empty = Series('', dtype=str, index=range(3))
+ empty2 = Series('', index=range(3))
+ assert_series_equal(empty, empty2)
+
+ @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
+ def test_constructor_nan(self, input_arg):
+ empty = Series(dtype='float64', index=lrange(10))
+ empty2 = Series(input_arg, index=lrange(10))
+
+ assert_series_equal(empty, empty2, check_index_type=False)
+
+ @pytest.mark.parametrize('dtype', [
+ 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object',
+ 'datetime64[ns, UTC]',
+ ])
+ @pytest.mark.parametrize('index', [None, pd.Index([])])
+ def test_constructor_dtype_only(self, dtype, index):
+ # GH-20865
+ result = pd.Series(dtype=dtype, index=index)
+ assert result.dtype == dtype
+ assert len(result) == 0
+
+ def test_constructor_no_data_index_order(self):
+ result = pd.Series(index=['b', 'a', 'c'])
+ assert result.index.tolist() == ['b', 'a', 'c']
+
+ def test_constructor_no_data_string_type(self):
+ # GH 22477
+ result = pd.Series(index=[1], dtype=str)
+ assert np.isnan(result.iloc[0])
+
+ @pytest.mark.parametrize('item', ['entry', 'ѐ', 13])
+ def test_constructor_string_element_string_type(self, item):
+ # GH 22477
+ result = pd.Series(item, index=[1], dtype=str)
+ assert result.iloc[0] == str(item)
+
+ def test_constructor_dtype_str_na_values(self, string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/21083
+ ser = Series(['x', None], dtype=string_dtype)
+ result = ser.isna()
+ expected = Series([False, True])
+ tm.assert_series_equal(result, expected)
+ assert ser.iloc[1] is None
+
+ ser = Series(['x', np.nan], dtype=string_dtype)
+ assert np.isnan(ser.iloc[1])
+
+ def test_constructor_series(self):
+ index1 = ['d', 'b', 'a', 'c']
+ index2 = sorted(index1)
+ s1 = Series([4, 7, -5, 3], index=index1)
+ s2 = Series(s1, index=index2)
+
+ assert_series_equal(s2, s1.sort_index())
+
+ def test_constructor_iterable(self):
+ # GH 21987
+ class Iter():
+ def __iter__(self):
+ for i in range(10):
+ yield i
+
+ expected = Series(list(range(10)), dtype='int64')
+ result = Series(Iter(), dtype='int64')
+ assert_series_equal(result, expected)
+
+ def test_constructor_sequence(self):
+ # GH 21987
+ expected = Series(list(range(10)), dtype='int64')
+ result = Series(range(10), dtype='int64')
+ assert_series_equal(result, expected)
+
+ def test_constructor_single_str(self):
+ # GH 21987
+ expected = Series(['abc'])
+ result = Series('abc')
+ assert_series_equal(result, expected)
+
+ def test_constructor_list_like(self):
+
+ # make sure that we are coercing different
+ # list-likes to standard dtypes and not
+ # platform specific
+ expected = Series([1, 2, 3], dtype='int64')
+ for obj in [[1, 2, 3], (1, 2, 3),
+ np.array([1, 2, 3], dtype='int64')]:
+ result = Series(obj, index=[0, 1, 2])
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('input_vals', [
+ ([1, 2]),
+ (['1', '2']),
+ (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
+ (list(pd.date_range('1/1/2011', periods=2, freq='H',
+ tz='US/Eastern'))),
+ ([pd.Interval(left=0, right=5)]),
+ ])
+ def test_constructor_list_str(self, input_vals, string_dtype):
+ # GH 16605
+ # Ensure that data elements from a list are converted to strings
+ # when dtype is str, 'str', or 'U'
+ result = Series(input_vals, dtype=string_dtype)
+ expected = Series(input_vals).astype(string_dtype)
+ assert_series_equal(result, expected)
+
+ def test_constructor_list_str_na(self, string_dtype):
+ result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
+ expected = Series(['1.0', '2.0', np.nan], dtype=object)
+ assert_series_equal(result, expected)
+ assert np.isnan(result[2])
+
+ def test_constructor_generator(self):
+ gen = (i for i in range(10))
+
+ result = Series(gen)
+ exp = Series(lrange(10))
+ assert_series_equal(result, exp)
+
+ gen = (i for i in range(10))
+ result = Series(gen, index=lrange(10, 20))
+ exp.index = lrange(10, 20)
+ assert_series_equal(result, exp)
+
+ def test_constructor_map(self):
+ # GH8909
+ m = map(lambda x: x, range(10))
+
+ result = Series(m)
+ exp = Series(lrange(10))
+ assert_series_equal(result, exp)
+
+ m = map(lambda x: x, range(10))
+ result = Series(m, index=lrange(10, 20))
+ exp.index = lrange(10, 20)
+ assert_series_equal(result, exp)
+
+ def test_constructor_categorical(self):
+ cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
+ fastpath=True)
+ res = Series(cat)
+ tm.assert_categorical_equal(res.values, cat)
+
+ # can cast to a new dtype
+ result = Series(pd.Categorical([1, 2, 3]),
+ dtype='int64')
+ expected = pd.Series([1, 2, 3], dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+ # GH12574
+ cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
+ assert is_categorical_dtype(cat)
+ assert is_categorical_dtype(cat.dtype)
+ s = Series([1, 2, 3], dtype='category')
+ assert is_categorical_dtype(s)
+ assert is_categorical_dtype(s.dtype)
+
+ def test_constructor_categorical_with_coercion(self):
+ factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ # test basic creation / coercion of categoricals
+ s = Series(factor, name='A')
+ assert s.dtype == 'category'
+ assert len(s) == len(factor)
+ str(s.values)
+ str(s)
+
+ # in a frame
+ df = DataFrame({'A': factor})
+ result = df['A']
+ tm.assert_series_equal(result, s)
+ result = df.iloc[:, 0]
+ tm.assert_series_equal(result, s)
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ df = DataFrame({'A': s})
+ result = df['A']
+ tm.assert_series_equal(result, s)
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ # multiples
+ df = DataFrame({'A': s, 'B': s, 'C': 1})
+ result1 = df['A']
+ result2 = df['B']
+ tm.assert_series_equal(result1, s)
+ tm.assert_series_equal(result2, s, check_names=False)
+ assert result2.name == 'B'
+ assert len(df) == len(factor)
+ str(df.values)
+ str(df)
+
+ # GH8623
+ x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
+ [1, 'John P. Doe']],
+ columns=['person_id', 'person_name'])
+ x['person_name'] = Categorical(x.person_name
+ ) # doing this breaks transform
+
+ expected = x.iloc[0].person_name
+ result = x.person_name.iloc[0]
+ assert result == expected
+
+ result = x.person_name[0]
+ assert result == expected
+
+ result = x.person_name.loc[0]
+ assert result == expected
+
+ def test_constructor_categorical_dtype(self):
+ result = pd.Series(['a', 'b'],
+ dtype=CategoricalDtype(['a', 'b', 'c'],
+ ordered=True))
+ assert is_categorical_dtype(result) is True
+ tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
+ assert result.cat.ordered
+
+ result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
+ assert is_categorical_dtype(result)
+ tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
+ assert result.cat.ordered is False
+
+ # GH 19565 - Check broadcasting of scalar with Categorical dtype
+ result = Series('a', index=[0, 1],
+ dtype=CategoricalDtype(['a', 'b'], ordered=True))
+ expected = Series(['a', 'a'], index=[0, 1],
+ dtype=CategoricalDtype(['a', 'b'], ordered=True))
+ tm.assert_series_equal(result, expected, check_categorical=True)
+
+ def test_categorical_sideeffects_free(self):
+ # Passing a categorical to a Series and then changing values in either
+ # the series or the categorical should not change the values in the
+ # other one, IF you specify copy!
+ cat = Categorical(["a", "b", "c", "a"])
+ s = Series(cat, copy=True)
+ assert s.cat is not cat
+ s.cat.categories = [1, 2, 3]
+ exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+ exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ # setting
+ s[0] = 2
+ exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+ # however, copy is False by default
+ # so this WILL change values
+ cat = Categorical(["a", "b", "c", "a"])
+ s = Series(cat)
+ assert s.values is cat
+ s.cat.categories = [1, 2, 3]
+ exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_s)
+
+ s[0] = 2
+ exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+ tm.assert_numpy_array_equal(cat.__array__(), exp_s2)
+
+ def test_unordered_compare_equal(self):
+ left = pd.Series(['a', 'b', 'c'],
+ dtype=CategoricalDtype(['a', 'b']))
+ right = pd.Series(pd.Categorical(['a', 'b', np.nan],
+ categories=['a', 'b']))
+ tm.assert_series_equal(left, right)
+
+ def test_constructor_maskedarray(self):
+ data = ma.masked_all((3, ), dtype=float)
+ result = Series(data)
+ expected = Series([nan, nan, nan])
+ assert_series_equal(result, expected)
+
+ data[0] = 0.0
+ data[2] = 2.0
+ index = ['a', 'b', 'c']
+ result = Series(data, index=index)
+ expected = Series([0.0, nan, 2.0], index=index)
+ assert_series_equal(result, expected)
+
+ data[1] = 1.0
+ result = Series(data, index=index)
+ expected = Series([0.0, 1.0, 2.0], index=index)
+ assert_series_equal(result, expected)
+
+ data = ma.masked_all((3, ), dtype=int)
+ result = Series(data)
+ expected = Series([nan, nan, nan], dtype=float)
+ assert_series_equal(result, expected)
+
+ data[0] = 0
+ data[2] = 2
+ index = ['a', 'b', 'c']
+ result = Series(data, index=index)
+ expected = Series([0, nan, 2], index=index, dtype=float)
+ assert_series_equal(result, expected)
+
+ data[1] = 1
+ result = Series(data, index=index)
+ expected = Series([0, 1, 2], index=index, dtype=int)
+ assert_series_equal(result, expected)
+
+ data = ma.masked_all((3, ), dtype=bool)
+ result = Series(data)
+ expected = Series([nan, nan, nan], dtype=object)
+ assert_series_equal(result, expected)
+
+ data[0] = True
+ data[2] = False
+ index = ['a', 'b', 'c']
+ result = Series(data, index=index)
+ expected = Series([True, nan, False], index=index, dtype=object)
+ assert_series_equal(result, expected)
+
+ data[1] = True
+ result = Series(data, index=index)
+ expected = Series([True, True, False], index=index, dtype=bool)
+ assert_series_equal(result, expected)
+
+ data = ma.masked_all((3, ), dtype='M8[ns]')
+ result = Series(data)
+ expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ data[0] = datetime(2001, 1, 1)
+ data[2] = datetime(2001, 1, 3)
+ index = ['a', 'b', 'c']
+ result = Series(data, index=index)
+ expected = Series([datetime(2001, 1, 1), iNaT,
+ datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ data[1] = datetime(2001, 1, 2)
+ result = Series(data, index=index)
+ expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
+ datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ def test_constructor_maskedarray_hardened(self):
+ # Check numpy masked arrays with hard masks -- from GH24574
+ data = ma.masked_all((3, ), dtype=float).harden_mask()
+ result = pd.Series(data)
+ expected = pd.Series([nan, nan, nan])
+ tm.assert_series_equal(result, expected)
+
+ def test_series_ctor_plus_datetimeindex(self):
+ rng = date_range('20090415', '20090519', freq='B')
+ data = {k: 1 for k in rng}
+
+ result = Series(data, index=rng)
+ assert result.index is rng
+
+ def test_constructor_default_index(self):
+ s = Series([0, 1, 2])
+ tm.assert_index_equal(s.index, pd.Index(np.arange(3)))
+
+ @pytest.mark.parametrize('input', [[1, 2, 3],
+ (1, 2, 3),
+ list(range(3)),
+ pd.Categorical(['a', 'b', 'a']),
+ (i for i in range(3)),
+ map(lambda x: x, range(3))])
+ def test_constructor_index_mismatch(self, input):
+ # GH 19342
+ # test that construction of a Series with an index of different length
+ # raises an error
+ msg = 'Length of passed values is 3, index implies 4'
+ with pytest.raises(ValueError, match=msg):
+ Series(input, index=np.arange(4))
+
+ def test_constructor_numpy_scalar(self):
+ # GH 19342
+ # construction with a numpy scalar
+ # should not raise
+ result = Series(np.array(100), index=np.arange(4), dtype='int64')
+ expected = Series(100, index=np.arange(4), dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+ def test_constructor_broadcast_list(self):
+ # GH 19342
+ # construction with single-element container and index
+ # should raise
+ msg = "Length of passed values is 1, index implies 3"
+ with pytest.raises(ValueError, match=msg):
+ Series(['foo'], index=['a', 'b', 'c'])
+
+ def test_constructor_corner(self):
+ df = tm.makeTimeDataFrame()
+ objs = [df, df]
+ s = Series(objs, index=[0, 1])
+ assert isinstance(s, Series)
+
+ def test_constructor_sanitize(self):
+ s = Series(np.array([1., 1., 8.]), dtype='i8')
+ assert s.dtype == np.dtype('i8')
+
+ s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8')
+ assert s.dtype == np.dtype('f8')
+
+ def test_constructor_copy(self):
+ # GH15125
+ # test dtype parameter has no side effects on copy=True
+ for data in [[1.], np.array([1.])]:
+ x = Series(data)
+ y = pd.Series(x, copy=True, dtype=float)
+
+ # copy=True maintains original data in Series
+ tm.assert_series_equal(x, y)
+
+ # changes to origin of copy does not affect the copy
+ x[0] = 2.
+ assert not x.equals(y)
+ assert x[0] == 2.
+ assert y[0] == 1.
+
+ @pytest.mark.parametrize(
+ "index",
+ [
+ pd.date_range('20170101', periods=3, tz='US/Eastern'),
+ pd.date_range('20170101', periods=3),
+ pd.timedelta_range('1 day', periods=3),
+ pd.period_range('2012Q1', periods=3, freq='Q'),
+ pd.Index(list('abc')),
+ pd.Int64Index([1, 2, 3]),
+ pd.RangeIndex(0, 3)],
+ ids=lambda x: type(x).__name__)
+ def test_constructor_limit_copies(self, index):
+ # GH 17449
+ # limit copies of input
+ s = pd.Series(index)
+
+ # we make 1 copy; this is just a smoke test here
+ assert s._data.blocks[0].values is not index
+
+ def test_constructor_pass_none(self):
+ s = Series(None, index=lrange(5))
+ assert s.dtype == np.float64
+
+ s = Series(None, index=lrange(5), dtype=object)
+ assert s.dtype == np.object_
+
+ # GH 7431
+ # inference on the index
+ s = Series(index=np.array([None]))
+ expected = Series(index=Index([None]))
+ assert_series_equal(s, expected)
+
+ def test_constructor_pass_nan_nat(self):
+ # GH 13467
+ exp = Series([np.nan, np.nan], dtype=np.float64)
+ assert exp.dtype == np.float64
+ tm.assert_series_equal(Series([np.nan, np.nan]), exp)
+ tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)
+
+ exp = Series([pd.NaT, pd.NaT])
+ assert exp.dtype == 'datetime64[ns]'
+ tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
+ tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)
+
+ tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
+ tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)
+
+ tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
+ tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)
+
+ def test_constructor_cast(self):
+ msg = "could not convert string to float"
+ with pytest.raises(ValueError, match=msg):
+ Series(["a", "b", "c"], dtype=float)
+
+ def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
+ # see gh-15832
+ msg = 'Trying to coerce negative values to unsigned integers'
+ with pytest.raises(OverflowError, match=msg):
+ Series([-1], dtype=uint_dtype)
+
+ def test_constructor_coerce_float_fail(self, any_int_dtype):
+ # see gh-15832
+ msg = "Trying to coerce float values to integers"
+ with pytest.raises(ValueError, match=msg):
+ Series([1, 2, 3.5], dtype=any_int_dtype)
+
+ def test_constructor_coerce_float_valid(self, float_dtype):
+ s = Series([1, 2, 3.5], dtype=float_dtype)
+ expected = Series([1, 2, 3.5]).astype(float_dtype)
+ assert_series_equal(s, expected)
+
+ def test_constructor_dtype_no_cast(self):
+ # see gh-1572
+ s = Series([1, 2, 3])
+ s2 = Series(s, dtype=np.int64)
+
+ s2[1] = 5
+ assert s[1] == 5
+
+ def test_constructor_datelike_coercion(self):
+
+ # GH 9477
+ # incorrectly inferring on dateimelike looking when object dtype is
+ # specified
+ s = Series([Timestamp('20130101'), 'NOV'], dtype=object)
+ assert s.iloc[0] == Timestamp('20130101')
+ assert s.iloc[1] == 'NOV'
+ assert s.dtype == object
+
+ # the dtype was being reset on the slicing and re-inferred to datetime
+ # even thought the blocks are mixed
+ belly = '216 3T19'.split()
+ wing1 = '2T15 4H19'.split()
+ wing2 = '416 4T20'.split()
+ mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
+ df = pd.DataFrame(
+ {'wing1': wing1,
+ 'wing2': wing2,
+ 'mat': mat}, index=belly)
+
+ result = df.loc['3T19']
+ assert result.dtype == object
+ result = df.loc['216']
+ assert result.dtype == object
+
+ def test_constructor_datetimes_with_nulls(self):
+ # gh-15869
+ for arr in [np.array([None, None, None, None,
+ datetime.now(), None]),
+ np.array([None, None, datetime.now(), None])]:
+ result = Series(arr)
+ assert result.dtype == 'M8[ns]'
+
+ def test_constructor_dtype_datetime64(self):
+
+ s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
+ assert isna(s).all()
+
+ # in theory this should be all nulls, but since
+ # we are not specifying a dtype is ambiguous
+ s = Series(iNaT, index=lrange(5))
+ assert not isna(s).all()
+
+ s = Series(nan, dtype='M8[ns]', index=lrange(5))
+ assert isna(s).all()
+
+ s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]')
+ assert isna(s[1])
+ assert s.dtype == 'M8[ns]'
+
+ s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]')
+ assert isna(s[1])
+ assert s.dtype == 'M8[ns]'
+
+ # GH3416
+ dates = [
+ np.datetime64(datetime(2013, 1, 1)),
+ np.datetime64(datetime(2013, 1, 2)),
+ np.datetime64(datetime(2013, 1, 3)),
+ ]
+
+ s = Series(dates)
+ assert s.dtype == 'M8[ns]'
+
+ s.iloc[0] = np.nan
+ assert s.dtype == 'M8[ns]'
+
+ # GH3414 related
+ # msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
+ # r" \[int32\]")
+ # with pytest.raises(TypeError, match=msg):
+ # Series(Series(dates).astype('int') / 1000000, dtype='M8[ms]')
+ pytest.raises(TypeError, lambda x: Series(
+ Series(dates).astype('int') / 1000000, dtype='M8[ms]'))
+
+ msg = (r"The 'datetime64' dtype has no unit\. Please pass in"
+ r" 'datetime64\[ns\]' instead\.")
+ with pytest.raises(ValueError, match=msg):
+ Series(dates, dtype='datetime64')
+
+ # invalid dates can be help as object
+ result = Series([datetime(2, 1, 1)])
+ assert result[0] == datetime(2, 1, 1, 0, 0)
+
+ result = Series([datetime(3000, 1, 1)])
+ assert result[0] == datetime(3000, 1, 1, 0, 0)
+
+ # don't mix types
+ result = Series([Timestamp('20130101'), 1], index=['a', 'b'])
+ assert result['a'] == Timestamp('20130101')
+ assert result['b'] == 1
+
+ # GH6529
+ # coerce datetime64 non-ns properly
+ dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M')
+ values2 = dates.view(np.ndarray).astype('datetime64[ns]')
+ expected = Series(values2, index=dates)
+
+ for dtype in ['s', 'D', 'ms', 'us', 'ns']:
+ values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
+ result = Series(values1, dates)
+ assert_series_equal(result, expected)
+
+ # GH 13876
+ # coerce to non-ns to object properly
+ expected = Series(values2, index=dates, dtype=object)
+ for dtype in ['s', 'D', 'ms', 'us', 'ns']:
+ values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
+ result = Series(values1, index=dates, dtype=object)
+ assert_series_equal(result, expected)
+
+ # leave datetime.date alone
+ dates2 = np.array([d.date() for d in dates.to_pydatetime()],
+ dtype=object)
+ series1 = Series(dates2, dates)
+ tm.assert_numpy_array_equal(series1.values, dates2)
+ assert series1.dtype == object
+
+ # these will correctly infer a datetime
+ s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
+ assert s.dtype == 'datetime64[ns]'
+ s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
+ assert s.dtype == 'datetime64[ns]'
+ s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
+ assert s.dtype == 'datetime64[ns]'
+ s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
+ assert s.dtype == 'datetime64[ns]'
+
+ # tz-aware (UTC and other tz's)
+ # GH 8411
+ dr = date_range('20130101', periods=3)
+ assert Series(dr).iloc[0].tz is None
+ dr = date_range('20130101', periods=3, tz='UTC')
+ assert str(Series(dr).iloc[0].tz) == 'UTC'
+ dr = date_range('20130101', periods=3, tz='US/Eastern')
+ assert str(Series(dr).iloc[0].tz) == 'US/Eastern'
+
+ # non-convertible
+ s = Series([1479596223000, -1479590, pd.NaT])
+ assert s.dtype == 'object'
+ assert s[2] is pd.NaT
+ assert 'NaT' in str(s)
+
+ # if we passed a NaT it remains
+ s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
+ assert s.dtype == 'object'
+ assert s[2] is pd.NaT
+ assert 'NaT' in str(s)
+
+ # if we passed a nan it remains
+ s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
+ assert s.dtype == 'object'
+ assert s[2] is np.nan
+ assert 'NaN' in str(s)
+
+ def test_constructor_with_datetime_tz(self):
+
+ # 8260
+ # support datetime64 with tz
+
+ dr = date_range('20130101', periods=3, tz='US/Eastern')
+ s = Series(dr)
+ assert s.dtype.name == 'datetime64[ns, US/Eastern]'
+ assert s.dtype == 'datetime64[ns, US/Eastern]'
+ assert is_datetime64tz_dtype(s.dtype)
+ assert 'datetime64[ns, US/Eastern]' in str(s)
+
+ # export
+ result = s.values
+ assert isinstance(result, np.ndarray)
+ assert result.dtype == 'datetime64[ns]'
+
+ exp = pd.DatetimeIndex(result)
+ exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz)
+ tm.assert_index_equal(dr, exp)
+
+ # indexing
+ result = s.iloc[0]
+ assert result == Timestamp('2013-01-01 00:00:00-0500',
+ tz='US/Eastern', freq='D')
+ result = s[0]
+ assert result == Timestamp('2013-01-01 00:00:00-0500',
+ tz='US/Eastern', freq='D')
+
+ result = s[Series([True, True, False], index=s.index)]
+ assert_series_equal(result, s[0:2])
+
+ result = s.iloc[0:1]
+ assert_series_equal(result, Series(dr[0:1]))
+
+ # concat
+ result = pd.concat([s.iloc[0:1], s.iloc[1:]])
+ assert_series_equal(result, s)
+
+ # short str
+ assert 'datetime64[ns, US/Eastern]' in str(s)
+
+ # formatting with NaT
+ result = s.shift()
+ assert 'datetime64[ns, US/Eastern]' in str(result)
+ assert 'NaT' in str(result)
+
+ # long str
+ t = Series(date_range('20130101', periods=1000, tz='US/Eastern'))
+ assert 'datetime64[ns, US/Eastern]' in str(t)
+
+ result = pd.DatetimeIndex(s, freq='infer')
+ tm.assert_index_equal(result, dr)
+
+ # inference
+ s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
+ pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
+ assert s.dtype == 'datetime64[ns, US/Pacific]'
+ assert lib.infer_dtype(s, skipna=True) == 'datetime64'
+
+ s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
+ pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
+ assert s.dtype == 'object'
+ assert lib.infer_dtype(s, skipna=True) == 'datetime'
+
+ # with all NaT
+ s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
+ expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern'))
+ assert_series_equal(s, expected)
+
+ @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
+ @pytest.mark.parametrize("dtype", ["M8", "m8"])
+ @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
+ def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
+ # tests all units
+ # gh-19223
+ dtype = "{}[{}]".format(dtype, unit)
+ arr = np.array([1, 2, 3], dtype=arr_dtype)
+ s = Series(arr)
+ result = s.astype(dtype)
+ expected = Series(arr.astype(dtype))
+
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('arg',
+ ['2013-01-01 00:00:00', pd.NaT, np.nan, None])
+ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
+ # GH 17415: With naive string
+ result = Series([arg], dtype='datetime64[ns, CET]')
+ expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET')
+ assert_series_equal(result, expected)
+
+ def test_construction_interval(self):
+ # construction from interval & array of intervals
+ index = IntervalIndex.from_breaks(np.arange(3), closed='right')
+ result = Series(index)
+ repr(result)
+ str(result)
+ tm.assert_index_equal(Index(result.values), index)
+
+ result = Series(index.values)
+ tm.assert_index_equal(Index(result.values), index)
+
+ def test_construction_consistency(self):
+
+ # make sure that we are not re-localizing upon construction
+ # GH 14928
+ s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))
+
+ result = Series(s, dtype=s.dtype)
+ tm.assert_series_equal(result, s)
+
+ result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype)
+ tm.assert_series_equal(result, s)
+
+ result = Series(s.values, dtype=s.dtype)
+ tm.assert_series_equal(result, s)
+
+ def test_constructor_infer_period(self):
+ data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None]
+ result = pd.Series(data)
+ expected = pd.Series(period_array(data))
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'Period[D]'
+
+ data = np.asarray(data, dtype=object)
+ tm.assert_series_equal(result, expected)
+ assert result.dtype == 'Period[D]'
+
+ def test_constructor_period_incompatible_frequency(self):
+ data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')]
+ result = pd.Series(data)
+ assert result.dtype == object
+ assert result.tolist() == data
+
+ def test_constructor_periodindex(self):
+ # GH7932
+ # converting a PeriodIndex when put in a Series
+
+ pi = period_range('20130101', periods=5, freq='D')
+ s = Series(pi)
+ assert s.dtype == 'Period[D]'
+ expected = Series(pi.astype(object))
+ assert_series_equal(s, expected)
+
+ def test_constructor_dict(self):
+ d = {'a': 0., 'b': 1., 'c': 2.}
+ result = Series(d, index=['b', 'c', 'd', 'a'])
+ expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a'])
+ assert_series_equal(result, expected)
+
+ pidx = tm.makePeriodIndex(100)
+ d = {pidx[0]: 0, pidx[1]: 1}
+ result = Series(d, index=pidx)
+ expected = Series(np.nan, pidx)
+ expected.iloc[0] = 0
+ expected.iloc[1] = 1
+ assert_series_equal(result, expected)
+
+ def test_constructor_dict_order(self):
+ # GH19018
+ # initialization ordering: by insertion order if python>= 3.6, else
+ # order by value
+ d = {'b': 1, 'a': 0, 'c': 2}
+ result = Series(d)
+ if PY36:
+ expected = Series([1, 0, 2], index=list('bac'))
+ else:
+ expected = Series([0, 1, 2], index=list('abc'))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
+ def test_constructor_dict_nan_key(self, value):
+ # GH 18480
+ d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
+ result = Series(d).sort_values()
+ expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
+ assert_series_equal(result, expected)
+
+ # MultiIndex:
+ d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
+ result = Series(d).sort_values()
+ expected = Series(['a', 'b', 'c'],
+ index=Index([(1, 1), (2, np.nan), (3, value)]))
+ assert_series_equal(result, expected)
+
+ def test_constructor_dict_datetime64_index(self):
+ # GH 9456
+
+ dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
+ values = [42544017.198965244, 1234565, 40512335.181958228, -1]
+
+ def create_data(constructor):
+ return dict(zip((constructor(x) for x in dates_as_str), values))
+
+ data_datetime64 = create_data(np.datetime64)
+ data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
+ data_Timestamp = create_data(Timestamp)
+
+ expected = Series(values, (Timestamp(x) for x in dates_as_str))
+
+ result_datetime64 = Series(data_datetime64)
+ result_datetime = Series(data_datetime)
+ result_Timestamp = Series(data_Timestamp)
+
+ assert_series_equal(result_datetime64, expected)
+ assert_series_equal(result_datetime, expected)
+ assert_series_equal(result_Timestamp, expected)
+
+ def test_constructor_list_of_tuples(self):
+ data = [(1, 1), (2, 2), (2, 3)]
+ s = Series(data)
+ assert list(s) == data
+
+ def test_constructor_tuple_of_tuples(self):
+ data = ((1, 1), (2, 2), (2, 3))
+ s = Series(data)
+ assert tuple(s) == data
+
+ def test_constructor_dict_of_tuples(self):
+ data = {(1, 2): 3,
+ (None, 5): 6}
+ result = Series(data).sort_values()
+ expected = Series([3, 6],
+ index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
+ tm.assert_series_equal(result, expected)
+
+ def test_constructor_set(self):
+ values = {1, 2, 3, 4, 5}
+ with pytest.raises(TypeError, match="'set' type is unordered"):
+ Series(values)
+ values = frozenset(values)
+ with pytest.raises(TypeError, match="'frozenset' type is unordered"):
+ Series(values)
+
+ # https://github.com/pandas-dev/pandas/issues/22698
+ @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
+ def test_fromDict(self):
+ data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
+
+ series = Series(data)
+ assert tm.is_sorted(series.index)
+
+ data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()}
+ series = Series(data)
+ assert series.dtype == np.object_
+
+ data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'}
+ series = Series(data)
+ assert series.dtype == np.object_
+
+ data = {'a': '0', 'b': '1'}
+ series = Series(data, dtype=float)
+ assert series.dtype == np.float64
+
+ def test_fromValue(self, datetime_series):
+
+ nans = Series(np.NaN, index=datetime_series.index)
+ assert nans.dtype == np.float_
+ assert len(nans) == len(datetime_series)
+
+ strings = Series('foo', index=datetime_series.index)
+ assert strings.dtype == np.object_
+ assert len(strings) == len(datetime_series)
+
+ d = datetime.now()
+ dates = Series(d, index=datetime_series.index)
+ assert dates.dtype == 'M8[ns]'
+ assert len(dates) == len(datetime_series)
+
+ # GH12336
+ # Test construction of categorical series from value
+ categorical = Series(0, index=datetime_series.index, dtype="category")
+ expected = Series(0, index=datetime_series.index).astype("category")
+ assert categorical.dtype == 'category'
+ assert len(categorical) == len(datetime_series)
+ tm.assert_series_equal(categorical, expected)
+
+ def test_constructor_dtype_timedelta64(self):
+
+ # basic
+ td = Series([timedelta(days=i) for i in range(3)])
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([timedelta(days=1)])
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(
+ 1, 's')])
+
+ assert td.dtype == 'timedelta64[ns]'
+
+ # mixed with NaT
+ td = Series([timedelta(days=1), NaT], dtype='m8[ns]')
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([timedelta(days=1), np.nan], dtype='m8[ns]')
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]')
+ assert td.dtype == 'timedelta64[ns]'
+
+ # improved inference
+ # GH5689
+ td = Series([np.timedelta64(300000000), NaT])
+ assert td.dtype == 'timedelta64[ns]'
+
+ # because iNaT is int, not coerced to timedelta
+ td = Series([np.timedelta64(300000000), iNaT])
+ assert td.dtype == 'object'
+
+ td = Series([np.timedelta64(300000000), np.nan])
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([pd.NaT, np.timedelta64(300000000)])
+ assert td.dtype == 'timedelta64[ns]'
+
+ td = Series([np.timedelta64(1, 's')])
+ assert td.dtype == 'timedelta64[ns]'
+
+ # these are frequency conversion astypes
+ # for t in ['s', 'D', 'us', 'ms']:
+ # pytest.raises(TypeError, td.astype, 'm8[%s]' % t)
+
+ # valid astype
+ td.astype('int64')
+
+ # invalid casting
+ msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
+ r" \[int32\]")
+ with pytest.raises(TypeError, match=msg):
+ td.astype('int32')
+
+ # this is an invalid casting
+ msg = "Could not convert object to NumPy timedelta"
+ with pytest.raises(ValueError, match=msg):
+ Series([timedelta(days=1), 'foo'], dtype='m8[ns]')
+
+ # leave as object here
+ td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
+ assert td.dtype == 'object'
+
+ # these will correctly infer a timedelta
+ s = Series([None, pd.NaT, '1 Day'])
+ assert s.dtype == 'timedelta64[ns]'
+ s = Series([np.nan, pd.NaT, '1 Day'])
+ assert s.dtype == 'timedelta64[ns]'
+ s = Series([pd.NaT, None, '1 Day'])
+ assert s.dtype == 'timedelta64[ns]'
+ s = Series([pd.NaT, np.nan, '1 Day'])
+ assert s.dtype == 'timedelta64[ns]'
+
+ # GH 16406
+ def test_constructor_mixed_tz(self):
+ s = Series([Timestamp('20130101'),
+ Timestamp('20130101', tz='US/Eastern')])
+ expected = Series([Timestamp('20130101'),
+ Timestamp('20130101', tz='US/Eastern')],
+ dtype='object')
+ assert_series_equal(s, expected)
+
+ def test_NaT_scalar(self):
+ series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')
+
+ val = series[3]
+ assert isna(val)
+
+ series[2] = val
+ assert isna(series[2])
+
+ def test_NaT_cast(self):
+ # GH10747
+ result = Series([np.nan]).astype('M8[ns]')
+ expected = Series([NaT])
+ assert_series_equal(result, expected)
+
+ def test_constructor_name_hashable(self):
+ for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]:
+ for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]:
+ s = Series(data, name=n)
+ assert s.name == n
+
+ def test_constructor_name_unhashable(self):
+ msg = r"Series\.name must be a hashable type"
+ for n in [['name_list'], np.ones(2), {1: 2}]:
+ for data in [['name_list'], np.ones(2), {1: 2}]:
+ with pytest.raises(TypeError, match=msg):
+ Series(data, name=n)
+
+ def test_auto_conversion(self):
+ series = Series(list(date_range('1/1/2000', periods=10)))
+ assert series.dtype == 'M8[ns]'
+
+ def test_convert_non_ns(self):
+ # convert from a numpy array of non-ns timedelta64
+ arr = np.array([1, 2, 3], dtype='timedelta64[s]')
+ s = Series(arr)
+ expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
+ assert_series_equal(s, expected)
+
+ # convert from a numpy array of non-ns datetime64
+ # note that creating a numpy datetime64 is in LOCAL time!!!!
+ # seems to work for M8[D], but not for M8[s]
+
+ s = Series(np.array(['2013-01-01', '2013-01-02',
+ '2013-01-03'], dtype='datetime64[D]'))
+ assert_series_equal(s, Series(date_range('20130101', periods=3,
+ freq='D')))
+
+ # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
+ # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))
+
+ # assert_series_equal(s,date_range('20130101
+ # 00:00:01',period=3,freq='s'))
+
+ @pytest.mark.parametrize(
+ "index",
+ [
+ date_range('1/1/2000', periods=10),
+ timedelta_range('1 day', periods=10),
+ period_range('2000-Q1', periods=10, freq='Q')],
+ ids=lambda x: type(x).__name__)
+ def test_constructor_cant_cast_datetimelike(self, index):
+
+ # floats are not ok
+ msg = "Cannot cast {}.*? to ".format(
+ # strip Index to convert PeriodIndex -> Period
+ # We don't care whether the error message says
+ # PeriodIndex or PeriodArray
+ type(index).__name__.rstrip("Index")
+ )
+ with pytest.raises(TypeError, match=msg):
+ Series(index, dtype=float)
+
+ # ints are ok
+ # we test with np.int64 to get similar results on
+ # windows / 32-bit platforms
+ result = Series(index, dtype=np.int64)
+ expected = Series(index.astype(np.int64))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "index",
+ [
+ date_range('1/1/2000', periods=10),
+ timedelta_range('1 day', periods=10),
+ period_range('2000-Q1', periods=10, freq='Q')],
+ ids=lambda x: type(x).__name__)
+ def test_constructor_cast_object(self, index):
+ s = Series(index, dtype=object)
+ exp = Series(index).astype(object)
+ tm.assert_series_equal(s, exp)
+
+ s = Series(pd.Index(index, dtype=object), dtype=object)
+ exp = Series(index).astype(object)
+ tm.assert_series_equal(s, exp)
+
+ s = Series(index.astype(object), dtype=object)
+ exp = Series(index).astype(object)
+ tm.assert_series_equal(s, exp)
+
+ @pytest.mark.parametrize("dtype", [
+ np.datetime64,
+ np.timedelta64,
+ ])
+ def test_constructor_generic_timestamp_no_frequency(self, dtype):
+ # see gh-15524, gh-15987
+ msg = "dtype has no unit. Please pass in"
+
+ with pytest.raises(ValueError, match=msg):
+ Series([], dtype=dtype)
+
+ @pytest.mark.parametrize("dtype,msg", [
+ ("m8[ps]", "cannot convert timedeltalike"),
+ ("M8[ps]", "cannot convert datetimelike"),
+ ])
+ def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
+ # see gh-15524, gh-15987
+
+ with pytest.raises(TypeError, match=msg):
+ Series([], dtype=dtype)
+
+ @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
+ def test_constructor_range_dtype(self, dtype):
+ # GH 16804
+ expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64')
+ result = Series(range(5), dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ def test_constructor_tz_mixed_data(self):
+ # GH 13051
+ dt_list = [Timestamp('2016-05-01 02:03:37'),
+ Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')]
+ result = Series(dt_list)
+ expected = Series(dt_list, dtype=object)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_datetime_values.py b/contrib/python/pandas/py2/pandas/tests/series/test_datetime_values.py
new file mode 100644
index 00000000000..a916cf30065
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_datetime_values.py
@@ -0,0 +1,556 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import calendar
+from datetime import date, datetime, time
+import locale
+import unicodedata
+
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs.timezones import maybe_get_tz
+
+from pandas.core.dtypes.common import is_integer_dtype, is_list_like
+
+import pandas as pd
+from pandas import (
+ DataFrame, DatetimeIndex, Index, PeriodIndex, Series, TimedeltaIndex,
+ bdate_range, compat, date_range, period_range, timedelta_range)
+from pandas.core.arrays import PeriodArray
+import pandas.core.common as com
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestSeriesDatetimeValues():
+
+ def test_dt_namespace_accessor(self):
+
+ # GH 7207, 11128
+ # test .dt namespace accessor
+
+ ok_for_period = PeriodArray._datetimelike_ops
+ ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq']
+ ok_for_dt = DatetimeIndex._datetimelike_ops
+ ok_for_dt_methods = ['to_period', 'to_pydatetime', 'tz_localize',
+ 'tz_convert', 'normalize', 'strftime', 'round',
+ 'floor', 'ceil', 'day_name', 'month_name']
+ ok_for_td = TimedeltaIndex._datetimelike_ops
+ ok_for_td_methods = ['components', 'to_pytimedelta', 'total_seconds',
+ 'round', 'floor', 'ceil']
+
+ def get_expected(s, name):
+ result = getattr(Index(s._values), prop)
+ if isinstance(result, np.ndarray):
+ if is_integer_dtype(result):
+ result = result.astype('int64')
+ elif not is_list_like(result):
+ return result
+ return Series(result, index=s.index, name=s.name)
+
+ def compare(s, name):
+ a = getattr(s.dt, prop)
+ b = get_expected(s, prop)
+ if not (is_list_like(a) and is_list_like(b)):
+ assert a == b
+ else:
+ tm.assert_series_equal(a, b)
+
+ # datetimeindex
+ cases = [Series(date_range('20130101', periods=5), name='xxx'),
+ Series(date_range('20130101', periods=5, freq='s'),
+ name='xxx'),
+ Series(date_range('20130101 00:00:00', periods=5, freq='ms'),
+ name='xxx')]
+ for s in cases:
+ for prop in ok_for_dt:
+ # we test freq below
+ if prop != 'freq':
+ compare(s, prop)
+
+ for prop in ok_for_dt_methods:
+ getattr(s.dt, prop)
+
+ result = s.dt.to_pydatetime()
+ assert isinstance(result, np.ndarray)
+ assert result.dtype == object
+
+ result = s.dt.tz_localize('US/Eastern')
+ exp_values = DatetimeIndex(s.values).tz_localize('US/Eastern')
+ expected = Series(exp_values, index=s.index, name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ tz_result = result.dt.tz
+ assert str(tz_result) == 'US/Eastern'
+ freq_result = s.dt.freq
+ assert freq_result == DatetimeIndex(s.values, freq='infer').freq
+
+ # let's localize, then convert
+ result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
+ exp_values = (DatetimeIndex(s.values).tz_localize('UTC')
+ .tz_convert('US/Eastern'))
+ expected = Series(exp_values, index=s.index, name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ # datetimeindex with tz
+ s = Series(date_range('20130101', periods=5, tz='US/Eastern'),
+ name='xxx')
+ for prop in ok_for_dt:
+
+ # we test freq below
+ if prop != 'freq':
+ compare(s, prop)
+
+ for prop in ok_for_dt_methods:
+ getattr(s.dt, prop)
+
+ result = s.dt.to_pydatetime()
+ assert isinstance(result, np.ndarray)
+ assert result.dtype == object
+
+ result = s.dt.tz_convert('CET')
+ expected = Series(s._values.tz_convert('CET'),
+ index=s.index, name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ tz_result = result.dt.tz
+ assert str(tz_result) == 'CET'
+ freq_result = s.dt.freq
+ assert freq_result == DatetimeIndex(s.values, freq='infer').freq
+
+ # timedelta index
+ cases = [Series(timedelta_range('1 day', periods=5),
+ index=list('abcde'), name='xxx'),
+ Series(timedelta_range('1 day 01:23:45', periods=5,
+ freq='s'), name='xxx'),
+ Series(timedelta_range('2 days 01:23:45.012345', periods=5,
+ freq='ms'), name='xxx')]
+ for s in cases:
+ for prop in ok_for_td:
+ # we test freq below
+ if prop != 'freq':
+ compare(s, prop)
+
+ for prop in ok_for_td_methods:
+ getattr(s.dt, prop)
+
+ result = s.dt.components
+ assert isinstance(result, DataFrame)
+ tm.assert_index_equal(result.index, s.index)
+
+ result = s.dt.to_pytimedelta()
+ assert isinstance(result, np.ndarray)
+ assert result.dtype == object
+
+ result = s.dt.total_seconds()
+ assert isinstance(result, pd.Series)
+ assert result.dtype == 'float64'
+
+ freq_result = s.dt.freq
+ assert freq_result == TimedeltaIndex(s.values, freq='infer').freq
+
+ # both
+ index = date_range('20130101', periods=3, freq='D')
+ s = Series(date_range('20140204', periods=3, freq='s'),
+ index=index, name='xxx')
+ exp = Series(np.array([2014, 2014, 2014], dtype='int64'),
+ index=index, name='xxx')
+ tm.assert_series_equal(s.dt.year, exp)
+
+ exp = Series(np.array([2, 2, 2], dtype='int64'),
+ index=index, name='xxx')
+ tm.assert_series_equal(s.dt.month, exp)
+
+ exp = Series(np.array([0, 1, 2], dtype='int64'),
+ index=index, name='xxx')
+ tm.assert_series_equal(s.dt.second, exp)
+
+ exp = pd.Series([s[0]] * 3, index=index, name='xxx')
+ tm.assert_series_equal(s.dt.normalize(), exp)
+
+ # periodindex
+ cases = [Series(period_range('20130101', periods=5, freq='D'),
+ name='xxx')]
+ for s in cases:
+ for prop in ok_for_period:
+ # we test freq below
+ if prop != 'freq':
+ compare(s, prop)
+
+ for prop in ok_for_period_methods:
+ getattr(s.dt, prop)
+
+ freq_result = s.dt.freq
+ assert freq_result == PeriodIndex(s.values).freq
+
+ # test limited display api
+ def get_dir(s):
+ results = [r for r in s.dt.__dir__() if not r.startswith('_')]
+ return list(sorted(set(results)))
+
+ s = Series(date_range('20130101', periods=5, freq='D'), name='xxx')
+ results = get_dir(s)
+ tm.assert_almost_equal(
+ results, list(sorted(set(ok_for_dt + ok_for_dt_methods))))
+
+ s = Series(period_range('20130101', periods=5,
+ freq='D', name='xxx').astype(object))
+ results = get_dir(s)
+ tm.assert_almost_equal(
+ results, list(sorted(set(ok_for_period + ok_for_period_methods))))
+
+ # 11295
+ # ambiguous time error on the conversions
+ s = Series(pd.date_range('2015-01-01', '2016-01-01',
+ freq='T'), name='xxx')
+ s = s.dt.tz_localize('UTC').dt.tz_convert('America/Chicago')
+ results = get_dir(s)
+ tm.assert_almost_equal(
+ results, list(sorted(set(ok_for_dt + ok_for_dt_methods))))
+ exp_values = pd.date_range('2015-01-01', '2016-01-01', freq='T',
+ tz='UTC').tz_convert('America/Chicago')
+ expected = Series(exp_values, name='xxx')
+ tm.assert_series_equal(s, expected)
+
+ # no setting allowed
+ s = Series(date_range('20130101', periods=5, freq='D'), name='xxx')
+ with pytest.raises(ValueError, match="modifications"):
+ s.dt.hour = 5
+
+ # trying to set a copy
+ with pd.option_context('chained_assignment', 'raise'):
+ with pytest.raises(com.SettingWithCopyError):
+ s.dt.hour[0] = 5
+
+ @pytest.mark.parametrize('method, dates', [
+ ['round', ['2012-01-02', '2012-01-02', '2012-01-01']],
+ ['floor', ['2012-01-01', '2012-01-01', '2012-01-01']],
+ ['ceil', ['2012-01-02', '2012-01-02', '2012-01-02']]
+ ])
+ def test_dt_round(self, method, dates):
+ # round
+ s = Series(pd.to_datetime(['2012-01-01 13:00:00',
+ '2012-01-01 12:01:00',
+ '2012-01-01 08:00:00']), name='xxx')
+ result = getattr(s.dt, method)('D')
+ expected = Series(pd.to_datetime(dates), name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ def test_dt_round_tz(self):
+ s = Series(pd.to_datetime(['2012-01-01 13:00:00',
+ '2012-01-01 12:01:00',
+ '2012-01-01 08:00:00']), name='xxx')
+ result = (s.dt.tz_localize('UTC')
+ .dt.tz_convert('US/Eastern')
+ .dt.round('D'))
+
+ exp_values = pd.to_datetime(['2012-01-01', '2012-01-01',
+ '2012-01-01']).tz_localize('US/Eastern')
+ expected = Series(exp_values, name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('method', ['ceil', 'round', 'floor'])
+ def test_dt_round_tz_ambiguous(self, method):
+ # GH 18946 round near "fall back" DST
+ df1 = pd.DataFrame([
+ pd.to_datetime('2017-10-29 02:00:00+02:00', utc=True),
+ pd.to_datetime('2017-10-29 02:00:00+01:00', utc=True),
+ pd.to_datetime('2017-10-29 03:00:00+01:00', utc=True)
+ ],
+ columns=['date'])
+ df1['date'] = df1['date'].dt.tz_convert('Europe/Madrid')
+ # infer
+ result = getattr(df1.date.dt, method)('H', ambiguous='infer')
+ expected = df1['date']
+ tm.assert_series_equal(result, expected)
+
+ # bool-array
+ result = getattr(df1.date.dt, method)(
+ 'H', ambiguous=[True, False, False]
+ )
+ tm.assert_series_equal(result, expected)
+
+ # NaT
+ result = getattr(df1.date.dt, method)('H', ambiguous='NaT')
+ expected = df1['date'].copy()
+ expected.iloc[0:2] = pd.NaT
+ tm.assert_series_equal(result, expected)
+
+ # raise
+ with pytest.raises(pytz.AmbiguousTimeError):
+ getattr(df1.date.dt, method)('H', ambiguous='raise')
+
+ @pytest.mark.parametrize('method, ts_str, freq', [
+ ['ceil', '2018-03-11 01:59:00-0600', '5min'],
+ ['round', '2018-03-11 01:59:00-0600', '5min'],
+ ['floor', '2018-03-11 03:01:00-0500', '2H']])
+ def test_dt_round_tz_nonexistent(self, method, ts_str, freq):
+ # GH 23324 round near "spring forward" DST
+ s = Series([pd.Timestamp(ts_str, tz='America/Chicago')])
+ result = getattr(s.dt, method)(freq, nonexistent='shift_forward')
+ expected = Series(
+ [pd.Timestamp('2018-03-11 03:00:00', tz='America/Chicago')]
+ )
+ tm.assert_series_equal(result, expected)
+
+ result = getattr(s.dt, method)(freq, nonexistent='NaT')
+ expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz)
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(pytz.NonExistentTimeError,
+ match='2018-03-11 02:00:00'):
+ getattr(s.dt, method)(freq, nonexistent='raise')
+
+ def test_dt_namespace_accessor_categorical(self):
+ # GH 19468
+ dti = DatetimeIndex(['20171111', '20181212']).repeat(2)
+ s = Series(pd.Categorical(dti), name='foo')
+ result = s.dt.year
+ expected = Series([2017, 2017, 2018, 2018], name='foo')
+ tm.assert_series_equal(result, expected)
+
+ def test_dt_accessor_no_new_attributes(self):
+ # https://github.com/pandas-dev/pandas/issues/10673
+ s = Series(date_range('20130101', periods=5, freq='D'))
+ with pytest.raises(AttributeError,
+ match="You cannot add any new attribute"):
+ s.dt.xlabel = "a"
+
+ @pytest.mark.parametrize('time_locale', [
+ None] if tm.get_locales() is None else [None] + tm.get_locales())
+ def test_dt_accessor_datetime_name_accessors(self, time_locale):
+ # Test Monday -> Sunday and January -> December, in that sequence
+ if time_locale is None:
+ # If the time_locale is None, day-name and month_name should
+ # return the english attributes
+ expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
+ 'Friday', 'Saturday', 'Sunday']
+ expected_months = ['January', 'February', 'March', 'April', 'May',
+ 'June', 'July', 'August', 'September',
+ 'October', 'November', 'December']
+ else:
+ with tm.set_locale(time_locale, locale.LC_TIME):
+ expected_days = calendar.day_name[:]
+ expected_months = calendar.month_name[1:]
+
+ s = Series(date_range(freq='D', start=datetime(1998, 1, 1),
+ periods=365))
+ english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
+ 'Friday', 'Saturday', 'Sunday']
+ for day, name, eng_name in zip(range(4, 11),
+ expected_days,
+ english_days):
+ name = name.capitalize()
+ assert s.dt.weekday_name[day] == eng_name
+ assert s.dt.day_name(locale=time_locale)[day] == name
+ s = s.append(Series([pd.NaT]))
+ assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1])
+
+ s = Series(date_range(freq='M', start='2012', end='2013'))
+ result = s.dt.month_name(locale=time_locale)
+ expected = Series([month.capitalize() for month in expected_months])
+
+ # work around https://github.com/pandas-dev/pandas/issues/22342
+ if not compat.PY2:
+ result = result.str.normalize("NFD")
+ expected = expected.str.normalize("NFD")
+
+ tm.assert_series_equal(result, expected)
+
+ for s_date, expected in zip(s, expected_months):
+ result = s_date.month_name(locale=time_locale)
+ expected = expected.capitalize()
+
+ if not compat.PY2:
+ result = unicodedata.normalize("NFD", result)
+ expected = unicodedata.normalize("NFD", expected)
+
+ assert result == expected
+
+ s = s.append(Series([pd.NaT]))
+ assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1])
+
+ def test_strftime(self):
+ # GH 10086
+ s = Series(date_range('20130101', periods=5))
+ result = s.dt.strftime('%Y/%m/%d')
+ expected = Series(['2013/01/01', '2013/01/02', '2013/01/03',
+ '2013/01/04', '2013/01/05'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(date_range('2015-02-03 11:22:33.4567', periods=5))
+ result = s.dt.strftime('%Y/%m/%d %H-%M-%S')
+ expected = Series(['2015/02/03 11-22-33', '2015/02/04 11-22-33',
+ '2015/02/05 11-22-33', '2015/02/06 11-22-33',
+ '2015/02/07 11-22-33'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(period_range('20130101', periods=5))
+ result = s.dt.strftime('%Y/%m/%d')
+ expected = Series(['2013/01/01', '2013/01/02', '2013/01/03',
+ '2013/01/04', '2013/01/05'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(period_range(
+ '2015-02-03 11:22:33.4567', periods=5, freq='s'))
+ result = s.dt.strftime('%Y/%m/%d %H-%M-%S')
+ expected = Series(['2015/02/03 11-22-33', '2015/02/03 11-22-34',
+ '2015/02/03 11-22-35', '2015/02/03 11-22-36',
+ '2015/02/03 11-22-37'])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(date_range('20130101', periods=5))
+ s.iloc[0] = pd.NaT
+ result = s.dt.strftime('%Y/%m/%d')
+ expected = Series(['NaT', '2013/01/02', '2013/01/03', '2013/01/04',
+ '2013/01/05'])
+ tm.assert_series_equal(result, expected)
+
+ datetime_index = date_range('20150301', periods=5)
+ result = datetime_index.strftime("%Y/%m/%d")
+
+ expected = Index(['2015/03/01', '2015/03/02', '2015/03/03',
+ '2015/03/04', '2015/03/05'], dtype=np.object_)
+ # dtype may be S10 or U10 depending on python version
+ tm.assert_index_equal(result, expected)
+
+ period_index = period_range('20150301', periods=5)
+ result = period_index.strftime("%Y/%m/%d")
+ expected = Index(['2015/03/01', '2015/03/02', '2015/03/03',
+ '2015/03/04', '2015/03/05'], dtype='=U10')
+ tm.assert_index_equal(result, expected)
+
+ s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14,
+ 32, 1)])
+ result = s.dt.strftime('%Y-%m-%d %H:%M:%S')
+ expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"])
+ tm.assert_series_equal(result, expected)
+
+ s = Series(period_range('20130101', periods=4, freq='H'))
+ result = s.dt.strftime('%Y/%m/%d %H:%M:%S')
+ expected = Series(["2013/01/01 00:00:00", "2013/01/01 01:00:00",
+ "2013/01/01 02:00:00", "2013/01/01 03:00:00"])
+
+ s = Series(period_range('20130101', periods=4, freq='L'))
+ result = s.dt.strftime('%Y/%m/%d %H:%M:%S.%l')
+ expected = Series(["2013/01/01 00:00:00.000",
+ "2013/01/01 00:00:00.001",
+ "2013/01/01 00:00:00.002",
+ "2013/01/01 00:00:00.003"])
+ tm.assert_series_equal(result, expected)
+
+ def test_valid_dt_with_missing_values(self):
+
+ from datetime import date, time
+
+ # GH 8689
+ s = Series(date_range('20130101', periods=5, freq='D'))
+ s.iloc[2] = pd.NaT
+
+ for attr in ['microsecond', 'nanosecond', 'second', 'minute', 'hour',
+ 'day']:
+ expected = getattr(s.dt, attr).copy()
+ expected.iloc[2] = np.nan
+ result = getattr(s.dt, attr)
+ tm.assert_series_equal(result, expected)
+
+ result = s.dt.date
+ expected = Series(
+ [date(2013, 1, 1), date(2013, 1, 2), np.nan, date(2013, 1, 4),
+ date(2013, 1, 5)], dtype='object')
+ tm.assert_series_equal(result, expected)
+
+ result = s.dt.time
+ expected = Series(
+ [time(0), time(0), np.nan, time(0), time(0)], dtype='object')
+ tm.assert_series_equal(result, expected)
+
+ def test_dt_accessor_api(self):
+ # GH 9322
+ from pandas.core.indexes.accessors import (
+ CombinedDatetimelikeProperties, DatetimeProperties)
+ assert Series.dt is CombinedDatetimelikeProperties
+
+ s = Series(date_range('2000-01-01', periods=3))
+ assert isinstance(s.dt, DatetimeProperties)
+
+ @pytest.mark.parametrize('ser', [Series(np.arange(5)),
+ Series(list('abcde')),
+ Series(np.random.randn(5))])
+ def test_dt_accessor_invalid(self, ser):
+ # GH#9322 check that series with incorrect dtypes don't have attr
+ with pytest.raises(AttributeError, match="only use .dt accessor"):
+ ser.dt
+ assert not hasattr(ser, 'dt')
+
+ def test_dt_accessor_updates_on_inplace(self):
+ s = Series(pd.date_range('2018-01-01', periods=10))
+ s[2] = None
+ s.fillna(pd.Timestamp('2018-01-01'), inplace=True)
+ result = s.dt.date
+ assert result[0] == result[2]
+
+ def test_between(self):
+ s = Series(bdate_range('1/1/2000', periods=20).astype(object))
+ s[::2] = np.nan
+
+ result = s[s.between(s[3], s[17])]
+ expected = s[3:18].dropna()
+ assert_series_equal(result, expected)
+
+ result = s[s.between(s[3], s[17], inclusive=False)]
+ expected = s[5:16].dropna()
+ assert_series_equal(result, expected)
+
+ def test_date_tz(self):
+ # GH11757
+ rng = pd.DatetimeIndex(['2014-04-04 23:56',
+ '2014-07-18 21:24',
+ '2015-11-22 22:14'], tz="US/Eastern")
+ s = Series(rng)
+ expected = Series([date(2014, 4, 4),
+ date(2014, 7, 18),
+ date(2015, 11, 22)])
+ assert_series_equal(s.dt.date, expected)
+ assert_series_equal(s.apply(lambda x: x.date()), expected)
+
+ def test_datetime_understood(self):
+ # Ensures it doesn't fail to create the right series
+ # reported in issue#16726
+ series = pd.Series(pd.date_range("2012-01-01", periods=3))
+ offset = pd.offsets.DateOffset(days=6)
+ result = series - offset
+ expected = pd.Series(pd.to_datetime([
+ '2011-12-26', '2011-12-27', '2011-12-28']))
+ tm.assert_series_equal(result, expected)
+
+ def test_dt_timetz_accessor(self, tz_naive_fixture):
+ # GH21358
+ tz = maybe_get_tz(tz_naive_fixture)
+
+ dtindex = pd.DatetimeIndex(['2014-04-04 23:56', '2014-07-18 21:24',
+ '2015-11-22 22:14'], tz=tz)
+ s = Series(dtindex)
+ expected = Series([time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz),
+ time(22, 14, tzinfo=tz)])
+ result = s.dt.timetz
+ tm.assert_series_equal(result, expected)
+
+ def test_setitem_with_string_index(self):
+ # GH 23451
+ x = pd.Series([1, 2, 3], index=['Date', 'b', 'other'])
+ x['Date'] = date.today()
+ assert x.Date == date.today()
+ assert x['Date'] == date.today()
+
+ def test_setitem_with_different_tz(self):
+ # GH#24024
+ ser = pd.Series(pd.date_range('2000', periods=2, tz="US/Central"))
+ ser[0] = pd.Timestamp("2000", tz='US/Eastern')
+ expected = pd.Series([
+ pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"),
+ pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"),
+ ], dtype=object)
+ tm.assert_series_equal(ser, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_dtypes.py b/contrib/python/pandas/py2/pandas/tests/series/test_dtypes.py
new file mode 100644
index 00000000000..e29974f5696
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_dtypes.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+import string
+import sys
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+import pandas.compat as compat
+from pandas.compat import lrange, range, u
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, Series, Timedelta, Timestamp, date_range)
+from pandas.api.types import CategoricalDtype
+import pandas.util.testing as tm
+
+
+class TestSeriesDtypes(object):
+
+ def test_dt64_series_astype_object(self):
+ dt64ser = Series(date_range('20130101', periods=3))
+ result = dt64ser.astype(object)
+ assert isinstance(result.iloc[0], datetime)
+ assert result.dtype == np.object_
+
+ def test_td64_series_astype_object(self):
+ tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]')
+ result = tdser.astype(object)
+ assert isinstance(result.iloc[0], timedelta)
+ assert result.dtype == np.object_
+
+ @pytest.mark.parametrize("dtype", ["float32", "float64",
+ "int64", "int32"])
+ def test_astype(self, dtype):
+ s = Series(np.random.randn(5), name='foo')
+ as_typed = s.astype(dtype)
+
+ assert as_typed.dtype == dtype
+ assert as_typed.name == s.name
+
+ def test_asobject_deprecated(self):
+ s = Series(np.random.randn(5), name='foo')
+ with tm.assert_produces_warning(FutureWarning):
+ o = s.asobject
+ assert isinstance(o, np.ndarray)
+
+ def test_dtype(self, datetime_series):
+
+ assert datetime_series.dtype == np.dtype('float64')
+ assert datetime_series.dtypes == np.dtype('float64')
+ assert datetime_series.ftype == 'float64:dense'
+ assert datetime_series.ftypes == 'float64:dense'
+ tm.assert_series_equal(datetime_series.get_dtype_counts(),
+ Series(1, ['float64']))
+ # GH18243 - Assert .get_ftype_counts is deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ tm.assert_series_equal(datetime_series.get_ftype_counts(),
+ Series(1, ['float64:dense']))
+
+ @pytest.mark.parametrize("value", [np.nan, np.inf])
+ @pytest.mark.parametrize("dtype", [np.int32, np.int64])
+ def test_astype_cast_nan_inf_int(self, dtype, value):
+ # gh-14265: check NaN and inf raise error when converting to int
+ msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'
+ s = Series([value])
+
+ with pytest.raises(ValueError, match=msg):
+ s.astype(dtype)
+
+ @pytest.mark.parametrize("dtype", [int, np.int8, np.int64])
+ def test_astype_cast_object_int_fail(self, dtype):
+ arr = Series(["car", "house", "tree", "1"])
+ msg = r"invalid literal for (int|long)\(\) with base 10: 'car'"
+ with pytest.raises(ValueError, match=msg):
+ arr.astype(dtype)
+
+ def test_astype_cast_object_int(self):
+ arr = Series(['1', '2', '3', '4'], dtype=object)
+ result = arr.astype(int)
+
+ tm.assert_series_equal(result, Series(np.arange(1, 5)))
+
+ def test_astype_datetime(self):
+ s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
+
+ s = s.astype('O')
+ assert s.dtype == np.object_
+
+ s = Series([datetime(2001, 1, 2, 0, 0)])
+
+ s = s.astype('O')
+ assert s.dtype == np.object_
+
+ s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)])
+
+ s[1] = np.nan
+ assert s.dtype == 'M8[ns]'
+
+ s = s.astype('O')
+ assert s.dtype == np.object_
+
+ def test_astype_datetime64tz(self):
+ s = Series(date_range('20130101', periods=3, tz='US/Eastern'))
+
+ # astype
+ result = s.astype(object)
+ expected = Series(s.astype(object), dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz)
+ tm.assert_series_equal(result, s)
+
+ # astype - object, preserves on construction
+ result = Series(s.astype(object))
+ expected = s.astype(object)
+ tm.assert_series_equal(result, expected)
+
+ # astype - datetime64[ns, tz]
+ result = Series(s.values).astype('datetime64[ns, US/Eastern]')
+ tm.assert_series_equal(result, s)
+
+ result = Series(s.values).astype(s.dtype)
+ tm.assert_series_equal(result, s)
+
+ result = s.astype('datetime64[ns, CET]')
+ expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET'))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [compat.text_type, np.str_])
+ @pytest.mark.parametrize("series", [Series([string.digits * 10,
+ tm.rands(63),
+ tm.rands(64),
+ tm.rands(1000)]),
+ Series([string.digits * 10,
+ tm.rands(63),
+ tm.rands(64), np.nan, 1.0])])
+ def test_astype_str_map(self, dtype, series):
+ # see gh-4405
+ result = series.astype(dtype)
+ expected = series.map(compat.text_type)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [str, compat.text_type])
+ def test_astype_str_cast(self, dtype):
+ # see gh-9757: test str and unicode on python 2.x
+ # and just str on python 3.x
+ ts = Series([Timestamp('2010-01-04 00:00:00')])
+ s = ts.astype(dtype)
+
+ expected = Series([dtype('2010-01-04')])
+ tm.assert_series_equal(s, expected)
+
+ ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')])
+ s = ts.astype(dtype)
+
+ expected = Series([dtype('2010-01-04 00:00:00-05:00')])
+ tm.assert_series_equal(s, expected)
+
+ td = Series([Timedelta(1, unit='d')])
+ s = td.astype(dtype)
+
+ expected = Series([dtype('1 days 00:00:00.000000000')])
+ tm.assert_series_equal(s, expected)
+
+ def test_astype_unicode(self):
+ # see gh-7758: A bit of magic is required to set
+ # default encoding to utf-8
+ digits = string.digits
+ test_series = [
+ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
+ Series([u('データーサイエンス、お前はもう死んでいる')]),
+ ]
+
+ former_encoding = None
+
+ if not compat.PY3:
+ # In Python, we can force the default encoding for this test
+ former_encoding = sys.getdefaultencoding()
+ reload(sys) # noqa
+
+ sys.setdefaultencoding("utf-8")
+ if sys.getdefaultencoding() == "utf-8":
+ test_series.append(Series([u('野菜食べないとやばい')
+ .encode("utf-8")]))
+
+ for s in test_series:
+ res = s.astype("unicode")
+ expec = s.map(compat.text_type)
+ tm.assert_series_equal(res, expec)
+
+ # Restore the former encoding
+ if former_encoding is not None and former_encoding != "utf-8":
+ reload(sys) # noqa
+ sys.setdefaultencoding(former_encoding)
+
+ @pytest.mark.parametrize("dtype_class", [dict, Series])
+ def test_astype_dict_like(self, dtype_class):
+ # see gh-7271
+ s = Series(range(0, 10, 2), name='abc')
+
+ dt1 = dtype_class({'abc': str})
+ result = s.astype(dt1)
+ expected = Series(['0', '2', '4', '6', '8'], name='abc')
+ tm.assert_series_equal(result, expected)
+
+ dt2 = dtype_class({'abc': 'float64'})
+ result = s.astype(dt2)
+ expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64',
+ name='abc')
+ tm.assert_series_equal(result, expected)
+
+ dt3 = dtype_class({'abc': str, 'def': str})
+ msg = ("Only the Series name can be used for the key in Series dtype"
+ r" mappings\.")
+ with pytest.raises(KeyError, match=msg):
+ s.astype(dt3)
+
+ dt4 = dtype_class({0: str})
+ with pytest.raises(KeyError, match=msg):
+ s.astype(dt4)
+
+ # GH16717
+ # if dtypes provided is empty, it should error
+ dt5 = dtype_class({})
+ with pytest.raises(KeyError, match=msg):
+ s.astype(dt5)
+
+ def test_astype_categories_deprecation(self):
+
+ # deprecated 17636
+ s = Series(['a', 'b', 'a'])
+ expected = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = s.astype('category', categories=['a', 'b'], ordered=True)
+ tm.assert_series_equal(result, expected)
+
+ def test_astype_from_categorical(self):
+ items = ["a", "b", "c", "a"]
+ s = Series(items)
+ exp = Series(Categorical(items))
+ res = s.astype('category')
+ tm.assert_series_equal(res, exp)
+
+ items = [1, 2, 3, 1]
+ s = Series(items)
+ exp = Series(Categorical(items))
+ res = s.astype('category')
+ tm.assert_series_equal(res, exp)
+
+ df = DataFrame({"cats": [1, 2, 3, 4, 5, 6],
+ "vals": [1, 2, 3, 4, 5, 6]})
+ cats = Categorical([1, 2, 3, 4, 5, 6])
+ exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
+ df["cats"] = df["cats"].astype("category")
+ tm.assert_frame_equal(exp_df, df)
+
+ df = DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
+ "vals": [1, 2, 3, 4, 5, 6]})
+ cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
+ exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
+ df["cats"] = df["cats"].astype("category")
+ tm.assert_frame_equal(exp_df, df)
+
+ # with keywords
+ lst = ["a", "b", "c", "a"]
+ s = Series(lst)
+ exp = Series(Categorical(lst, ordered=True))
+ res = s.astype(CategoricalDtype(None, ordered=True))
+ tm.assert_series_equal(res, exp)
+
+ exp = Series(Categorical(lst, categories=list('abcdef'), ordered=True))
+ res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
+ tm.assert_series_equal(res, exp)
+
+ def test_astype_categorical_to_other(self):
+
+ df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+ labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+ cat_labels = Categorical(labels, labels)
+
+ df = df.sort_values(by=['value'], ascending=True)
+ df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+ right=False, labels=cat_labels)
+
+ s = df['value_group']
+ expected = s
+ tm.assert_series_equal(s.astype('category'), expected)
+ tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
+ msg = (r"could not convert string to float: '(0 - 499|9500 - 9999)'|"
+ r"invalid literal for float\(\): (0 - 499|9500 - 9999)")
+ with pytest.raises(ValueError, match=msg):
+ s.astype('float64')
+
+ cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
+ exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+ tm.assert_series_equal(cat.astype('str'), exp)
+ s2 = Series(Categorical(['1', '2', '3', '4']))
+ exp2 = Series([1, 2, 3, 4]).astype(int)
+ tm.assert_series_equal(s2.astype('int'), exp2)
+
+ # object don't sort correctly, so just compare that we have the same
+ # values
+ def cmp(a, b):
+ tm.assert_almost_equal(
+ np.sort(np.unique(a)), np.sort(np.unique(b)))
+
+ expected = Series(np.array(s.values), name='value_group')
+ cmp(s.astype('object'), expected)
+ cmp(s.astype(np.object_), expected)
+
+ # array conversion
+ tm.assert_almost_equal(np.array(s), np.array(s.values))
+
+ # valid conversion
+ for valid in [lambda x: x.astype('category'),
+ lambda x: x.astype(CategoricalDtype()),
+ lambda x: x.astype('object').astype('category'),
+ lambda x: x.astype('object').astype(
+ CategoricalDtype())
+ ]:
+
+ result = valid(s)
+ # compare series values
+ # internal .categories can't be compared because it is sorted
+ tm.assert_series_equal(result, s, check_categorical=False)
+
+ # invalid conversion (these are NOT a dtype)
+ msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
+ "Categorical'> for astype")
+ for invalid in [lambda x: x.astype(Categorical),
+ lambda x: x.astype('object').astype(Categorical)]:
+ with pytest.raises(TypeError, match=msg):
+ invalid(s)
+
+ @pytest.mark.parametrize('name', [None, 'foo'])
+ @pytest.mark.parametrize('dtype_ordered', [True, False])
+ @pytest.mark.parametrize('series_ordered', [True, False])
+ def test_astype_categorical_to_categorical(self, name, dtype_ordered,
+ series_ordered):
+ # GH 10696/18593
+ s_data = list('abcaacbab')
+ s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
+ s = Series(s_data, dtype=s_dtype, name=name)
+
+ # unspecified categories
+ dtype = CategoricalDtype(ordered=dtype_ordered)
+ result = s.astype(dtype)
+ exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
+ expected = Series(s_data, name=name, dtype=exp_dtype)
+ tm.assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.astype('category', ordered=dtype_ordered)
+ tm.assert_series_equal(result, expected)
+
+ # different categories
+ dtype = CategoricalDtype(list('adc'), dtype_ordered)
+ result = s.astype(dtype)
+ expected = Series(s_data, name=name, dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = s.astype(
+ 'category', categories=list('adc'), ordered=dtype_ordered)
+ tm.assert_series_equal(result, expected)
+
+ if dtype_ordered is False:
+ # not specifying ordered, so only test once
+ expected = s
+ result = s.astype('category')
+ tm.assert_series_equal(result, expected)
+
+ def test_astype_categoricaldtype(self):
+ s = Series(['a', 'b', 'a'])
+ result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
+ expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
+ tm.assert_series_equal(result, expected)
+
+ result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
+ expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
+ tm.assert_series_equal(result, expected)
+
+ result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
+ expected = Series(Categorical(['a', 'b', 'a'],
+ categories=['a', 'b', 'c'],
+ ordered=False))
+ tm.assert_series_equal(result, expected)
+ tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))
+
+ def test_astype_categoricaldtype_with_args(self):
+ s = Series(['a', 'b'])
+ type_ = CategoricalDtype(['a', 'b'])
+
+ msg = (r"Cannot specify a CategoricalDtype and also `categories` or"
+ r" `ordered`\. Use `dtype=CategoricalDtype\(categories,"
+ r" ordered\)` instead\.")
+ with pytest.raises(TypeError, match=msg):
+ s.astype(type_, ordered=True)
+ with pytest.raises(TypeError, match=msg):
+ s.astype(type_, categories=['a', 'b'])
+ with pytest.raises(TypeError, match=msg):
+ s.astype(type_, categories=['a', 'b'], ordered=False)
+
+ @pytest.mark.parametrize("dtype", [
+ np.datetime64,
+ np.timedelta64,
+ ])
+ def test_astype_generic_timestamp_no_frequency(self, dtype):
+ # see gh-15524, gh-15987
+ data = [1]
+ s = Series(data)
+
+ msg = "dtype has no unit. Please pass in"
+ with pytest.raises(ValueError, match=msg):
+ s.astype(dtype)
+
+ @pytest.mark.parametrize("dtype", np.typecodes['All'])
+ def test_astype_empty_constructor_equality(self, dtype):
+ # see gh-15524
+
+ if dtype not in (
+ "S", "V", # poor support (if any) currently
+ "M", "m" # Generic timestamps raise a ValueError. Already tested.
+ ):
+ init_empty = Series([], dtype=dtype)
+ as_type_empty = Series([]).astype(dtype)
+ tm.assert_series_equal(init_empty, as_type_empty)
+
+ def test_complex(self):
+ # see gh-4819: complex access for ndarray compat
+ a = np.arange(5, dtype=np.float64)
+ b = Series(a + 4j * a)
+
+ tm.assert_numpy_array_equal(a, b.real)
+ tm.assert_numpy_array_equal(4 * a, b.imag)
+
+ b.real = np.arange(5) + 5
+ tm.assert_numpy_array_equal(a + 5, b.real)
+ tm.assert_numpy_array_equal(4 * a, b.imag)
+
+ def test_arg_for_errors_in_astype(self):
+ # see gh-14878
+ s = Series([1, 2, 3])
+
+ msg = (r"Expected value of kwarg 'errors' to be one of \['raise',"
+ r" 'ignore'\]\. Supplied value is 'False'")
+ with pytest.raises(ValueError, match=msg):
+ s.astype(np.float64, errors=False)
+
+ s.astype(np.int8, errors='raise')
+
+ def test_intercept_astype_object(self):
+ series = Series(date_range('1/1/2000', periods=10))
+
+ # This test no longer makes sense, as
+ # Series is by default already M8[ns].
+ expected = series.astype('object')
+
+ df = DataFrame({'a': series,
+ 'b': np.random.randn(len(series))})
+ exp_dtypes = Series([np.dtype('datetime64[ns]'),
+ np.dtype('float64')], index=['a', 'b'])
+ tm.assert_series_equal(df.dtypes, exp_dtypes)
+
+ result = df.values.squeeze()
+ assert (result[:, 0] == expected.values).all()
+
+ df = DataFrame({'a': series, 'b': ['foo'] * len(series)})
+
+ result = df.values.squeeze()
+ assert (result[:, 0] == expected.values).all()
+
+ def test_series_to_categorical(self):
+ # see gh-16524: test conversion of Series to Categorical
+ series = Series(['a', 'b', 'c'])
+
+ result = Series(series, dtype='category')
+ expected = Series(['a', 'b', 'c'], dtype='category')
+
+ tm.assert_series_equal(result, expected)
+
+ def test_infer_objects_series(self):
+ # GH 11221
+ actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects()
+ expected = Series([1, 2, 3])
+ tm.assert_series_equal(actual, expected)
+
+ actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects()
+ expected = Series([1., 2., 3., np.nan])
+ tm.assert_series_equal(actual, expected)
+
+ # only soft conversions, unconvertable pass thru unchanged
+ actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O'))
+ .infer_objects())
+ expected = Series([1, 2, 3, None, 'a'])
+
+ assert actual.dtype == 'object'
+ tm.assert_series_equal(actual, expected)
+
+ def test_is_homogeneous_type(self):
+ assert Series()._is_homogeneous_type
+ assert Series([1, 2])._is_homogeneous_type
+ assert Series(pd.Categorical([1, 2]))._is_homogeneous_type
+
+ @pytest.mark.parametrize("data", [
+ pd.period_range("2000", periods=4),
+ pd.IntervalIndex.from_breaks([1, 2, 3, 4])
+ ])
+ def test_values_compatibility(self, data):
+ # https://github.com/pandas-dev/pandas/issues/23995
+ result = pd.Series(data).values
+ expected = np.array(data.astype(object))
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_duplicates.py b/contrib/python/pandas/py2/pandas/tests/series/test_duplicates.py
new file mode 100644
index 00000000000..a975edacc19
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_duplicates.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+
+import numpy as np
+import pytest
+
+from pandas import Categorical, Series
+import pandas.util.testing as tm
+
+
+def test_value_counts_nunique():
+ # basics.rst doc example
+ series = Series(np.random.randn(500))
+ series[20:500] = np.nan
+ series[10:20] = 5000
+ result = series.nunique()
+ assert result == 11
+
+ # GH 18051
+ s = Series(Categorical([]))
+ assert s.nunique() == 0
+ s = Series(Categorical([np.nan]))
+ assert s.nunique() == 0
+
+
+def test_unique():
+ # GH714 also, dtype=float
+ s = Series([1.2345] * 100)
+ s[::2] = np.nan
+ result = s.unique()
+ assert len(result) == 2
+
+ s = Series([1.2345] * 100, dtype='f4')
+ s[::2] = np.nan
+ result = s.unique()
+ assert len(result) == 2
+
+ # NAs in object arrays #714
+ s = Series(['foo'] * 100, dtype='O')
+ s[::2] = np.nan
+ result = s.unique()
+ assert len(result) == 2
+
+ # decision about None
+ s = Series([1, 2, 3, None, None, None], dtype=object)
+ result = s.unique()
+ expected = np.array([1, 2, 3, None], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ # GH 18051
+ s = Series(Categorical([]))
+ tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False)
+ s = Series(Categorical([np.nan]))
+ tm.assert_categorical_equal(s.unique(), Categorical([np.nan]),
+ check_dtype=False)
+
+
+def test_unique_data_ownership():
+ # it works! #1807
+ Series(Series(["a", "c", "b"]).unique()).sort_values()
+
+
[email protected]('data, expected', [
+ (np.random.randint(0, 10, size=1000), False),
+ (np.arange(1000), True),
+ ([], True),
+ ([np.nan], True),
+ (['foo', 'bar', np.nan], True),
+ (['foo', 'foo', np.nan], False),
+ (['foo', 'bar', np.nan, np.nan], False)])
+def test_is_unique(data, expected):
+ # GH11946 / GH25180
+ s = Series(data)
+ assert s.is_unique is expected
+
+
+def test_is_unique_class_ne(capsys):
+ # GH 20661
+ class Foo(object):
+ def __init__(self, val):
+ self._value = val
+
+ def __ne__(self, other):
+ raise Exception("NEQ not supported")
+
+ with capsys.disabled():
+ li = [Foo(i) for i in range(5)]
+ s = Series(li, index=[i for i in range(5)])
+ s.is_unique
+ captured = capsys.readouterr()
+ assert len(captured.err) == 0
+
+
+ 'keep, expected',
+ [
+ ('first', Series([False, False, False, False, True, True, False])),
+ ('last', Series([False, True, True, False, False, False, False])),
+ (False, Series([False, True, True, False, True, True, False]))
+ ])
+def test_drop_duplicates(any_numpy_dtype, keep, expected):
+ tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))
+
+ if tc.dtype == 'bool':
+ pytest.skip('tested separately in test_drop_duplicates_bool')
+
+ tm.assert_series_equal(tc.duplicated(keep=keep), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep=keep, inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+
[email protected]('keep, expected',
+ [('first', Series([False, False, True, True])),
+ ('last', Series([True, True, False, False])),
+ (False, Series([True, True, True, True]))])
+def test_drop_duplicates_bool(keep, expected):
+ tc = Series([True, False, True, False])
+
+ tm.assert_series_equal(tc.duplicated(keep=keep), expected)
+ tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
+ sc = tc.copy()
+ sc.drop_duplicates(keep=keep, inplace=True)
+ tm.assert_series_equal(sc, tc[~expected])
+
+
[email protected]('keep, expected', [
+ ('first', Series([False, False, True, False, True], name='name')),
+ ('last', Series([True, True, False, False, False], name='name')),
+ (False, Series([True, True, True, False, True], name='name'))
+])
+def test_duplicated_keep(keep, expected):
+ s = Series(['a', 'b', 'b', 'c', 'a'], name='name')
+
+ result = s.duplicated(keep=keep)
+ tm.assert_series_equal(result, expected)
+
+
[email protected]('keep, expected', [
+ ('first', Series([False, False, True, False, True])),
+ ('last', Series([True, True, False, False, False])),
+ (False, Series([True, True, True, False, True]))
+])
+def test_duplicated_nan_none(keep, expected):
+ s = Series([np.nan, 3, 3, None, np.nan], dtype=object)
+
+ result = s.duplicated(keep=keep)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_internals.py b/contrib/python/pandas/py2/pandas/tests/series/test_internals.py
new file mode 100644
index 00000000000..26b868872ee
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_internals.py
@@ -0,0 +1,343 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import NaT, Series, Timestamp
+from pandas.core.internals.blocks import IntBlock
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestSeriesInternals(object):
+
+ def test_convert_objects(self):
+
+ s = Series([1., 2, 3], index=['a', 'b', 'c'])
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates=False,
+ convert_numeric=True)
+ assert_series_equal(result, s)
+
+ # force numeric conversion
+ r = s.copy().astype('O')
+ r['a'] = '1'
+ with tm.assert_produces_warning(FutureWarning):
+ result = r.convert_objects(convert_dates=False,
+ convert_numeric=True)
+ assert_series_equal(result, s)
+
+ r = s.copy().astype('O')
+ r['a'] = '1.'
+ with tm.assert_produces_warning(FutureWarning):
+ result = r.convert_objects(convert_dates=False,
+ convert_numeric=True)
+ assert_series_equal(result, s)
+
+ r = s.copy().astype('O')
+ r['a'] = 'garbled'
+ expected = s.copy()
+ expected['a'] = np.nan
+ with tm.assert_produces_warning(FutureWarning):
+ result = r.convert_objects(convert_dates=False,
+ convert_numeric=True)
+ assert_series_equal(result, expected)
+
+ # GH 4119, not converting a mixed type (e.g.floats and object)
+ s = Series([1, 'na', 3, 4])
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_numeric=True)
+ expected = Series([1, np.nan, 3, 4])
+ assert_series_equal(result, expected)
+
+ s = Series([1, '', 3, 4])
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_numeric=True)
+ expected = Series([1, np.nan, 3, 4])
+ assert_series_equal(result, expected)
+
+ # dates
+ s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
+ datetime(2001, 1, 3, 0, 0)])
+ s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
+ datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1,
+ Timestamp('20010104'), '20010105'],
+ dtype='O')
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates=True,
+ convert_numeric=False)
+ expected = Series([Timestamp('20010101'), Timestamp('20010102'),
+ Timestamp('20010103')], dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce',
+ convert_numeric=False)
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce',
+ convert_numeric=True)
+ assert_series_equal(result, expected)
+
+ expected = Series([Timestamp('20010101'), Timestamp('20010102'),
+ Timestamp('20010103'),
+ NaT, NaT, NaT, Timestamp('20010104'),
+ Timestamp('20010105')], dtype='M8[ns]')
+ with tm.assert_produces_warning(FutureWarning):
+ result = s2.convert_objects(convert_dates='coerce',
+ convert_numeric=False)
+ assert_series_equal(result, expected)
+ with tm.assert_produces_warning(FutureWarning):
+ result = s2.convert_objects(convert_dates='coerce',
+ convert_numeric=True)
+ assert_series_equal(result, expected)
+
+ # preserver all-nans (if convert_dates='coerce')
+ s = Series(['foo', 'bar', 1, 1.0], dtype='O')
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce',
+ convert_numeric=False)
+ expected = Series([NaT] * 2 + [Timestamp(1)] * 2)
+ assert_series_equal(result, expected)
+
+ # preserver if non-object
+ s = Series([1], dtype='float32')
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce',
+ convert_numeric=False)
+ assert_series_equal(result, s)
+
+ # r = s.copy()
+ # r[0] = np.nan
+ # result = r.convert_objects(convert_dates=True,convert_numeric=False)
+ # assert result.dtype == 'M8[ns]'
+
+ # dateutil parses some single letters into today's value as a date
+ for x in 'abcdefghijklmnopqrstuvwxyz':
+ s = Series([x])
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce')
+ assert_series_equal(result, s)
+ s = Series([x.upper()])
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.convert_objects(convert_dates='coerce')
+ assert_series_equal(result, s)
+
+ def test_convert_objects_preserve_bool(self):
+ s = Series([1, True, 3, 5], dtype=object)
+ with tm.assert_produces_warning(FutureWarning):
+ r = s.convert_objects(convert_numeric=True)
+ e = Series([1, 1, 3, 5], dtype='i8')
+ tm.assert_series_equal(r, e)
+
+ def test_convert_objects_preserve_all_bool(self):
+ s = Series([False, True, False, False], dtype=object)
+ with tm.assert_produces_warning(FutureWarning):
+ r = s.convert_objects(convert_numeric=True)
+ e = Series([False, True, False, False], dtype=bool)
+ tm.assert_series_equal(r, e)
+
+ # GH 10265
+ def test_convert(self):
+ # Tests: All to nans, coerce, true
+ # Test coercion returns correct type
+ s = Series(['a', 'b', 'c'])
+ results = s._convert(datetime=True, coerce=True)
+ expected = Series([NaT] * 3)
+ assert_series_equal(results, expected)
+
+ results = s._convert(numeric=True, coerce=True)
+ expected = Series([np.nan] * 3)
+ assert_series_equal(results, expected)
+
+ expected = Series([NaT] * 3, dtype=np.dtype('m8[ns]'))
+ results = s._convert(timedelta=True, coerce=True)
+ assert_series_equal(results, expected)
+
+ dt = datetime(2001, 1, 1, 0, 0)
+ td = dt - datetime(2000, 1, 1, 0, 0)
+
+ # Test coercion with mixed types
+ s = Series(['a', '3.1415', dt, td])
+ results = s._convert(datetime=True, coerce=True)
+ expected = Series([NaT, NaT, dt, NaT])
+ assert_series_equal(results, expected)
+
+ results = s._convert(numeric=True, coerce=True)
+ expected = Series([np.nan, 3.1415, np.nan, np.nan])
+ assert_series_equal(results, expected)
+
+ results = s._convert(timedelta=True, coerce=True)
+ expected = Series([NaT, NaT, NaT, td],
+ dtype=np.dtype('m8[ns]'))
+ assert_series_equal(results, expected)
+
+ # Test standard conversion returns original
+ results = s._convert(datetime=True)
+ assert_series_equal(results, s)
+ results = s._convert(numeric=True)
+ expected = Series([np.nan, 3.1415, np.nan, np.nan])
+ assert_series_equal(results, expected)
+ results = s._convert(timedelta=True)
+ assert_series_equal(results, s)
+
+ # test pass-through and non-conversion when other types selected
+ s = Series(['1.0', '2.0', '3.0'])
+ results = s._convert(datetime=True, numeric=True, timedelta=True)
+ expected = Series([1.0, 2.0, 3.0])
+ assert_series_equal(results, expected)
+ results = s._convert(True, False, True)
+ assert_series_equal(results, s)
+
+ s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)],
+ dtype='O')
+ results = s._convert(datetime=True, numeric=True, timedelta=True)
+ expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0,
+ 0)])
+ assert_series_equal(results, expected)
+ results = s._convert(datetime=False, numeric=True, timedelta=True)
+ assert_series_equal(results, s)
+
+ td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
+ s = Series([td, td], dtype='O')
+ results = s._convert(datetime=True, numeric=True, timedelta=True)
+ expected = Series([td, td])
+ assert_series_equal(results, expected)
+ results = s._convert(True, True, False)
+ assert_series_equal(results, s)
+
+ s = Series([1., 2, 3], index=['a', 'b', 'c'])
+ result = s._convert(numeric=True)
+ assert_series_equal(result, s)
+
+ # force numeric conversion
+ r = s.copy().astype('O')
+ r['a'] = '1'
+ result = r._convert(numeric=True)
+ assert_series_equal(result, s)
+
+ r = s.copy().astype('O')
+ r['a'] = '1.'
+ result = r._convert(numeric=True)
+ assert_series_equal(result, s)
+
+ r = s.copy().astype('O')
+ r['a'] = 'garbled'
+ result = r._convert(numeric=True)
+ expected = s.copy()
+ expected['a'] = np.nan
+ assert_series_equal(result, expected)
+
+ # GH 4119, not converting a mixed type (e.g.floats and object)
+ s = Series([1, 'na', 3, 4])
+ result = s._convert(datetime=True, numeric=True)
+ expected = Series([1, np.nan, 3, 4])
+ assert_series_equal(result, expected)
+
+ s = Series([1, '', 3, 4])
+ result = s._convert(datetime=True, numeric=True)
+ assert_series_equal(result, expected)
+
+ # dates
+ s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
+ datetime(2001, 1, 3, 0, 0)])
+ s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
+ datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1,
+ Timestamp('20010104'), '20010105'], dtype='O')
+
+ result = s._convert(datetime=True)
+ expected = Series([Timestamp('20010101'), Timestamp('20010102'),
+ Timestamp('20010103')], dtype='M8[ns]')
+ assert_series_equal(result, expected)
+
+ result = s._convert(datetime=True, coerce=True)
+ assert_series_equal(result, expected)
+
+ expected = Series([Timestamp('20010101'), Timestamp('20010102'),
+ Timestamp('20010103'), NaT, NaT, NaT,
+ Timestamp('20010104'), Timestamp('20010105')],
+ dtype='M8[ns]')
+ result = s2._convert(datetime=True, numeric=False, timedelta=False,
+ coerce=True)
+ assert_series_equal(result, expected)
+ result = s2._convert(datetime=True, coerce=True)
+ assert_series_equal(result, expected)
+
+ s = Series(['foo', 'bar', 1, 1.0], dtype='O')
+ result = s._convert(datetime=True, coerce=True)
+ expected = Series([NaT] * 2 + [Timestamp(1)] * 2)
+ assert_series_equal(result, expected)
+
+ # preserver if non-object
+ s = Series([1], dtype='float32')
+ result = s._convert(datetime=True, coerce=True)
+ assert_series_equal(result, s)
+
+ # r = s.copy()
+ # r[0] = np.nan
+ # result = r._convert(convert_dates=True,convert_numeric=False)
+ # assert result.dtype == 'M8[ns]'
+
+ # dateutil parses some single letters into today's value as a date
+ expected = Series([NaT])
+ for x in 'abcdefghijklmnopqrstuvwxyz':
+ s = Series([x])
+ result = s._convert(datetime=True, coerce=True)
+ assert_series_equal(result, expected)
+ s = Series([x.upper()])
+ result = s._convert(datetime=True, coerce=True)
+ assert_series_equal(result, expected)
+
+ def test_convert_no_arg_error(self):
+ s = Series(['1.0', '2'])
+ msg = r"At least one of datetime, numeric or timedelta must be True\."
+ with pytest.raises(ValueError, match=msg):
+ s._convert()
+
+ def test_convert_preserve_bool(self):
+ s = Series([1, True, 3, 5], dtype=object)
+ r = s._convert(datetime=True, numeric=True)
+ e = Series([1, 1, 3, 5], dtype='i8')
+ tm.assert_series_equal(r, e)
+
+ def test_convert_preserve_all_bool(self):
+ s = Series([False, True, False, False], dtype=object)
+ r = s._convert(datetime=True, numeric=True)
+ e = Series([False, True, False, False], dtype=bool)
+ tm.assert_series_equal(r, e)
+
+ def test_constructor_no_pandas_array(self):
+ ser = pd.Series([1, 2, 3])
+ result = pd.Series(ser.array)
+ tm.assert_series_equal(ser, result)
+ assert isinstance(result._data.blocks[0], IntBlock)
+
+ def test_from_array(self):
+ result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]'))
+ assert result._data.blocks[0].is_extension is False
+
+ result = pd.Series(pd.array(['2015'], dtype='datetime64[ns]'))
+ assert result._data.blocks[0].is_extension is False
+
+ def test_from_list_dtype(self):
+ result = pd.Series(['1H', '2H'], dtype='timedelta64[ns]')
+ assert result._data.blocks[0].is_extension is False
+
+ result = pd.Series(['2015'], dtype='datetime64[ns]')
+ assert result._data.blocks[0].is_extension is False
+
+
+def test_hasnans_unchached_for_series():
+ # GH#19700
+ idx = pd.Index([0, 1])
+ assert idx.hasnans is False
+ assert 'hasnans' in idx._cache
+ ser = idx.to_series()
+ assert ser.hasnans is False
+ assert not hasattr(ser, '_cache')
+ ser.iloc[-1] = np.nan
+ assert ser.hasnans is True
+ assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_io.py b/contrib/python/pandas/py2/pandas/tests/series/test_io.py
new file mode 100644
index 00000000000..5749b0c6551
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_io.py
@@ -0,0 +1,267 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import collections
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO, u
+
+import pandas as pd
+from pandas import DataFrame, Series
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean)
+
+from pandas.io.common import _get_handle
+
+
+class TestSeriesToCSV():
+
+ def read_csv(self, path, **kwargs):
+ params = dict(squeeze=True, index_col=0,
+ header=None, parse_dates=True)
+ params.update(**kwargs)
+
+ header = params.get("header")
+ out = pd.read_csv(path, **params)
+
+ if header is None:
+ out.name = out.index.name = None
+
+ return out
+
+ def test_from_csv_deprecation(self, datetime_series):
+ # see gh-17812
+ with ensure_clean() as path:
+ datetime_series.to_csv(path, header=False)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ ts = self.read_csv(path)
+ depr_ts = Series.from_csv(path)
+ assert_series_equal(depr_ts, ts)
+
+ @pytest.mark.parametrize("arg", ["path", "header", "both"])
+ def test_to_csv_deprecation(self, arg, datetime_series):
+ # see gh-19715
+ with ensure_clean() as path:
+ if arg == "path":
+ kwargs = dict(path=path, header=False)
+ elif arg == "header":
+ kwargs = dict(path_or_buf=path)
+ else: # Both discrepancies match.
+ kwargs = dict(path=path)
+
+ with tm.assert_produces_warning(FutureWarning):
+ datetime_series.to_csv(**kwargs)
+
+ # Make sure roundtrip still works.
+ ts = self.read_csv(path)
+ assert_series_equal(datetime_series, ts, check_names=False)
+
+ def test_from_csv(self, datetime_series, string_series):
+
+ with ensure_clean() as path:
+ datetime_series.to_csv(path, header=False)
+ ts = self.read_csv(path)
+ assert_series_equal(datetime_series, ts, check_names=False)
+
+ assert ts.name is None
+ assert ts.index.name is None
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ depr_ts = Series.from_csv(path)
+ assert_series_equal(depr_ts, ts)
+
+ # see gh-10483
+ datetime_series.to_csv(path, header=True)
+ ts_h = self.read_csv(path, header=0)
+ assert ts_h.name == "ts"
+
+ string_series.to_csv(path, header=False)
+ series = self.read_csv(path)
+ assert_series_equal(string_series, series, check_names=False)
+
+ assert series.name is None
+ assert series.index.name is None
+
+ string_series.to_csv(path, header=True)
+ series_h = self.read_csv(path, header=0)
+ assert series_h.name == "series"
+
+ with open(path, "w") as outfile:
+ outfile.write("1998-01-01|1.0\n1999-01-01|2.0")
+
+ series = self.read_csv(path, sep="|")
+ check_series = Series({datetime(1998, 1, 1): 1.0,
+ datetime(1999, 1, 1): 2.0})
+ assert_series_equal(check_series, series)
+
+ series = self.read_csv(path, sep="|", parse_dates=False)
+ check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0})
+ assert_series_equal(check_series, series)
+
+ def test_to_csv(self, datetime_series):
+ import io
+
+ with ensure_clean() as path:
+ datetime_series.to_csv(path, header=False)
+
+ with io.open(path, newline=None) as f:
+ lines = f.readlines()
+ assert (lines[1] != '\n')
+
+ datetime_series.to_csv(path, index=False, header=False)
+ arr = np.loadtxt(path)
+ assert_almost_equal(arr, datetime_series.values)
+
+ def test_to_csv_unicode_index(self):
+ buf = StringIO()
+ s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")])
+
+ s.to_csv(buf, encoding="UTF-8", header=False)
+ buf.seek(0)
+
+ s2 = self.read_csv(buf, index_col=0, encoding="UTF-8")
+ assert_series_equal(s, s2)
+
+ def test_to_csv_float_format(self):
+
+ with ensure_clean() as filename:
+ ser = Series([0.123456, 0.234567, 0.567567])
+ ser.to_csv(filename, float_format="%.2f", header=False)
+
+ rs = self.read_csv(filename)
+ xp = Series([0.12, 0.23, 0.57])
+ assert_series_equal(rs, xp)
+
+ def test_to_csv_list_entries(self):
+ s = Series(['jack and jill', 'jesse and frank'])
+
+ split = s.str.split(r'\s+and\s+')
+
+ buf = StringIO()
+ split.to_csv(buf, header=False)
+
+ def test_to_csv_path_is_none(self):
+ # GH 8215
+ # Series.to_csv() was returning None, inconsistent with
+ # DataFrame.to_csv() which returned string
+ s = Series([1, 2, 3])
+ csv_str = s.to_csv(path_or_buf=None, header=False)
+ assert isinstance(csv_str, str)
+
+ @pytest.mark.parametrize('s,encoding', [
+ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
+ name='X'), None),
+ # GH 21241, 21118
+ (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
+ (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
+ (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
+ ])
+ def test_to_csv_compression(self, s, encoding, compression):
+
+ with ensure_clean() as filename:
+
+ s.to_csv(filename, compression=compression, encoding=encoding,
+ header=True)
+ # test the round trip - to_csv -> read_csv
+ result = pd.read_csv(filename, compression=compression,
+ encoding=encoding, index_col=0, squeeze=True)
+ assert_series_equal(s, result)
+
+ # test the round trip using file handle - to_csv -> read_csv
+ f, _handles = _get_handle(filename, 'w', compression=compression,
+ encoding=encoding)
+ with f:
+ s.to_csv(f, encoding=encoding, header=True)
+ result = pd.read_csv(filename, compression=compression,
+ encoding=encoding, index_col=0, squeeze=True)
+ assert_series_equal(s, result)
+
+ # explicitly ensure file was compressed
+ with tm.decompress_file(filename, compression) as fh:
+ text = fh.read().decode(encoding or 'utf8')
+ assert s.name in text
+
+ with tm.decompress_file(filename, compression) as fh:
+ assert_series_equal(s, pd.read_csv(fh,
+ index_col=0,
+ squeeze=True,
+ encoding=encoding))
+
+
+class TestSeriesIO():
+
+ def test_to_frame(self, datetime_series):
+ datetime_series.name = None
+ rs = datetime_series.to_frame()
+ xp = pd.DataFrame(datetime_series.values, index=datetime_series.index)
+ assert_frame_equal(rs, xp)
+
+ datetime_series.name = 'testname'
+ rs = datetime_series.to_frame()
+ xp = pd.DataFrame(dict(testname=datetime_series.values),
+ index=datetime_series.index)
+ assert_frame_equal(rs, xp)
+
+ rs = datetime_series.to_frame(name='testdifferent')
+ xp = pd.DataFrame(dict(testdifferent=datetime_series.values),
+ index=datetime_series.index)
+ assert_frame_equal(rs, xp)
+
+ def test_timeseries_periodindex(self):
+ # GH2891
+ from pandas import period_range
+ prng = period_range('1/1/2011', '1/1/2012', freq='M')
+ ts = Series(np.random.randn(len(prng)), prng)
+ new_ts = tm.round_trip_pickle(ts)
+ assert new_ts.index.freq == 'M'
+
+ def test_pickle_preserve_name(self):
+ for n in [777, 777., 'name', datetime(2001, 11, 11), (1, 2)]:
+ unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n))
+ assert unpickled.name == n
+
+ def _pickle_roundtrip_name(self, obj):
+
+ with ensure_clean() as path:
+ obj.to_pickle(path)
+ unpickled = pd.read_pickle(path)
+ return unpickled
+
+ def test_to_frame_expanddim(self):
+ # GH 9762
+
+ class SubclassedSeries(Series):
+
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedFrame
+
+ class SubclassedFrame(DataFrame):
+ pass
+
+ s = SubclassedSeries([1, 2, 3], name='X')
+ result = s.to_frame()
+ assert isinstance(result, SubclassedFrame)
+ expected = SubclassedFrame({'X': [1, 2, 3]})
+ assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('mapping', (
+ dict,
+ collections.defaultdict(list),
+ collections.OrderedDict))
+ def test_to_dict(self, mapping, datetime_series):
+ # GH16122
+ tm.assert_series_equal(
+ Series(datetime_series.to_dict(mapping), name='ts'),
+ datetime_series)
+ from_method = Series(datetime_series.to_dict(collections.Counter))
+ from_constructor = Series(collections
+ .Counter(datetime_series.iteritems()))
+ tm.assert_series_equal(from_method, from_constructor)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_missing.py b/contrib/python/pandas/py2/pandas/tests/series/test_missing.py
new file mode 100644
index 00000000000..985288c4399
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_missing.py
@@ -0,0 +1,1374 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+from distutils.version import LooseVersion
+
+import numpy as np
+from numpy import nan
+import pytest
+import pytz
+
+from pandas._libs.tslib import iNaT
+from pandas.compat import range
+from pandas.errors import PerformanceWarning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series,
+ Timestamp, date_range, isna)
+from pandas.core.series import remove_na
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+try:
+ import scipy
+ _is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
+ LooseVersion('0.19.0'))
+except ImportError:
+ _is_scipy_ge_0190 = False
+
+
+def _skip_if_no_pchip():
+ try:
+ from scipy.interpolate import pchip_interpolate # noqa
+ except ImportError:
+ import pytest
+ pytest.skip('scipy.interpolate.pchip missing')
+
+
+def _skip_if_no_akima():
+ try:
+ from scipy.interpolate import Akima1DInterpolator # noqa
+ except ImportError:
+ import pytest
+ pytest.skip('scipy.interpolate.Akima1DInterpolator missing')
+
+
+def _simple_ts(start, end, freq='D'):
+ rng = date_range(start, end, freq=freq)
+ return Series(np.random.randn(len(rng)), index=rng)
+
+
+class TestSeriesMissingData():
+
+ def test_remove_na_deprecation(self):
+ # see gh-16971
+ with tm.assert_produces_warning(FutureWarning):
+ remove_na(Series([]))
+
+ def test_timedelta_fillna(self):
+ # GH 3371
+ s = Series([Timestamp('20130101'), Timestamp('20130101'),
+ Timestamp('20130102'), Timestamp('20130103 9:01:01')])
+ td = s.diff()
+
+ # reg fillna
+ with tm.assert_produces_warning(FutureWarning):
+ result = td.fillna(0)
+ expected = Series([timedelta(0), timedelta(0), timedelta(1),
+ timedelta(days=1, seconds=9 * 3600 + 60 + 1)])
+ assert_series_equal(result, expected)
+
+ # interpreted as seconds, deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ result = td.fillna(1)
+ expected = Series([timedelta(seconds=1),
+ timedelta(0), timedelta(1),
+ timedelta(days=1, seconds=9 * 3600 + 60 + 1)])
+ assert_series_equal(result, expected)
+
+ result = td.fillna(timedelta(days=1, seconds=1))
+ expected = Series([timedelta(days=1, seconds=1), timedelta(0),
+ timedelta(1),
+ timedelta(days=1, seconds=9 * 3600 + 60 + 1)])
+ assert_series_equal(result, expected)
+
+ result = td.fillna(np.timedelta64(int(1e9)))
+ expected = Series([timedelta(seconds=1), timedelta(0), timedelta(1),
+ timedelta(days=1, seconds=9 * 3600 + 60 + 1)])
+ assert_series_equal(result, expected)
+
+ result = td.fillna(NaT)
+ expected = Series([NaT, timedelta(0), timedelta(1),
+ timedelta(days=1, seconds=9 * 3600 + 60 + 1)],
+ dtype='m8[ns]')
+ assert_series_equal(result, expected)
+
+ # ffill
+ td[2] = np.nan
+ result = td.ffill()
+ with tm.assert_produces_warning(FutureWarning):
+ expected = td.fillna(0)
+ expected[0] = np.nan
+ assert_series_equal(result, expected)
+
+ # bfill
+ td[2] = np.nan
+ result = td.bfill()
+ with tm.assert_produces_warning(FutureWarning):
+ expected = td.fillna(0)
+ expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1)
+ assert_series_equal(result, expected)
+
+ def test_datetime64_fillna(self):
+
+ s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp(
+ '20130102'), Timestamp('20130103 9:01:01')])
+ s[2] = np.nan
+
+ # reg fillna
+ result = s.fillna(Timestamp('20130104'))
+ expected = Series([Timestamp('20130101'), Timestamp(
+ '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')])
+ assert_series_equal(result, expected)
+
+ result = s.fillna(NaT)
+ expected = s
+ assert_series_equal(result, expected)
+
+ # ffill
+ result = s.ffill()
+ expected = Series([Timestamp('20130101'), Timestamp(
+ '20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01')])
+ assert_series_equal(result, expected)
+
+ # bfill
+ result = s.bfill()
+ expected = Series([Timestamp('20130101'), Timestamp('20130101'),
+ Timestamp('20130103 9:01:01'), Timestamp(
+ '20130103 9:01:01')])
+ assert_series_equal(result, expected)
+
+ # GH 6587
+ # make sure that we are treating as integer when filling
+ # this also tests inference of a datetime-like with NaT's
+ s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001'])
+ expected = Series(
+ ['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001',
+ '2013-08-05 15:30:00.000001'], dtype='M8[ns]')
+ result = s.fillna(method='backfill')
+ assert_series_equal(result, expected)
+
+ def test_datetime64_tz_fillna(self):
+
+ for tz in ['US/Eastern', 'Asia/Tokyo']:
+ # DatetimeBlock
+ s = Series([Timestamp('2011-01-01 10:00'), pd.NaT,
+ Timestamp('2011-01-03 10:00'), pd.NaT])
+ null_loc = pd.Series([False, True, False, True])
+
+ result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
+ expected = Series([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00'),
+ Timestamp('2011-01-03 10:00'),
+ Timestamp('2011-01-02 10:00')])
+ tm.assert_series_equal(expected, result)
+ # check s is not changed
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
+ expected = Series([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00', tz=tz),
+ Timestamp('2011-01-03 10:00'),
+ Timestamp('2011-01-02 10:00', tz=tz)])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna('AAA')
+ expected = Series([Timestamp('2011-01-01 10:00'), 'AAA',
+ Timestamp('2011-01-03 10:00'), 'AAA'],
+ dtype=object)
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz),
+ 3: pd.Timestamp('2011-01-04 10:00')})
+ expected = Series([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00', tz=tz),
+ Timestamp('2011-01-03 10:00'),
+ Timestamp('2011-01-04 10:00')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna({1: pd.Timestamp('2011-01-02 10:00'),
+ 3: pd.Timestamp('2011-01-04 10:00')})
+ expected = Series([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-02 10:00'),
+ Timestamp('2011-01-03 10:00'),
+ Timestamp('2011-01-04 10:00')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ # DatetimeBlockTZ
+ idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT,
+ '2011-01-03 10:00', pd.NaT], tz=tz)
+ s = pd.Series(idx)
+ assert s.dtype == 'datetime64[ns, {0}]'.format(tz)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna(pd.Timestamp('2011-01-02 10:00'))
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz),
+ Timestamp('2011-01-02 10:00'),
+ Timestamp('2011-01-03 10:00', tz=tz),
+ Timestamp('2011-01-02 10:00')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz))
+ idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00',
+ '2011-01-03 10:00', '2011-01-02 10:00'],
+ tz=tz)
+ expected = Series(idx)
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna(pd.Timestamp('2011-01-02 10:00',
+ tz=tz).to_pydatetime())
+ idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00',
+ '2011-01-03 10:00', '2011-01-02 10:00'],
+ tz=tz)
+ expected = Series(idx)
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna('AAA')
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz), 'AAA',
+ Timestamp('2011-01-03 10:00', tz=tz), 'AAA'],
+ dtype=object)
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz),
+ 3: pd.Timestamp('2011-01-04 10:00')})
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz),
+ Timestamp('2011-01-02 10:00', tz=tz),
+ Timestamp('2011-01-03 10:00', tz=tz),
+ Timestamp('2011-01-04 10:00')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz),
+ 3: pd.Timestamp('2011-01-04 10:00', tz=tz)})
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz),
+ Timestamp('2011-01-02 10:00', tz=tz),
+ Timestamp('2011-01-03 10:00', tz=tz),
+ Timestamp('2011-01-04 10:00', tz=tz)])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ # filling with a naive/other zone, coerce to object
+ result = s.fillna(Timestamp('20130101'))
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz),
+ Timestamp('2013-01-01'),
+ Timestamp('2011-01-03 10:00', tz=tz),
+ Timestamp('2013-01-01')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ result = s.fillna(Timestamp('20130101', tz='US/Pacific'))
+ expected = Series([Timestamp('2011-01-01 10:00', tz=tz),
+ Timestamp('2013-01-01', tz='US/Pacific'),
+ Timestamp('2011-01-03 10:00', tz=tz),
+ Timestamp('2013-01-01', tz='US/Pacific')])
+ tm.assert_series_equal(expected, result)
+ tm.assert_series_equal(pd.isna(s), null_loc)
+
+ # with timezone
+ # GH 15855
+ df = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT])
+ exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.Timestamp('2012-11-11 00:00:00+01:00')])
+ assert_series_equal(df.fillna(method='pad'), exp)
+
+ df = pd.Series([pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')])
+ exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.Timestamp('2012-11-11 00:00:00+01:00')])
+ assert_series_equal(df.fillna(method='bfill'), exp)
+
+ def test_fillna_consistency(self):
+ # GH 16402
+ # fillna with a tz aware to a tz-naive, should result in object
+
+ s = Series([Timestamp('20130101'), pd.NaT])
+
+ result = s.fillna(Timestamp('20130101', tz='US/Eastern'))
+ expected = Series([Timestamp('20130101'),
+ Timestamp('2013-01-01', tz='US/Eastern')],
+ dtype='object')
+ assert_series_equal(result, expected)
+
+ # where (we ignore the errors=)
+ result = s.where([True, False],
+ Timestamp('20130101', tz='US/Eastern'),
+ errors='ignore')
+ assert_series_equal(result, expected)
+
+ result = s.where([True, False],
+ Timestamp('20130101', tz='US/Eastern'),
+ errors='ignore')
+ assert_series_equal(result, expected)
+
+ # with a non-datetime
+ result = s.fillna('foo')
+ expected = Series([Timestamp('20130101'),
+ 'foo'])
+ assert_series_equal(result, expected)
+
+ # assignment
+ s2 = s.copy()
+ s2[1] = 'foo'
+ assert_series_equal(s2, expected)
+
+ def test_datetime64tz_fillna_round_issue(self):
+ # GH 14872
+
+ data = pd.Series([pd.NaT, pd.NaT,
+ datetime(2016, 12, 12, 22, 24, 6, 100001,
+ tzinfo=pytz.utc)])
+
+ filled = data.fillna(method='bfill')
+
+ expected = pd.Series([datetime(2016, 12, 12, 22, 24, 6,
+ 100001, tzinfo=pytz.utc),
+ datetime(2016, 12, 12, 22, 24, 6,
+ 100001, tzinfo=pytz.utc),
+ datetime(2016, 12, 12, 22, 24, 6,
+ 100001, tzinfo=pytz.utc)])
+
+ assert_series_equal(filled, expected)
+
+ def test_fillna_downcast(self):
+ # GH 15277
+ # infer int64 from float64
+ s = pd.Series([1., np.nan])
+ result = s.fillna(0, downcast='infer')
+ expected = pd.Series([1, 0])
+ assert_series_equal(result, expected)
+
+ # infer int64 from float64 when fillna value is a dict
+ s = pd.Series([1., np.nan])
+ result = s.fillna({1: 0}, downcast='infer')
+ expected = pd.Series([1, 0])
+ assert_series_equal(result, expected)
+
+ def test_fillna_int(self):
+ s = Series(np.random.randint(-100, 100, 50))
+ s.fillna(method='ffill', inplace=True)
+ assert_series_equal(s.fillna(method='ffill', inplace=False), s)
+
+ def test_fillna_raise(self):
+ s = Series(np.random.randint(-100, 100, 50))
+ msg = ('"value" parameter must be a scalar or dict, but you passed a'
+ ' "list"')
+ with pytest.raises(TypeError, match=msg):
+ s.fillna([1, 2])
+
+ msg = ('"value" parameter must be a scalar or dict, but you passed a'
+ ' "tuple"')
+ with pytest.raises(TypeError, match=msg):
+ s.fillna((1, 2))
+
+ # related GH 9217, make sure limit is an int and greater than 0
+ s = Series([1, 2, 3, None])
+ msg = (r"Cannot specify both 'value' and 'method'\.|"
+ r"Limit must be greater than 0|"
+ "Limit must be an integer")
+ for limit in [-1, 0, 1., 2.]:
+ for method in ['backfill', 'bfill', 'pad', 'ffill', None]:
+ with pytest.raises(ValueError, match=msg):
+ s.fillna(1, limit=limit, method=method)
+
+ def test_categorical_nan_equality(self):
+ cat = Series(Categorical(["a", "b", "c", np.nan]))
+ exp = Series([True, True, True, False])
+ res = (cat == cat)
+ tm.assert_series_equal(res, exp)
+
+ def test_categorical_nan_handling(self):
+
+ # NaNs are represented as -1 in labels
+ s = Series(Categorical(["a", "b", np.nan, "a"]))
+ tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+ tm.assert_numpy_array_equal(s.values.codes,
+ np.array([0, 1, -1, 0], dtype=np.int8))
+
+ @pytest.mark.parametrize('fill_value, expected_output', [
+ ('a', ['a', 'a', 'b', 'a', 'a']),
+ ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']),
+ ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]),
+ ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]),
+ (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]),
+ (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]),
+ (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]),
+ (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b'])
+ ])
+ def test_fillna_categorical(self, fill_value, expected_output):
+ # GH 17033
+ # Test fillna for a Categorical series
+ data = ['a', np.nan, 'b', np.nan, np.nan]
+ s = Series(Categorical(data, categories=['a', 'b']))
+ exp = Series(Categorical(expected_output, categories=['a', 'b']))
+ tm.assert_series_equal(s.fillna(fill_value), exp)
+
+ def test_fillna_categorical_raise(self):
+ data = ['a', np.nan, 'b', np.nan, np.nan]
+ s = Series(Categorical(data, categories=['a', 'b']))
+
+ with pytest.raises(ValueError,
+ match="fill value must be in categories"):
+ s.fillna('d')
+
+ with pytest.raises(ValueError,
+ match="fill value must be in categories"):
+ s.fillna(Series('d'))
+
+ with pytest.raises(ValueError,
+ match="fill value must be in categories"):
+ s.fillna({1: 'd', 3: 'a'})
+
+ msg = ('"value" parameter must be a scalar or '
+ 'dict, but you passed a "list"')
+ with pytest.raises(TypeError, match=msg):
+ s.fillna(['a', 'b'])
+
+ msg = ('"value" parameter must be a scalar or '
+ 'dict, but you passed a "tuple"')
+ with pytest.raises(TypeError, match=msg):
+ s.fillna(('a', 'b'))
+
+ msg = ('"value" parameter must be a scalar, dict '
+ 'or Series, but you passed a "DataFrame"')
+ with pytest.raises(TypeError, match=msg):
+ s.fillna(DataFrame({1: ['a'], 3: ['b']}))
+
+ def test_fillna_nat(self):
+ series = Series([0, 1, 2, iNaT], dtype='M8[ns]')
+
+ filled = series.fillna(method='pad')
+ filled2 = series.fillna(value=series.values[2])
+
+ expected = series.copy()
+ expected.values[3] = expected.values[2]
+
+ assert_series_equal(filled, expected)
+ assert_series_equal(filled2, expected)
+
+ df = DataFrame({'A': series})
+ filled = df.fillna(method='pad')
+ filled2 = df.fillna(value=series.values[2])
+ expected = DataFrame({'A': expected})
+ assert_frame_equal(filled, expected)
+ assert_frame_equal(filled2, expected)
+
+ series = Series([iNaT, 0, 1, 2], dtype='M8[ns]')
+
+ filled = series.fillna(method='bfill')
+ filled2 = series.fillna(value=series[1])
+
+ expected = series.copy()
+ expected[0] = expected[1]
+
+ assert_series_equal(filled, expected)
+ assert_series_equal(filled2, expected)
+
+ df = DataFrame({'A': series})
+ filled = df.fillna(method='bfill')
+ filled2 = df.fillna(value=series[1])
+ expected = DataFrame({'A': expected})
+ assert_frame_equal(filled, expected)
+ assert_frame_equal(filled2, expected)
+
+ def test_isna_for_inf(self):
+ s = Series(['a', np.inf, np.nan, 1.0])
+ with pd.option_context('mode.use_inf_as_na', True):
+ r = s.isna()
+ dr = s.dropna()
+ e = Series([False, True, True, False])
+ de = Series(['a', 1.0], index=[0, 3])
+ tm.assert_series_equal(r, e)
+ tm.assert_series_equal(dr, de)
+
+ def test_isnull_for_inf_deprecated(self):
+ # gh-17115
+ s = Series(['a', np.inf, np.nan, 1.0])
+ with pd.option_context('mode.use_inf_as_null', True):
+ r = s.isna()
+ dr = s.dropna()
+
+ e = Series([False, True, True, False])
+ de = Series(['a', 1.0], index=[0, 3])
+ tm.assert_series_equal(r, e)
+ tm.assert_series_equal(dr, de)
+
+ def test_fillna(self, datetime_series):
+ ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))
+
+ tm.assert_series_equal(ts, ts.fillna(method='ffill'))
+
+ ts[2] = np.NaN
+
+ exp = Series([0., 1., 1., 3., 4.], index=ts.index)
+ tm.assert_series_equal(ts.fillna(method='ffill'), exp)
+
+ exp = Series([0., 1., 3., 3., 4.], index=ts.index)
+ tm.assert_series_equal(ts.fillna(method='backfill'), exp)
+
+ exp = Series([0., 1., 5., 3., 4.], index=ts.index)
+ tm.assert_series_equal(ts.fillna(value=5), exp)
+
+ msg = "Must specify a fill 'value' or 'method'"
+ with pytest.raises(ValueError, match=msg):
+ ts.fillna()
+
+ msg = "Cannot specify both 'value' and 'method'"
+ with pytest.raises(ValueError, match=msg):
+ datetime_series.fillna(value=0, method='ffill')
+
+ # GH 5703
+ s1 = Series([np.nan])
+ s2 = Series([1])
+ result = s1.fillna(s2)
+ expected = Series([1.])
+ assert_series_equal(result, expected)
+ result = s1.fillna({})
+ assert_series_equal(result, s1)
+ result = s1.fillna(Series(()))
+ assert_series_equal(result, s1)
+ result = s2.fillna(s1)
+ assert_series_equal(result, s2)
+ result = s1.fillna({0: 1})
+ assert_series_equal(result, expected)
+ result = s1.fillna({1: 1})
+ assert_series_equal(result, Series([np.nan]))
+ result = s1.fillna({0: 1, 1: 1})
+ assert_series_equal(result, expected)
+ result = s1.fillna(Series({0: 1, 1: 1}))
+ assert_series_equal(result, expected)
+ result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
+ assert_series_equal(result, s1)
+
+ s1 = Series([0, 1, 2], list('abc'))
+ s2 = Series([0, np.nan, 2], list('bac'))
+ result = s2.fillna(s1)
+ expected = Series([0, 0, 2.], list('bac'))
+ assert_series_equal(result, expected)
+
+ # limit
+ s = Series(np.nan, index=[0, 1, 2])
+ result = s.fillna(999, limit=1)
+ expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
+ assert_series_equal(result, expected)
+
+ result = s.fillna(999, limit=2)
+ expected = Series([999, 999, np.nan], index=[0, 1, 2])
+ assert_series_equal(result, expected)
+
+ # GH 9043
+ # make sure a string representation of int/float values can be filled
+ # correctly without raising errors or being converted
+ vals = ['0', '1.5', '-0.3']
+ for val in vals:
+ s = Series([0, 1, np.nan, np.nan, 4], dtype='float64')
+ result = s.fillna(val)
+ expected = Series([0, 1, val, val, 4], dtype='object')
+ assert_series_equal(result, expected)
+
+ def test_fillna_bug(self):
+ x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd'])
+ filled = x.fillna(method='ffill')
+ expected = Series([nan, 1., 1., 3., 3.], x.index)
+ assert_series_equal(filled, expected)
+
+ filled = x.fillna(method='bfill')
+ expected = Series([1., 1., 3., 3., nan], x.index)
+ assert_series_equal(filled, expected)
+
+ def test_fillna_inplace(self):
+ x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd'])
+ y = x.copy()
+
+ y.fillna(value=0, inplace=True)
+
+ expected = x.fillna(value=0)
+ assert_series_equal(y, expected)
+
+ def test_fillna_invalid_method(self, datetime_series):
+ try:
+ datetime_series.fillna(method='ffil')
+ except ValueError as inst:
+ assert 'ffil' in str(inst)
+
+ def test_ffill(self):
+ ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))
+ ts[2] = np.NaN
+ assert_series_equal(ts.ffill(), ts.fillna(method='ffill'))
+
+ def test_ffill_mixed_dtypes_without_missing_data(self):
+ # GH14956
+ series = pd.Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1])
+ result = series.ffill()
+ assert_series_equal(series, result)
+
+ def test_bfill(self):
+ ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5))
+ ts[2] = np.NaN
+ assert_series_equal(ts.bfill(), ts.fillna(method='bfill'))
+
+ def test_timedelta64_nan(self):
+
+ td = Series([timedelta(days=i) for i in range(10)])
+
+ # nan ops on timedeltas
+ td1 = td.copy()
+ td1[0] = np.nan
+ assert isna(td1[0])
+ assert td1[0].value == iNaT
+ td1[0] = td[0]
+ assert not isna(td1[0])
+
+ td1[1] = iNaT
+ assert isna(td1[1])
+ assert td1[1].value == iNaT
+ td1[1] = td[1]
+ assert not isna(td1[1])
+
+ td1[2] = NaT
+ assert isna(td1[2])
+ assert td1[2].value == iNaT
+ td1[2] = td[2]
+ assert not isna(td1[2])
+
+ # boolean setting
+ # this doesn't work, not sure numpy even supports it
+ # result = td[(td>np.timedelta64(timedelta(days=3))) &
+ # td<np.timedelta64(timedelta(days=7)))] = np.nan
+ # assert isna(result).sum() == 7
+
+ # NumPy limitiation =(
+
+ # def test_logical_range_select(self):
+ # np.random.seed(12345)
+ # selector = -0.5 <= datetime_series <= 0.5
+ # expected = (datetime_series >= -0.5) & (datetime_series <= 0.5)
+ # assert_series_equal(selector, expected)
+
+ def test_dropna_empty(self):
+ s = Series([])
+ assert len(s.dropna()) == 0
+ s.dropna(inplace=True)
+ assert len(s) == 0
+
+ # invalid axis
+ msg = r"No axis named 1 for object type <(class|type) 'type'>"
+ with pytest.raises(ValueError, match=msg):
+ s.dropna(axis=1)
+
+ def test_datetime64_tz_dropna(self):
+ # DatetimeBlock
+ s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp(
+ '2011-01-03 10:00'), pd.NaT])
+ result = s.dropna()
+ expected = Series([Timestamp('2011-01-01 10:00'),
+ Timestamp('2011-01-03 10:00')], index=[0, 2])
+ tm.assert_series_equal(result, expected)
+
+ # DatetimeBlockTZ
+ idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT,
+ '2011-01-03 10:00', pd.NaT],
+ tz='Asia/Tokyo')
+ s = pd.Series(idx)
+ assert s.dtype == 'datetime64[ns, Asia/Tokyo]'
+ result = s.dropna()
+ expected = Series([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
+ Timestamp('2011-01-03 10:00', tz='Asia/Tokyo')],
+ index=[0, 2])
+ assert result.dtype == 'datetime64[ns, Asia/Tokyo]'
+ tm.assert_series_equal(result, expected)
+
+ def test_dropna_no_nan(self):
+ for s in [Series([1, 2, 3], name='x'), Series(
+ [False, True, False], name='x')]:
+
+ result = s.dropna()
+ tm.assert_series_equal(result, s)
+ assert result is not s
+
+ s2 = s.copy()
+ s2.dropna(inplace=True)
+ tm.assert_series_equal(s2, s)
+
+ def test_dropna_intervals(self):
+ s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays(
+ [np.nan, 0, 1, 2],
+ [np.nan, 1, 2, 3]))
+
+ result = s.dropna()
+ expected = s.iloc[1:]
+ assert_series_equal(result, expected)
+
+ def test_valid(self, datetime_series):
+ ts = datetime_series.copy()
+ ts[::2] = np.NaN
+
+ result = ts.dropna()
+ assert len(result) == ts.count()
+ tm.assert_series_equal(result, ts[1::2])
+ tm.assert_series_equal(result, ts[pd.notna(ts)])
+
+ def test_isna(self):
+ ser = Series([0, 5.4, 3, nan, -0.001])
+ expected = Series([False, False, False, True, False])
+ tm.assert_series_equal(ser.isna(), expected)
+
+ ser = Series(["hi", "", nan])
+ expected = Series([False, False, True])
+ tm.assert_series_equal(ser.isna(), expected)
+
+ def test_notna(self):
+ ser = Series([0, 5.4, 3, nan, -0.001])
+ expected = Series([True, True, True, False, True])
+ tm.assert_series_equal(ser.notna(), expected)
+
+ ser = Series(["hi", "", nan])
+ expected = Series([True, True, False])
+ tm.assert_series_equal(ser.notna(), expected)
+
+ def test_pad_nan(self):
+ x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'],
+ dtype=float)
+
+ x.fillna(method='pad', inplace=True)
+
+ expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0],
+ ['z', 'a', 'b', 'c', 'd'], dtype=float)
+ assert_series_equal(x[1:], expected[1:])
+ assert np.isnan(x[0]), np.isnan(expected[0])
+
+ def test_pad_require_monotonicity(self):
+ rng = date_range('1/1/2000', '3/1/2000', freq='B')
+
+ # neither monotonic increasing or decreasing
+ rng2 = rng[[1, 0, 2]]
+
+ msg = "index must be monotonic increasing or decreasing"
+ with pytest.raises(ValueError, match=msg):
+ rng2.get_indexer(rng, method='pad')
+
+ def test_dropna_preserve_name(self, datetime_series):
+ datetime_series[:5] = np.nan
+ result = datetime_series.dropna()
+ assert result.name == datetime_series.name
+ name = datetime_series.name
+ ts = datetime_series.copy()
+ ts.dropna(inplace=True)
+ assert ts.name == name
+
+ def test_fill_value_when_combine_const(self):
+ # GH12723
+ s = Series([0, 1, np.nan, 3, 4, 5])
+
+ exp = s.fillna(0).add(2)
+ res = s.add(2, fill_value=0)
+ assert_series_equal(res, exp)
+
+ def test_series_fillna_limit(self):
+ index = np.arange(10)
+ s = Series(np.random.randn(10), index=index)
+
+ result = s[:2].reindex(index)
+ result = result.fillna(method='pad', limit=5)
+
+ expected = s[:2].reindex(index).fillna(method='pad')
+ expected[-3:] = np.nan
+ assert_series_equal(result, expected)
+
+ result = s[-2:].reindex(index)
+ result = result.fillna(method='bfill', limit=5)
+
+ expected = s[-2:].reindex(index).fillna(method='backfill')
+ expected[:3] = np.nan
+ assert_series_equal(result, expected)
+
+ def test_sparse_series_fillna_limit(self):
+ index = np.arange(10)
+ s = Series(np.random.randn(10), index=index)
+
+ ss = s[:2].reindex(index).to_sparse()
+ # TODO: what is this test doing? why are result an expected
+ # the same call to fillna?
+ with tm.assert_produces_warning(PerformanceWarning):
+ # TODO: release-note fillna performance warning
+ result = ss.fillna(method='pad', limit=5)
+ expected = ss.fillna(method='pad', limit=5)
+ expected = expected.to_dense()
+ expected[-3:] = np.nan
+ expected = expected.to_sparse()
+ assert_series_equal(result, expected)
+
+ ss = s[-2:].reindex(index).to_sparse()
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = ss.fillna(method='backfill', limit=5)
+ expected = ss.fillna(method='backfill')
+ expected = expected.to_dense()
+ expected[:3] = np.nan
+ expected = expected.to_sparse()
+ assert_series_equal(result, expected)
+
+ def test_sparse_series_pad_backfill_limit(self):
+ index = np.arange(10)
+ s = Series(np.random.randn(10), index=index)
+ s = s.to_sparse()
+
+ result = s[:2].reindex(index, method='pad', limit=5)
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = s[:2].reindex(index).fillna(method='pad')
+ expected = expected.to_dense()
+ expected[-3:] = np.nan
+ expected = expected.to_sparse()
+ assert_series_equal(result, expected)
+
+ result = s[-2:].reindex(index, method='backfill', limit=5)
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = s[-2:].reindex(index).fillna(method='backfill')
+ expected = expected.to_dense()
+ expected[:3] = np.nan
+ expected = expected.to_sparse()
+ assert_series_equal(result, expected)
+
+ def test_series_pad_backfill_limit(self):
+ index = np.arange(10)
+ s = Series(np.random.randn(10), index=index)
+
+ result = s[:2].reindex(index, method='pad', limit=5)
+
+ expected = s[:2].reindex(index).fillna(method='pad')
+ expected[-3:] = np.nan
+ assert_series_equal(result, expected)
+
+ result = s[-2:].reindex(index, method='backfill', limit=5)
+
+ expected = s[-2:].reindex(index).fillna(method='backfill')
+ expected[:3] = np.nan
+ assert_series_equal(result, expected)
+
+
+class TestSeriesInterpolateData():
+
+ def test_interpolate(self, datetime_series, string_series):
+ ts = Series(np.arange(len(datetime_series), dtype=float),
+ datetime_series.index)
+
+ ts_copy = ts.copy()
+ ts_copy[5:10] = np.NaN
+
+ linear_interp = ts_copy.interpolate(method='linear')
+ tm.assert_series_equal(linear_interp, ts)
+
+ ord_ts = Series([d.toordinal() for d in datetime_series.index],
+ index=datetime_series.index).astype(float)
+
+ ord_ts_copy = ord_ts.copy()
+ ord_ts_copy[5:10] = np.NaN
+
+ time_interp = ord_ts_copy.interpolate(method='time')
+ tm.assert_series_equal(time_interp, ord_ts)
+
+ # try time interpolation on a non-TimeSeries
+ # Only raises ValueError if there are NaNs.
+ non_ts = string_series.copy()
+ non_ts[0] = np.NaN
+ msg = ("time-weighted interpolation only works on Series or DataFrames"
+ " with a DatetimeIndex")
+ with pytest.raises(ValueError, match=msg):
+ non_ts.interpolate(method='time')
+
+ @td.skip_if_no_scipy
+ def test_interpolate_pchip(self):
+ _skip_if_no_pchip()
+
+ ser = Series(np.sort(np.random.uniform(size=100)))
+
+ # interpolate at new_index
+ new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5,
+ 50.75]))
+ interp_s = ser.reindex(new_index).interpolate(method='pchip')
+ # does not blow up, GH5977
+ interp_s[49:51]
+
+ @td.skip_if_no_scipy
+ def test_interpolate_akima(self):
+ _skip_if_no_akima()
+
+ ser = Series([10, 11, 12, 13])
+
+ expected = Series([11.00, 11.25, 11.50, 11.75,
+ 12.00, 12.25, 12.50, 12.75, 13.00],
+ index=Index([1.0, 1.25, 1.5, 1.75,
+ 2.0, 2.25, 2.5, 2.75, 3.0]))
+ # interpolate at new_index
+ new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]))
+ interp_s = ser.reindex(new_index).interpolate(method='akima')
+ assert_series_equal(interp_s[1:3], expected)
+
+ @td.skip_if_no_scipy
+ def test_interpolate_piecewise_polynomial(self):
+ ser = Series([10, 11, 12, 13])
+
+ expected = Series([11.00, 11.25, 11.50, 11.75,
+ 12.00, 12.25, 12.50, 12.75, 13.00],
+ index=Index([1.0, 1.25, 1.5, 1.75,
+ 2.0, 2.25, 2.5, 2.75, 3.0]))
+ # interpolate at new_index
+ new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]))
+ interp_s = ser.reindex(new_index).interpolate(
+ method='piecewise_polynomial')
+ assert_series_equal(interp_s[1:3], expected)
+
+ @td.skip_if_no_scipy
+ def test_interpolate_from_derivatives(self):
+ ser = Series([10, 11, 12, 13])
+
+ expected = Series([11.00, 11.25, 11.50, 11.75,
+ 12.00, 12.25, 12.50, 12.75, 13.00],
+ index=Index([1.0, 1.25, 1.5, 1.75,
+ 2.0, 2.25, 2.5, 2.75, 3.0]))
+ # interpolate at new_index
+ new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]))
+ interp_s = ser.reindex(new_index).interpolate(
+ method='from_derivatives')
+ assert_series_equal(interp_s[1:3], expected)
+
+ @pytest.mark.parametrize("kwargs", [
+ {},
+ pytest.param({'method': 'polynomial', 'order': 1},
+ marks=td.skip_if_no_scipy)
+ ])
+ def test_interpolate_corners(self, kwargs):
+ s = Series([np.nan, np.nan])
+ assert_series_equal(s.interpolate(**kwargs), s)
+
+ s = Series([]).interpolate()
+ assert_series_equal(s.interpolate(**kwargs), s)
+
+ def test_interpolate_index_values(self):
+ s = Series(np.nan, index=np.sort(np.random.rand(30)))
+ s[::3] = np.random.randn(10)
+
+ vals = s.index.values.astype(float)
+
+ result = s.interpolate(method='index')
+
+ expected = s.copy()
+ bad = isna(expected.values)
+ good = ~bad
+ expected = Series(np.interp(vals[bad], vals[good],
+ s.values[good]),
+ index=s.index[bad])
+
+ assert_series_equal(result[bad], expected)
+
+ # 'values' is synonymous with 'index' for the method kwarg
+ other_result = s.interpolate(method='values')
+
+ assert_series_equal(other_result, result)
+ assert_series_equal(other_result[bad], expected)
+
+ def test_interpolate_non_ts(self):
+ s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+ msg = ("time-weighted interpolation only works on Series or DataFrames"
+ " with a DatetimeIndex")
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='time')
+
+ @pytest.mark.parametrize("kwargs", [
+ {},
+ pytest.param({'method': 'polynomial', 'order': 1},
+ marks=td.skip_if_no_scipy)
+ ])
+ def test_nan_interpolate(self, kwargs):
+ s = Series([0, 1, np.nan, 3])
+ result = s.interpolate(**kwargs)
+ expected = Series([0., 1., 2., 3.])
+ assert_series_equal(result, expected)
+
+ def test_nan_irregular_index(self):
+ s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9])
+ result = s.interpolate()
+ expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9])
+ assert_series_equal(result, expected)
+
+ def test_nan_str_index(self):
+ s = Series([0, 1, 2, np.nan], index=list('abcd'))
+ result = s.interpolate()
+ expected = Series([0., 1., 2., 2.], index=list('abcd'))
+ assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_interp_quad(self):
+ sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4])
+ result = sq.interpolate(method='quadratic')
+ expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4])
+ assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_interp_scipy_basic(self):
+ s = Series([1, 3, np.nan, 12, np.nan, 25])
+ # slinear
+ expected = Series([1., 3., 7.5, 12., 18.5, 25.])
+ result = s.interpolate(method='slinear')
+ assert_series_equal(result, expected)
+
+ result = s.interpolate(method='slinear', downcast='infer')
+ assert_series_equal(result, expected)
+ # nearest
+ expected = Series([1, 3, 3, 12, 12, 25])
+ result = s.interpolate(method='nearest')
+ assert_series_equal(result, expected.astype('float'))
+
+ result = s.interpolate(method='nearest', downcast='infer')
+ assert_series_equal(result, expected)
+ # zero
+ expected = Series([1, 3, 3, 12, 12, 25])
+ result = s.interpolate(method='zero')
+ assert_series_equal(result, expected.astype('float'))
+
+ result = s.interpolate(method='zero', downcast='infer')
+ assert_series_equal(result, expected)
+ # quadratic
+ # GH #15662.
+ # new cubic and quadratic interpolation algorithms from scipy 0.19.0.
+ # previously `splmake` was used. See scipy/scipy#6710
+ if _is_scipy_ge_0190:
+ expected = Series([1, 3., 6.823529, 12., 18.058824, 25.])
+ else:
+ expected = Series([1, 3., 6.769231, 12., 18.230769, 25.])
+ result = s.interpolate(method='quadratic')
+ assert_series_equal(result, expected)
+
+ result = s.interpolate(method='quadratic', downcast='infer')
+ assert_series_equal(result, expected)
+ # cubic
+ expected = Series([1., 3., 6.8, 12., 18.2, 25.])
+ result = s.interpolate(method='cubic')
+ assert_series_equal(result, expected)
+
+ def test_interp_limit(self):
+ s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+ expected = Series([1., 3., 5., 7., np.nan, 11.])
+ result = s.interpolate(method='linear', limit=2)
+ assert_series_equal(result, expected)
+
+ # GH 9217, make sure limit is an int and greater than 0
+ methods = ['linear', 'time', 'index', 'values', 'nearest', 'zero',
+ 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh',
+ 'polynomial', 'spline', 'piecewise_polynomial', None,
+ 'from_derivatives', 'pchip', 'akima']
+ s = pd.Series([1, 2, np.nan, np.nan, 5])
+ msg = (r"Limit must be greater than 0|"
+ "time-weighted interpolation only works on Series or"
+ r" DataFrames with a DatetimeIndex|"
+ r"invalid method '(polynomial|spline|None)' to interpolate|"
+ "Limit must be an integer")
+ for limit in [-1, 0, 1., 2.]:
+ for method in methods:
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(limit=limit, method=method)
+
+ def test_interp_limit_forward(self):
+ s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+ # Provide 'forward' (the default) explicitly here.
+ expected = Series([1., 3., 5., 7., np.nan, 11.])
+
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='forward')
+ assert_series_equal(result, expected)
+
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='FORWARD')
+ assert_series_equal(result, expected)
+
+ def test_interp_unlimited(self):
+ # these test are for issue #16282 default Limit=None is unlimited
+ s = Series([np.nan, 1., 3., np.nan, np.nan, np.nan, 11., np.nan])
+ expected = Series([1., 1., 3., 5., 7., 9., 11., 11.])
+ result = s.interpolate(method='linear',
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ expected = Series([np.nan, 1., 3., 5., 7., 9., 11., 11.])
+ result = s.interpolate(method='linear',
+ limit_direction='forward')
+ assert_series_equal(result, expected)
+
+ expected = Series([1., 1., 3., 5., 7., 9., 11., np.nan])
+ result = s.interpolate(method='linear',
+ limit_direction='backward')
+ assert_series_equal(result, expected)
+
+ def test_interp_limit_bad_direction(self):
+ s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+ msg = (r"Invalid limit_direction: expecting one of \['forward',"
+ r" 'backward', 'both'\], got 'abc'")
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='linear', limit=2, limit_direction='abc')
+
+ # raises an error even if no limit is specified.
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='linear', limit_direction='abc')
+
+ # limit_area introduced GH #16284
+ def test_interp_limit_area(self):
+ # These tests are for issue #9218 -- fill NaNs in both directions.
+ s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan])
+
+ expected = Series([nan, nan, 3., 4., 5., 6., 7., nan, nan])
+ result = s.interpolate(method='linear', limit_area='inside')
+ assert_series_equal(result, expected)
+
+ expected = Series([nan, nan, 3., 4., nan, nan, 7., nan, nan])
+ result = s.interpolate(method='linear', limit_area='inside',
+ limit=1)
+
+ expected = Series([nan, nan, 3., 4., nan, 6., 7., nan, nan])
+ result = s.interpolate(method='linear', limit_area='inside',
+ limit_direction='both', limit=1)
+ assert_series_equal(result, expected)
+
+ expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., 7.])
+ result = s.interpolate(method='linear', limit_area='outside')
+ assert_series_equal(result, expected)
+
+ expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., nan])
+ result = s.interpolate(method='linear', limit_area='outside',
+ limit=1)
+
+ expected = Series([nan, 3., 3., nan, nan, nan, 7., 7., nan])
+ result = s.interpolate(method='linear', limit_area='outside',
+ limit_direction='both', limit=1)
+ assert_series_equal(result, expected)
+
+ expected = Series([3., 3., 3., nan, nan, nan, 7., nan, nan])
+ result = s.interpolate(method='linear', limit_area='outside',
+ direction='backward')
+
+ # raises an error even if limit type is wrong.
+ msg = (r"Invalid limit_area: expecting one of \['inside', 'outside'\],"
+ " got abc")
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='linear', limit_area='abc')
+
+ def test_interp_limit_direction(self):
+ # These tests are for issue #9218 -- fill NaNs in both directions.
+ s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+ expected = Series([1., 3., np.nan, 7., 9., 11.])
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='backward')
+ assert_series_equal(result, expected)
+
+ expected = Series([1., 3., 5., np.nan, 9., 11.])
+ result = s.interpolate(method='linear', limit=1,
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ # Check that this works on a longer series of nans.
+ s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12,
+ np.nan])
+
+ expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.])
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.])
+ result = s.interpolate(method='linear', limit=1,
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ def test_interp_limit_to_ends(self):
+ # These test are for issue #10420 -- flow back to beginning.
+ s = Series([np.nan, np.nan, 5, 7, 9, np.nan])
+
+ expected = Series([5., 5., 5., 7., 9., np.nan])
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='backward')
+ assert_series_equal(result, expected)
+
+ expected = Series([5., 5., 5., 7., 9., 9.])
+ result = s.interpolate(method='linear', limit=2,
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ def test_interp_limit_before_ends(self):
+ # These test are for issue #11115 -- limit ends properly.
+ s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan])
+
+ expected = Series([np.nan, np.nan, 5., 7., 7., np.nan])
+ result = s.interpolate(method='linear', limit=1,
+ limit_direction='forward')
+ assert_series_equal(result, expected)
+
+ expected = Series([np.nan, 5., 5., 7., np.nan, np.nan])
+ result = s.interpolate(method='linear', limit=1,
+ limit_direction='backward')
+ assert_series_equal(result, expected)
+
+ expected = Series([np.nan, 5., 5., 7., 7., np.nan])
+ result = s.interpolate(method='linear', limit=1,
+ limit_direction='both')
+ assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ def test_interp_all_good(self):
+ s = Series([1, 2, 3])
+ result = s.interpolate(method='polynomial', order=1)
+ assert_series_equal(result, s)
+
+ # non-scipy
+ result = s.interpolate()
+ assert_series_equal(result, s)
+
+ @pytest.mark.parametrize("check_scipy", [
+ False,
+ pytest.param(True, marks=td.skip_if_no_scipy)
+ ])
+ def test_interp_multiIndex(self, check_scipy):
+ idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
+ s = Series([1, 2, np.nan], index=idx)
+
+ expected = s.copy()
+ expected.loc[2] = 2
+ result = s.interpolate()
+ assert_series_equal(result, expected)
+
+ msg = "Only `method=linear` interpolation is supported on MultiIndexes"
+ if check_scipy:
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='polynomial', order=1)
+
+ @td.skip_if_no_scipy
+ def test_interp_nonmono_raise(self):
+ s = Series([1, np.nan, 3], index=[0, 2, 1])
+ msg = "krogh interpolation requires that the index be monotonic"
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='krogh')
+
+ @td.skip_if_no_scipy
+ def test_interp_datetime64(self):
+ df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3))
+ result = df.interpolate(method='nearest')
+ expected = Series([1., 1., 3.],
+ index=date_range('1/1/2000', periods=3))
+ assert_series_equal(result, expected)
+
+ def test_interp_limit_no_nans(self):
+ # GH 7173
+ s = pd.Series([1., 2., 3.])
+ result = s.interpolate(limit=1)
+ expected = s
+ assert_series_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize("method", ['polynomial', 'spline'])
+ def test_no_order(self, method):
+ s = Series([0, 1, np.nan, 3])
+ msg = "invalid method '{}' to interpolate".format(method)
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method=method)
+
+ @td.skip_if_no_scipy
+ def test_spline(self):
+ s = Series([1, 2, np.nan, 4, 5, np.nan, 7])
+ result = s.interpolate(method='spline', order=1)
+ expected = Series([1., 2., 3., 4., 5., 6., 7.])
+ assert_series_equal(result, expected)
+
+ @td.skip_if_no('scipy', min_version='0.15')
+ def test_spline_extrapolate(self):
+ s = Series([1, 2, 3, 4, np.nan, 6, np.nan])
+ result3 = s.interpolate(method='spline', order=1, ext=3)
+ expected3 = Series([1., 2., 3., 4., 5., 6., 6.])
+ assert_series_equal(result3, expected3)
+
+ result1 = s.interpolate(method='spline', order=1, ext=0)
+ expected1 = Series([1., 2., 3., 4., 5., 6., 7.])
+ assert_series_equal(result1, expected1)
+
+ @td.skip_if_no_scipy
+ def test_spline_smooth(self):
+ s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7])
+ assert (s.interpolate(method='spline', order=3, s=0)[5] !=
+ s.interpolate(method='spline', order=3)[5])
+
+ @td.skip_if_no_scipy
+ def test_spline_interpolation(self):
+ s = Series(np.arange(10) ** 2)
+ s[np.random.randint(0, 9, 3)] = np.nan
+ result1 = s.interpolate(method='spline', order=1)
+ expected1 = s.interpolate(method='spline', order=1)
+ assert_series_equal(result1, expected1)
+
+ @td.skip_if_no_scipy
+ def test_spline_error(self):
+ # see gh-10633
+ s = pd.Series(np.arange(10) ** 2)
+ s[np.random.randint(0, 9, 3)] = np.nan
+ msg = "invalid method 'spline' to interpolate"
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='spline')
+
+ msg = "order needs to be specified and greater than 0"
+ with pytest.raises(ValueError, match=msg):
+ s.interpolate(method='spline', order=0)
+
+ def test_interp_timedelta64(self):
+ # GH 6424
+ df = Series([1, np.nan, 3],
+ index=pd.to_timedelta([1, 2, 3]))
+ result = df.interpolate(method='time')
+ expected = Series([1., 2., 3.],
+ index=pd.to_timedelta([1, 2, 3]))
+ assert_series_equal(result, expected)
+
+ # test for non uniform spacing
+ df = Series([1, np.nan, 3],
+ index=pd.to_timedelta([1, 2, 4]))
+ result = df.interpolate(method='time')
+ expected = Series([1., 1.666667, 3.],
+ index=pd.to_timedelta([1, 2, 4]))
+ assert_series_equal(result, expected)
+
+ def test_series_interpolate_method_values(self):
+ # #1646
+ ts = _simple_ts('1/1/2000', '1/20/2000')
+ ts[::2] = np.nan
+
+ result = ts.interpolate(method='values')
+ exp = ts.interpolate()
+ assert_series_equal(result, exp)
+
+ def test_series_interpolate_intraday(self):
+ # #1698
+ index = pd.date_range('1/1/2012', periods=4, freq='12D')
+ ts = pd.Series([0, 12, 24, 36], index)
+ new_index = index.append(index + pd.DateOffset(days=1)).sort_values()
+
+ exp = ts.reindex(new_index).interpolate(method='time')
+
+ index = pd.date_range('1/1/2012', periods=4, freq='12H')
+ ts = pd.Series([0, 12, 24, 36], index)
+ new_index = index.append(index + pd.DateOffset(hours=1)).sort_values()
+ result = ts.reindex(new_index).interpolate(method='time')
+
+ tm.assert_numpy_array_equal(result.values, exp.values)
+
+ def test_nonzero_warning(self):
+ # GH 24048
+ ser = pd.Series([1, 0, 3, 4])
+ with tm.assert_produces_warning(FutureWarning):
+ ser.nonzero()
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_operators.py b/contrib/python/pandas/py2/pandas/tests/series/test_operators.py
new file mode 100644
index 00000000000..b2aac441db1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_operators.py
@@ -0,0 +1,756 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+import operator
+
+import numpy as np
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import range
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, Series, bdate_range, date_range, isna)
+from pandas.core import ops
+import pandas.core.nanops as nanops
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+from .common import TestData
+
+
+class TestSeriesLogicalOps(object):
+ @pytest.mark.parametrize('bool_op', [operator.and_,
+ operator.or_, operator.xor])
+ def test_bool_operators_with_nas(self, bool_op):
+ # boolean &, |, ^ should work with object arrays and propagate NAs
+ ser = Series(bdate_range('1/1/2000', periods=10), dtype=object)
+ ser[::2] = np.nan
+
+ mask = ser.isna()
+ filled = ser.fillna(ser[0])
+
+ result = bool_op(ser < ser[9], ser > ser[3])
+
+ expected = bool_op(filled < filled[9], filled > filled[3])
+ expected[mask] = False
+ assert_series_equal(result, expected)
+
+ def test_operators_bitwise(self):
+ # GH#9016: support bitwise op for integer types
+ index = list('bca')
+
+ s_tft = Series([True, False, True], index=index)
+ s_fff = Series([False, False, False], index=index)
+ s_tff = Series([True, False, False], index=index)
+ s_empty = Series([])
+
+ # TODO: unused
+ # s_0101 = Series([0, 1, 0, 1])
+
+ s_0123 = Series(range(4), dtype='int64')
+ s_3333 = Series([3] * 4)
+ s_4444 = Series([4] * 4)
+
+ res = s_tft & s_empty
+ expected = s_fff
+ assert_series_equal(res, expected)
+
+ res = s_tft | s_empty
+ expected = s_tft
+ assert_series_equal(res, expected)
+
+ res = s_0123 & s_3333
+ expected = Series(range(4), dtype='int64')
+ assert_series_equal(res, expected)
+
+ res = s_0123 | s_4444
+ expected = Series(range(4, 8), dtype='int64')
+ assert_series_equal(res, expected)
+
+ s_a0b1c0 = Series([1], list('b'))
+
+ res = s_tft & s_a0b1c0
+ expected = s_tff.reindex(list('abc'))
+ assert_series_equal(res, expected)
+
+ res = s_tft | s_a0b1c0
+ expected = s_tft.reindex(list('abc'))
+ assert_series_equal(res, expected)
+
+ n0 = 0
+ res = s_tft & n0
+ expected = s_fff
+ assert_series_equal(res, expected)
+
+ res = s_0123 & n0
+ expected = Series([0] * 4)
+ assert_series_equal(res, expected)
+
+ n1 = 1
+ res = s_tft & n1
+ expected = s_tft
+ assert_series_equal(res, expected)
+
+ res = s_0123 & n1
+ expected = Series([0, 1, 0, 1])
+ assert_series_equal(res, expected)
+
+ s_1111 = Series([1] * 4, dtype='int8')
+ res = s_0123 & s_1111
+ expected = Series([0, 1, 0, 1], dtype='int64')
+ assert_series_equal(res, expected)
+
+ res = s_0123.astype(np.int16) | s_1111.astype(np.int32)
+ expected = Series([1, 1, 3, 3], dtype='int32')
+ assert_series_equal(res, expected)
+
+ with pytest.raises(TypeError):
+ s_1111 & 'a'
+ with pytest.raises(TypeError):
+ s_1111 & ['a', 'b', 'c', 'd']
+ with pytest.raises(TypeError):
+ s_0123 & np.NaN
+ with pytest.raises(TypeError):
+ s_0123 & 3.14
+ with pytest.raises(TypeError):
+ s_0123 & [0.1, 4, 3.14, 2]
+
+ # s_0123 will be all false now because of reindexing like s_tft
+ exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
+ assert_series_equal(s_tft & s_0123, exp)
+
+ # s_tft will be all false now because of reindexing like s_0123
+ exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
+ assert_series_equal(s_0123 & s_tft, exp)
+
+ assert_series_equal(s_0123 & False, Series([False] * 4))
+ assert_series_equal(s_0123 ^ False, Series([False, True, True, True]))
+ assert_series_equal(s_0123 & [False], Series([False] * 4))
+ assert_series_equal(s_0123 & (False), Series([False] * 4))
+ assert_series_equal(s_0123 & Series([False, np.NaN, False, False]),
+ Series([False] * 4))
+
+ s_ftft = Series([False, True, False, True])
+ assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft)
+
+ s_abNd = Series(['a', 'b', np.NaN, 'd'])
+ res = s_0123 & s_abNd
+ expected = s_ftft
+ assert_series_equal(res, expected)
+
+ def test_scalar_na_logical_ops_corners(self):
+ s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10])
+
+ with pytest.raises(TypeError):
+ s & datetime(2005, 1, 1)
+
+ s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)])
+ s[::2] = np.nan
+
+ expected = Series(True, index=s.index)
+ expected[::2] = False
+ result = s & list(s)
+ assert_series_equal(result, expected)
+
+ d = DataFrame({'A': s})
+ # TODO: Fix this exception - needs to be fixed! (see GH5035)
+ # (previously this was a TypeError because series returned
+ # NotImplemented
+
+ # this is an alignment issue; these are equivalent
+ # https://github.com/pandas-dev/pandas/issues/5284
+
+ with pytest.raises(TypeError):
+ d.__and__(s, axis='columns')
+
+ with pytest.raises(TypeError):
+ s & d
+
+ # this is wrong as its not a boolean result
+ # result = d.__and__(s,axis='index')
+
+ @pytest.mark.parametrize('op', [
+ operator.and_,
+ operator.or_,
+ operator.xor,
+
+ ])
+ def test_logical_ops_with_index(self, op):
+ # GH#22092, GH#19792
+ ser = Series([True, True, False, False])
+ idx1 = Index([True, False, True, False])
+ idx2 = Index([1, 0, 1, 0])
+
+ expected = Series([op(ser[n], idx1[n]) for n in range(len(ser))])
+
+ result = op(ser, idx1)
+ assert_series_equal(result, expected)
+
+ expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))],
+ dtype=bool)
+
+ result = op(ser, idx2)
+ assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("op, expected", [
+ (ops.rand_, pd.Index([False, True])),
+ (ops.ror_, pd.Index([False, True])),
+ (ops.rxor, pd.Index([])),
+ ])
+ def test_reverse_ops_with_index(self, op, expected):
+ # https://github.com/pandas-dev/pandas/pull/23628
+ # multi-set Index ops are buggy, so let's avoid duplicates...
+ ser = Series([True, False])
+ idx = Index([False, True])
+ result = op(ser, idx)
+ tm.assert_index_equal(result, expected)
+
+ def test_logical_ops_label_based(self):
+ # GH#4947
+ # logical ops should be label based
+
+ a = Series([True, False, True], list('bca'))
+ b = Series([False, True, False], list('abc'))
+
+ expected = Series([False, True, False], list('abc'))
+ result = a & b
+ assert_series_equal(result, expected)
+
+ expected = Series([True, True, False], list('abc'))
+ result = a | b
+ assert_series_equal(result, expected)
+
+ expected = Series([True, False, False], list('abc'))
+ result = a ^ b
+ assert_series_equal(result, expected)
+
+ # rhs is bigger
+ a = Series([True, False, True], list('bca'))
+ b = Series([False, True, False, True], list('abcd'))
+
+ expected = Series([False, True, False, False], list('abcd'))
+ result = a & b
+ assert_series_equal(result, expected)
+
+ expected = Series([True, True, False, False], list('abcd'))
+ result = a | b
+ assert_series_equal(result, expected)
+
+ # filling
+
+ # vs empty
+ result = a & Series([])
+ expected = Series([False, False, False], list('bca'))
+ assert_series_equal(result, expected)
+
+ result = a | Series([])
+ expected = Series([True, False, True], list('bca'))
+ assert_series_equal(result, expected)
+
+ # vs non-matching
+ result = a & Series([1], ['z'])
+ expected = Series([False, False, False, False], list('abcz'))
+ assert_series_equal(result, expected)
+
+ result = a | Series([1], ['z'])
+ expected = Series([True, True, False, False], list('abcz'))
+ assert_series_equal(result, expected)
+
+ # identity
+ # we would like s[s|e] == s to hold for any e, whether empty or not
+ for e in [Series([]), Series([1], ['z']),
+ Series(np.nan, b.index), Series(np.nan, a.index)]:
+ result = a[a | e]
+ assert_series_equal(result, a[a])
+
+ for e in [Series(['z'])]:
+ result = a[a | e]
+ assert_series_equal(result, a[a])
+
+ # vs scalars
+ index = list('bca')
+ t = Series([True, False, True])
+
+ for v in [True, 1, 2]:
+ result = Series([True, False, True], index=index) | v
+ expected = Series([True, True, True], index=index)
+ assert_series_equal(result, expected)
+
+ for v in [np.nan, 'foo']:
+ with pytest.raises(TypeError):
+ t | v
+
+ for v in [False, 0]:
+ result = Series([True, False, True], index=index) | v
+ expected = Series([True, False, True], index=index)
+ assert_series_equal(result, expected)
+
+ for v in [True, 1]:
+ result = Series([True, False, True], index=index) & v
+ expected = Series([True, False, True], index=index)
+ assert_series_equal(result, expected)
+
+ for v in [False, 0]:
+ result = Series([True, False, True], index=index) & v
+ expected = Series([False, False, False], index=index)
+ assert_series_equal(result, expected)
+ for v in [np.nan]:
+ with pytest.raises(TypeError):
+ t & v
+
+ def test_logical_ops_df_compat(self):
+ # GH#1134
+ s1 = pd.Series([True, False, True], index=list('ABC'), name='x')
+ s2 = pd.Series([True, True, False], index=list('ABD'), name='x')
+
+ exp = pd.Series([True, False, False, False],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s1 & s2, exp)
+ assert_series_equal(s2 & s1, exp)
+
+ # True | np.nan => True
+ exp = pd.Series([True, True, True, False],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s1 | s2, exp)
+ # np.nan | True => np.nan, filled with False
+ exp = pd.Series([True, True, False, False],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s2 | s1, exp)
+
+ # DataFrame doesn't fill nan with False
+ exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]},
+ index=list('ABCD'))
+ assert_frame_equal(s1.to_frame() & s2.to_frame(), exp)
+ assert_frame_equal(s2.to_frame() & s1.to_frame(), exp)
+
+ exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]},
+ index=list('ABCD'))
+ assert_frame_equal(s1.to_frame() | s2.to_frame(), exp)
+ assert_frame_equal(s2.to_frame() | s1.to_frame(), exp)
+
+ # different length
+ s3 = pd.Series([True, False, True], index=list('ABC'), name='x')
+ s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x')
+
+ exp = pd.Series([True, False, True, False],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s3 & s4, exp)
+ assert_series_equal(s4 & s3, exp)
+
+ # np.nan | True => np.nan, filled with False
+ exp = pd.Series([True, True, True, False],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s3 | s4, exp)
+ # True | np.nan => True
+ exp = pd.Series([True, True, True, True],
+ index=list('ABCD'), name='x')
+ assert_series_equal(s4 | s3, exp)
+
+ exp = pd.DataFrame({'x': [True, False, True, np.nan]},
+ index=list('ABCD'))
+ assert_frame_equal(s3.to_frame() & s4.to_frame(), exp)
+ assert_frame_equal(s4.to_frame() & s3.to_frame(), exp)
+
+ exp = pd.DataFrame({'x': [True, True, True, np.nan]},
+ index=list('ABCD'))
+ assert_frame_equal(s3.to_frame() | s4.to_frame(), exp)
+ assert_frame_equal(s4.to_frame() | s3.to_frame(), exp)
+
+
+class TestSeriesComparisons(object):
+ def test_comparisons(self):
+ left = np.random.randn(10)
+ right = np.random.randn(10)
+ left[:3] = np.nan
+
+ result = nanops.nangt(left, right)
+ with np.errstate(invalid='ignore'):
+ expected = (left > right).astype('O')
+ expected[:3] = np.nan
+
+ assert_almost_equal(result, expected)
+
+ s = Series(['a', 'b', 'c'])
+ s2 = Series([False, True, False])
+
+ # it works!
+ exp = Series([False, False, False])
+ assert_series_equal(s == s2, exp)
+ assert_series_equal(s2 == s, exp)
+
+ def test_categorical_comparisons(self):
+ # GH 8938
+ # allow equality comparisons
+ a = Series(list('abc'), dtype="category")
+ b = Series(list('abc'), dtype="object")
+ c = Series(['a', 'b', 'cc'], dtype="object")
+ d = Series(list('acb'), dtype="object")
+ e = Categorical(list('abc'))
+ f = Categorical(list('acb'))
+
+ # vs scalar
+ assert not (a == 'a').all()
+ assert ((a != 'a') == ~(a == 'a')).all()
+
+ assert not ('a' == a).all()
+ assert (a == 'a')[0]
+ assert ('a' == a)[0]
+ assert not ('a' != a)[0]
+
+ # vs list-like
+ assert (a == a).all()
+ assert not (a != a).all()
+
+ assert (a == list(a)).all()
+ assert (a == b).all()
+ assert (b == a).all()
+ assert ((~(a == b)) == (a != b)).all()
+ assert ((~(b == a)) == (b != a)).all()
+
+ assert not (a == c).all()
+ assert not (c == a).all()
+ assert not (a == d).all()
+ assert not (d == a).all()
+
+ # vs a cat-like
+ assert (a == e).all()
+ assert (e == a).all()
+ assert not (a == f).all()
+ assert not (f == a).all()
+
+ assert ((~(a == e) == (a != e)).all())
+ assert ((~(e == a) == (e != a)).all())
+ assert ((~(a == f) == (a != f)).all())
+ assert ((~(f == a) == (f != a)).all())
+
+ # non-equality is not comparable
+ with pytest.raises(TypeError):
+ a < b
+ with pytest.raises(TypeError):
+ b < a
+ with pytest.raises(TypeError):
+ a > b
+ with pytest.raises(TypeError):
+ b > a
+
+ def test_comparison_tuples(self):
+ # GH11339
+ # comparisons vs tuple
+ s = Series([(1, 1), (1, 2)])
+
+ result = s == (1, 2)
+ expected = Series([False, True])
+ assert_series_equal(result, expected)
+
+ result = s != (1, 2)
+ expected = Series([True, False])
+ assert_series_equal(result, expected)
+
+ result = s == (0, 0)
+ expected = Series([False, False])
+ assert_series_equal(result, expected)
+
+ result = s != (0, 0)
+ expected = Series([True, True])
+ assert_series_equal(result, expected)
+
+ s = Series([(1, 1), (1, 1)])
+
+ result = s == (1, 1)
+ expected = Series([True, True])
+ assert_series_equal(result, expected)
+
+ result = s != (1, 1)
+ expected = Series([False, False])
+ assert_series_equal(result, expected)
+
+ s = Series([frozenset([1]), frozenset([1, 2])])
+
+ result = s == frozenset([1])
+ expected = Series([True, False])
+ assert_series_equal(result, expected)
+
+ def test_comparison_operators_with_nas(self):
+ ser = Series(bdate_range('1/1/2000', periods=10), dtype=object)
+ ser[::2] = np.nan
+
+ # test that comparisons work
+ ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne']
+ for op in ops:
+ val = ser[5]
+
+ f = getattr(operator, op)
+ result = f(ser, val)
+
+ expected = f(ser.dropna(), val).reindex(ser.index)
+
+ if op == 'ne':
+ expected = expected.fillna(True).astype(bool)
+ else:
+ expected = expected.fillna(False).astype(bool)
+
+ assert_series_equal(result, expected)
+
+ # fffffffuuuuuuuuuuuu
+ # result = f(val, s)
+ # expected = f(val, s.dropna()).reindex(s.index)
+ # assert_series_equal(result, expected)
+
+ def test_unequal_categorical_comparison_raises_type_error(self):
+ # unequal comparison should raise for unordered cats
+ cat = Series(Categorical(list("abc")))
+ with pytest.raises(TypeError):
+ cat > "b"
+
+ cat = Series(Categorical(list("abc"), ordered=False))
+ with pytest.raises(TypeError):
+ cat > "b"
+
+ # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+ # and following comparisons with scalars not in categories should raise
+ # for unequal comps, but not for equal/not equal
+ cat = Series(Categorical(list("abc"), ordered=True))
+
+ with pytest.raises(TypeError):
+ cat < "d"
+ with pytest.raises(TypeError):
+ cat > "d"
+ with pytest.raises(TypeError):
+ "d" < cat
+ with pytest.raises(TypeError):
+ "d" > cat
+
+ tm.assert_series_equal(cat == "d", Series([False, False, False]))
+ tm.assert_series_equal(cat != "d", Series([True, True, True]))
+
+ def test_ne(self):
+ ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float)
+ expected = [True, True, False, True, True]
+ assert tm.equalContents(ts.index != 5, expected)
+ assert tm.equalContents(~(ts.index == 5), expected)
+
+ def test_comp_ops_df_compat(self):
+ # GH 1134
+ s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x')
+ s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x')
+
+ s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x')
+ s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x')
+
+ for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]:
+
+ msg = "Can only compare identically-labeled Series objects"
+ with pytest.raises(ValueError, match=msg):
+ left == right
+
+ with pytest.raises(ValueError, match=msg):
+ left != right
+
+ with pytest.raises(ValueError, match=msg):
+ left < right
+
+ msg = "Can only compare identically-labeled DataFrame objects"
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() == right.to_frame()
+
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() != right.to_frame()
+
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() < right.to_frame()
+
+ def test_compare_series_interval_keyword(self):
+ # GH 25338
+ s = Series(['IntervalA', 'IntervalB', 'IntervalC'])
+ result = s == 'IntervalA'
+ expected = Series([True, False, False])
+ assert_series_equal(result, expected)
+
+
+class TestSeriesFlexComparisonOps(object):
+
+ def test_comparison_flex_alignment(self):
+ left = Series([1, 3, 2], index=list('abc'))
+ right = Series([2, 2, 2], index=list('bcd'))
+
+ exp = pd.Series([False, False, True, False], index=list('abcd'))
+ assert_series_equal(left.eq(right), exp)
+
+ exp = pd.Series([True, True, False, True], index=list('abcd'))
+ assert_series_equal(left.ne(right), exp)
+
+ exp = pd.Series([False, False, True, False], index=list('abcd'))
+ assert_series_equal(left.le(right), exp)
+
+ exp = pd.Series([False, False, False, False], index=list('abcd'))
+ assert_series_equal(left.lt(right), exp)
+
+ exp = pd.Series([False, True, True, False], index=list('abcd'))
+ assert_series_equal(left.ge(right), exp)
+
+ exp = pd.Series([False, True, False, False], index=list('abcd'))
+ assert_series_equal(left.gt(right), exp)
+
+ def test_comparison_flex_alignment_fill(self):
+ left = Series([1, 3, 2], index=list('abc'))
+ right = Series([2, 2, 2], index=list('bcd'))
+
+ exp = pd.Series([False, False, True, True], index=list('abcd'))
+ assert_series_equal(left.eq(right, fill_value=2), exp)
+
+ exp = pd.Series([True, True, False, False], index=list('abcd'))
+ assert_series_equal(left.ne(right, fill_value=2), exp)
+
+ exp = pd.Series([False, False, True, True], index=list('abcd'))
+ assert_series_equal(left.le(right, fill_value=0), exp)
+
+ exp = pd.Series([False, False, False, True], index=list('abcd'))
+ assert_series_equal(left.lt(right, fill_value=0), exp)
+
+ exp = pd.Series([True, True, True, False], index=list('abcd'))
+ assert_series_equal(left.ge(right, fill_value=0), exp)
+
+ exp = pd.Series([True, True, False, False], index=list('abcd'))
+ assert_series_equal(left.gt(right, fill_value=0), exp)
+
+
+class TestSeriesOperators(TestData):
+
+ def test_operators_empty_int_corner(self):
+ s1 = Series([], [], dtype=np.int32)
+ s2 = Series({'x': 0.})
+ assert_series_equal(s1 * s2, Series([np.nan], index=['x']))
+
+ def test_ops_datetimelike_align(self):
+ # GH 7500
+ # datetimelike ops need to align
+ dt = Series(date_range('2012-1-1', periods=3, freq='D'))
+ dt.iloc[2] = np.nan
+ dt2 = dt[::-1]
+
+ expected = Series([timedelta(0), timedelta(0), pd.NaT])
+ # name is reset
+ result = dt2 - dt
+ assert_series_equal(result, expected)
+
+ expected = Series(expected, name=0)
+ result = (dt2.to_frame() - dt.to_frame())[0]
+ assert_series_equal(result, expected)
+
+ def test_operators_corner(self):
+ series = self.ts
+
+ empty = Series([], index=Index([]))
+
+ result = series + empty
+ assert np.isnan(result).all()
+
+ result = empty + Series([], index=Index([]))
+ assert len(result) == 0
+
+ # TODO: this returned NotImplemented earlier, what to do?
+ # deltas = Series([timedelta(1)] * 5, index=np.arange(5))
+ # sub_deltas = deltas[::2]
+ # deltas5 = deltas * 5
+ # deltas = deltas + sub_deltas
+
+ # float + int
+ int_ts = self.ts.astype(int)[:-5]
+ added = self.ts + int_ts
+ expected = Series(self.ts.values[:-5] + int_ts.values,
+ index=self.ts.index[:-5], name='ts')
+ tm.assert_series_equal(added[:-5], expected)
+
+ pairings = []
+ for op in ['add', 'sub', 'mul', 'pow', 'truediv', 'floordiv']:
+ fv = 0
+ lop = getattr(Series, op)
+ lequiv = getattr(operator, op)
+ rop = getattr(Series, 'r' + op)
+ # bind op at definition time...
+ requiv = lambda x, y, op=op: getattr(operator, op)(y, x)
+ pairings.append((lop, lequiv, fv))
+ pairings.append((rop, requiv, fv))
+ if compat.PY3:
+ pairings.append((Series.div, operator.truediv, 1))
+ pairings.append((Series.rdiv, lambda x, y: operator.truediv(y, x), 1))
+ else:
+ pairings.append((Series.div, operator.div, 1))
+ pairings.append((Series.rdiv, lambda x, y: operator.div(y, x), 1))
+
+ @pytest.mark.parametrize('op, equiv_op, fv', pairings)
+ def test_operators_combine(self, op, equiv_op, fv):
+ def _check_fill(meth, op, a, b, fill_value=0):
+ exp_index = a.index.union(b.index)
+ a = a.reindex(exp_index)
+ b = b.reindex(exp_index)
+
+ amask = isna(a)
+ bmask = isna(b)
+
+ exp_values = []
+ for i in range(len(exp_index)):
+ with np.errstate(all='ignore'):
+ if amask[i]:
+ if bmask[i]:
+ exp_values.append(np.nan)
+ continue
+ exp_values.append(op(fill_value, b[i]))
+ elif bmask[i]:
+ if amask[i]:
+ exp_values.append(np.nan)
+ continue
+ exp_values.append(op(a[i], fill_value))
+ else:
+ exp_values.append(op(a[i], b[i]))
+
+ result = meth(a, b, fill_value=fill_value)
+ expected = Series(exp_values, exp_index)
+ assert_series_equal(result, expected)
+
+ a = Series([np.nan, 1., 2., 3., np.nan], index=np.arange(5))
+ b = Series([np.nan, 1, np.nan, 3, np.nan, 4.], index=np.arange(6))
+
+ result = op(a, b)
+ exp = equiv_op(a, b)
+ assert_series_equal(result, exp)
+ _check_fill(op, equiv_op, a, b, fill_value=fv)
+ # should accept axis=0 or axis='rows'
+ op(a, b, axis=0)
+
+ def test_operators_na_handling(self):
+ from decimal import Decimal
+ from datetime import date
+ s = Series([Decimal('1.3'), Decimal('2.3')],
+ index=[date(2012, 1, 1), date(2012, 1, 2)])
+
+ result = s + s.shift(1)
+ result2 = s.shift(1) + s
+ assert isna(result[0])
+ assert isna(result2[0])
+
+ def test_op_duplicate_index(self):
+ # GH14227
+ s1 = Series([1, 2], index=[1, 1])
+ s2 = Series([10, 10], index=[1, 2])
+ result = s1 + s2
+ expected = pd.Series([11, 12, np.nan], index=[1, 1, 2])
+ assert_series_equal(result, expected)
+
+
+class TestSeriesUnaryOps(object):
+ # __neg__, __pos__, __inv__
+
+ def test_neg(self):
+ ser = tm.makeStringSeries()
+ ser.name = 'series'
+ assert_series_equal(-ser, -1 * ser)
+
+ def test_invert(self):
+ ser = tm.makeStringSeries()
+ ser.name = 'series'
+ assert_series_equal(-(ser < 0), ~(ser < 0))
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_period.py b/contrib/python/pandas/py2/pandas/tests/series/test_period.py
new file mode 100644
index 00000000000..0a86bb0b677
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_period.py
@@ -0,0 +1,166 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Period, Series, period_range
+from pandas.core.arrays import PeriodArray
+import pandas.util.testing as tm
+
+
+class TestSeriesPeriod(object):
+
+ def setup_method(self, method):
+ self.series = Series(period_range('2000-01-01', periods=10, freq='D'))
+
+ def test_auto_conversion(self):
+ series = Series(list(period_range('2000-01-01', periods=10, freq='D')))
+ assert series.dtype == 'Period[D]'
+
+ series = pd.Series([pd.Period('2011-01-01', freq='D'),
+ pd.Period('2011-02-01', freq='D')])
+ assert series.dtype == 'Period[D]'
+
+ def test_getitem(self):
+ assert self.series[1] == pd.Period('2000-01-02', freq='D')
+
+ result = self.series[[2, 4]]
+ exp = pd.Series([pd.Period('2000-01-03', freq='D'),
+ pd.Period('2000-01-05', freq='D')],
+ index=[2, 4], dtype='Period[D]')
+ tm.assert_series_equal(result, exp)
+ assert result.dtype == 'Period[D]'
+
+ def test_isna(self):
+ # GH 13737
+ s = Series([pd.Period('2011-01', freq='M'),
+ pd.Period('NaT', freq='M')])
+ tm.assert_series_equal(s.isna(), Series([False, True]))
+ tm.assert_series_equal(s.notna(), Series([True, False]))
+
+ def test_fillna(self):
+ # GH 13737
+ s = Series([pd.Period('2011-01', freq='M'),
+ pd.Period('NaT', freq='M')])
+
+ res = s.fillna(pd.Period('2012-01', freq='M'))
+ exp = Series([pd.Period('2011-01', freq='M'),
+ pd.Period('2012-01', freq='M')])
+ tm.assert_series_equal(res, exp)
+ assert res.dtype == 'Period[M]'
+
+ def test_dropna(self):
+ # GH 13737
+ s = Series([pd.Period('2011-01', freq='M'),
+ pd.Period('NaT', freq='M')])
+ tm.assert_series_equal(s.dropna(),
+ Series([pd.Period('2011-01', freq='M')]))
+
+ def test_between(self):
+ left, right = self.series[[2, 7]]
+ result = self.series.between(left, right)
+ expected = (self.series >= left) & (self.series <= right)
+ tm.assert_series_equal(result, expected)
+
+ # ---------------------------------------------------------------------
+ # NaT support
+
+ @pytest.mark.xfail(reason="PeriodDtype Series not supported yet")
+ def test_NaT_scalar(self):
+ series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]')
+
+ val = series[3]
+ assert pd.isna(val)
+
+ series[2] = val
+ assert pd.isna(series[2])
+
+ @pytest.mark.xfail(reason="PeriodDtype Series not supported yet")
+ def test_NaT_cast(self):
+ result = Series([np.nan]).astype('period[D]')
+ expected = Series([pd.NaT])
+ tm.assert_series_equal(result, expected)
+
+ def test_set_none(self):
+ self.series[3] = None
+ assert self.series[3] is pd.NaT
+
+ self.series[3:5] = None
+ assert self.series[4] is pd.NaT
+
+ def test_set_nan(self):
+ # Do we want to allow this?
+ self.series[5] = np.nan
+ assert self.series[5] is pd.NaT
+
+ self.series[5:7] = np.nan
+ assert self.series[6] is pd.NaT
+
+ def test_intercept_astype_object(self):
+ expected = self.series.astype('object')
+
+ df = DataFrame({'a': self.series,
+ 'b': np.random.randn(len(self.series))})
+
+ result = df.values.squeeze()
+ assert (result[:, 0] == expected.values).all()
+
+ df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)})
+
+ result = df.values.squeeze()
+ assert (result[:, 0] == expected.values).all()
+
+ def test_align_series(self, join_type):
+ rng = period_range('1/1/2000', '1/1/2010', freq='A')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ ts.align(ts[::2], join=join_type)
+
+ def test_truncate(self):
+ # GH 17717
+ idx1 = pd.PeriodIndex([
+ pd.Period('2017-09-02'),
+ pd.Period('2017-09-02'),
+ pd.Period('2017-09-03')
+ ])
+ series1 = pd.Series([1, 2, 3], index=idx1)
+ result1 = series1.truncate(after='2017-09-02')
+
+ expected_idx1 = pd.PeriodIndex([
+ pd.Period('2017-09-02'),
+ pd.Period('2017-09-02')
+ ])
+ tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1))
+
+ idx2 = pd.PeriodIndex([
+ pd.Period('2017-09-03'),
+ pd.Period('2017-09-02'),
+ pd.Period('2017-09-03')
+ ])
+ series2 = pd.Series([1, 2, 3], index=idx2)
+ result2 = series2.sort_index().truncate(after='2017-09-02')
+
+ expected_idx2 = pd.PeriodIndex([
+ pd.Period('2017-09-02')
+ ])
+ tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2))
+
+ @pytest.mark.parametrize('input_vals', [
+ [Period('2016-01', freq='M'), Period('2016-02', freq='M')],
+ [Period('2016-01-01', freq='D'), Period('2016-01-02', freq='D')],
+ [Period('2016-01-01 00:00:00', freq='H'),
+ Period('2016-01-01 01:00:00', freq='H')],
+ [Period('2016-01-01 00:00:00', freq='M'),
+ Period('2016-01-01 00:01:00', freq='M')],
+ [Period('2016-01-01 00:00:00', freq='S'),
+ Period('2016-01-01 00:00:01', freq='S')]
+ ])
+ def test_end_time_timevalues(self, input_vals):
+ # GH 17157
+ # Check that the time part of the Period is adjusted by end_time
+ # when using the dt accessor on a Series
+ input_vals = PeriodArray._from_sequence(np.asarray(input_vals))
+
+ s = Series(input_vals)
+ result = s.dt.end_time
+ expected = s.apply(lambda x: x.end_time)
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_quantile.py b/contrib/python/pandas/py2/pandas/tests/series/test_quantile.py
new file mode 100644
index 00000000000..4f462e11e9b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_quantile.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_integer
+
+import pandas as pd
+from pandas import Index, Series
+from pandas.core.indexes.datetimes import Timestamp
+import pandas.util.testing as tm
+
+from .common import TestData
+
+
+class TestSeriesQuantile(TestData):
+
+ def test_quantile(self):
+
+ q = self.ts.quantile(0.1)
+ assert q == np.percentile(self.ts.dropna(), 10)
+
+ q = self.ts.quantile(0.9)
+ assert q == np.percentile(self.ts.dropna(), 90)
+
+ # object dtype
+ q = Series(self.ts, dtype=object).quantile(0.9)
+ assert q == np.percentile(self.ts.dropna(), 90)
+
+ # datetime64[ns] dtype
+ dts = self.ts.index.to_series()
+ q = dts.quantile(.2)
+ assert q == Timestamp('2000-01-10 19:12:00')
+
+ # timedelta64[ns] dtype
+ tds = dts.diff()
+ q = tds.quantile(.25)
+ assert q == pd.to_timedelta('24:00:00')
+
+ # GH7661
+ result = Series([np.timedelta64('NaT')]).sum()
+ assert result == pd.Timedelta(0)
+
+ msg = 'percentiles should all be in the interval \\[0, 1\\]'
+ for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
+ with pytest.raises(ValueError, match=msg):
+ self.ts.quantile(invalid)
+
+ def test_quantile_multi(self):
+
+ qs = [.1, .9]
+ result = self.ts.quantile(qs)
+ expected = pd.Series([np.percentile(self.ts.dropna(), 10),
+ np.percentile(self.ts.dropna(), 90)],
+ index=qs, name=self.ts.name)
+ tm.assert_series_equal(result, expected)
+
+ dts = self.ts.index.to_series()
+ dts.name = 'xxx'
+ result = dts.quantile((.2, .2))
+ expected = Series([Timestamp('2000-01-10 19:12:00'),
+ Timestamp('2000-01-10 19:12:00')],
+ index=[.2, .2], name='xxx')
+ tm.assert_series_equal(result, expected)
+
+ result = self.ts.quantile([])
+ expected = pd.Series([], name=self.ts.name, index=Index(
+ [], dtype=float))
+ tm.assert_series_equal(result, expected)
+
+ def test_quantile_interpolation(self):
+ # see gh-10174
+
+ # interpolation = linear (default case)
+ q = self.ts.quantile(0.1, interpolation='linear')
+ assert q == np.percentile(self.ts.dropna(), 10)
+ q1 = self.ts.quantile(0.1)
+ assert q1 == np.percentile(self.ts.dropna(), 10)
+
+ # test with and without interpolation keyword
+ assert q == q1
+
+ def test_quantile_interpolation_dtype(self):
+ # GH #10174
+
+ # interpolation = linear (default case)
+ q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower')
+ assert q == np.percentile(np.array([1, 3, 4]), 50)
+ assert is_integer(q)
+
+ q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher')
+ assert q == np.percentile(np.array([1, 3, 4]), 50)
+ assert is_integer(q)
+
+ def test_quantile_nan(self):
+
+ # GH 13098
+ s = pd.Series([1, 2, 3, 4, np.nan])
+ result = s.quantile(0.5)
+ expected = 2.5
+ assert result == expected
+
+ # all nan/empty
+ cases = [Series([]), Series([np.nan, np.nan])]
+
+ for s in cases:
+ res = s.quantile(0.5)
+ assert np.isnan(res)
+
+ res = s.quantile([0.5])
+ tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5]))
+
+ res = s.quantile([0.2, 0.3])
+ tm.assert_series_equal(res, pd.Series([np.nan, np.nan],
+ index=[0.2, 0.3]))
+
+ @pytest.mark.parametrize('case', [
+ [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03')],
+ [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03', tz='US/Eastern')],
+ [pd.Timedelta('1 days'), pd.Timedelta('2 days'),
+ pd.Timedelta('3 days')],
+ # NaT
+ [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'),
+ pd.Timestamp('2011-01-03'), pd.NaT],
+ [pd.Timestamp('2011-01-01', tz='US/Eastern'),
+ pd.Timestamp('2011-01-02', tz='US/Eastern'),
+ pd.Timestamp('2011-01-03', tz='US/Eastern'), pd.NaT],
+ [pd.Timedelta('1 days'), pd.Timedelta('2 days'),
+ pd.Timedelta('3 days'), pd.NaT]])
+ def test_quantile_box(self, case):
+ s = pd.Series(case, name='XXX')
+ res = s.quantile(0.5)
+ assert res == case[1]
+
+ res = s.quantile([0.5])
+ exp = pd.Series([case[1]], index=[0.5], name='XXX')
+ tm.assert_series_equal(res, exp)
+
+ def test_datetime_timedelta_quantiles(self):
+ # covers #9694
+ assert pd.isna(Series([], dtype='M8[ns]').quantile(.5))
+ assert pd.isna(Series([], dtype='m8[ns]').quantile(.5))
+
+ def test_quantile_nat(self):
+ res = Series([pd.NaT, pd.NaT]).quantile(0.5)
+ assert res is pd.NaT
+
+ res = Series([pd.NaT, pd.NaT]).quantile([0.5])
+ tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5]))
+
+ @pytest.mark.parametrize('values, dtype', [
+ ([0, 0, 0, 1, 2, 3], 'Sparse[int]'),
+ ([0., None, 1., 2.], 'Sparse[float]'),
+ ])
+ def test_quantile_sparse(self, values, dtype):
+ ser = pd.Series(values, dtype=dtype)
+ result = ser.quantile([0.5])
+ expected = pd.Series(np.asarray(ser)).quantile([0.5])
+ tm.assert_series_equal(result, expected)
+
+ def test_quantile_empty(self):
+
+ # floats
+ s = Series([], dtype='float64')
+
+ res = s.quantile(0.5)
+ assert np.isnan(res)
+
+ res = s.quantile([0.5])
+ exp = Series([np.nan], index=[0.5])
+ tm.assert_series_equal(res, exp)
+
+ # int
+ s = Series([], dtype='int64')
+
+ res = s.quantile(0.5)
+ assert np.isnan(res)
+
+ res = s.quantile([0.5])
+ exp = Series([np.nan], index=[0.5])
+ tm.assert_series_equal(res, exp)
+
+ # datetime
+ s = Series([], dtype='datetime64[ns]')
+
+ res = s.quantile(0.5)
+ assert res is pd.NaT
+
+ res = s.quantile([0.5])
+ exp = Series([pd.NaT], index=[0.5])
+ tm.assert_series_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_rank.py b/contrib/python/pandas/py2/pandas/tests/series/test_rank.py
new file mode 100644
index 00000000000..510a51e0029
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_rank.py
@@ -0,0 +1,506 @@
+# -*- coding: utf-8 -*-
+from distutils.version import LooseVersion
+from itertools import chain
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas._libs.algos import Infinity, NegInfinity
+from pandas._libs.tslib import iNaT
+import pandas.compat as compat
+from pandas.compat import product
+import pandas.util._test_decorators as td
+
+from pandas import NaT, Series, Timestamp, date_range
+from pandas.api.types import CategoricalDtype
+from pandas.tests.series.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import assert_series_equal
+
+
+class TestSeriesRank(TestData):
+ s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
+
+ results = {
+ 'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
+ 3.5, 1.5, 8.0, nan, 5.5]),
+ 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
+ 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
+ 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
+ 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
+ }
+
+ def test_rank(self):
+ pytest.importorskip('scipy.stats.special')
+ rankdata = pytest.importorskip('scipy.stats.rankdata')
+
+ self.ts[::2] = np.nan
+ self.ts[:10][::3] = 4.
+
+ ranks = self.ts.rank()
+ oranks = self.ts.astype('O').rank()
+
+ assert_series_equal(ranks, oranks)
+
+ mask = np.isnan(self.ts)
+ filled = self.ts.fillna(np.inf)
+
+ # rankdata returns a ndarray
+ exp = Series(rankdata(filled), index=filled.index, name='ts')
+ exp[mask] = np.nan
+
+ tm.assert_series_equal(ranks, exp)
+
+ iseries = Series(np.arange(5).repeat(2))
+
+ iranks = iseries.rank()
+ exp = iseries.astype(float).rank()
+ assert_series_equal(iranks, exp)
+ iseries = Series(np.arange(5)) + 1.0
+ exp = iseries / 5.0
+ iranks = iseries.rank(pct=True)
+
+ assert_series_equal(iranks, exp)
+
+ iseries = Series(np.repeat(1, 100))
+ exp = Series(np.repeat(0.505, 100))
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ iseries[1] = np.nan
+ exp = Series(np.repeat(50.0 / 99.0, 100))
+ exp[1] = np.nan
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ iseries = Series(np.arange(5)) + 1.0
+ iseries[4] = np.nan
+ exp = iseries / 4.0
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ iseries = Series(np.repeat(np.nan, 100))
+ exp = iseries.copy()
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ iseries = Series(np.arange(5)) + 1
+ iseries[4] = np.nan
+ exp = iseries / 4.0
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ rng = date_range('1/1/1990', periods=5)
+ iseries = Series(np.arange(5), rng) + 1
+ iseries.iloc[4] = np.nan
+ exp = iseries / 4.0
+ iranks = iseries.rank(pct=True)
+ assert_series_equal(iranks, exp)
+
+ iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
+ exp = Series([2, 1, 3, 5, 4, 6.0])
+ iranks = iseries.rank()
+ assert_series_equal(iranks, exp)
+
+ # GH 5968
+ iseries = Series(['3 day', '1 day 10m', '-2 day', NaT],
+ dtype='m8[ns]')
+ exp = Series([3, 2, 1, np.nan])
+ iranks = iseries.rank()
+ assert_series_equal(iranks, exp)
+
+ values = np.array(
+ [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40
+ ], dtype='float64')
+ random_order = np.random.permutation(len(values))
+ iseries = Series(values[random_order])
+ exp = Series(random_order + 1.0, dtype='float64')
+ iranks = iseries.rank()
+ assert_series_equal(iranks, exp)
+
+ def test_rank_categorical(self):
+ # GH issue #15420 rank incorrectly orders ordered categories
+
+ # Test ascending/descending ranking for ordered categoricals
+ exp = Series([1., 2., 3., 4., 5., 6.])
+ exp_desc = Series([6., 5., 4., 3., 2., 1.])
+ ordered = Series(
+ ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
+ ).astype(CategoricalDtype(categories=['first', 'second', 'third',
+ 'fourth', 'fifth', 'sixth'],
+ ordered=True))
+ assert_series_equal(ordered.rank(), exp)
+ assert_series_equal(ordered.rank(ascending=False), exp_desc)
+
+ # Unordered categoricals should be ranked as objects
+ unordered = Series(['first', 'second', 'third', 'fourth',
+ 'fifth', 'sixth']).astype(
+ CategoricalDtype(categories=['first', 'second', 'third',
+ 'fourth', 'fifth', 'sixth'],
+ ordered=False))
+ exp_unordered = Series([2., 4., 6., 3., 1., 5.])
+ res = unordered.rank()
+ assert_series_equal(res, exp_unordered)
+
+ unordered1 = Series(
+ [1, 2, 3, 4, 5, 6],
+ ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False))
+ exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
+ res1 = unordered1.rank()
+ assert_series_equal(res1, exp_unordered1)
+
+ # Test na_option for rank data
+ na_ser = Series(
+ ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
+ ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth',
+ 'fifth', 'sixth', 'seventh'], True))
+
+ exp_top = Series([2., 3., 4., 5., 6., 7., 1.])
+ exp_bot = Series([1., 2., 3., 4., 5., 6., 7.])
+ exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN])
+
+ assert_series_equal(na_ser.rank(na_option='top'), exp_top)
+ assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
+ assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
+
+ # Test na_option for rank data with ascending False
+ exp_top = Series([7., 6., 5., 4., 3., 2., 1.])
+ exp_bot = Series([6., 5., 4., 3., 2., 1., 7.])
+ exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN])
+
+ assert_series_equal(
+ na_ser.rank(na_option='top', ascending=False),
+ exp_top
+ )
+ assert_series_equal(
+ na_ser.rank(na_option='bottom', ascending=False),
+ exp_bot
+ )
+ assert_series_equal(
+ na_ser.rank(na_option='keep', ascending=False),
+ exp_keep
+ )
+
+ # Test invalid values for na_option
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+
+ with pytest.raises(ValueError, match=msg):
+ na_ser.rank(na_option='bad', ascending=False)
+
+ # invalid type
+ with pytest.raises(ValueError, match=msg):
+ na_ser.rank(na_option=True, ascending=False)
+
+ # Test with pct=True
+ na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype(
+ CategoricalDtype(['first', 'second', 'third', 'fourth'], True))
+ exp_top = Series([0.4, 0.6, 0.8, 1., 0.2])
+ exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.])
+ exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN])
+
+ assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
+ assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
+ assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
+
+ def test_rank_signature(self):
+ s = Series([0, 1])
+ s.rank(method='average')
+ msg = r"No axis named average for object type <(class|type) 'type'>"
+ with pytest.raises(ValueError, match=msg):
+ s.rank('average')
+
+ @pytest.mark.parametrize('contents,dtype', [
+ ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
+ 2, 40, np.inf],
+ 'float64'),
+ ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10,
+ 2, 40, np.inf],
+ 'float32'),
+ ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max],
+ 'uint8'),
+ pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000,
+ 1e10, np.iinfo(np.int64).max],
+ 'int64',
+ marks=pytest.mark.xfail(
+ reason="iNaT is equivalent to minimum value of dtype"
+ "int64 pending issue GH#16674")),
+ ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
+ 'object')
+ ])
+ def test_rank_inf(self, contents, dtype):
+ dtype_na_map = {
+ 'float64': np.nan,
+ 'float32': np.nan,
+ 'int64': iNaT,
+ 'object': None
+ }
+ # Insert nans at random positions if underlying dtype has missing
+ # value. Then adjust the expected order by adding nans accordingly
+ # This is for testing whether rank calculation is affected
+ # when values are interwined with nan values.
+ values = np.array(contents, dtype=dtype)
+ exp_order = np.array(range(len(values)), dtype='float64') + 1.0
+ if dtype in dtype_na_map:
+ na_value = dtype_na_map[dtype]
+ nan_indices = np.random.choice(range(len(values)), 5)
+ values = np.insert(values, nan_indices, na_value)
+ exp_order = np.insert(exp_order, nan_indices, np.nan)
+ # shuffle the testing array and expected results in the same way
+ random_order = np.random.permutation(len(values))
+ iseries = Series(values[random_order])
+ exp = Series(exp_order[random_order], dtype='float64')
+ iranks = iseries.rank()
+ assert_series_equal(iranks, exp)
+
+ def test_rank_tie_methods(self):
+ s = self.s
+
+ def _check(s, expected, method='average'):
+ result = s.rank(method=method)
+ tm.assert_series_equal(result, Series(expected))
+
+ dtypes = [None, object]
+ disabled = {(object, 'first')}
+ results = self.results
+
+ for method, dtype in product(results, dtypes):
+ if (dtype, method) in disabled:
+ continue
+ series = s if dtype is None else s.astype(dtype)
+ _check(series, results[method], method=method)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize('ascending', [True, False])
+ @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first',
+ 'dense'])
+ @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep'])
+ def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
+ dtypes = [('object', None, Infinity(), NegInfinity()),
+ ('float64', np.nan, np.inf, -np.inf)]
+ chunk = 3
+ disabled = {('object', 'first')}
+
+ def _check(s, method, na_option, ascending):
+ exp_ranks = {
+ 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
+ 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
+ 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
+ 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
+ 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
+ }
+ ranks = exp_ranks[method]
+ if na_option == 'top':
+ order = [ranks[1], ranks[0], ranks[2]]
+ elif na_option == 'bottom':
+ order = [ranks[0], ranks[2], ranks[1]]
+ else:
+ order = [ranks[0], [np.nan] * chunk, ranks[1]]
+ expected = order if ascending else order[::-1]
+ expected = list(chain.from_iterable(expected))
+ result = s.rank(method=method, na_option=na_option,
+ ascending=ascending)
+ tm.assert_series_equal(result, Series(expected, dtype='float64'))
+
+ for dtype, na_value, pos_inf, neg_inf in dtypes:
+ in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
+ iseries = Series(in_arr, dtype=dtype)
+ if (dtype, method) in disabled:
+ continue
+ _check(iseries, method, na_option, ascending)
+
+ def test_rank_desc_mix_nans_infs(self):
+ # GH 19538
+ # check descending ranking when mix nans and infs
+ iseries = Series([1, np.nan, np.inf, -np.inf, 25])
+ result = iseries.rank(ascending=False)
+ exp = Series([3, np.nan, 1, 4, 2], dtype='float64')
+ tm.assert_series_equal(result, exp)
+
+ def test_rank_methods_series(self):
+ pytest.importorskip('scipy.stats.special')
+ rankdata = pytest.importorskip('scipy.stats.rankdata')
+ import scipy
+
+ xs = np.random.randn(9)
+ xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
+ np.random.shuffle(xs)
+
+ index = [chr(ord('a') + i) for i in range(len(xs))]
+
+ for vals in [xs, xs + 1e6, xs * 1e-6]:
+ ts = Series(vals, index=index)
+
+ for m in ['average', 'min', 'max', 'first', 'dense']:
+ result = ts.rank(method=m)
+ sprank = rankdata(vals, m if m != 'first' else 'ordinal')
+ expected = Series(sprank, index=index)
+
+ if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
+ expected = expected.astype('float64')
+ tm.assert_series_equal(result, expected)
+
+ def test_rank_dense_method(self):
+ dtypes = ['O', 'f8', 'i8']
+ in_out = [([1], [1]),
+ ([2], [1]),
+ ([0], [1]),
+ ([2, 2], [1, 1]),
+ ([1, 2, 3], [1, 2, 3]),
+ ([4, 2, 1], [3, 2, 1],),
+ ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
+ ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]
+
+ for ser, exp in in_out:
+ for dtype in dtypes:
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='dense')
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+ def test_rank_descending(self):
+ dtypes = ['O', 'f8', 'i8']
+
+ for dtype, method in product(dtypes, self.results):
+ if 'i' in dtype:
+ s = self.s.dropna()
+ else:
+ s = self.s.astype(dtype)
+
+ res = s.rank(ascending=False)
+ expected = (s.max() - s).rank()
+ assert_series_equal(res, expected)
+
+ if method == 'first' and dtype == 'O':
+ continue
+
+ expected = (s.max() - s).rank(method=method)
+ res2 = s.rank(method=method, ascending=False)
+ assert_series_equal(res2, expected)
+
+ def test_rank_int(self):
+ s = self.s.dropna().astype('i8')
+
+ for method, res in compat.iteritems(self.results):
+ result = s.rank(method=method)
+ expected = Series(res).dropna()
+ expected.index = result.index
+ assert_series_equal(result, expected)
+
+ def test_rank_object_bug(self):
+ # GH 13445
+
+ # smoke tests
+ Series([np.nan] * 32).astype(object).rank(ascending=True)
+ Series([np.nan] * 32).astype(object).rank(ascending=False)
+
+ def test_rank_modify_inplace(self):
+ # GH 18521
+ # Check rank does not mutate series
+ s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT])
+ expected = s.copy()
+
+ s.rank()
+ result = s
+ assert_series_equal(result, expected)
+
+
+# GH15630, pct should be on 100% basis when method='dense'
+
[email protected]('dtype', ['O', 'f8', 'i8'])
[email protected]('ser, exp', [
+ ([1], [1.]),
+ ([1, 2], [1. / 2, 2. / 2]),
+ ([2, 2], [1., 1.]),
+ ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+ ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]),
+ ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+ ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
+ ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]),
+ ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_dense_pct(dtype, ser, exp):
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='dense', pct=True)
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+
[email protected]('dtype', ['O', 'f8', 'i8'])
[email protected]('ser, exp', [
+ ([1], [1.]),
+ ([1, 2], [1. / 2, 2. / 2]),
+ ([2, 2], [1. / 2, 1. / 2]),
+ ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+ ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]),
+ ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+ ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]),
+ ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]),
+ ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_min_pct(dtype, ser, exp):
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='min', pct=True)
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+
[email protected]('dtype', ['O', 'f8', 'i8'])
[email protected]('ser, exp', [
+ ([1], [1.]),
+ ([1, 2], [1. / 2, 2. / 2]),
+ ([2, 2], [1., 1.]),
+ ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+ ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]),
+ ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+ ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]),
+ ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]),
+ ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_max_pct(dtype, ser, exp):
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='max', pct=True)
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+
[email protected]('dtype', ['O', 'f8', 'i8'])
[email protected]('ser, exp', [
+ ([1], [1.]),
+ ([1, 2], [1. / 2, 2. / 2]),
+ ([2, 2], [1.5 / 2, 1.5 / 2]),
+ ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+ ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]),
+ ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+ ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]),
+ ([1, 1, 3, 3, 5, 5],
+ [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
+ ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_average_pct(dtype, ser, exp):
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='average', pct=True)
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+
[email protected]('dtype', ['f8', 'i8'])
[email protected]('ser, exp', [
+ ([1], [1.]),
+ ([1, 2], [1. / 2, 2. / 2]),
+ ([2, 2], [1. / 2, 2. / 2.]),
+ ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+ ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]),
+ ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+ ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]),
+ ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]),
+ ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_first_pct(dtype, ser, exp):
+ s = Series(ser).astype(dtype)
+ result = s.rank(method='first', pct=True)
+ expected = Series(exp).astype(result.dtype)
+ assert_series_equal(result, expected)
+
+
+def test_pct_max_many_rows():
+ # GH 18271
+ s = Series(np.arange(2**24 + 1))
+ result = s.rank(pct=True).max()
+ assert result == 1
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_replace.py b/contrib/python/pandas/py2/pandas/tests/series/test_replace.py
new file mode 100644
index 00000000000..2e7b746f6c9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_replace.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from .common import TestData
+
+
+class TestSeriesReplace(TestData):
+ def test_replace(self):
+ N = 100
+ ser = pd.Series(np.random.randn(N))
+ ser[0:4] = np.nan
+ ser[6:10] = 0
+
+ # replace list with a single value
+ ser.replace([np.nan], -1, inplace=True)
+
+ exp = ser.fillna(-1)
+ tm.assert_series_equal(ser, exp)
+
+ rs = ser.replace(0., np.nan)
+ ser[ser == 0.] = np.nan
+ tm.assert_series_equal(rs, ser)
+
+ ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N),
+ dtype=object)
+ ser[:5] = np.nan
+ ser[6:10] = 'foo'
+ ser[20:30] = 'bar'
+
+ # replace list with a single value
+ rs = ser.replace([np.nan, 'foo', 'bar'], -1)
+
+ assert (rs[:5] == -1).all()
+ assert (rs[6:10] == -1).all()
+ assert (rs[20:30] == -1).all()
+ assert (pd.isna(ser[:5])).all()
+
+ # replace with different values
+ rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3})
+
+ assert (rs[:5] == -1).all()
+ assert (rs[6:10] == -2).all()
+ assert (rs[20:30] == -3).all()
+ assert (pd.isna(ser[:5])).all()
+
+ # replace with different values with 2 lists
+ rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3])
+ tm.assert_series_equal(rs, rs2)
+
+ # replace inplace
+ ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True)
+
+ assert (ser[:5] == -1).all()
+ assert (ser[6:10] == -1).all()
+ assert (ser[20:30] == -1).all()
+
+ ser = pd.Series([np.nan, 0, np.inf])
+ tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
+
+ ser = pd.Series([np.nan, 0, 'foo', 'bar', np.inf, None, pd.NaT])
+ tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
+ filled = ser.copy()
+ filled[4] = 0
+ tm.assert_series_equal(ser.replace(np.inf, 0), filled)
+
+ ser = pd.Series(self.ts.index)
+ tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
+
+ # malformed
+ msg = r"Replacement lists must match in length\. Expecting 3 got 2"
+ with pytest.raises(ValueError, match=msg):
+ ser.replace([1, 2, 3], [np.nan, 0])
+
+ # make sure that we aren't just masking a TypeError because bools don't
+ # implement indexing
+ with pytest.raises(TypeError, match='Cannot compare types .+'):
+ ser.replace([1, 2], [np.nan, 0])
+
+ ser = pd.Series([0, 1, 2, 3, 4])
+ result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
+ tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0]))
+
+ def test_replace_gh5319(self):
+ # API change from 0.12?
+ # GH 5319
+ ser = pd.Series([0, np.nan, 2, 3, 4])
+ expected = ser.ffill()
+ result = ser.replace([np.nan])
+ tm.assert_series_equal(result, expected)
+
+ ser = pd.Series([0, np.nan, 2, 3, 4])
+ expected = ser.ffill()
+ result = ser.replace(np.nan)
+ tm.assert_series_equal(result, expected)
+ # GH 5797
+ ser = pd.Series(pd.date_range('20130101', periods=5))
+ expected = ser.copy()
+ expected.loc[2] = pd.Timestamp('20120101')
+ result = ser.replace({pd.Timestamp('20130103'):
+ pd.Timestamp('20120101')})
+ tm.assert_series_equal(result, expected)
+ result = ser.replace(pd.Timestamp('20130103'),
+ pd.Timestamp('20120101'))
+ tm.assert_series_equal(result, expected)
+
+ # GH 11792: Test with replacing NaT in a list with tz data
+ ts = pd.Timestamp('2015/01/01', tz='UTC')
+ s = pd.Series([pd.NaT, pd.Timestamp('2015/01/01', tz='UTC')])
+ result = s.replace([np.nan, pd.NaT], pd.Timestamp.min)
+ expected = pd.Series([pd.Timestamp.min, ts], dtype=object)
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_with_single_list(self):
+ ser = pd.Series([0, 1, 2, 3, 4])
+ result = ser.replace([1, 2, 3])
+ tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4]))
+
+ s = ser.copy()
+ s.replace([1, 2, 3], inplace=True)
+ tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4]))
+
+ # make sure things don't get corrupted when fillna call fails
+ s = ser.copy()
+ msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill"
+ r" \(bfill\)\. Got crash_cymbal")
+ with pytest.raises(ValueError, match=msg):
+ s.replace([1, 2, 3], inplace=True, method='crash_cymbal')
+ tm.assert_series_equal(s, ser)
+
+ def test_replace_with_empty_list(self):
+ # GH 21977
+ s = pd.Series([[1], [2, 3], [], np.nan, [4]])
+ expected = s
+ result = s.replace([], np.nan)
+ tm.assert_series_equal(result, expected)
+
+ # GH 19266
+ with pytest.raises(ValueError, match="cannot assign mismatch"):
+ s.replace({np.nan: []})
+ with pytest.raises(ValueError, match="cannot assign mismatch"):
+ s.replace({np.nan: ['dummy', 'alt']})
+
+ def test_replace_mixed_types(self):
+ s = pd.Series(np.arange(5), dtype='int64')
+
+ def check_replace(to_rep, val, expected):
+ sc = s.copy()
+ r = s.replace(to_rep, val)
+ sc.replace(to_rep, val, inplace=True)
+ tm.assert_series_equal(expected, r)
+ tm.assert_series_equal(expected, sc)
+
+ # MUST upcast to float
+ e = pd.Series([0., 1., 2., 3., 4.])
+ tr, v = [3], [3.0]
+ check_replace(tr, v, e)
+
+ # MUST upcast to float
+ e = pd.Series([0, 1, 2, 3.5, 4])
+ tr, v = [3], [3.5]
+ check_replace(tr, v, e)
+
+ # casts to object
+ e = pd.Series([0, 1, 2, 3.5, 'a'])
+ tr, v = [3, 4], [3.5, 'a']
+ check_replace(tr, v, e)
+
+ # again casts to object
+ e = pd.Series([0, 1, 2, 3.5, pd.Timestamp('20130101')])
+ tr, v = [3, 4], [3.5, pd.Timestamp('20130101')]
+ check_replace(tr, v, e)
+
+ # casts to object
+ e = pd.Series([0, 1, 2, 3.5, True], dtype='object')
+ tr, v = [3, 4], [3.5, True]
+ check_replace(tr, v, e)
+
+ # test an object with dates + floats + integers + strings
+ dr = pd.date_range('1/1/2001', '1/10/2001',
+ freq='D').to_series().reset_index(drop=True)
+ result = dr.astype(object).replace(
+ [dr[0], dr[1], dr[2]], [1.0, 2, 'a'])
+ expected = pd.Series([1.0, 2, 'a'] + dr[3:].tolist(), dtype=object)
+ tm.assert_series_equal(result, expected)
+
+ def test_replace_bool_with_string_no_op(self):
+ s = pd.Series([True, False, True])
+ result = s.replace('fun', 'in-the-sun')
+ tm.assert_series_equal(s, result)
+
+ def test_replace_bool_with_string(self):
+ # nonexistent elements
+ s = pd.Series([True, False, True])
+ result = s.replace(True, '2u')
+ expected = pd.Series(['2u', False, '2u'])
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_bool_with_bool(self):
+ s = pd.Series([True, False, True])
+ result = s.replace(True, False)
+ expected = pd.Series([False] * len(s))
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_with_dict_with_bool_keys(self):
+ s = pd.Series([True, False, True])
+ with pytest.raises(TypeError, match='Cannot compare types .+'):
+ s.replace({'asdf': 'asdb', True: 'yes'})
+
+ def test_replace2(self):
+ N = 100
+ ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N),
+ dtype=object)
+ ser[:5] = np.nan
+ ser[6:10] = 'foo'
+ ser[20:30] = 'bar'
+
+ # replace list with a single value
+ rs = ser.replace([np.nan, 'foo', 'bar'], -1)
+
+ assert (rs[:5] == -1).all()
+ assert (rs[6:10] == -1).all()
+ assert (rs[20:30] == -1).all()
+ assert (pd.isna(ser[:5])).all()
+
+ # replace with different values
+ rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3})
+
+ assert (rs[:5] == -1).all()
+ assert (rs[6:10] == -2).all()
+ assert (rs[20:30] == -3).all()
+ assert (pd.isna(ser[:5])).all()
+
+ # replace with different values with 2 lists
+ rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3])
+ tm.assert_series_equal(rs, rs2)
+
+ # replace inplace
+ ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True)
+ assert (ser[:5] == -1).all()
+ assert (ser[6:10] == -1).all()
+ assert (ser[20:30] == -1).all()
+
+ def test_replace_with_empty_dictlike(self):
+ # GH 15289
+ s = pd.Series(list('abcd'))
+ tm.assert_series_equal(s, s.replace(dict()))
+ tm.assert_series_equal(s, s.replace(pd.Series([])))
+
+ def test_replace_string_with_number(self):
+ # GH 15743
+ s = pd.Series([1, 2, 3])
+ result = s.replace('2', np.nan)
+ expected = pd.Series([1, 2, 3])
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_replacer_equals_replacement(self):
+ # GH 20656
+ # make sure all replacers are matching against original values
+ s = pd.Series(['a', 'b'])
+ expected = pd.Series(['b', 'a'])
+ result = s.replace({'a': 'b', 'b': 'a'})
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_unicode_with_number(self):
+ # GH 15743
+ s = pd.Series([1, 2, 3])
+ result = s.replace(u'2', np.nan)
+ expected = pd.Series([1, 2, 3])
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_mixed_types_with_string(self):
+ # Testing mixed
+ s = pd.Series([1, 2, 3, '4', 4, 5])
+ result = s.replace([2, '4'], np.nan)
+ expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
+ tm.assert_series_equal(expected, result)
+
+ def test_replace_with_no_overflowerror(self):
+ # GH 25616
+ # casts to object without Exception from OverflowError
+ s = pd.Series([0, 1, 2, 3, 4])
+ result = s.replace([3], ['100000000000000000000'])
+ expected = pd.Series([0, 1, 2, '100000000000000000000', 4])
+ tm.assert_series_equal(result, expected)
+
+ s = pd.Series([0, '100000000000000000000',
+ '100000000000000000001'])
+ result = s.replace(['100000000000000000000'], [1])
+ expected = pd.Series([0, 1, '100000000000000000001'])
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_repr.py b/contrib/python/pandas/py2/pandas/tests/series/test_repr.py
new file mode 100644
index 00000000000..842207f2a57
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_repr.py
@@ -0,0 +1,484 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import lrange, range, u
+
+import pandas as pd
+from pandas import (
+ Categorical, DataFrame, Index, Series, date_range, option_context,
+ period_range, timedelta_range)
+from pandas.core.base import StringMixin
+from pandas.core.index import MultiIndex
+import pandas.util.testing as tm
+
+from .common import TestData
+
+
+class TestSeriesRepr(TestData):
+
+ def test_multilevel_name_print(self):
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ s = Series(lrange(0, len(index)), index=index, name='sth')
+ expected = ["first second", "foo one 0",
+ " two 1", " three 2",
+ "bar one 3", " two 4",
+ "baz two 5", " three 6",
+ "qux one 7", " two 8",
+ " three 9", "Name: sth, dtype: int64"]
+ expected = "\n".join(expected)
+ assert repr(s) == expected
+
+ def test_name_printing(self):
+ # Test small Series.
+ s = Series([0, 1, 2])
+
+ s.name = "test"
+ assert "Name: test" in repr(s)
+
+ s.name = None
+ assert "Name:" not in repr(s)
+
+ # Test big Series (diff code path).
+ s = Series(lrange(0, 1000))
+
+ s.name = "test"
+ assert "Name: test" in repr(s)
+
+ s.name = None
+ assert "Name:" not in repr(s)
+
+ s = Series(index=date_range('20010101', '20020101'), name='test')
+ assert "Name: test" in repr(s)
+
+ def test_repr(self):
+ str(self.ts)
+ str(self.series)
+ str(self.series.astype(int))
+ str(self.objSeries)
+
+ str(Series(tm.randn(1000), index=np.arange(1000)))
+ str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1)))
+
+ # empty
+ str(self.empty)
+
+ # with NaNs
+ self.series[5:7] = np.NaN
+ str(self.series)
+
+ # with Nones
+ ots = self.ts.astype('O')
+ ots[::2] = None
+ repr(ots)
+
+ # various names
+ for name in ['', 1, 1.2, 'foo', u('\u03B1\u03B2\u03B3'),
+ 'loooooooooooooooooooooooooooooooooooooooooooooooooooong',
+ ('foo', 'bar', 'baz'), (1, 2), ('foo', 1, 2.3),
+ (u('\u03B1'), u('\u03B2'), u('\u03B3')),
+ (u('\u03B1'), 'bar')]:
+ self.series.name = name
+ repr(self.series)
+
+ biggie = Series(tm.randn(1000), index=np.arange(1000),
+ name=('foo', 'bar', 'baz'))
+ repr(biggie)
+
+ # 0 as name
+ ser = Series(np.random.randn(100), name=0)
+ rep_str = repr(ser)
+ assert "Name: 0" in rep_str
+
+ # tidy repr
+ ser = Series(np.random.randn(1001), name=0)
+ rep_str = repr(ser)
+ assert "Name: 0" in rep_str
+
+ ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"])
+ assert "\t" not in repr(ser)
+ assert "\r" not in repr(ser)
+ assert "a\n" not in repr(ser)
+
+ # with empty series (#4651)
+ s = Series([], dtype=np.int64, name='foo')
+ assert repr(s) == 'Series([], Name: foo, dtype: int64)'
+
+ s = Series([], dtype=np.int64, name=None)
+ assert repr(s) == 'Series([], dtype: int64)'
+
+ def test_tidy_repr(self):
+ a = Series([u("\u05d0")] * 1000)
+ a.name = 'title1'
+ repr(a) # should not raise exception
+
+ def test_repr_bool_fails(self, capsys):
+ s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)])
+
+ # It works (with no Cython exception barf)!
+ repr(s)
+
+ captured = capsys.readouterr()
+ assert captured.err == ''
+
+ def test_repr_name_iterable_indexable(self):
+ s = Series([1, 2, 3], name=np.int64(3))
+
+ # it works!
+ repr(s)
+
+ s.name = (u("\u05d0"), ) * 2
+ repr(s)
+
+ def test_repr_should_return_str(self):
+ # https://docs.python.org/3/reference/datamodel.html#object.__repr__
+ # ...The return value must be a string object.
+
+ # (str on py2.x, str (unicode) on py3)
+
+ data = [8, 5, 3, 5]
+ index1 = [u("\u03c3"), u("\u03c4"), u("\u03c5"), u("\u03c6")]
+ df = Series(data, index=index1)
+ assert type(df.__repr__() == str) # both py2 / 3
+
+ def test_repr_max_rows(self):
+ # GH 6863
+ with pd.option_context('max_rows', None):
+ str(Series(range(1001))) # should not raise exception
+
+ def test_unicode_string_with_unicode(self):
+ df = Series([u("\u05d0")], name=u("\u05d1"))
+ if compat.PY3:
+ str(df)
+ else:
+ compat.text_type(df)
+
+ def test_bytestring_with_unicode(self):
+ df = Series([u("\u05d0")], name=u("\u05d1"))
+ if compat.PY3:
+ bytes(df)
+ else:
+ str(df)
+
+ def test_timeseries_repr_object_dtype(self):
+ index = Index([datetime(2000, 1, 1) + timedelta(i)
+ for i in range(1000)], dtype=object)
+ ts = Series(np.random.randn(len(index)), index)
+ repr(ts)
+
+ ts = tm.makeTimeSeries(1000)
+ assert repr(ts).splitlines()[-1].startswith('Freq:')
+
+ ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)]
+ repr(ts2).splitlines()[-1]
+
+ def test_latex_repr(self):
+ result = r"""\begin{tabular}{ll}
+\toprule
+{} & 0 \\
+\midrule
+0 & $\alpha$ \\
+1 & b \\
+2 & c \\
+\bottomrule
+\end{tabular}
+"""
+ with option_context('display.latex.escape', False,
+ 'display.latex.repr', True):
+ s = Series([r'$\alpha$', 'b', 'c'])
+ assert result == s._repr_latex_()
+
+ assert s._repr_latex_() is None
+
+ def test_index_repr_in_frame_with_nan(self):
+ # see gh-25061
+ i = Index([1, np.nan])
+ s = Series([1, 2], index=i)
+ exp = """1.0 1\nNaN 2\ndtype: int64"""
+
+ assert repr(s) == exp
+
+
+class TestCategoricalRepr(object):
+
+ def test_categorical_repr_unicode(self):
+ # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii',
+ # and we are working in PY2, then rendering a Categorical could raise
+ # UnicodeDecodeError by trying to decode when it shouldn't
+
+ class County(StringMixin):
+ name = u'San Sebastián'
+ state = u'PR'
+
+ def __unicode__(self):
+ return self.name + u', ' + self.state
+
+ cat = pd.Categorical([County() for n in range(61)])
+ idx = pd.Index(cat)
+ ser = idx.to_series()
+
+ if compat.PY3:
+ # no reloading of sys, just check that the default (utf8) works
+ # as expected
+ repr(ser)
+ str(ser)
+
+ else:
+ # set sys.defaultencoding to ascii, then change it back after
+ # the test
+ with tm.set_defaultencoding('ascii'):
+ repr(ser)
+ str(ser)
+
+ def test_categorical_repr(self):
+ a = Series(Categorical([1, 2, 3, 4]))
+ exp = u("0 1\n1 2\n2 3\n3 4\n" +
+ "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
+
+ assert exp == a.__unicode__()
+
+ a = Series(Categorical(["a", "b"] * 25))
+ exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" +
+ "Length: 50, dtype: category\nCategories (2, object): [a, b]")
+ with option_context("display.max_rows", 5):
+ assert exp == repr(a)
+
+ levs = list("abcdefghijklmnopqrstuvwxyz")
+ a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
+ exp = u("0 a\n1 b\n" + "dtype: category\n"
+ "Categories (26, object): [a < b < c < d ... w < x < y < z]")
+ assert exp == a.__unicode__()
+
+ def test_categorical_series_repr(self):
+ s = Series(Categorical([1, 2, 3]))
+ exp = """0 1
+1 2
+2 3
+dtype: category
+Categories (3, int64): [1, 2, 3]"""
+
+ assert repr(s) == exp
+
+ s = Series(Categorical(np.arange(10)))
+ exp = """0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+dtype: category
+Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_ordered(self):
+ s = Series(Categorical([1, 2, 3], ordered=True))
+ exp = """0 1
+1 2
+2 3
+dtype: category
+Categories (3, int64): [1 < 2 < 3]"""
+
+ assert repr(s) == exp
+
+ s = Series(Categorical(np.arange(10), ordered=True))
+ exp = """0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+dtype: category
+Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_datetime(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00:00
+1 2011-01-01 10:00:00
+2 2011-01-01 11:00:00
+3 2011-01-01 12:00:00
+4 2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00,
+ 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00:00-05:00
+1 2011-01-01 10:00:00-05:00
+2 2011-01-01 11:00:00-05:00
+3 2011-01-01 12:00:00-05:00
+4 2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,
+ 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_datetime_ordered(self):
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00:00
+1 2011-01-01 10:00:00
+2 2011-01-01 11:00:00
+3 2011-01-01 12:00:00
+4 2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+ 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = date_range('2011-01-01 09:00', freq='H', periods=5,
+ tz='US/Eastern')
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00:00-05:00
+1 2011-01-01 10:00:00-05:00
+2 2011-01-01 11:00:00-05:00
+3 2011-01-01 12:00:00-05:00
+4 2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+ 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+ 2011-01-01 13:00:00-05:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_period(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01-01 09:00
+1 2011-01-01 10:00
+2 2011-01-01 11:00
+3 2011-01-01 12:00
+4 2011-01-01 13:00
+dtype: category
+Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 2011-01
+1 2011-02
+2 2011-03
+3 2011-04
+4 2011-05
+dtype: category
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_period_ordered(self):
+ idx = period_range('2011-01-01 09:00', freq='H', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01-01 09:00
+1 2011-01-01 10:00
+2 2011-01-01 11:00
+3 2011-01-01 12:00
+4 2011-01-01 13:00
+dtype: category
+Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+ 2011-01-01 13:00]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = period_range('2011-01', freq='M', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 2011-01
+1 2011-02
+2 2011-03
+3 2011-04
+4 2011-05
+dtype: category
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_timedelta(self):
+ idx = timedelta_range('1 days', periods=5)
+ s = Series(Categorical(idx))
+ exp = """0 1 days
+1 2 days
+2 3 days
+3 4 days
+4 5 days
+dtype: category
+Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
+
+ assert repr(s) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ s = Series(Categorical(idx))
+ exp = """0 0 days 01:00:00
+1 1 days 01:00:00
+2 2 days 01:00:00
+3 3 days 01:00:00
+4 4 days 01:00:00
+5 5 days 01:00:00
+6 6 days 01:00:00
+7 7 days 01:00:00
+8 8 days 01:00:00
+9 9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+ 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00,
+ 8 days 01:00:00, 9 days 01:00:00]""" # noqa
+
+ assert repr(s) == exp
+
+ def test_categorical_series_repr_timedelta_ordered(self):
+ idx = timedelta_range('1 days', periods=5)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 1 days
+1 2 days
+2 3 days
+3 4 days
+4 5 days
+dtype: category
+Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
+
+ assert repr(s) == exp
+
+ idx = timedelta_range('1 hours', periods=10)
+ s = Series(Categorical(idx, ordered=True))
+ exp = """0 0 days 01:00:00
+1 1 days 01:00:00
+2 2 days 01:00:00
+3 3 days 01:00:00
+4 4 days 01:00:00
+5 5 days 01:00:00
+6 6 days 01:00:00
+7 7 days 01:00:00
+8 8 days 01:00:00
+9 9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+ 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 <
+ 8 days 01:00:00 < 9 days 01:00:00]""" # noqa
+
+ assert repr(s) == exp
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_sorting.py b/contrib/python/pandas/py2/pandas/tests/series/test_sorting.py
new file mode 100644
index 00000000000..216f84c8f07
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_sorting.py
@@ -0,0 +1,266 @@
+# coding=utf-8
+
+import random
+
+import numpy as np
+import pytest
+
+from pandas import Categorical, DataFrame, IntervalIndex, MultiIndex, Series
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal, assert_series_equal
+
+from .common import TestData
+
+
+class TestSeriesSorting(TestData):
+
+ def test_sort_values(self):
+
+ # check indexes are reordered corresponding with the values
+ ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D'])
+ expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C'])
+ result = ser.sort_values()
+ tm.assert_series_equal(expected, result)
+
+ ts = self.ts.copy()
+ ts[:5] = np.NaN
+ vals = ts.values
+
+ result = ts.sort_values()
+ assert np.isnan(result[-5:]).all()
+ tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:]))
+
+ # na_position
+ result = ts.sort_values(na_position='first')
+ assert np.isnan(result[:5]).all()
+ tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:]))
+
+ # something object-type
+ ser = Series(['A', 'B'], [1, 2])
+ # no failure
+ ser.sort_values()
+
+ # ascending=False
+ ordered = ts.sort_values(ascending=False)
+ expected = np.sort(ts.dropna().values)[::-1]
+ assert_almost_equal(expected, ordered.dropna().values)
+ ordered = ts.sort_values(ascending=False, na_position='first')
+ assert_almost_equal(expected, ordered.dropna().values)
+
+ # ascending=[False] should behave the same as ascending=False
+ ordered = ts.sort_values(ascending=[False])
+ expected = ts.sort_values(ascending=False)
+ assert_series_equal(expected, ordered)
+ ordered = ts.sort_values(ascending=[False], na_position='first')
+ expected = ts.sort_values(ascending=False, na_position='first')
+ assert_series_equal(expected, ordered)
+
+ msg = "ascending must be boolean"
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending=None)
+ msg = r"Length of ascending \(0\) must be 1 for Series"
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending=[])
+ msg = r"Length of ascending \(3\) must be 1 for Series"
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending=[1, 2, 3])
+ msg = r"Length of ascending \(2\) must be 1 for Series"
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending=[False, False])
+ msg = "ascending must be boolean"
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending='foobar')
+
+ # inplace=True
+ ts = self.ts.copy()
+ ts.sort_values(ascending=False, inplace=True)
+ tm.assert_series_equal(ts, self.ts.sort_values(ascending=False))
+ tm.assert_index_equal(ts.index,
+ self.ts.sort_values(ascending=False).index)
+
+ # GH 5856/5853
+ # Series.sort_values operating on a view
+ df = DataFrame(np.random.randn(10, 4))
+ s = df.iloc[:, 0]
+
+ msg = ("This Series is a view of some other array, to sort in-place"
+ " you must create a copy")
+ with pytest.raises(ValueError, match=msg):
+ s.sort_values(inplace=True)
+
+ def test_sort_index(self):
+ rindex = list(self.ts.index)
+ random.shuffle(rindex)
+
+ random_order = self.ts.reindex(rindex)
+ sorted_series = random_order.sort_index()
+ assert_series_equal(sorted_series, self.ts)
+
+ # descending
+ sorted_series = random_order.sort_index(ascending=False)
+ assert_series_equal(sorted_series,
+ self.ts.reindex(self.ts.index[::-1]))
+
+ # compat on level
+ sorted_series = random_order.sort_index(level=0)
+ assert_series_equal(sorted_series, self.ts)
+
+ # compat on axis
+ sorted_series = random_order.sort_index(axis=0)
+ assert_series_equal(sorted_series, self.ts)
+
+ msg = r"No axis named 1 for object type <(class|type) 'type'>"
+ with pytest.raises(ValueError, match=msg):
+ random_order.sort_values(axis=1)
+
+ sorted_series = random_order.sort_index(level=0, axis=0)
+ assert_series_equal(sorted_series, self.ts)
+
+ with pytest.raises(ValueError, match=msg):
+ random_order.sort_index(level=0, axis=1)
+
+ def test_sort_index_inplace(self):
+
+ # For #11402
+ rindex = list(self.ts.index)
+ random.shuffle(rindex)
+
+ # descending
+ random_order = self.ts.reindex(rindex)
+ result = random_order.sort_index(ascending=False, inplace=True)
+
+ assert result is None
+ tm.assert_series_equal(random_order, self.ts.reindex(
+ self.ts.index[::-1]))
+
+ # ascending
+ random_order = self.ts.reindex(rindex)
+ result = random_order.sort_index(ascending=True, inplace=True)
+
+ assert result is None
+ tm.assert_series_equal(random_order, self.ts)
+
+ @pytest.mark.parametrize("level", ['A', 0]) # GH 21052
+ def test_sort_index_multiindex(self, level):
+
+ mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
+ s = Series([1, 2], mi)
+ backwards = s.iloc[[1, 0]]
+
+ # implicit sort_remaining=True
+ res = s.sort_index(level=level)
+ assert_series_equal(backwards, res)
+
+ # GH13496
+ # sort has no effect without remaining lvls
+ res = s.sort_index(level=level, sort_remaining=False)
+ assert_series_equal(s, res)
+
+ def test_sort_index_kind(self):
+ # GH #14444 & #13589: Add support for sort algo choosing
+ series = Series(index=[3, 2, 1, 4, 3])
+ expected_series = Series(index=[1, 2, 3, 3, 4])
+
+ index_sorted_series = series.sort_index(kind='mergesort')
+ assert_series_equal(expected_series, index_sorted_series)
+
+ index_sorted_series = series.sort_index(kind='quicksort')
+ assert_series_equal(expected_series, index_sorted_series)
+
+ index_sorted_series = series.sort_index(kind='heapsort')
+ assert_series_equal(expected_series, index_sorted_series)
+
+ def test_sort_index_na_position(self):
+ series = Series(index=[3, 2, 1, 4, 3, np.nan])
+
+ expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4])
+ index_sorted_series = series.sort_index(na_position='first')
+ assert_series_equal(expected_series_first, index_sorted_series)
+
+ expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan])
+ index_sorted_series = series.sort_index(na_position='last')
+ assert_series_equal(expected_series_last, index_sorted_series)
+
+ def test_sort_index_intervals(self):
+ s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays(
+ [0, 1, 2, 3],
+ [1, 2, 3, 4]))
+
+ result = s.sort_index()
+ expected = s
+ assert_series_equal(result, expected)
+
+ result = s.sort_index(ascending=False)
+ expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays(
+ [3, 2, 1, 0],
+ [4, 3, 2, 1]))
+ assert_series_equal(result, expected)
+
+ def test_sort_values_categorical(self):
+
+ c = Categorical(["a", "b", "b", "a"], ordered=False)
+ cat = Series(c.copy())
+
+ # sort in the categories order
+ expected = Series(
+ Categorical(["a", "a", "b", "b"],
+ ordered=False), index=[0, 3, 1, 2])
+ result = cat.sort_values()
+ tm.assert_series_equal(result, expected)
+
+ cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ cat = Series(Categorical(["a", "c", "b", "d"], categories=[
+ "a", "b", "c", "d"], ordered=True))
+ res = cat.sort_values()
+ exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ res = cat.sort_values(ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res.__array__(), exp)
+
+ raw_cat1 = Categorical(["a", "b", "c", "d"],
+ categories=["a", "b", "c", "d"], ordered=False)
+ raw_cat2 = Categorical(["a", "b", "c", "d"],
+ categories=["d", "c", "b", "a"], ordered=True)
+ s = ["a", "b", "c", "d"]
+ df = DataFrame({"unsort": raw_cat1,
+ "sort": raw_cat2,
+ "string": s,
+ "values": [1, 2, 3, 4]})
+
+ # Cats must be sorted in a dataframe
+ res = df.sort_values(by=["string"], ascending=False)
+ exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+ tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
+ assert res["sort"].dtype == "category"
+
+ res = df.sort_values(by=["sort"], ascending=False)
+ exp = df.sort_values(by=["string"], ascending=True)
+ tm.assert_series_equal(res["values"], exp["values"])
+ assert res["sort"].dtype == "category"
+ assert res["unsort"].dtype == "category"
+
+ # unordered cat, but we allow this
+ df.sort_values(by=["unsort"], ascending=False)
+
+ # multi-columns sort
+ # GH 7848
+ df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
+ "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
+ df["grade"] = Categorical(df["raw_grade"], ordered=True)
+ df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])
+
+ # sorts 'grade' according to the order of the categories
+ result = df.sort_values(by=['grade'])
+ expected = df.iloc[[1, 2, 5, 0, 3, 4]]
+ tm.assert_frame_equal(result, expected)
+
+ # multi
+ result = df.sort_values(by=['grade', 'id'])
+ expected = df.iloc[[2, 1, 5, 4, 3, 0]]
+ tm.assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_subclass.py b/contrib/python/pandas/py2/pandas/tests/series/test_subclass.py
new file mode 100644
index 00000000000..68a162ee4c2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_subclass.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+import numpy as np
+
+import pandas as pd
+from pandas import SparseDtype
+import pandas.util.testing as tm
+
+
+class TestSeriesSubclassing(object):
+
+ def test_indexing_sliced(self):
+ s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'))
+ res = s.loc[['a', 'b']]
+ exp = tm.SubclassedSeries([1, 2], index=list('ab'))
+ tm.assert_series_equal(res, exp)
+
+ res = s.iloc[[2, 3]]
+ exp = tm.SubclassedSeries([3, 4], index=list('cd'))
+ tm.assert_series_equal(res, exp)
+
+ res = s.loc[['a', 'b']]
+ exp = tm.SubclassedSeries([1, 2], index=list('ab'))
+ tm.assert_series_equal(res, exp)
+
+ def test_to_frame(self):
+ s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx')
+ res = s.to_frame()
+ exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd'))
+ tm.assert_frame_equal(res, exp)
+
+ def test_subclass_unstack(self):
+ # GH 15564
+ s = tm.SubclassedSeries(
+ [1, 2, 3, 4], index=[list('aabb'), list('xyxy')])
+
+ res = s.unstack()
+ exp = tm.SubclassedDataFrame(
+ {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b'])
+
+ tm.assert_frame_equal(res, exp)
+
+
+class TestSparseSeriesSubclassing(object):
+
+ def test_subclass_sparse_slice(self):
+ # int64
+ s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5])
+ exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3])
+ tm.assert_sp_series_equal(s.loc[1:3], exp)
+ assert s.loc[1:3].dtype == SparseDtype(np.int64)
+
+ exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
+ tm.assert_sp_series_equal(s.iloc[1:3], exp)
+ assert s.iloc[1:3].dtype == SparseDtype(np.int64)
+
+ exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
+ tm.assert_sp_series_equal(s[1:3], exp)
+ assert s[1:3].dtype == SparseDtype(np.int64)
+
+ # float64
+ s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.])
+ exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3])
+ tm.assert_sp_series_equal(s.loc[1:3], exp)
+ assert s.loc[1:3].dtype == SparseDtype(np.float64)
+
+ exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
+ tm.assert_sp_series_equal(s.iloc[1:3], exp)
+ assert s.iloc[1:3].dtype == SparseDtype(np.float64)
+
+ exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
+ tm.assert_sp_series_equal(s[1:3], exp)
+ assert s[1:3].dtype == SparseDtype(np.float64)
+
+ def test_subclass_sparse_addition(self):
+ s1 = tm.SubclassedSparseSeries([1, 3, 5])
+ s2 = tm.SubclassedSparseSeries([-2, 5, 12])
+ exp = tm.SubclassedSparseSeries([-1, 8, 17])
+ tm.assert_sp_series_equal(s1 + s2, exp)
+
+ s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0])
+ s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0])
+ exp = tm.SubclassedSparseSeries([5., 7., 9.])
+ tm.assert_sp_series_equal(s1 + s2, exp)
+
+ def test_subclass_sparse_to_frame(self):
+ s = tm.SubclassedSparseSeries([1, 2], index=list('ab'), name='xxx')
+ res = s.to_frame()
+
+ exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block',
+ fill_value=0)
+ exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr},
+ index=list('ab'),
+ default_fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+ # create from int dict
+ res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]},
+ index=list('ab'),
+ default_fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+ s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('ab'),
+ name='xxx')
+ res = s.to_frame()
+ exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]},
+ index=list('ab'))
+ tm.assert_sp_frame_equal(res, exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_timeseries.py b/contrib/python/pandas/py2/pandas/tests/series/test_timeseries.py
new file mode 100644
index 00000000000..d082b023e1f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_timeseries.py
@@ -0,0 +1,1099 @@
+# coding=utf-8
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, time, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+from pandas.compat import StringIO, lrange, product
+from pandas.errors import NullFrequencyError
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, NaT, Series, Timestamp, concat, date_range, offsets,
+ timedelta_range, to_datetime)
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.timedeltas import TimedeltaIndex
+from pandas.tests.series.common import TestData
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_series_equal)
+
+from pandas.tseries.offsets import BDay, BMonthEnd
+
+
+def _simple_ts(start, end, freq='D'):
+ rng = date_range(start, end, freq=freq)
+ return Series(np.random.randn(len(rng)), index=rng)
+
+
+def assert_range_equal(left, right):
+ assert (left.equals(right))
+ assert (left.freq == right.freq)
+ assert (left.tz == right.tz)
+
+
+class TestTimeSeries(TestData):
+
+ def test_shift(self):
+ shifted = self.ts.shift(1)
+ unshifted = shifted.shift(-1)
+
+ tm.assert_index_equal(shifted.index, self.ts.index)
+ tm.assert_index_equal(unshifted.index, self.ts.index)
+ tm.assert_numpy_array_equal(unshifted.dropna().values,
+ self.ts.values[:-1])
+
+ offset = BDay()
+ shifted = self.ts.shift(1, freq=offset)
+ unshifted = shifted.shift(-1, freq=offset)
+
+ assert_series_equal(unshifted, self.ts)
+
+ unshifted = self.ts.shift(0, freq=offset)
+ assert_series_equal(unshifted, self.ts)
+
+ shifted = self.ts.shift(1, freq='B')
+ unshifted = shifted.shift(-1, freq='B')
+
+ assert_series_equal(unshifted, self.ts)
+
+ # corner case
+ unshifted = self.ts.shift(0)
+ assert_series_equal(unshifted, self.ts)
+
+ # Shifting with PeriodIndex
+ ps = tm.makePeriodSeries()
+ shifted = ps.shift(1)
+ unshifted = shifted.shift(-1)
+ tm.assert_index_equal(shifted.index, ps.index)
+ tm.assert_index_equal(unshifted.index, ps.index)
+ tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1])
+
+ shifted2 = ps.shift(1, 'B')
+ shifted3 = ps.shift(1, BDay())
+ assert_series_equal(shifted2, shifted3)
+ assert_series_equal(ps, shifted2.shift(-1, 'B'))
+
+ msg = "Given freq D does not match PeriodIndex freq B"
+ with pytest.raises(ValueError, match=msg):
+ ps.shift(freq='D')
+
+ # legacy support
+ shifted4 = ps.shift(1, freq='B')
+ assert_series_equal(shifted2, shifted4)
+
+ shifted5 = ps.shift(1, freq=BDay())
+ assert_series_equal(shifted5, shifted4)
+
+ # 32-bit taking
+ # GH 8129
+ index = date_range('2000-01-01', periods=5)
+ for dtype in ['int32', 'int64']:
+ s1 = Series(np.arange(5, dtype=dtype), index=index)
+ p = s1.iloc[1]
+ result = s1.shift(periods=p)
+ expected = Series([np.nan, 0, 1, 2, 3], index=index)
+ assert_series_equal(result, expected)
+
+ # xref 8260
+ # with tz
+ s = Series(date_range('2000-01-01 09:00:00', periods=5,
+ tz='US/Eastern'), name='foo')
+ result = s - s.shift()
+
+ exp = Series(TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo')
+ assert_series_equal(result, exp)
+
+ # incompat tz
+ s2 = Series(date_range('2000-01-01 09:00:00', periods=5,
+ tz='CET'), name='foo')
+ msg = ("DatetimeArray subtraction must have the same timezones or no"
+ " timezones")
+ with pytest.raises(TypeError, match=msg):
+ s - s2
+
+ def test_shift2(self):
+ ts = Series(np.random.randn(5),
+ index=date_range('1/1/2000', periods=5, freq='H'))
+
+ result = ts.shift(1, freq='5T')
+ exp_index = ts.index.shift(1, freq='5T')
+ tm.assert_index_equal(result.index, exp_index)
+
+ # GH #1063, multiple of same base
+ result = ts.shift(1, freq='4H')
+ exp_index = ts.index + offsets.Hour(4)
+ tm.assert_index_equal(result.index, exp_index)
+
+ idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'])
+ msg = "Cannot shift with no freq"
+ with pytest.raises(NullFrequencyError, match=msg):
+ idx.shift(1)
+
+ def test_shift_fill_value(self):
+ # GH #24128
+ ts = Series([1.0, 2.0, 3.0, 4.0, 5.0],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+
+ exp = Series([0.0, 1.0, 2.0, 3.0, 4.0],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+ # check that fill value works
+ result = ts.shift(1, fill_value=0.0)
+ tm.assert_series_equal(result, exp)
+
+ exp = Series([0.0, 0.0, 1.0, 2.0, 3.0],
+ index=date_range('1/1/2000', periods=5, freq='H'))
+ result = ts.shift(2, fill_value=0.0)
+ tm.assert_series_equal(result, exp)
+
+ ts = pd.Series([1, 2, 3])
+ res = ts.shift(2, fill_value=0)
+ assert res.dtype == ts.dtype
+
+ def test_categorical_shift_fill_value(self):
+ ts = pd.Series(['a', 'b', 'c', 'd'], dtype="category")
+ res = ts.shift(1, fill_value='a')
+ expected = pd.Series(pd.Categorical(['a', 'a', 'b', 'c'],
+ categories=['a', 'b', 'c', 'd'],
+ ordered=False))
+ tm.assert_equal(res, expected)
+
+ # check for incorrect fill_value
+ msg = "'fill_value=f' is not present in this Categorical's categories"
+ with pytest.raises(ValueError, match=msg):
+ ts.shift(1, fill_value='f')
+
+ def test_shift_dst(self):
+ # GH 13926
+ dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern')
+ s = Series(dates)
+
+ res = s.shift(0)
+ tm.assert_series_equal(res, s)
+ assert res.dtype == 'datetime64[ns, US/Eastern]'
+
+ res = s.shift(1)
+ exp_vals = [NaT] + dates.astype(object).values.tolist()[:9]
+ exp = Series(exp_vals)
+ tm.assert_series_equal(res, exp)
+ assert res.dtype == 'datetime64[ns, US/Eastern]'
+
+ res = s.shift(-2)
+ exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT]
+ exp = Series(exp_vals)
+ tm.assert_series_equal(res, exp)
+ assert res.dtype == 'datetime64[ns, US/Eastern]'
+
+ for ex in [10, -10, 20, -20]:
+ res = s.shift(ex)
+ exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]')
+ tm.assert_series_equal(res, exp)
+ assert res.dtype == 'datetime64[ns, US/Eastern]'
+
+ def test_tshift(self):
+ # PeriodIndex
+ ps = tm.makePeriodSeries()
+ shifted = ps.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_series_equal(unshifted, ps)
+
+ shifted2 = ps.tshift(freq='B')
+ assert_series_equal(shifted, shifted2)
+
+ shifted3 = ps.tshift(freq=BDay())
+ assert_series_equal(shifted, shifted3)
+
+ msg = "Given freq M does not match PeriodIndex freq B"
+ with pytest.raises(ValueError, match=msg):
+ ps.tshift(freq='M')
+
+ # DatetimeIndex
+ shifted = self.ts.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_series_equal(self.ts, unshifted)
+
+ shifted2 = self.ts.tshift(freq=self.ts.index.freq)
+ assert_series_equal(shifted, shifted2)
+
+ inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index)),
+ name='ts')
+ shifted = inferred_ts.tshift(1)
+ unshifted = shifted.tshift(-1)
+ assert_series_equal(shifted, self.ts.tshift(1))
+ assert_series_equal(unshifted, inferred_ts)
+
+ no_freq = self.ts[[0, 5, 7]]
+ msg = "Freq was not given and was not set in the index"
+ with pytest.raises(ValueError, match=msg):
+ no_freq.tshift()
+
+ def test_truncate(self):
+ offset = BDay()
+
+ ts = self.ts[::3]
+
+ start, end = self.ts.index[3], self.ts.index[6]
+ start_missing, end_missing = self.ts.index[2], self.ts.index[7]
+
+ # neither specified
+ truncated = ts.truncate()
+ assert_series_equal(truncated, ts)
+
+ # both specified
+ expected = ts[1:3]
+
+ truncated = ts.truncate(start, end)
+ assert_series_equal(truncated, expected)
+
+ truncated = ts.truncate(start_missing, end_missing)
+ assert_series_equal(truncated, expected)
+
+ # start specified
+ expected = ts[1:]
+
+ truncated = ts.truncate(before=start)
+ assert_series_equal(truncated, expected)
+
+ truncated = ts.truncate(before=start_missing)
+ assert_series_equal(truncated, expected)
+
+ # end specified
+ expected = ts[:3]
+
+ truncated = ts.truncate(after=end)
+ assert_series_equal(truncated, expected)
+
+ truncated = ts.truncate(after=end_missing)
+ assert_series_equal(truncated, expected)
+
+ # corner case, empty series returned
+ truncated = ts.truncate(after=self.ts.index[0] - offset)
+ assert (len(truncated) == 0)
+
+ truncated = ts.truncate(before=self.ts.index[-1] + offset)
+ assert (len(truncated) == 0)
+
+ msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00"
+ with pytest.raises(ValueError, match=msg):
+ ts.truncate(before=self.ts.index[-1] + offset,
+ after=self.ts.index[0] - offset)
+
+ def test_truncate_nonsortedindex(self):
+ # GH 17935
+
+ s = pd.Series(['a', 'b', 'c', 'd', 'e'],
+ index=[5, 3, 2, 9, 0])
+ msg = 'truncate requires a sorted index'
+
+ with pytest.raises(ValueError, match=msg):
+ s.truncate(before=3, after=9)
+
+ rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
+ ts = pd.Series(np.random.randn(len(rng)), index=rng)
+ msg = 'truncate requires a sorted index'
+
+ with pytest.raises(ValueError, match=msg):
+ ts.sort_values(ascending=False).truncate(before='2011-11',
+ after='2011-12')
+
+ def test_asfreq(self):
+ ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime(
+ 2009, 11, 30), datetime(2009, 12, 31)])
+
+ daily_ts = ts.asfreq('B')
+ monthly_ts = daily_ts.asfreq('BM')
+ tm.assert_series_equal(monthly_ts, ts)
+
+ daily_ts = ts.asfreq('B', method='pad')
+ monthly_ts = daily_ts.asfreq('BM')
+ tm.assert_series_equal(monthly_ts, ts)
+
+ daily_ts = ts.asfreq(BDay())
+ monthly_ts = daily_ts.asfreq(BMonthEnd())
+ tm.assert_series_equal(monthly_ts, ts)
+
+ result = ts[:0].asfreq('M')
+ assert len(result) == 0
+ assert result is not ts
+
+ daily_ts = ts.asfreq('D', fill_value=-1)
+ result = daily_ts.value_counts().sort_index()
+ expected = Series([60, 1, 1, 1],
+ index=[-1.0, 2.0, 1.0, 0.0]).sort_index()
+ tm.assert_series_equal(result, expected)
+
+ def test_asfreq_datetimeindex_empty_series(self):
+ # GH 14320
+ expected = Series(index=pd.DatetimeIndex(
+ ["2016-09-29 11:00"])).asfreq('H')
+ result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]),
+ data=[3]).asfreq('H')
+ tm.assert_index_equal(expected.index, result.index)
+
+ def test_diff(self):
+ # Just run the function
+ self.ts.diff()
+
+ # int dtype
+ a = 10000000000000000
+ b = a + 1
+ s = Series([a, b])
+
+ rs = s.diff()
+ assert rs[1] == 1
+
+ # neg n
+ rs = self.ts.diff(-1)
+ xp = self.ts - self.ts.shift(-1)
+ assert_series_equal(rs, xp)
+
+ # 0
+ rs = self.ts.diff(0)
+ xp = self.ts - self.ts
+ assert_series_equal(rs, xp)
+
+ # datetime diff (GH3100)
+ s = Series(date_range('20130102', periods=5))
+ rs = s - s.shift(1)
+ xp = s.diff()
+ assert_series_equal(rs, xp)
+
+ # timedelta diff
+ nrs = rs - rs.shift(1)
+ nxp = xp.diff()
+ assert_series_equal(nrs, nxp)
+
+ # with tz
+ s = Series(
+ date_range('2000-01-01 09:00:00', periods=5,
+ tz='US/Eastern'), name='foo')
+ result = s.diff()
+ assert_series_equal(result, Series(
+ TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo'))
+
+ def test_pct_change(self):
+ rs = self.ts.pct_change(fill_method=None)
+ assert_series_equal(rs, self.ts / self.ts.shift(1) - 1)
+
+ rs = self.ts.pct_change(2)
+ filled = self.ts.fillna(method='pad')
+ assert_series_equal(rs, filled / filled.shift(2) - 1)
+
+ rs = self.ts.pct_change(fill_method='bfill', limit=1)
+ filled = self.ts.fillna(method='bfill', limit=1)
+ assert_series_equal(rs, filled / filled.shift(1) - 1)
+
+ rs = self.ts.pct_change(freq='5D')
+ filled = self.ts.fillna(method='pad')
+ assert_series_equal(rs,
+ (filled / filled.shift(freq='5D') - 1)
+ .reindex_like(filled))
+
+ def test_pct_change_shift_over_nas(self):
+ s = Series([1., 1.5, np.nan, 2.5, 3.])
+
+ chg = s.pct_change()
+ expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
+ assert_series_equal(chg, expected)
+
+ @pytest.mark.parametrize("freq, periods, fill_method, limit",
+ [('5B', 5, None, None),
+ ('3B', 3, None, None),
+ ('3B', 3, 'bfill', None),
+ ('7B', 7, 'pad', 1),
+ ('7B', 7, 'bfill', 3),
+ ('14B', 14, None, None)])
+ def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
+ # GH 7292
+ rs_freq = self.ts.pct_change(freq=freq,
+ fill_method=fill_method,
+ limit=limit)
+ rs_periods = self.ts.pct_change(periods,
+ fill_method=fill_method,
+ limit=limit)
+ assert_series_equal(rs_freq, rs_periods)
+
+ empty_ts = Series(index=self.ts.index)
+ rs_freq = empty_ts.pct_change(freq=freq,
+ fill_method=fill_method,
+ limit=limit)
+ rs_periods = empty_ts.pct_change(periods,
+ fill_method=fill_method,
+ limit=limit)
+ assert_series_equal(rs_freq, rs_periods)
+
+ def test_autocorr(self):
+ # Just run the function
+ corr1 = self.ts.autocorr()
+
+ # Now run it with the lag parameter
+ corr2 = self.ts.autocorr(lag=1)
+
+ # corr() with lag needs Series of at least length 2
+ if len(self.ts) <= 2:
+ assert np.isnan(corr1)
+ assert np.isnan(corr2)
+ else:
+ assert corr1 == corr2
+
+ # Choose a random lag between 1 and length of Series - 2
+ # and compare the result with the Series corr() function
+ n = 1 + np.random.randint(max(1, len(self.ts) - 2))
+ corr1 = self.ts.corr(self.ts.shift(n))
+ corr2 = self.ts.autocorr(lag=n)
+
+ # corr() with lag needs Series of at least length 2
+ if len(self.ts) <= 2:
+ assert np.isnan(corr1)
+ assert np.isnan(corr2)
+ else:
+ assert corr1 == corr2
+
+ def test_first_last_valid(self):
+ ts = self.ts.copy()
+ ts[:5] = np.NaN
+
+ index = ts.first_valid_index()
+ assert index == ts.index[5]
+
+ ts[-5:] = np.NaN
+ index = ts.last_valid_index()
+ assert index == ts.index[-6]
+
+ ts[:] = np.nan
+ assert ts.last_valid_index() is None
+ assert ts.first_valid_index() is None
+
+ ser = Series([], index=[])
+ assert ser.last_valid_index() is None
+ assert ser.first_valid_index() is None
+
+ # GH12800
+ empty = Series()
+ assert empty.last_valid_index() is None
+ assert empty.first_valid_index() is None
+
+ # GH20499: its preserves freq with holes
+ ts.index = date_range("20110101", periods=len(ts), freq="B")
+ ts.iloc[1] = 1
+ ts.iloc[-2] = 1
+ assert ts.first_valid_index() == ts.index[1]
+ assert ts.last_valid_index() == ts.index[-2]
+ assert ts.first_valid_index().freq == ts.index.freq
+ assert ts.last_valid_index().freq == ts.index.freq
+
+ def test_mpl_compat_hack(self):
+ result = self.ts[:, np.newaxis]
+ expected = self.ts.values[:, np.newaxis]
+ assert_almost_equal(result, expected)
+
+ def test_timeseries_coercion(self):
+ idx = tm.makeDateIndex(10000)
+ ser = Series(np.random.randn(len(idx)), idx.astype(object))
+ assert ser.index.is_all_dates
+ assert isinstance(ser.index, DatetimeIndex)
+
+ def test_contiguous_boolean_preserve_freq(self):
+ rng = date_range('1/1/2000', '3/1/2000', freq='B')
+
+ mask = np.zeros(len(rng), dtype=bool)
+ mask[10:20] = True
+
+ masked = rng[mask]
+ expected = rng[10:20]
+ assert expected.freq is not None
+ assert_range_equal(masked, expected)
+
+ mask[22] = True
+ masked = rng[mask]
+ assert masked.freq is None
+
+ def test_to_datetime_unit(self):
+
+ epoch = 1370745748
+ s = Series([epoch + t for t in range(20)])
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in range(20)])
+ assert_series_equal(result, expected)
+
+ s = Series([epoch + t for t in range(20)]).astype(float)
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in range(20)])
+ assert_series_equal(result, expected)
+
+ s = Series([epoch + t for t in range(20)] + [iNaT])
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in range(20)] + [NaT])
+ assert_series_equal(result, expected)
+
+ s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float)
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in range(20)] + [NaT])
+ assert_series_equal(result, expected)
+
+ # GH13834
+ s = Series([epoch + t for t in np.arange(0, 2, .25)] +
+ [iNaT]).astype(float)
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in np.arange(0, 2, .25)] + [NaT])
+ assert_series_equal(result, expected)
+
+ s = concat([Series([epoch + t for t in range(20)]
+ ).astype(float), Series([np.nan])],
+ ignore_index=True)
+ result = to_datetime(s, unit='s')
+ expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta(
+ seconds=t) for t in range(20)] + [NaT])
+ assert_series_equal(result, expected)
+
+ result = to_datetime([1, 2, 'NaT', pd.NaT, np.nan], unit='D')
+ expected = DatetimeIndex([Timestamp('1970-01-02'),
+ Timestamp('1970-01-03')] + ['NaT'] * 3)
+ tm.assert_index_equal(result, expected)
+
+ msg = "non convertible value foo with the unit 'D'"
+ with pytest.raises(ValueError, match=msg):
+ to_datetime([1, 2, 'foo'], unit='D')
+ msg = "cannot convert input 111111111 with the unit 'D'"
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ to_datetime([1, 2, 111111111], unit='D')
+
+ # coerce we can process
+ expected = DatetimeIndex([Timestamp('1970-01-02'),
+ Timestamp('1970-01-03')] + ['NaT'] * 1)
+ result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce')
+ tm.assert_index_equal(result, expected)
+
+ result = to_datetime([1, 2, 111111111], unit='D', errors='coerce')
+ tm.assert_index_equal(result, expected)
+
+ def test_series_ctor_datetime64(self):
+ rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
+ dates = np.asarray(rng)
+
+ series = Series(dates)
+ assert np.issubdtype(series.dtype, np.dtype('M8[ns]'))
+
+ def test_series_repr_nat(self):
+ series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')
+
+ result = repr(series)
+ expected = ('0 1970-01-01 00:00:00.000000\n'
+ '1 1970-01-01 00:00:00.000001\n'
+ '2 1970-01-01 00:00:00.000002\n'
+ '3 NaT\n'
+ 'dtype: datetime64[ns]')
+ assert result == expected
+
+ def test_asfreq_keep_index_name(self):
+ # GH #9854
+ index_name = 'bar'
+ index = pd.date_range('20130101', periods=20, name=index_name)
+ df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index)
+
+ assert index_name == df.index.name
+ assert index_name == df.asfreq('10D').index.name
+
+ def test_promote_datetime_date(self):
+ rng = date_range('1/1/2000', periods=20)
+ ts = Series(np.random.randn(20), index=rng)
+
+ ts_slice = ts[5:]
+ ts2 = ts_slice.copy()
+ ts2.index = [x.date() for x in ts2.index]
+
+ result = ts + ts2
+ result2 = ts2 + ts
+ expected = ts + ts[5:]
+ assert_series_equal(result, expected)
+ assert_series_equal(result2, expected)
+
+ # test asfreq
+ result = ts2.asfreq('4H', method='ffill')
+ expected = ts[5:].asfreq('4H', method='ffill')
+ assert_series_equal(result, expected)
+
+ result = rng.get_indexer(ts2.index)
+ expected = rng.get_indexer(ts_slice.index)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_asfreq_normalize(self):
+ rng = date_range('1/1/2000 09:30', periods=20)
+ norm = date_range('1/1/2000', periods=20)
+ vals = np.random.randn(20)
+ ts = Series(vals, index=rng)
+
+ result = ts.asfreq('D', normalize=True)
+ norm = date_range('1/1/2000', periods=20)
+ expected = Series(vals, index=norm)
+
+ assert_series_equal(result, expected)
+
+ vals = np.random.randn(20, 3)
+ ts = DataFrame(vals, index=rng)
+
+ result = ts.asfreq('D', normalize=True)
+ expected = DataFrame(vals, index=norm)
+
+ assert_frame_equal(result, expected)
+
+ def test_first_subset(self):
+ ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h')
+ result = ts.first('10d')
+ assert len(result) == 20
+
+ ts = _simple_ts('1/1/2000', '1/1/2010')
+ result = ts.first('10d')
+ assert len(result) == 10
+
+ result = ts.first('3M')
+ expected = ts[:'3/31/2000']
+ assert_series_equal(result, expected)
+
+ result = ts.first('21D')
+ expected = ts[:21]
+ assert_series_equal(result, expected)
+
+ result = ts[:0].first('3M')
+ assert_series_equal(result, ts[:0])
+
+ def test_first_raises(self):
+ # GH20725
+ ser = pd.Series('a b c'.split())
+ msg = "'first' only supports a DatetimeIndex index"
+ with pytest.raises(TypeError, match=msg):
+ ser.first('1D')
+
+ def test_last_subset(self):
+ ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h')
+ result = ts.last('10d')
+ assert len(result) == 20
+
+ ts = _simple_ts('1/1/2000', '1/1/2010')
+ result = ts.last('10d')
+ assert len(result) == 10
+
+ result = ts.last('21D')
+ expected = ts['12/12/2009':]
+ assert_series_equal(result, expected)
+
+ result = ts.last('21D')
+ expected = ts[-21:]
+ assert_series_equal(result, expected)
+
+ result = ts[:0].last('3M')
+ assert_series_equal(result, ts[:0])
+
+ def test_last_raises(self):
+ # GH20725
+ ser = pd.Series('a b c'.split())
+ msg = "'last' only supports a DatetimeIndex index"
+ with pytest.raises(TypeError, match=msg):
+ ser.last('1D')
+
+ def test_format_pre_1900_dates(self):
+ rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC')
+ rng.format()
+ ts = Series(1, index=rng)
+ repr(ts)
+
+ def test_at_time(self):
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ rs = ts.at_time(rng[1])
+ assert (rs.index.hour == rng[1].hour).all()
+ assert (rs.index.minute == rng[1].minute).all()
+ assert (rs.index.second == rng[1].second).all()
+
+ result = ts.at_time('9:30')
+ expected = ts.at_time(time(9, 30))
+ assert_series_equal(result, expected)
+
+ df = DataFrame(np.random.randn(len(rng), 3), index=rng)
+
+ result = ts[time(9, 30)]
+ result_df = df.loc[time(9, 30)]
+ expected = ts[(rng.hour == 9) & (rng.minute == 30)]
+ exp_df = df[(rng.hour == 9) & (rng.minute == 30)]
+
+ # expected.index = date_range('1/1/2000', '1/4/2000')
+
+ assert_series_equal(result, expected)
+ tm.assert_frame_equal(result_df, exp_df)
+
+ chunk = df.loc['1/4/2000':]
+ result = chunk.loc[time(9, 30)]
+ expected = result_df[-1:]
+ tm.assert_frame_equal(result, expected)
+
+ # midnight, everything
+ rng = date_range('1/1/2000', '1/31/2000')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ts.at_time(time(0, 0))
+ assert_series_equal(result, ts)
+
+ # time doesn't exist
+ rng = date_range('1/1/2012', freq='23Min', periods=384)
+ ts = Series(np.random.randn(len(rng)), rng)
+ rs = ts.at_time('16:00')
+ assert len(rs) == 0
+
+ def test_at_time_raises(self):
+ # GH20725
+ ser = pd.Series('a b c'.split())
+ msg = "Index must be DatetimeIndex"
+ with pytest.raises(TypeError, match=msg):
+ ser.at_time('00:00')
+
+ def test_between(self):
+ series = Series(date_range('1/1/2000', periods=10))
+ left, right = series[[2, 7]]
+
+ result = series.between(left, right)
+ expected = (series >= left) & (series <= right)
+ assert_series_equal(result, expected)
+
+ def test_between_time(self):
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ stime = time(0, 0)
+ etime = time(1, 0)
+
+ close_open = product([True, False], [True, False])
+ for inc_start, inc_end in close_open:
+ filtered = ts.between_time(stime, etime, inc_start, inc_end)
+ exp_len = 13 * 4 + 1
+ if not inc_start:
+ exp_len -= 5
+ if not inc_end:
+ exp_len -= 4
+
+ assert len(filtered) == exp_len
+ for rs in filtered.index:
+ t = rs.time()
+ if inc_start:
+ assert t >= stime
+ else:
+ assert t > stime
+
+ if inc_end:
+ assert t <= etime
+ else:
+ assert t < etime
+
+ result = ts.between_time('00:00', '01:00')
+ expected = ts.between_time(stime, etime)
+ assert_series_equal(result, expected)
+
+ # across midnight
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ stime = time(22, 0)
+ etime = time(9, 0)
+
+ close_open = product([True, False], [True, False])
+ for inc_start, inc_end in close_open:
+ filtered = ts.between_time(stime, etime, inc_start, inc_end)
+ exp_len = (12 * 11 + 1) * 4 + 1
+ if not inc_start:
+ exp_len -= 4
+ if not inc_end:
+ exp_len -= 4
+
+ assert len(filtered) == exp_len
+ for rs in filtered.index:
+ t = rs.time()
+ if inc_start:
+ assert (t >= stime) or (t <= etime)
+ else:
+ assert (t > stime) or (t <= etime)
+
+ if inc_end:
+ assert (t <= etime) or (t >= stime)
+ else:
+ assert (t < etime) or (t >= stime)
+
+ def test_between_time_raises(self):
+ # GH20725
+ ser = pd.Series('a b c'.split())
+ msg = "Index must be DatetimeIndex"
+ with pytest.raises(TypeError, match=msg):
+ ser.between_time(start_time='00:00', end_time='12:00')
+
+ def test_between_time_types(self):
+ # GH11818
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ msg = (r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]"
+ " to a time")
+ with pytest.raises(ValueError, match=msg):
+ rng.indexer_between_time(datetime(2010, 1, 2, 1),
+ datetime(2010, 1, 2, 5))
+
+ frame = DataFrame({'A': 0}, index=rng)
+ with pytest.raises(ValueError, match=msg):
+ frame.between_time(datetime(2010, 1, 2, 1),
+ datetime(2010, 1, 2, 5))
+
+ series = Series(0, index=rng)
+ with pytest.raises(ValueError, match=msg):
+ series.between_time(datetime(2010, 1, 2, 1),
+ datetime(2010, 1, 2, 5))
+
+ @td.skip_if_has_locale
+ def test_between_time_formats(self):
+ # GH11818
+ rng = date_range('1/1/2000', '1/5/2000', freq='5min')
+ ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
+
+ strings = [("2:00", "2:30"), ("0200", "0230"), ("2:00am", "2:30am"),
+ ("0200am", "0230am"), ("2:00:00", "2:30:00"),
+ ("020000", "023000"), ("2:00:00am", "2:30:00am"),
+ ("020000am", "023000am")]
+ expected_length = 28
+
+ for time_string in strings:
+ assert len(ts.between_time(*time_string)) == expected_length
+
+ def test_between_time_axis(self):
+ # issue 8839
+ rng = date_range('1/1/2000', periods=100, freq='10min')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+ stime, etime = ('08:00:00', '09:00:00')
+ expected_length = 7
+
+ assert len(ts.between_time(stime, etime)) == expected_length
+ assert len(ts.between_time(stime, etime, axis=0)) == expected_length
+ msg = r"No axis named 1 for object type <(class|type) 'type'>"
+ with pytest.raises(ValueError, match=msg):
+ ts.between_time(stime, etime, axis=1)
+
+ def test_to_period(self):
+ from pandas.core.indexes.period import period_range
+
+ ts = _simple_ts('1/1/2000', '1/1/2001')
+
+ pts = ts.to_period()
+ exp = ts.copy()
+ exp.index = period_range('1/1/2000', '1/1/2001')
+ assert_series_equal(pts, exp)
+
+ pts = ts.to_period('M')
+ exp.index = exp.index.asfreq('M')
+ tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
+ assert_series_equal(pts, exp)
+
+ # GH 7606 without freq
+ idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03',
+ '2011-01-04'])
+ exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03',
+ '2011-01-04'], freq='D')
+
+ s = Series(np.random.randn(4), index=idx)
+ expected = s.copy()
+ expected.index = exp_idx
+ assert_series_equal(s.to_period(), expected)
+
+ df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx)
+ expected = df.copy()
+ expected.index = exp_idx
+ assert_frame_equal(df.to_period(), expected)
+
+ expected = df.copy()
+ expected.columns = exp_idx
+ assert_frame_equal(df.to_period(axis=1), expected)
+
+ def test_groupby_count_dateparseerror(self):
+ dr = date_range(start='1/1/2012', freq='5min', periods=10)
+
+ # BAD Example, datetimes first
+ s = Series(np.arange(10), index=[dr, lrange(10)])
+ grouped = s.groupby(lambda x: x[1] % 2 == 0)
+ result = grouped.count()
+
+ s = Series(np.arange(10), index=[lrange(10), dr])
+ grouped = s.groupby(lambda x: x[0] % 2 == 0)
+ expected = grouped.count()
+
+ assert_series_equal(result, expected)
+
+ def test_to_csv_numpy_16_bug(self):
+ frame = DataFrame({'a': date_range('1/1/2000', periods=10)})
+
+ buf = StringIO()
+ frame.to_csv(buf)
+
+ result = buf.getvalue()
+ assert '2000-01-01' in result
+
+ def test_series_map_box_timedelta(self):
+ # GH 11349
+ s = Series(timedelta_range('1 day 1 s', periods=5, freq='h'))
+
+ def f(x):
+ return x.total_seconds()
+
+ s.map(f)
+ s.apply(f)
+ DataFrame(s).applymap(f)
+
+ def test_asfreq_resample_set_correct_freq(self):
+ # GH5613
+ # we test if .asfreq() and .resample() set the correct value for .freq
+ df = pd.DataFrame({'date': ["2012-01-01", "2012-01-02", "2012-01-03"],
+ 'col': [1, 2, 3]})
+ df = df.set_index(pd.to_datetime(df.date))
+
+ # testing the settings before calling .asfreq() and .resample()
+ assert df.index.freq is None
+ assert df.index.inferred_freq == 'D'
+
+ # does .asfreq() set .freq correctly?
+ assert df.asfreq('D').index.freq == 'D'
+
+ # does .resample() set .freq correctly?
+ assert df.resample('D').asfreq().index.freq == 'D'
+
+ def test_pickle(self):
+
+ # GH4606
+ p = tm.round_trip_pickle(NaT)
+ assert p is NaT
+
+ idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06'])
+ idx_p = tm.round_trip_pickle(idx)
+ assert idx_p[0] == idx[0]
+ assert idx_p[1] is NaT
+ assert idx_p[2] == idx[2]
+
+ # GH11002
+ # don't infer freq
+ idx = date_range('1750-1-1', '2050-1-1', freq='7D')
+ idx_p = tm.round_trip_pickle(idx)
+ tm.assert_index_equal(idx, idx_p)
+
+ @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo', 'US/Eastern'])
+ def test_setops_preserve_freq(self, tz):
+ rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz)
+
+ result = rng[:50].union(rng[50:100])
+ assert result.name == rng.name
+ assert result.freq == rng.freq
+ assert result.tz == rng.tz
+
+ result = rng[:50].union(rng[30:100])
+ assert result.name == rng.name
+ assert result.freq == rng.freq
+ assert result.tz == rng.tz
+
+ result = rng[:50].union(rng[60:100])
+ assert result.name == rng.name
+ assert result.freq is None
+ assert result.tz == rng.tz
+
+ result = rng[:50].intersection(rng[25:75])
+ assert result.name == rng.name
+ assert result.freqstr == 'D'
+ assert result.tz == rng.tz
+
+ nofreq = DatetimeIndex(list(rng[25:75]), name='other')
+ result = rng[:50].union(nofreq)
+ assert result.name is None
+ assert result.freq == rng.freq
+ assert result.tz == rng.tz
+
+ result = rng[:50].intersection(nofreq)
+ assert result.name is None
+ assert result.freq == rng.freq
+ assert result.tz == rng.tz
+
+ def test_from_M8_structured(self):
+ dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))]
+ arr = np.array(dates,
+ dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')])
+ df = DataFrame(arr)
+
+ assert df['Date'][0] == dates[0][0]
+ assert df['Forecasting'][0] == dates[0][1]
+
+ s = Series(arr['Date'])
+ assert isinstance(s[0], Timestamp)
+ assert s[0] == dates[0][0]
+
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ s = Series.from_array(arr['Date'], Index([0]))
+ assert s[0] == dates[0][0]
+
+ def test_get_level_values_box(self):
+ from pandas import MultiIndex
+
+ dates = date_range('1/1/2000', periods=4)
+ levels = [dates, [0, 1]]
+ codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]]
+
+ index = MultiIndex(levels=levels, codes=codes)
+
+ assert isinstance(index.get_level_values(0)[0], Timestamp)
+
+ def test_view_tz(self):
+ # GH#24024
+ ser = pd.Series(pd.date_range('2000', periods=4, tz='US/Central'))
+ result = ser.view("i8")
+ expected = pd.Series([946706400000000000,
+ 946792800000000000,
+ 946879200000000000,
+ 946965600000000000])
+ tm.assert_series_equal(result, expected)
+
+ def test_asarray_tz_naive(self):
+ # This shouldn't produce a warning.
+ ser = pd.Series(pd.date_range('2000', periods=2))
+ expected = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]')
+ with tm.assert_produces_warning(None):
+ result = np.asarray(ser)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # optionally, object
+ with tm.assert_produces_warning(None):
+ result = np.asarray(ser, dtype=object)
+
+ expected = np.array([pd.Timestamp('2000-01-01'),
+ pd.Timestamp('2000-01-02')])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_asarray_tz_aware(self):
+ tz = 'US/Central'
+ ser = pd.Series(pd.date_range('2000', periods=2, tz=tz))
+ expected = np.array(['2000-01-01T06', '2000-01-02T06'], dtype='M8[ns]')
+ # We warn by default and return an ndarray[M8[ns]]
+ with tm.assert_produces_warning(FutureWarning):
+ result = np.asarray(ser)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Old behavior with no warning
+ with tm.assert_produces_warning(None):
+ result = np.asarray(ser, dtype="M8[ns]")
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # Future behavior with no warning
+ expected = np.array([pd.Timestamp("2000-01-01", tz=tz),
+ pd.Timestamp("2000-01-02", tz=tz)])
+ with tm.assert_produces_warning(None):
+ result = np.asarray(ser, dtype=object)
+
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_timezones.py b/contrib/python/pandas/py2/pandas/tests/series/test_timezones.py
new file mode 100644
index 00000000000..ec644a8e93d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_timezones.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for Series timezone-related methods
+"""
+from datetime import datetime
+
+from dateutil.tz import tzoffset
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs.tslibs import conversion, timezones
+from pandas.compat import lrange
+
+from pandas import DatetimeIndex, Index, NaT, Series, Timestamp
+from pandas.core.indexes.datetimes import date_range
+import pandas.util.testing as tm
+
+
+class TestSeriesTimezones(object):
+ # -----------------------------------------------------------------
+ # Series.tz_localize
+ def test_series_tz_localize(self):
+
+ rng = date_range('1/1/2011', periods=100, freq='H')
+ ts = Series(1, index=rng)
+
+ result = ts.tz_localize('utc')
+ assert result.index.tz.zone == 'UTC'
+
+ # Can't localize if already tz-aware
+ rng = date_range('1/1/2011', periods=100, freq='H', tz='utc')
+ ts = Series(1, index=rng)
+
+ with pytest.raises(TypeError, match='Already tz-aware'):
+ ts.tz_localize('US/Eastern')
+
+ @pytest.mark.filterwarnings('ignore::FutureWarning')
+ def test_tz_localize_errors_deprecation(self):
+ # GH 22644
+ tz = 'Europe/Warsaw'
+ n = 60
+ rng = date_range(start='2015-03-29 02:00:00', periods=n, freq='min')
+ ts = Series(rng)
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ with pytest.raises(ValueError):
+ ts.dt.tz_localize(tz, errors='foo')
+ # make sure errors='coerce' gets mapped correctly to nonexistent
+ result = ts.dt.tz_localize(tz, errors='coerce')
+ expected = ts.dt.tz_localize(tz, nonexistent='NaT')
+ tm.assert_series_equal(result, expected)
+
+ def test_series_tz_localize_ambiguous_bool(self):
+ # make sure that we are correctly accepting bool values as ambiguous
+
+ # GH#14402
+ ts = Timestamp('2015-11-01 01:00:03')
+ expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central')
+ expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central')
+
+ ser = Series([ts])
+ expected0 = Series([expected0])
+ expected1 = Series([expected1])
+
+ with pytest.raises(pytz.AmbiguousTimeError):
+ ser.dt.tz_localize('US/Central')
+
+ result = ser.dt.tz_localize('US/Central', ambiguous=True)
+ tm.assert_series_equal(result, expected0)
+
+ result = ser.dt.tz_localize('US/Central', ambiguous=[True])
+ tm.assert_series_equal(result, expected0)
+
+ result = ser.dt.tz_localize('US/Central', ambiguous=False)
+ tm.assert_series_equal(result, expected1)
+
+ result = ser.dt.tz_localize('US/Central', ambiguous=[False])
+ tm.assert_series_equal(result, expected1)
+
+ @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw'])
+ @pytest.mark.parametrize('method, exp', [
+ ['shift_forward', '2015-03-29 03:00:00'],
+ ['NaT', NaT],
+ ['raise', None],
+ ['foo', 'invalid']
+ ])
+ def test_series_tz_localize_nonexistent(self, tz, method, exp):
+ # GH 8917
+ n = 60
+ dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min')
+ s = Series(1, dti)
+ if method == 'raise':
+ with pytest.raises(pytz.NonExistentTimeError):
+ s.tz_localize(tz, nonexistent=method)
+ elif exp == 'invalid':
+ with pytest.raises(ValueError):
+ dti.tz_localize(tz, nonexistent=method)
+ else:
+ result = s.tz_localize(tz, nonexistent=method)
+ expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz))
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_series_tz_localize_empty(self, tzstr):
+ # GH#2248
+ ser = Series()
+
+ ser2 = ser.tz_localize('utc')
+ assert ser2.index.tz == pytz.utc
+
+ ser2 = ser.tz_localize(tzstr)
+ timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr))
+
+ # -----------------------------------------------------------------
+ # Series.tz_convert
+
+ def test_series_tz_convert(self):
+ rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
+ ts = Series(1, index=rng)
+
+ result = ts.tz_convert('Europe/Berlin')
+ assert result.index.tz.zone == 'Europe/Berlin'
+
+ # can't convert tz-naive
+ rng = date_range('1/1/2011', periods=200, freq='D')
+ ts = Series(1, index=rng)
+
+ with pytest.raises(TypeError, match="Cannot convert tz-naive"):
+ ts.tz_convert('US/Eastern')
+
+ def test_series_tz_convert_to_utc(self):
+ base = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'],
+ tz='UTC')
+ idx1 = base.tz_convert('Asia/Tokyo')[:2]
+ idx2 = base.tz_convert('US/Eastern')[1:]
+
+ res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2)
+ tm.assert_series_equal(res, Series([np.nan, 3, np.nan], index=base))
+
+ # -----------------------------------------------------------------
+ # Series.append
+
+ def test_series_append_aware(self):
+ rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
+ tz='US/Eastern')
+ rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
+ tz='US/Eastern')
+ ser1 = Series([1], index=rng1)
+ ser2 = Series([2], index=rng2)
+ ts_result = ser1.append(ser2)
+
+ exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'],
+ tz='US/Eastern')
+ exp = Series([1, 2], index=exp_index)
+ tm.assert_series_equal(ts_result, exp)
+ assert ts_result.index.tz == rng1.tz
+
+ rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC')
+ rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC')
+ ser1 = Series([1], index=rng1)
+ ser2 = Series([2], index=rng2)
+ ts_result = ser1.append(ser2)
+
+ exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'],
+ tz='UTC')
+ exp = Series([1, 2], index=exp_index)
+ tm.assert_series_equal(ts_result, exp)
+ utc = rng1.tz
+ assert utc == ts_result.index.tz
+
+ # GH#7795
+ # different tz coerces to object dtype, not UTC
+ rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
+ tz='US/Eastern')
+ rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
+ tz='US/Central')
+ ser1 = Series([1], index=rng1)
+ ser2 = Series([2], index=rng2)
+ ts_result = ser1.append(ser2)
+ exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'),
+ Timestamp('1/1/2011 02:00', tz='US/Central')])
+ exp = Series([1, 2], index=exp_index)
+ tm.assert_series_equal(ts_result, exp)
+
+ def test_series_append_aware_naive(self):
+ rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
+ rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
+ tz='US/Eastern')
+ ser1 = Series(np.random.randn(len(rng1)), index=rng1)
+ ser2 = Series(np.random.randn(len(rng2)), index=rng2)
+ ts_result = ser1.append(ser2)
+
+ expected = ser1.index.astype(object).append(ser2.index.astype(object))
+ assert ts_result.index.equals(expected)
+
+ # mixed
+ rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
+ rng2 = lrange(100)
+ ser1 = Series(np.random.randn(len(rng1)), index=rng1)
+ ser2 = Series(np.random.randn(len(rng2)), index=rng2)
+ ts_result = ser1.append(ser2)
+
+ expected = ser1.index.astype(object).append(ser2.index)
+ assert ts_result.index.equals(expected)
+
+ def test_series_append_dst(self):
+ rng1 = date_range('1/1/2016 01:00', periods=3, freq='H',
+ tz='US/Eastern')
+ rng2 = date_range('8/1/2016 01:00', periods=3, freq='H',
+ tz='US/Eastern')
+ ser1 = Series([1, 2, 3], index=rng1)
+ ser2 = Series([10, 11, 12], index=rng2)
+ ts_result = ser1.append(ser2)
+
+ exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00',
+ '2016-01-01 03:00', '2016-08-01 01:00',
+ '2016-08-01 02:00', '2016-08-01 03:00'],
+ tz='US/Eastern')
+ exp = Series([1, 2, 3, 10, 11, 12], index=exp_index)
+ tm.assert_series_equal(ts_result, exp)
+ assert ts_result.index.tz == rng1.tz
+
+ # -----------------------------------------------------------------
+
+ def test_dateutil_tzoffset_support(self):
+ values = [188.5, 328.25]
+ tzinfo = tzoffset(None, 7200)
+ index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo),
+ datetime(2012, 5, 11, 12, tzinfo=tzinfo)]
+ series = Series(data=values, index=index)
+
+ assert series.index.tz == tzinfo
+
+ # it works! #2443
+ repr(series.index[0])
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_tz_aware_asfreq(self, tz):
+ dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz)
+
+ ser = Series(np.random.randn(len(dr)), index=dr)
+
+ # it works!
+ ser.asfreq('T')
+
+ @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_string_index_alias_tz_aware(self, tz):
+ rng = date_range('1/1/2000', periods=10, tz=tz)
+ ser = Series(np.random.randn(len(rng)), index=rng)
+
+ result = ser['1/3/2000']
+ tm.assert_almost_equal(result, ser[2])
+
+ # TODO: De-duplicate with test below
+ def test_series_add_tz_mismatch_converts_to_utc_duplicate(self):
+ rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
+ ser = Series(np.random.randn(len(rng)), index=rng)
+
+ ts_moscow = ser.tz_convert('Europe/Moscow')
+
+ result = ser + ts_moscow
+ assert result.index.tz is pytz.utc
+
+ result = ts_moscow + ser
+ assert result.index.tz is pytz.utc
+
+ def test_series_add_tz_mismatch_converts_to_utc(self):
+ rng = date_range('1/1/2011', periods=100, freq='H', tz='utc')
+
+ perm = np.random.permutation(100)[:90]
+ ser1 = Series(np.random.randn(90),
+ index=rng.take(perm).tz_convert('US/Eastern'))
+
+ perm = np.random.permutation(100)[:90]
+ ser2 = Series(np.random.randn(90),
+ index=rng.take(perm).tz_convert('Europe/Berlin'))
+
+ result = ser1 + ser2
+
+ uts1 = ser1.tz_convert('utc')
+ uts2 = ser2.tz_convert('utc')
+ expected = uts1 + uts2
+
+ assert result.index.tz == pytz.UTC
+ tm.assert_series_equal(result, expected)
+
+ def test_series_add_aware_naive_raises(self):
+ rng = date_range('1/1/2011', periods=10, freq='H')
+ ser = Series(np.random.randn(len(rng)), index=rng)
+
+ ser_utc = ser.tz_localize('utc')
+
+ with pytest.raises(Exception):
+ ser + ser_utc
+
+ with pytest.raises(Exception):
+ ser_utc + ser
+
+ def test_series_align_aware(self):
+ idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
+ ser = Series(np.random.randn(len(idx1)), index=idx1)
+ ser_central = ser.tz_convert('US/Central')
+ # # different timezones convert to UTC
+
+ new1, new2 = ser.align(ser_central)
+ assert new1.index.tz == pytz.UTC
+ assert new2.index.tz == pytz.UTC
+
+ @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern'])
+ def test_localized_at_time_between_time(self, tzstr):
+ from datetime import time
+ tz = timezones.maybe_get_tz(tzstr)
+
+ rng = date_range('4/16/2012', '5/1/2012', freq='H')
+ ts = Series(np.random.randn(len(rng)), index=rng)
+
+ ts_local = ts.tz_localize(tzstr)
+
+ result = ts_local.at_time(time(10, 0))
+ expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
+ tm.assert_series_equal(result, expected)
+ assert timezones.tz_compare(result.index.tz, tz)
+
+ t1, t2 = time(10, 0), time(11, 0)
+ result = ts_local.between_time(t1, t2)
+ expected = ts.between_time(t1, t2).tz_localize(tzstr)
+ tm.assert_series_equal(result, expected)
+ assert timezones.tz_compare(result.index.tz, tz)
+
+ @pytest.mark.parametrize('tzstr', ['Europe/Berlin',
+ 'dateutil/Europe/Berlin'])
+ def test_getitem_pydatetime_tz(self, tzstr):
+ tz = timezones.maybe_get_tz(tzstr)
+
+ index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00',
+ freq='H', tz=tzstr)
+ ts = Series(index=index, data=index.hour)
+ time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr)
+
+ dt = datetime(2012, 12, 24, 17, 0)
+ time_datetime = conversion.localize_pydatetime(dt, tz)
+ assert ts[time_pandas] == ts[time_datetime]
+
+ def test_series_truncate_datetimeindex_tz(self):
+ # GH 9243
+ idx = date_range('4/1/2005', '4/30/2005', freq='D', tz='US/Pacific')
+ s = Series(range(len(idx)), index=idx)
+ result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4))
+ expected = Series([1, 2, 3], index=idx[1:4])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('copy', [True, False])
+ @pytest.mark.parametrize('method, tz', [
+ ['tz_localize', None],
+ ['tz_convert', 'Europe/Berlin']
+ ])
+ def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
+ # GH 6326
+ result = Series(np.arange(0, 5),
+ index=date_range('20131027', periods=5, freq='1H',
+ tz=tz))
+ getattr(result, method)('UTC', copy=copy)
+ expected = Series(np.arange(0, 5),
+ index=date_range('20131027', periods=5, freq='1H',
+ tz=tz))
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/series/test_validate.py b/contrib/python/pandas/py2/pandas/tests/series/test_validate.py
new file mode 100644
index 00000000000..8f7c16f2c31
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/series/test_validate.py
@@ -0,0 +1,19 @@
+import pytest
+
+
+class TestSeriesValidate(object):
+ """Tests for error handling related to data types of method arguments."""
+
+ @pytest.mark.parametrize("func", ["reset_index", "_set_name",
+ "sort_values", "sort_index",
+ "rename", "dropna"])
+ @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
+ def test_validate_bool_args(self, string_series, func, inplace):
+ msg = "For argument \"inplace\" expected type bool"
+ kwargs = dict(inplace=inplace)
+
+ if func == "_set_name":
+ kwargs["name"] = "hello"
+
+ with pytest.raises(ValueError, match=msg):
+ getattr(string_series, func)(**kwargs)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/__init__.py b/contrib/python/pandas/py2/pandas/tests/sparse/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/common.py b/contrib/python/pandas/py2/pandas/tests/sparse/common.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/common.py
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/__init__.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/conftest.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/conftest.py
new file mode 100644
index 00000000000..3423260c172
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/conftest.py
@@ -0,0 +1,115 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, SparseArray, SparseDataFrame, bdate_range
+
+data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6],
+ 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6],
+ 'C': np.arange(10, dtype=np.float64),
+ 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]}
+dates = bdate_range('1/1/2011', periods=10)
+
+
+# fixture names must be compatible with the tests in
+# tests/frame/test_api.SharedWithSparse
+
+def float_frame_dense():
+ """
+ Fixture for dense DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; some entries are missing
+ """
+ return DataFrame(data, index=dates)
+
+
+def float_frame():
+ """
+ Fixture for sparse DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; some entries are missing
+ """
+ # default_kind='block' is the default
+ return SparseDataFrame(data, index=dates, default_kind='block')
+
+
+def float_frame_int_kind():
+ """
+ Fixture for sparse DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'.
+ Some entries are missing.
+ """
+ return SparseDataFrame(data, index=dates, default_kind='integer')
+
+
+def float_string_frame():
+ """
+ Fixture for sparse DataFrame of floats and strings with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing
+ """
+ sdf = SparseDataFrame(data, index=dates)
+ sdf['foo'] = SparseArray(['bar'] * len(dates))
+ return sdf
+
+
+def float_frame_fill0_dense():
+ """
+ Fixture for dense DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0
+ """
+ values = SparseDataFrame(data).values
+ values[np.isnan(values)] = 0
+ return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates)
+
+
+def float_frame_fill0():
+ """
+ Fixture for sparse DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0
+ """
+ values = SparseDataFrame(data).values
+ values[np.isnan(values)] = 0
+ return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
+ default_fill_value=0, index=dates)
+
+
+def float_frame_fill2_dense():
+ """
+ Fixture for dense DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2
+ """
+ values = SparseDataFrame(data).values
+ values[np.isnan(values)] = 2
+ return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates)
+
+
+def float_frame_fill2():
+ """
+ Fixture for sparse DataFrame of floats with DatetimeIndex
+
+ Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2
+ """
+ values = SparseDataFrame(data).values
+ values[np.isnan(values)] = 2
+ return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
+ default_fill_value=2, index=dates)
+
+
+def empty_frame():
+ """
+ Fixture for empty SparseDataFrame
+ """
+ return SparseDataFrame()
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_analytics.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_analytics.py
new file mode 100644
index 00000000000..95c1c8c453d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_analytics.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, SparseDataFrame, SparseSeries
+from pandas.util import testing as tm
+
+
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_quantile():
+ # GH 17386
+ data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
+ q = 0.1
+
+ sparse_df = SparseDataFrame(data)
+ result = sparse_df.quantile(q)
+
+ dense_df = DataFrame(data)
+ dense_expected = dense_df.quantile(q)
+ sparse_expected = SparseSeries(dense_expected)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
+
+
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_quantile_multi():
+ # GH 17386
+ data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
+ q = [0.1, 0.5]
+
+ sparse_df = SparseDataFrame(data)
+ result = sparse_df.quantile(q)
+
+ dense_df = DataFrame(data)
+ dense_expected = dense_df.quantile(q)
+ sparse_expected = SparseDataFrame(dense_expected)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_apply.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_apply.py
new file mode 100644
index 00000000000..b5ea0a5c90e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_apply.py
@@ -0,0 +1,105 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series, SparseDataFrame, bdate_range
+from pandas.core import nanops
+from pandas.core.sparse.api import SparseDtype
+from pandas.util import testing as tm
+
+
+def dates():
+ return bdate_range('1/1/2011', periods=10)
+
+
+def empty():
+ return SparseDataFrame()
+
+
+def frame(dates):
+ data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6],
+ 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6],
+ 'C': np.arange(10, dtype=np.float64),
+ 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]}
+
+ return SparseDataFrame(data, index=dates)
+
+
+def fill_frame(frame):
+ values = frame.values.copy()
+ values[np.isnan(values)] = 2
+
+ return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
+ default_fill_value=2,
+ index=frame.index)
+
+
+def test_apply(frame):
+ applied = frame.apply(np.sqrt)
+ assert isinstance(applied, SparseDataFrame)
+ tm.assert_almost_equal(applied.values, np.sqrt(frame.values))
+
+ # agg / broadcast
+ with tm.assert_produces_warning(FutureWarning):
+ broadcasted = frame.apply(np.sum, broadcast=True)
+ assert isinstance(broadcasted, SparseDataFrame)
+
+ with tm.assert_produces_warning(FutureWarning):
+ exp = frame.to_dense().apply(np.sum, broadcast=True)
+ tm.assert_frame_equal(broadcasted.to_dense(), exp)
+
+ applied = frame.apply(np.sum)
+ tm.assert_series_equal(applied,
+ frame.to_dense().apply(nanops.nansum).to_sparse())
+
+
+def test_apply_fill(fill_frame):
+ applied = fill_frame.apply(np.sqrt)
+ assert applied['A'].fill_value == np.sqrt(2)
+
+
+def test_apply_empty(empty):
+ assert empty.apply(np.sqrt) is empty
+
+
+def test_apply_nonuq():
+ orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ index=['a', 'a', 'c'])
+ sparse = orig.to_sparse()
+ res = sparse.apply(lambda s: s[0], axis=1)
+ exp = orig.apply(lambda s: s[0], axis=1)
+
+ # dtype must be kept
+ assert res.dtype == SparseDtype(np.int64)
+
+ # ToDo: apply must return subclassed dtype
+ assert isinstance(res, Series)
+ tm.assert_series_equal(res.to_dense(), exp)
+
+ # df.T breaks
+ sparse = orig.T.to_sparse()
+ res = sparse.apply(lambda s: s[0], axis=0) # noqa
+ exp = orig.T.apply(lambda s: s[0], axis=0)
+
+ # TODO: no non-unique columns supported in sparse yet
+ # tm.assert_series_equal(res.to_dense(), exp)
+
+
+def test_applymap(frame):
+ # just test that it works
+ result = frame.applymap(lambda x: x * 2)
+ assert isinstance(result, SparseDataFrame)
+
+
+def test_apply_keep_sparse_dtype():
+ # GH 23744
+ sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]),
+ columns=['b', 'a', 'c'], default_fill_value=1)
+ df = DataFrame(sdf)
+
+ expected = sdf.apply(np.exp)
+ result = df.apply(np.exp)
+ tm.assert_frame_equal(expected, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_frame.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_frame.py
new file mode 100644
index 00000000000..bfb5103c97a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_frame.py
@@ -0,0 +1,1369 @@
+# pylint: disable-msg=E1101,W0612
+
+import operator
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas._libs.sparse import BlockIndex, IntIndex
+from pandas.compat import lrange
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import DataFrame, Panel, Series, bdate_range, compat
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.sparse import frame as spf
+from pandas.core.sparse.api import (
+ SparseArray, SparseDataFrame, SparseDtype, SparseSeries)
+from pandas.tests.frame.test_api import SharedWithSparse
+from pandas.util import testing as tm
+
+from pandas.tseries.offsets import BDay
+
+
+class TestSparseDataFrame(SharedWithSparse):
+ klass = SparseDataFrame
+
+ # SharedWithSparse tests use generic, klass-agnostic assertion
+ _assert_frame_equal = staticmethod(tm.assert_sp_frame_equal)
+ _assert_series_equal = staticmethod(tm.assert_sp_series_equal)
+
+ def test_iterrows(self, float_frame, float_string_frame):
+ # Same as parent, but we don't ensure the sparse kind is the same.
+ for k, v in float_frame.iterrows():
+ exp = float_frame.loc[k]
+ tm.assert_sp_series_equal(v, exp, check_kind=False)
+
+ for k, v in float_string_frame.iterrows():
+ exp = float_string_frame.loc[k]
+ tm.assert_sp_series_equal(v, exp, check_kind=False)
+
+ def test_itertuples(self, float_frame):
+ for i, tup in enumerate(float_frame.itertuples()):
+ s = self.klass._constructor_sliced(tup[1:])
+ s.name = tup[0]
+ expected = float_frame.iloc[i, :].reset_index(drop=True)
+ tm.assert_sp_series_equal(s, expected, check_kind=False)
+
+ def test_fill_value_when_combine_const(self):
+ # GH12723
+ dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
+ df = SparseDataFrame({'foo': dat}, index=range(6))
+
+ exp = df.fillna(0).add(2)
+ res = df.add(2, fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+ def test_values(self, empty_frame, float_frame):
+ empty = empty_frame.values
+ assert empty.shape == (0, 0)
+
+ no_cols = SparseDataFrame(index=np.arange(10))
+ mat = no_cols.values
+ assert mat.shape == (10, 0)
+
+ no_index = SparseDataFrame(columns=np.arange(10))
+ mat = no_index.values
+ assert mat.shape == (0, 10)
+
+ def test_copy(self, float_frame):
+ cp = float_frame.copy()
+ assert isinstance(cp, SparseDataFrame)
+ tm.assert_sp_frame_equal(cp, float_frame)
+
+ # as of v0.15.0
+ # this is now identical (but not is_a )
+ assert cp.index.identical(float_frame.index)
+
+ def test_constructor(self, float_frame, float_frame_int_kind,
+ float_frame_fill0):
+ for col, series in compat.iteritems(float_frame):
+ assert isinstance(series, SparseSeries)
+
+ assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex)
+
+ # constructed zframe from matrix above
+ assert float_frame_fill0['A'].fill_value == 0
+ # XXX: changed asarray
+ expected = pd.SparseArray([0, 0, 0, 0, 1., 2., 3., 4., 5., 6.],
+ fill_value=0, kind='block')
+ tm.assert_sp_array_equal(expected,
+ float_frame_fill0['A'].values)
+ tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2.,
+ 3., 4., 5., 6.]),
+ float_frame_fill0['A'].to_dense().values)
+
+ # construct no data
+ sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10))
+ for col, series in compat.iteritems(sdf):
+ assert isinstance(series, SparseSeries)
+
+ # construct from nested dict
+ data = {c: s.to_dict() for c, s in compat.iteritems(float_frame)}
+
+ sdf = SparseDataFrame(data)
+ tm.assert_sp_frame_equal(sdf, float_frame)
+
+ # TODO: test data is copied from inputs
+
+ # init dict with different index
+ idx = float_frame.index[:5]
+ cons = SparseDataFrame(
+ float_frame, index=idx, columns=float_frame.columns,
+ default_fill_value=float_frame.default_fill_value,
+ default_kind=float_frame.default_kind, copy=True)
+ reindexed = float_frame.reindex(idx)
+
+ tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False)
+
+ # assert level parameter breaks reindex
+ with pytest.raises(TypeError):
+ float_frame.reindex(idx, level=0)
+
+ repr(float_frame)
+
+ def test_constructor_dict_order(self):
+ # GH19018
+ # initialization ordering: by insertion order if python>= 3.6, else
+ # order by value
+ d = {'b': [2, 3], 'a': [0, 1]}
+ frame = SparseDataFrame(data=d)
+ if compat.PY36:
+ expected = SparseDataFrame(data=d, columns=list('ba'))
+ else:
+ expected = SparseDataFrame(data=d, columns=list('ab'))
+ tm.assert_sp_frame_equal(frame, expected)
+
+ def test_constructor_ndarray(self, float_frame):
+ # no index or columns
+ sp = SparseDataFrame(float_frame.values)
+
+ # 1d
+ sp = SparseDataFrame(float_frame['A'].values, index=float_frame.index,
+ columns=['A'])
+ tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=['A']))
+
+ # raise on level argument
+ pytest.raises(TypeError, float_frame.reindex, columns=['A'],
+ level=1)
+
+ # wrong length index / columns
+ with pytest.raises(ValueError, match="^Index length"):
+ SparseDataFrame(float_frame.values, index=float_frame.index[:-1])
+
+ with pytest.raises(ValueError, match="^Column length"):
+ SparseDataFrame(float_frame.values,
+ columns=float_frame.columns[:-1])
+
+ # GH 9272
+ def test_constructor_empty(self):
+ sp = SparseDataFrame()
+ assert len(sp.index) == 0
+ assert len(sp.columns) == 0
+
+ def test_constructor_dataframe(self, float_frame):
+ dense = float_frame.to_dense()
+ sp = SparseDataFrame(dense)
+ tm.assert_sp_frame_equal(sp, float_frame)
+
+ def test_constructor_convert_index_once(self):
+ arr = np.array([1.5, 2.5, 3.5])
+ sdf = SparseDataFrame(columns=lrange(4), index=arr)
+ assert sdf[0].index is sdf[1].index
+
+ def test_constructor_from_series(self):
+
+ # GH 2873
+ x = Series(np.random.randn(10000), name='a')
+ x = x.to_sparse(fill_value=0)
+ assert isinstance(x, SparseSeries)
+ df = SparseDataFrame(x)
+ assert isinstance(df, SparseDataFrame)
+
+ x = Series(np.random.randn(10000), name='a')
+ y = Series(np.random.randn(10000), name='b')
+ x2 = x.astype(float)
+ x2.loc[:9998] = np.NaN
+ # TODO: x_sparse is unused...fix
+ x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa
+
+ # Currently fails too with weird ufunc error
+ # df1 = SparseDataFrame([x_sparse, y])
+
+ y.loc[:9998] = 0
+ # TODO: y_sparse is unsused...fix
+ y_sparse = y.to_sparse(fill_value=0) # noqa
+ # without sparse value raises error
+ # df2 = SparseDataFrame([x2_sparse, y])
+
+ def test_constructor_from_dense_series(self):
+ # GH 19393
+ # series with name
+ x = Series(np.random.randn(10000), name='a')
+ result = SparseDataFrame(x)
+ expected = x.to_frame().to_sparse()
+ tm.assert_sp_frame_equal(result, expected)
+
+ # series with no name
+ x = Series(np.random.randn(10000))
+ result = SparseDataFrame(x)
+ expected = x.to_frame().to_sparse()
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_constructor_from_unknown_type(self):
+ # GH 19393
+ class Unknown(object):
+ pass
+ with pytest.raises(TypeError,
+ match=('SparseDataFrame called with unknown type '
+ '"Unknown" for data argument')):
+ SparseDataFrame(Unknown())
+
+ def test_constructor_preserve_attr(self):
+ # GH 13866
+ arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ df = pd.SparseDataFrame({'x': arr})
+ assert df['x'].dtype == SparseDtype(np.int64)
+ assert df['x'].fill_value == 0
+
+ s = pd.SparseSeries(arr, name='x')
+ assert s.dtype == SparseDtype(np.int64)
+ assert s.fill_value == 0
+
+ df = pd.SparseDataFrame(s)
+ assert df['x'].dtype == SparseDtype(np.int64)
+ assert df['x'].fill_value == 0
+
+ df = pd.SparseDataFrame({'x': s})
+ assert df['x'].dtype == SparseDtype(np.int64)
+ assert df['x'].fill_value == 0
+
+ def test_constructor_nan_dataframe(self):
+ # GH 10079
+ trains = np.arange(100)
+ thresholds = [10, 20, 30, 40, 50, 60]
+ tuples = [(i, j) for i in trains for j in thresholds]
+ index = pd.MultiIndex.from_tuples(tuples,
+ names=['trains', 'thresholds'])
+ matrix = np.empty((len(index), len(trains)))
+ matrix.fill(np.nan)
+ df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float)
+ result = df.to_sparse()
+ expected = pd.SparseDataFrame(matrix, index=index, columns=trains,
+ dtype=float)
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_type_coercion_at_construction(self):
+ # GH 15682
+ result = pd.SparseDataFrame(
+ {'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8',
+ default_fill_value=0)
+ expected = pd.SparseDataFrame(
+ {'a': pd.SparseSeries([1, 0, 0], dtype='uint8'),
+ 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'),
+ 'c': pd.SparseSeries([0, 0, 1], dtype='uint8')},
+ default_fill_value=0)
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_dtypes(self):
+ df = DataFrame(np.random.randn(10000, 4))
+ df.loc[:9998] = np.nan
+ sdf = df.to_sparse()
+
+ result = sdf.get_dtype_counts()
+ expected = Series({'Sparse[float64, nan]': 4})
+ tm.assert_series_equal(result, expected)
+
+ def test_shape(self, float_frame, float_frame_int_kind,
+ float_frame_fill0, float_frame_fill2):
+ # see gh-10452
+ assert float_frame.shape == (10, 4)
+ assert float_frame_int_kind.shape == (10, 4)
+ assert float_frame_fill0.shape == (10, 4)
+ assert float_frame_fill2.shape == (10, 4)
+
+ def test_str(self):
+ df = DataFrame(np.random.randn(10000, 4))
+ df.loc[:9998] = np.nan
+
+ sdf = df.to_sparse()
+ str(sdf)
+
+ def test_array_interface(self, float_frame):
+ res = np.sqrt(float_frame)
+ dres = np.sqrt(float_frame.to_dense())
+ tm.assert_frame_equal(res.to_dense(), dres)
+
+ def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense,
+ float_frame_fill0, float_frame_fill0_dense,
+ float_frame_fill2, float_frame_fill2_dense):
+
+ def _test_roundtrip(frame, orig):
+ result = tm.round_trip_pickle(frame)
+ tm.assert_sp_frame_equal(frame, result)
+ tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False)
+
+ _test_roundtrip(SparseDataFrame(), DataFrame())
+ _test_roundtrip(float_frame, float_frame_dense)
+ _test_roundtrip(float_frame_int_kind, float_frame_dense)
+ _test_roundtrip(float_frame_fill0, float_frame_fill0_dense)
+ _test_roundtrip(float_frame_fill2, float_frame_fill2_dense)
+
+ def test_dense_to_sparse(self):
+ df = DataFrame({'A': [nan, nan, nan, 1, 2],
+ 'B': [1, 2, nan, nan, nan]})
+ sdf = df.to_sparse()
+ assert isinstance(sdf, SparseDataFrame)
+ assert np.isnan(sdf.default_fill_value)
+ assert isinstance(sdf['A'].sp_index, BlockIndex)
+ tm.assert_frame_equal(sdf.to_dense(), df)
+
+ sdf = df.to_sparse(kind='integer')
+ assert isinstance(sdf['A'].sp_index, IntIndex)
+
+ df = DataFrame({'A': [0, 0, 0, 1, 2],
+ 'B': [1, 2, 0, 0, 0]}, dtype=float)
+ sdf = df.to_sparse(fill_value=0)
+ assert sdf.default_fill_value == 0
+ tm.assert_frame_equal(sdf.to_dense(), df)
+
+ def test_density(self):
+ df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6])
+ assert df.density == 0.7
+
+ df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
+ 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
+ 'C': np.arange(10),
+ 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]})
+
+ assert df.density == 0.75
+
+ def test_sparse_to_dense(self):
+ pass
+
+ def test_sparse_series_ops(self, float_frame):
+ self._check_frame_ops(float_frame)
+
+ def test_sparse_series_ops_i(self, float_frame_int_kind):
+ self._check_frame_ops(float_frame_int_kind)
+
+ def test_sparse_series_ops_z(self, float_frame_fill0):
+ self._check_frame_ops(float_frame_fill0)
+
+ def test_sparse_series_ops_fill(self, float_frame_fill2):
+ self._check_frame_ops(float_frame_fill2)
+
+ def _check_frame_ops(self, frame):
+
+ def _compare_to_dense(a, b, da, db, op):
+ sparse_result = op(a, b)
+ dense_result = op(da, db)
+
+ fill = sparse_result.default_fill_value
+ dense_result = dense_result.to_sparse(fill_value=fill)
+ tm.assert_sp_frame_equal(sparse_result, dense_result,
+ exact_indices=False)
+
+ if isinstance(a, DataFrame) and isinstance(db, DataFrame):
+ mixed_result = op(a, db)
+ assert isinstance(mixed_result, SparseDataFrame)
+ tm.assert_sp_frame_equal(mixed_result, sparse_result,
+ exact_indices=False)
+
+ opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv']
+ ops = [getattr(operator, name) for name in opnames]
+
+ fidx = frame.index
+
+ # time series operations
+
+ series = [frame['A'], frame['B'], frame['C'], frame['D'],
+ frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]),
+ SparseSeries(
+ [], index=[])]
+
+ for op in opnames:
+ _compare_to_dense(frame, frame[::2], frame.to_dense(),
+ frame[::2].to_dense(), getattr(operator, op))
+
+ # 2304, no auto-broadcasting
+ for i, s in enumerate(series):
+ f = lambda a, b: getattr(a, op)(b, axis='index')
+ _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f)
+
+ # rops are not implemented
+ # _compare_to_dense(s, frame, s.to_dense(),
+ # frame.to_dense(), f)
+
+ # cross-sectional operations
+ series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]),
+ frame.xs(fidx[7]), frame.xs(fidx[5])[:2]]
+
+ for op in ops:
+ for s in series:
+ _compare_to_dense(frame, s, frame.to_dense(), s, op)
+ _compare_to_dense(s, frame, s, frame.to_dense(), op)
+
+ # it works!
+ result = frame + frame.loc[:, ['A', 'B']] # noqa
+
+ def test_op_corners(self, float_frame, empty_frame):
+ empty = empty_frame + empty_frame
+ assert empty.empty
+
+ foo = float_frame + empty_frame
+ assert isinstance(foo.index, DatetimeIndex)
+ tm.assert_frame_equal(foo, float_frame * np.nan)
+
+ foo = empty_frame + float_frame
+ tm.assert_frame_equal(foo, float_frame * np.nan)
+
+ def test_scalar_ops(self):
+ pass
+
+ def test_getitem(self):
+ # 1585 select multiple columns
+ sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c'])
+
+ result = sdf[['a', 'b']]
+ exp = sdf.reindex(columns=['a', 'b'])
+ tm.assert_sp_frame_equal(result, exp)
+
+ pytest.raises(Exception, sdf.__getitem__, ['a', 'd'])
+
+ def test_iloc(self, float_frame):
+
+ # GH 2227
+ result = float_frame.iloc[:, 0]
+ assert isinstance(result, SparseSeries)
+ tm.assert_sp_series_equal(result, float_frame['A'])
+
+ # preserve sparse index type. #2251
+ data = {'A': [0, 1]}
+ iframe = SparseDataFrame(data, default_kind='integer')
+ tm.assert_class_equal(iframe['A'].sp_index,
+ iframe.iloc[:, 0].sp_index)
+
+ def test_set_value(self, float_frame):
+
+ # ok, as the index gets converted to object
+ frame = float_frame.copy()
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res = frame.set_value('foobar', 'B', 1.5)
+ assert res.index.dtype == 'object'
+
+ res = float_frame
+ res.index = res.index.astype(object)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res = float_frame.set_value('foobar', 'B', 1.5)
+ assert res is not float_frame
+ assert res.index[-1] == 'foobar'
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert res.get_value('foobar', 'B') == 1.5
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ res2 = res.set_value('foobar', 'qux', 1.5)
+ assert res2 is not res
+ tm.assert_index_equal(res2.columns,
+ pd.Index(list(float_frame.columns) + ['qux']))
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ assert res2.get_value('foobar', 'qux') == 1.5
+
+ def test_fancy_index_misc(self, float_frame):
+ # axis = 0
+ sliced = float_frame.iloc[-2:, :]
+ expected = float_frame.reindex(index=float_frame.index[-2:])
+ tm.assert_sp_frame_equal(sliced, expected)
+
+ # axis = 1
+ sliced = float_frame.iloc[:, -2:]
+ expected = float_frame.reindex(columns=float_frame.columns[-2:])
+ tm.assert_sp_frame_equal(sliced, expected)
+
+ def test_getitem_overload(self, float_frame):
+ # slicing
+ sl = float_frame[:20]
+ tm.assert_sp_frame_equal(sl,
+ float_frame.reindex(float_frame.index[:20]))
+
+ # boolean indexing
+ d = float_frame.index[5]
+ indexer = float_frame.index > d
+
+ subindex = float_frame.index[indexer]
+ subframe = float_frame[indexer]
+
+ tm.assert_index_equal(subindex, subframe.index)
+ pytest.raises(Exception, float_frame.__getitem__, indexer[:-1])
+
+ def test_setitem(self, float_frame, float_frame_int_kind,
+ float_frame_dense,
+ float_frame_fill0, float_frame_fill0_dense,
+ float_frame_fill2, float_frame_fill2_dense):
+
+ def _check_frame(frame, orig):
+ N = len(frame)
+
+ # insert SparseSeries
+ frame['E'] = frame['A']
+ assert isinstance(frame['E'], SparseSeries)
+ tm.assert_sp_series_equal(frame['E'], frame['A'],
+ check_names=False)
+
+ # insert SparseSeries differently-indexed
+ to_insert = frame['A'][::2]
+ frame['E'] = to_insert
+ expected = to_insert.to_dense().reindex(frame.index)
+ result = frame['E'].to_dense()
+ tm.assert_series_equal(result, expected, check_names=False)
+ assert result.name == 'E'
+
+ # insert Series
+ frame['F'] = frame['A'].to_dense()
+ assert isinstance(frame['F'], SparseSeries)
+ tm.assert_sp_series_equal(frame['F'], frame['A'],
+ check_names=False)
+
+ # insert Series differently-indexed
+ to_insert = frame['A'].to_dense()[::2]
+ frame['G'] = to_insert
+ expected = to_insert.reindex(frame.index)
+ expected.name = 'G'
+ tm.assert_series_equal(frame['G'].to_dense(), expected)
+
+ # insert ndarray
+ frame['H'] = np.random.randn(N)
+ assert isinstance(frame['H'], SparseSeries)
+
+ to_sparsify = np.random.randn(N)
+ to_sparsify[N // 2:] = frame.default_fill_value
+ frame['I'] = to_sparsify
+ assert len(frame['I'].sp_values) == N // 2
+
+ # insert ndarray wrong size
+ pytest.raises(Exception, frame.__setitem__, 'foo',
+ np.random.randn(N - 1))
+
+ # scalar value
+ frame['J'] = 5
+ assert len(frame['J'].sp_values) == N
+ assert (frame['J'].sp_values == 5).all()
+
+ frame['K'] = frame.default_fill_value
+ assert len(frame['K'].sp_values) == 0
+
+ _check_frame(float_frame, float_frame_dense)
+ _check_frame(float_frame_int_kind, float_frame_dense)
+ _check_frame(float_frame_fill0, float_frame_fill0_dense)
+ _check_frame(float_frame_fill2, float_frame_fill2_dense)
+
+ @pytest.mark.parametrize('values', [
+ [True, False],
+ [0, 1],
+ [1, None],
+ ['a', 'b'],
+ [pd.Timestamp('2017'), pd.NaT],
+ [pd.Timedelta('10s'), pd.NaT],
+ ])
+ def test_setitem_more(self, values):
+ df = pd.DataFrame({"A": values})
+ df['A'] = pd.SparseArray(values)
+ expected = pd.DataFrame({'A': pd.SparseArray(values)})
+ tm.assert_frame_equal(df, expected)
+
+ def test_setitem_corner(self, float_frame):
+ float_frame['a'] = float_frame['B']
+ tm.assert_sp_series_equal(float_frame['a'], float_frame['B'],
+ check_names=False)
+
+ def test_setitem_array(self, float_frame):
+ arr = float_frame['B']
+
+ float_frame['E'] = arr
+ tm.assert_sp_series_equal(float_frame['E'], float_frame['B'],
+ check_names=False)
+
+ float_frame['F'] = arr[:-1]
+ index = float_frame.index[:-1]
+ tm.assert_sp_series_equal(float_frame['E'].reindex(index),
+ float_frame['F'].reindex(index),
+ check_names=False)
+
+ def test_setitem_chained_no_consolidate(self):
+ # https://github.com/pandas-dev/pandas/pull/19268
+ # issuecomment-361696418
+ # chained setitem used to cause consolidation
+ sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]])
+ with pd.option_context('mode.chained_assignment', None):
+ sdf[0][1] = 2
+ assert len(sdf._data.blocks) == 2
+
+ def test_delitem(self, float_frame):
+ A = float_frame['A']
+ C = float_frame['C']
+
+ del float_frame['B']
+ assert 'B' not in float_frame
+ tm.assert_sp_series_equal(float_frame['A'], A)
+ tm.assert_sp_series_equal(float_frame['C'], C)
+
+ del float_frame['D']
+ assert 'D' not in float_frame
+
+ del float_frame['A']
+ assert 'A' not in float_frame
+
+ def test_set_columns(self, float_frame):
+ float_frame.columns = float_frame.columns
+ pytest.raises(Exception, setattr, float_frame, 'columns',
+ float_frame.columns[:-1])
+
+ def test_set_index(self, float_frame):
+ float_frame.index = float_frame.index
+ pytest.raises(Exception, setattr, float_frame, 'index',
+ float_frame.index[:-1])
+
+ def test_ctor_reindex(self):
+ idx = pd.Index([0, 1, 2, 3])
+ with pytest.raises(ValueError, match=''):
+ pd.SparseDataFrame({"A": [1, 2]}, index=idx)
+
+ def test_append(self, float_frame):
+ a = float_frame[:5]
+ b = float_frame[5:]
+
+ appended = a.append(b)
+ tm.assert_sp_frame_equal(appended, float_frame, exact_indices=False)
+
+ a = float_frame.iloc[:5, :3]
+ b = float_frame.iloc[5:]
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ # Stacklevel is set for pd.concat, not append
+ appended = a.append(b)
+ tm.assert_sp_frame_equal(appended.iloc[:, :3], float_frame.iloc[:, :3],
+ exact_indices=False)
+
+ a = a[['B', 'C', 'A']].head(2)
+ b = b.head(2)
+
+ expected = pd.SparseDataFrame({
+ "B": [0., 1, None, 3],
+ "C": [0., 1, 5, 6],
+ "A": [None, None, 2, 3],
+ "D": [None, None, 5, None],
+ }, index=a.index | b.index, columns=['B', 'C', 'A', 'D'])
+ with tm.assert_produces_warning(None):
+ appended = a.append(b, sort=False)
+
+ tm.assert_frame_equal(appended, expected)
+
+ with tm.assert_produces_warning(None):
+ appended = a.append(b, sort=True)
+
+ tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']],
+ consolidate_block_indices=True,
+ check_kind=False)
+
+ def test_astype(self):
+ sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
+ dtype=np.int64),
+ 'B': SparseArray([4, 5, 6, 7],
+ dtype=np.int64)})
+ assert sparse['A'].dtype == SparseDtype(np.int64)
+ assert sparse['B'].dtype == SparseDtype(np.int64)
+
+ # retain fill_value
+ res = sparse.astype(np.float64)
+ exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.],
+ fill_value=0,
+ kind='integer'),
+ 'B': SparseArray([4., 5., 6., 7.],
+ fill_value=0,
+ kind='integer')},
+ default_fill_value=np.nan)
+ tm.assert_sp_frame_equal(res, exp)
+ assert res['A'].dtype == SparseDtype(np.float64, 0)
+ assert res['B'].dtype == SparseDtype(np.float64, 0)
+
+ # update fill_value
+ res = sparse.astype(SparseDtype(np.float64, np.nan))
+ exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.],
+ fill_value=np.nan,
+ kind='integer'),
+ 'B': SparseArray([4., 5., 6., 7.],
+ fill_value=np.nan,
+ kind='integer')},
+ default_fill_value=np.nan)
+ tm.assert_sp_frame_equal(res, exp)
+ assert res['A'].dtype == SparseDtype(np.float64, np.nan)
+ assert res['B'].dtype == SparseDtype(np.float64, np.nan)
+
+ def test_astype_bool(self):
+ sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
+ fill_value=0,
+ dtype=np.int64),
+ 'B': SparseArray([0, 5, 0, 7],
+ fill_value=0,
+ dtype=np.int64)},
+ default_fill_value=0)
+ assert sparse['A'].dtype == SparseDtype(np.int64)
+ assert sparse['B'].dtype == SparseDtype(np.int64)
+
+ res = sparse.astype(SparseDtype(bool, False))
+ exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
+ dtype=np.bool,
+ fill_value=False,
+ kind='integer'),
+ 'B': SparseArray([False, True, False, True],
+ dtype=np.bool,
+ fill_value=False,
+ kind='integer')},
+ default_fill_value=False)
+ tm.assert_sp_frame_equal(res, exp)
+ assert res['A'].dtype == SparseDtype(np.bool)
+ assert res['B'].dtype == SparseDtype(np.bool)
+
+ def test_astype_object(self):
+ # This may change in GH-23125
+ df = pd.DataFrame({"A": SparseArray([0, 1]),
+ "B": SparseArray([0, 1])})
+ result = df.astype(object)
+ dtype = SparseDtype(object, 0)
+ expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype),
+ "B": SparseArray([0, 1], dtype=dtype)})
+ tm.assert_frame_equal(result, expected)
+
+ def test_fillna(self, float_frame_fill0, float_frame_fill0_dense):
+ df = float_frame_fill0.reindex(lrange(5))
+ dense = float_frame_fill0_dense.reindex(lrange(5))
+
+ result = df.fillna(0)
+ expected = dense.fillna(0)
+ tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
+ exact_indices=False)
+ tm.assert_frame_equal(result.to_dense(), expected)
+
+ result = df.copy()
+ result.fillna(0, inplace=True)
+ expected = dense.fillna(0)
+
+ tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
+ exact_indices=False)
+ tm.assert_frame_equal(result.to_dense(), expected)
+
+ result = df.copy()
+ result = df['A']
+ result.fillna(0, inplace=True)
+
+ expected = dense['A'].fillna(0)
+ # this changes internal SparseArray repr
+ # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0))
+ tm.assert_series_equal(result.to_dense(), expected)
+
+ def test_fillna_fill_value(self):
+ df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]})
+
+ sparse = pd.SparseDataFrame(df)
+ tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
+ df.fillna(-1), check_dtype=False)
+
+ sparse = pd.SparseDataFrame(df, default_fill_value=0)
+ tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
+ df.fillna(-1), check_dtype=False)
+
+ def test_sparse_frame_pad_backfill_limit(self):
+ index = np.arange(10)
+ df = DataFrame(np.random.randn(10, 4), index=index)
+ sdf = df.to_sparse()
+
+ result = sdf[:2].reindex(index, method='pad', limit=5)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = sdf[:2].reindex(index).fillna(method='pad')
+ expected = expected.to_dense()
+ expected.values[-3:] = np.nan
+ expected = expected.to_sparse()
+ tm.assert_frame_equal(result, expected)
+
+ result = sdf[-2:].reindex(index, method='backfill', limit=5)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = sdf[-2:].reindex(index).fillna(method='backfill')
+ expected = expected.to_dense()
+ expected.values[:3] = np.nan
+ expected = expected.to_sparse()
+ tm.assert_frame_equal(result, expected)
+
+ def test_sparse_frame_fillna_limit(self):
+ index = np.arange(10)
+ df = DataFrame(np.random.randn(10, 4), index=index)
+ sdf = df.to_sparse()
+
+ result = sdf[:2].reindex(index)
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = result.fillna(method='pad', limit=5)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = sdf[:2].reindex(index).fillna(method='pad')
+ expected = expected.to_dense()
+ expected.values[-3:] = np.nan
+ expected = expected.to_sparse()
+ tm.assert_frame_equal(result, expected)
+
+ result = sdf[-2:].reindex(index)
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = result.fillna(method='backfill', limit=5)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ expected = sdf[-2:].reindex(index).fillna(method='backfill')
+ expected = expected.to_dense()
+ expected.values[:3] = np.nan
+ expected = expected.to_sparse()
+ tm.assert_frame_equal(result, expected)
+
+ def test_rename(self, float_frame):
+ result = float_frame.rename(index=str)
+ expected = SparseDataFrame(float_frame.values,
+ index=float_frame.index.strftime(
+ "%Y-%m-%d %H:%M:%S"),
+ columns=list('ABCD'))
+ tm.assert_sp_frame_equal(result, expected)
+
+ result = float_frame.rename(columns=lambda x: '%s%d' % (x, 1))
+ data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
+ 'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
+ 'C1': np.arange(10, dtype=np.float64),
+ 'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
+ expected = SparseDataFrame(data, index=float_frame.index)
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_corr(self, float_frame):
+ res = float_frame.corr()
+ # XXX: this stays sparse
+ tm.assert_frame_equal(res, float_frame.to_dense().corr().to_sparse())
+
+ def test_describe(self, float_frame):
+ float_frame['foo'] = np.nan
+ float_frame.get_dtype_counts()
+ str(float_frame)
+ desc = float_frame.describe() # noqa
+
+ def test_join(self, float_frame):
+ left = float_frame.loc[:, ['A', 'B']]
+ right = float_frame.loc[:, ['C', 'D']]
+ joined = left.join(right)
+ tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False)
+
+ right = float_frame.loc[:, ['B', 'D']]
+ pytest.raises(Exception, left.join, right)
+
+ with pytest.raises(ValueError, match='Other Series must have a name'):
+ float_frame.join(Series(
+ np.random.randn(len(float_frame)), index=float_frame.index))
+
+ def test_reindex(self, float_frame, float_frame_int_kind,
+ float_frame_fill0, float_frame_fill2):
+
+ def _check_frame(frame):
+ index = frame.index
+ sidx = index[::2]
+ sidx2 = index[:5] # noqa
+
+ sparse_result = frame.reindex(sidx)
+ dense_result = frame.to_dense().reindex(sidx)
+ tm.assert_frame_equal(sparse_result.to_dense(), dense_result)
+
+ tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(),
+ dense_result)
+
+ sparse_result2 = sparse_result.reindex(index)
+ dense_result2 = dense_result.reindex(index)
+ tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2)
+
+ # propagate CORRECT fill value
+ tm.assert_almost_equal(sparse_result.default_fill_value,
+ frame.default_fill_value)
+ tm.assert_almost_equal(sparse_result['A'].fill_value,
+ frame['A'].fill_value)
+
+ # length zero
+ length_zero = frame.reindex([])
+ assert len(length_zero) == 0
+ assert len(length_zero.columns) == len(frame.columns)
+ assert len(length_zero['A']) == 0
+
+ # frame being reindexed has length zero
+ length_n = length_zero.reindex(index)
+ assert len(length_n) == len(frame)
+ assert len(length_n.columns) == len(frame.columns)
+ assert len(length_n['A']) == len(frame)
+
+ # reindex columns
+ reindexed = frame.reindex(columns=['A', 'B', 'Z'])
+ assert len(reindexed.columns) == 3
+ tm.assert_almost_equal(reindexed['Z'].fill_value,
+ frame.default_fill_value)
+ assert np.isnan(reindexed['Z'].sp_values).all()
+
+ _check_frame(float_frame)
+ _check_frame(float_frame_int_kind)
+ _check_frame(float_frame_fill0)
+ _check_frame(float_frame_fill2)
+
+ # with copy=False
+ reindexed = float_frame.reindex(float_frame.index, copy=False)
+ reindexed['F'] = reindexed['A']
+ assert 'F' in float_frame
+
+ reindexed = float_frame.reindex(float_frame.index)
+ reindexed['G'] = reindexed['A']
+ assert 'G' not in float_frame
+
+ def test_reindex_fill_value(self, float_frame_fill0,
+ float_frame_fill0_dense):
+ rng = bdate_range('20110110', periods=20)
+
+ result = float_frame_fill0.reindex(rng, fill_value=0)
+ exp = float_frame_fill0_dense.reindex(rng, fill_value=0)
+ exp = exp.to_sparse(float_frame_fill0.default_fill_value)
+ tm.assert_sp_frame_equal(result, exp)
+
+ def test_reindex_method(self):
+
+ sparse = SparseDataFrame(data=[[11., 12., 14.],
+ [21., 22., 24.],
+ [41., 42., 44.]],
+ index=[1, 2, 4],
+ columns=[1, 2, 4],
+ dtype=float)
+
+ # Over indices
+
+ # default method
+ result = sparse.reindex(index=range(6))
+ expected = SparseDataFrame(data=[[nan, nan, nan],
+ [11., 12., 14.],
+ [21., 22., 24.],
+ [nan, nan, nan],
+ [41., 42., 44.],
+ [nan, nan, nan]],
+ index=range(6),
+ columns=[1, 2, 4],
+ dtype=float)
+ tm.assert_sp_frame_equal(result, expected)
+
+ # method='bfill'
+ result = sparse.reindex(index=range(6), method='bfill')
+ expected = SparseDataFrame(data=[[11., 12., 14.],
+ [11., 12., 14.],
+ [21., 22., 24.],
+ [41., 42., 44.],
+ [41., 42., 44.],
+ [nan, nan, nan]],
+ index=range(6),
+ columns=[1, 2, 4],
+ dtype=float)
+ tm.assert_sp_frame_equal(result, expected)
+
+ # method='ffill'
+ result = sparse.reindex(index=range(6), method='ffill')
+ expected = SparseDataFrame(data=[[nan, nan, nan],
+ [11., 12., 14.],
+ [21., 22., 24.],
+ [21., 22., 24.],
+ [41., 42., 44.],
+ [41., 42., 44.]],
+ index=range(6),
+ columns=[1, 2, 4],
+ dtype=float)
+ tm.assert_sp_frame_equal(result, expected)
+
+ # Over columns
+
+ # default method
+ result = sparse.reindex(columns=range(6))
+ expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan],
+ [nan, 21., 22., nan, 24., nan],
+ [nan, 41., 42., nan, 44., nan]],
+ index=[1, 2, 4],
+ columns=range(6),
+ dtype=float)
+ tm.assert_sp_frame_equal(result, expected)
+
+ # method='bfill'
+ with pytest.raises(NotImplementedError):
+ sparse.reindex(columns=range(6), method='bfill')
+
+ # method='ffill'
+ with pytest.raises(NotImplementedError):
+ sparse.reindex(columns=range(6), method='ffill')
+
+ def test_take(self, float_frame):
+ result = float_frame.take([1, 0, 2], axis=1)
+ expected = float_frame.reindex(columns=['B', 'A', 'C'])
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_to_dense(self, float_frame, float_frame_int_kind,
+ float_frame_dense,
+ float_frame_fill0, float_frame_fill0_dense,
+ float_frame_fill2, float_frame_fill2_dense):
+ def _check(frame, orig):
+ dense_dm = frame.to_dense()
+ # Sparse[float] != float
+ tm.assert_frame_equal(frame, dense_dm, check_dtype=False)
+ tm.assert_frame_equal(dense_dm, orig, check_dtype=False)
+
+ _check(float_frame, float_frame_dense)
+ _check(float_frame_int_kind, float_frame_dense)
+ _check(float_frame_fill0, float_frame_fill0_dense)
+ _check(float_frame_fill2, float_frame_fill2_dense)
+
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_stack_sparse_frame(self, float_frame, float_frame_int_kind,
+ float_frame_fill0, float_frame_fill2):
+ def _check(frame):
+ dense_frame = frame.to_dense() # noqa
+
+ wp = Panel.from_dict({'foo': frame})
+ from_dense_lp = wp.to_frame()
+
+ from_sparse_lp = spf.stack_sparse_frame(frame)
+
+ tm.assert_numpy_array_equal(from_dense_lp.values,
+ from_sparse_lp.values)
+
+ _check(float_frame)
+ _check(float_frame_int_kind)
+
+ # for now
+ pytest.raises(Exception, _check, float_frame_fill0)
+ pytest.raises(Exception, _check, float_frame_fill2)
+
+ def test_transpose(self, float_frame, float_frame_int_kind,
+ float_frame_dense,
+ float_frame_fill0, float_frame_fill0_dense,
+ float_frame_fill2, float_frame_fill2_dense):
+
+ def _check(frame, orig):
+ transposed = frame.T
+ untransposed = transposed.T
+ tm.assert_sp_frame_equal(frame, untransposed)
+
+ tm.assert_frame_equal(frame.T.to_dense(), orig.T)
+ tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T)
+ tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False)
+
+ _check(float_frame, float_frame_dense)
+ _check(float_frame_int_kind, float_frame_dense)
+ _check(float_frame_fill0, float_frame_fill0_dense)
+ _check(float_frame_fill2, float_frame_fill2_dense)
+
+ def test_shift(self, float_frame, float_frame_int_kind, float_frame_dense,
+ float_frame_fill0, float_frame_fill0_dense,
+ float_frame_fill2, float_frame_fill2_dense):
+
+ def _check(frame, orig):
+ shifted = frame.shift(0)
+ exp = orig.shift(0)
+ tm.assert_frame_equal(shifted.to_dense(), exp)
+
+ shifted = frame.shift(1)
+ exp = orig.shift(1)
+ tm.assert_frame_equal(shifted.to_dense(), exp)
+
+ shifted = frame.shift(-2)
+ exp = orig.shift(-2)
+ tm.assert_frame_equal(shifted.to_dense(), exp)
+
+ shifted = frame.shift(2, freq='B')
+ exp = orig.shift(2, freq='B')
+ exp = exp.to_sparse(frame.default_fill_value,
+ kind=frame.default_kind)
+ tm.assert_frame_equal(shifted, exp)
+
+ shifted = frame.shift(2, freq=BDay())
+ exp = orig.shift(2, freq=BDay())
+ exp = exp.to_sparse(frame.default_fill_value,
+ kind=frame.default_kind)
+ tm.assert_frame_equal(shifted, exp)
+
+ _check(float_frame, float_frame_dense)
+ _check(float_frame_int_kind, float_frame_dense)
+ _check(float_frame_fill0, float_frame_fill0_dense)
+ _check(float_frame_fill2, float_frame_fill2_dense)
+
+ def test_count(self, float_frame):
+ dense_result = float_frame.to_dense().count()
+
+ result = float_frame.count()
+ tm.assert_series_equal(result.to_dense(), dense_result)
+
+ result = float_frame.count(axis=None)
+ tm.assert_series_equal(result.to_dense(), dense_result)
+
+ result = float_frame.count(axis=0)
+ tm.assert_series_equal(result.to_dense(), dense_result)
+
+ result = float_frame.count(axis=1)
+ dense_result = float_frame.to_dense().count(axis=1)
+
+ # win32 don't check dtype
+ tm.assert_series_equal(result, dense_result, check_dtype=False)
+
+ def test_numpy_transpose(self):
+ sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a'])
+ result = np.transpose(np.transpose(sdf))
+ tm.assert_sp_frame_equal(result, sdf)
+
+ msg = "the 'axes' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.transpose(sdf, axes=1)
+
+ def test_combine_first(self, float_frame):
+ df = float_frame
+
+ result = df[::2].combine_first(df)
+
+ expected = df[::2].to_dense().combine_first(df.to_dense())
+ expected = expected.to_sparse(fill_value=df.default_fill_value)
+
+ tm.assert_sp_frame_equal(result, expected)
+
+ @pytest.mark.xfail(reason="No longer supported.")
+ def test_combine_first_with_dense(self):
+ # We could support this if we allow
+ # pd.core.dtypes.cast.find_common_type to special case SparseDtype
+ # but I don't think that's worth it.
+ df = self.frame
+
+ result = df[::2].combine_first(df.to_dense())
+ expected = df[::2].to_dense().combine_first(df.to_dense())
+ expected = expected.to_sparse(fill_value=df.default_fill_value)
+
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_combine_add(self, float_frame):
+ df = float_frame.to_dense()
+ df2 = df.copy()
+ df2['C'][:3] = np.nan
+ df['A'][:3] = 5.7
+
+ result = df.to_sparse().add(df2.to_sparse(), fill_value=0)
+ expected = df.add(df2, fill_value=0).to_sparse()
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_isin(self):
+ sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.)
+ xp = sparse_df[sparse_df.flag == 1.]
+ rs = sparse_df[sparse_df.flag.isin([1.])]
+ tm.assert_frame_equal(xp, rs)
+
+ def test_sparse_pow_issue(self):
+ # 2220
+ df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
+
+ # note : no error without nan
+ df = SparseDataFrame({'A': [nan, 0, 1]})
+
+ # note that 2 ** df works fine, also df ** 1
+ result = 1 ** df
+
+ r1 = result.take([0], 1)['A']
+ r2 = result['A']
+
+ assert len(r2.sp_values) == len(r1.sp_values)
+
+ def test_as_blocks(self):
+ df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]},
+ dtype='float64')
+
+ # deprecated 0.21.0
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ df_blocks = df.blocks
+ assert list(df_blocks.keys()) == ['Sparse[float64, nan]']
+ tm.assert_frame_equal(df_blocks['Sparse[float64, nan]'], df)
+
+ @pytest.mark.xfail(reason='nan column names in _init_dict problematic '
+ '(GH#16894)')
+ def test_nan_columnname(self):
+ # GH 8822
+ nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
+ nan_colname_sparse = nan_colname.to_sparse()
+ assert np.isnan(nan_colname_sparse.columns[0])
+
+ def test_isna(self):
+ # GH 8276
+ df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
+ 'B': [0, np.nan, np.nan, 2, np.nan]})
+
+ res = df.isna()
+ exp = pd.SparseDataFrame({'A': [True, True, False, False, True],
+ 'B': [False, True, True, False, True]},
+ default_fill_value=True)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp)
+
+ # if fill_value is not nan, True can be included in sp_values
+ df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
+ 'B': [0, np.nan, 0, 2, np.nan]},
+ default_fill_value=0.)
+ res = df.isna()
+ assert isinstance(res, pd.SparseDataFrame)
+ exp = pd.DataFrame({'A': [False, False, False, False, True],
+ 'B': [False, True, False, False, True]})
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ def test_notna(self):
+ # GH 8276
+ df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
+ 'B': [0, np.nan, np.nan, 2, np.nan]})
+
+ res = df.notna()
+ exp = pd.SparseDataFrame({'A': [False, False, True, True, False],
+ 'B': [True, False, False, True, False]},
+ default_fill_value=False)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp)
+
+ # if fill_value is not nan, True can be included in sp_values
+ df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
+ 'B': [0, np.nan, 0, 2, np.nan]},
+ default_fill_value=0.)
+ res = df.notna()
+ assert isinstance(res, pd.SparseDataFrame)
+ exp = pd.DataFrame({'A': [True, True, True, True, False],
+ 'B': [True, False, True, True, False]})
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+
+class TestSparseDataFrameArithmetic(object):
+
+ def test_numeric_op_scalar(self):
+ df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
+ 'B': [0, 1, 2, nan],
+ 'C': [1., 2., 3., 4.],
+ 'D': [nan, nan, nan, nan]})
+ sparse = df.to_sparse()
+
+ tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse())
+
+ def test_comparison_op_scalar(self):
+ # GH 13001
+ df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
+ 'B': [0, 1, 2, nan],
+ 'C': [1., 2., 3., 4.],
+ 'D': [nan, nan, nan, nan]})
+ sparse = df.to_sparse()
+
+ # comparison changes internal repr, compare with dense
+ res = sparse > 1
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), df > 1)
+
+ res = sparse != 0
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), df != 0)
+
+
+class TestSparseDataFrameAnalytics(object):
+
+ def test_cumsum(self, float_frame):
+ expected = SparseDataFrame(float_frame.to_dense().cumsum())
+
+ result = float_frame.cumsum()
+ tm.assert_sp_frame_equal(result, expected)
+
+ result = float_frame.cumsum(axis=None)
+ tm.assert_sp_frame_equal(result, expected)
+
+ result = float_frame.cumsum(axis=0)
+ tm.assert_sp_frame_equal(result, expected)
+
+ def test_numpy_cumsum(self, float_frame):
+ result = np.cumsum(float_frame)
+ expected = SparseDataFrame(float_frame.to_dense().cumsum())
+ tm.assert_sp_frame_equal(result, expected)
+
+ msg = "the 'dtype' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(float_frame, dtype=np.int64)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(float_frame, out=result)
+
+ def test_numpy_func_call(self, float_frame):
+ # no exception should be raised even though
+ # numpy passes in 'axis=None' or `axis=-1'
+ funcs = ['sum', 'cumsum', 'var',
+ 'mean', 'prod', 'cumprod',
+ 'std', 'min', 'max']
+ for func in funcs:
+ getattr(np, func)(float_frame)
+
+ @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)')
+ def test_quantile(self):
+ # GH 17386
+ data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
+ q = 0.1
+
+ sparse_df = SparseDataFrame(data)
+ result = sparse_df.quantile(q)
+
+ dense_df = DataFrame(data)
+ dense_expected = dense_df.quantile(q)
+ sparse_expected = SparseSeries(dense_expected)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
+
+ @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)')
+ def test_quantile_multi(self):
+ # GH 17386
+ data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
+ q = [0.1, 0.5]
+
+ sparse_df = SparseDataFrame(data)
+ result = sparse_df.quantile(q)
+
+ dense_df = DataFrame(data)
+ dense_expected = dense_df.quantile(q)
+ sparse_expected = SparseDataFrame(dense_expected)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
+
+ def test_assign_with_sparse_frame(self):
+ # GH 19163
+ df = pd.DataFrame({"a": [1, 2, 3]})
+ res = df.to_sparse(fill_value=False).assign(newcol=False)
+ exp = df.assign(newcol=False).to_sparse(fill_value=False)
+
+ tm.assert_sp_frame_equal(res, exp)
+
+ for column in res.columns:
+ assert type(res[column]) is SparseSeries
+
+ @pytest.mark.parametrize("inplace", [True, False])
+ @pytest.mark.parametrize("how", ["all", "any"])
+ def test_dropna(self, inplace, how):
+ # Tests regression #21172.
+ expected = pd.SparseDataFrame({"F2": [0, 1]})
+ input_df = pd.SparseDataFrame(
+ {"F1": [float('nan'), float('nan')], "F2": [0, 1]}
+ )
+ result_df = input_df.dropna(axis=1, inplace=inplace, how=how)
+ if inplace:
+ result_df = input_df
+ tm.assert_sp_frame_equal(expected, result_df)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_indexing.py
new file mode 100644
index 00000000000..2d2a7ac278d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_indexing.py
@@ -0,0 +1,109 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, SparseDataFrame
+from pandas.util import testing as tm
+
+pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)")
+
+
+ [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]],
+ [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]],
+ [
+ [1.0, 1.0 + 1.0j],
+ [2.0 + 2.0j, 2.0],
+ [3.0, 3.0 + 3.0j],
+ [4.0 + 4.0j, 4.0],
+ [np.nan, np.nan]
+ ]
+])
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_numeric_data(data):
+ # GH 17386
+ lower_bound = 1.5
+
+ sparse = SparseDataFrame(data)
+ result = sparse.where(sparse > lower_bound)
+
+ dense = DataFrame(data)
+ dense_expected = dense.where(dense > lower_bound)
+ sparse_expected = SparseDataFrame(dense_expected)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
+
+
+ [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]],
+ [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]],
+ [
+ [1.0, 1.0 + 1.0j],
+ [2.0 + 2.0j, 2.0],
+ [3.0, 3.0 + 3.0j],
+ [4.0 + 4.0j, 4.0],
+ [np.nan, np.nan]
+ ]
+])
+ True,
+ -100,
+ 0.1,
+ 100.0 + 100.0j
+])
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_numeric_data_and_other(data, other):
+ # GH 17386
+ lower_bound = 1.5
+
+ sparse = SparseDataFrame(data)
+ result = sparse.where(sparse > lower_bound, other)
+
+ dense = DataFrame(data)
+ dense_expected = dense.where(dense > lower_bound, other)
+ sparse_expected = SparseDataFrame(dense_expected,
+ default_fill_value=other)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
+
+
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_bool_data():
+ # GH 17386
+ data = [[False, False], [True, True], [False, False]]
+ cond = True
+
+ sparse = SparseDataFrame(data)
+ result = sparse.where(sparse == cond)
+
+ dense = DataFrame(data)
+ dense_expected = dense.where(dense == cond)
+ sparse_expected = SparseDataFrame(dense_expected)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
+
+
+ True,
+ 0,
+ 0.1,
+ 100.0 + 100.0j
+])
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_bool_data_and_other(other):
+ # GH 17386
+ data = [[False, False], [True, True], [False, False]]
+ cond = True
+
+ sparse = SparseDataFrame(data)
+ result = sparse.where(sparse == cond, other)
+
+ dense = DataFrame(data)
+ dense_expected = dense.where(dense == cond, other)
+ sparse_expected = SparseDataFrame(dense_expected,
+ default_fill_value=other)
+
+ tm.assert_frame_equal(result, dense_expected)
+ tm.assert_sp_frame_equal(result, sparse_expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_csv.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_csv.py
new file mode 100644
index 00000000000..ed19872f8a7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_csv.py
@@ -0,0 +1,21 @@
+import numpy as np
+import pytest
+
+from pandas import SparseDataFrame, read_csv
+from pandas.util import testing as tm
+
+
+class TestSparseDataFrameToCsv(object):
+ fill_values = [np.nan, 0, None, 1]
+
+ @pytest.mark.parametrize('fill_value', fill_values)
+ def test_to_csv_sparse_dataframe(self, fill_value):
+ # GH19384
+ sdf = SparseDataFrame({'a': type(self).fill_values},
+ default_fill_value=fill_value)
+
+ with tm.ensure_clean('sparse_df.csv') as path:
+ sdf.to_csv(path, index=False)
+ df = read_csv(path, skip_blank_lines=False)
+
+ tm.assert_sp_frame_equal(df.to_sparse(fill_value=fill_value), sdf)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_from_scipy.py b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_from_scipy.py
new file mode 100644
index 00000000000..bdb2cd022b4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/frame/test_to_from_scipy.py
@@ -0,0 +1,185 @@
+from distutils.version import LooseVersion
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_bool_dtype
+
+import pandas as pd
+from pandas import SparseDataFrame, SparseSeries
+from pandas.core.sparse.api import SparseDtype
+from pandas.util import testing as tm
+
+scipy = pytest.importorskip('scipy')
+ignore_matrix_warning = pytest.mark.filterwarnings(
+ "ignore:the matrix subclass:PendingDeprecationWarning"
+)
+
+
[email protected]('index', [None, list('abc')]) # noqa: F811
[email protected]('columns', [None, list('def')])
[email protected]('fill_value', [None, 0, np.nan])
[email protected]('dtype', [bool, int, float, np.uint16])
+@ignore_matrix_warning
+def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype):
+ # GH 4343
+ # Make one ndarray and from it one sparse matrix, both to be used for
+ # constructing frames and comparing results
+ arr = np.eye(3, dtype=dtype)
+ # GH 16179
+ arr[0, 1] = dtype(2)
+ try:
+ spm = spmatrix(arr)
+ assert spm.dtype == arr.dtype
+ except (TypeError, AssertionError):
+ # If conversion to sparse fails for this spmatrix type and arr.dtype,
+ # then the combination is not currently supported in NumPy, so we
+ # can just skip testing it thoroughly
+ return
+
+ sdf = SparseDataFrame(spm, index=index, columns=columns,
+ default_fill_value=fill_value)
+
+ # Expected result construction is kind of tricky for all
+ # dtype-fill_value combinations; easiest to cast to something generic
+ # and except later on
+ rarr = arr.astype(object)
+ rarr[arr == 0] = np.nan
+ expected = SparseDataFrame(rarr, index=index, columns=columns).fillna(
+ fill_value if fill_value is not None else np.nan)
+
+ # Assert frame is as expected
+ sdf_obj = sdf.astype(object)
+ tm.assert_sp_frame_equal(sdf_obj, expected)
+ tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())
+
+ # Assert spmatrices equal
+ assert dict(sdf.to_coo().todok()) == dict(spm.todok())
+
+ # Ensure dtype is preserved if possible
+ # XXX: verify this
+ res_dtype = bool if is_bool_dtype(dtype) else dtype
+ tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype),
+ {np.dtype(res_dtype)})
+ assert sdf.to_coo().dtype == res_dtype
+
+ # However, adding a str column results in an upcast to object
+ sdf['strings'] = np.arange(len(sdf)).astype(str)
+ assert sdf.to_coo().dtype == np.object_
+
+
[email protected]('fill_value', [None, 0, np.nan]) # noqa: F811
+@ignore_matrix_warning
[email protected]("ignore:object dtype is not supp:UserWarning")
+def test_from_to_scipy_object(spmatrix, fill_value):
+ # GH 4343
+ dtype = object
+ columns = list('cd')
+ index = list('ab')
+
+ if (spmatrix is scipy.sparse.dok_matrix and LooseVersion(
+ scipy.__version__) >= LooseVersion('0.19.0')):
+ pytest.skip("dok_matrix from object does not work in SciPy >= 0.19")
+
+ # Make one ndarray and from it one sparse matrix, both to be used for
+ # constructing frames and comparing results
+ arr = np.eye(2, dtype=dtype)
+ try:
+ spm = spmatrix(arr)
+ assert spm.dtype == arr.dtype
+ except (TypeError, AssertionError):
+ # If conversion to sparse fails for this spmatrix type and arr.dtype,
+ # then the combination is not currently supported in NumPy, so we
+ # can just skip testing it thoroughly
+ return
+
+ sdf = SparseDataFrame(spm, index=index, columns=columns,
+ default_fill_value=fill_value)
+
+ # Expected result construction is kind of tricky for all
+ # dtype-fill_value combinations; easiest to cast to something generic
+ # and except later on
+ rarr = arr.astype(object)
+ rarr[arr == 0] = np.nan
+ expected = SparseDataFrame(rarr, index=index, columns=columns).fillna(
+ fill_value if fill_value is not None else np.nan)
+
+ # Assert frame is as expected
+ sdf_obj = sdf.astype(SparseDtype(object, fill_value))
+ tm.assert_sp_frame_equal(sdf_obj, expected)
+ tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())
+
+ # Assert spmatrices equal
+ assert dict(sdf.to_coo().todok()) == dict(spm.todok())
+
+ # Ensure dtype is preserved if possible
+ res_dtype = object
+ tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype),
+ {np.dtype(res_dtype)})
+ assert sdf.to_coo().dtype == res_dtype
+
+
+@ignore_matrix_warning
+def test_from_scipy_correct_ordering(spmatrix):
+ # GH 16179
+ arr = np.arange(1, 5).reshape(2, 2)
+ try:
+ spm = spmatrix(arr)
+ assert spm.dtype == arr.dtype
+ except (TypeError, AssertionError):
+ # If conversion to sparse fails for this spmatrix type and arr.dtype,
+ # then the combination is not currently supported in NumPy, so we
+ # can just skip testing it thoroughly
+ return
+
+ sdf = SparseDataFrame(spm)
+ expected = SparseDataFrame(arr)
+ tm.assert_sp_frame_equal(sdf, expected)
+ tm.assert_frame_equal(sdf.to_dense(), expected.to_dense())
+
+
+@ignore_matrix_warning
+def test_from_scipy_fillna(spmatrix):
+ # GH 16112
+ arr = np.eye(3)
+ arr[1:, 0] = np.nan
+
+ try:
+ spm = spmatrix(arr)
+ assert spm.dtype == arr.dtype
+ except (TypeError, AssertionError):
+ # If conversion to sparse fails for this spmatrix type and arr.dtype,
+ # then the combination is not currently supported in NumPy, so we
+ # can just skip testing it thoroughly
+ return
+
+ sdf = SparseDataFrame(spm).fillna(-1.0)
+
+ # Returning frame should fill all nan values with -1.0
+ expected = SparseDataFrame({
+ 0: SparseSeries([1., -1, -1]),
+ 1: SparseSeries([np.nan, 1, np.nan]),
+ 2: SparseSeries([np.nan, np.nan, 1]),
+ }, default_fill_value=-1)
+
+ # fill_value is expected to be what .fillna() above was called with
+ # We don't use -1 as initial fill_value in expected SparseSeries
+ # construction because this way we obtain "compressed" SparseArrays,
+ # avoiding having to construct them ourselves
+ for col in expected:
+ expected[col].fill_value = -1
+
+ tm.assert_sp_frame_equal(sdf, expected)
+
+
+def test_index_names_multiple_nones():
+ # https://github.com/pandas-dev/pandas/pull/24092
+ sparse = pytest.importorskip("scipy.sparse")
+
+ s = (pd.Series(1, index=pd.MultiIndex.from_product([['A', 'B'], [0, 1]]))
+ .to_sparse())
+ result, _, _ = s.to_coo()
+ assert isinstance(result, sparse.coo_matrix)
+ result = result.toarray()
+ expected = np.ones((2, 2), dtype="int64")
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/series/__init__.py b/contrib/python/pandas/py2/pandas/tests/sparse/series/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/series/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/series/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/sparse/series/test_indexing.py
new file mode 100644
index 00000000000..0f4235d7cc3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/series/test_indexing.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pytest
+
+from pandas import Series, SparseSeries
+from pandas.util import testing as tm
+
+pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)")
+
+
+ [1, 1, 2, 2, 3, 3, 4, 4, 0, 0],
+ [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan],
+ [
+ 1.0, 1.0 + 1.0j,
+ 2.0 + 2.0j, 2.0,
+ 3.0, 3.0 + 3.0j,
+ 4.0 + 4.0j, 4.0,
+ np.nan, np.nan
+ ]
+])
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_numeric_data(data):
+ # GH 17386
+ lower_bound = 1.5
+
+ sparse = SparseSeries(data)
+ result = sparse.where(sparse > lower_bound)
+
+ dense = Series(data)
+ dense_expected = dense.where(dense > lower_bound)
+ sparse_expected = SparseSeries(dense_expected)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
+
+
+ [1, 1, 2, 2, 3, 3, 4, 4, 0, 0],
+ [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan],
+ [
+ 1.0, 1.0 + 1.0j,
+ 2.0 + 2.0j, 2.0,
+ 3.0, 3.0 + 3.0j,
+ 4.0 + 4.0j, 4.0,
+ np.nan, np.nan
+ ]
+])
+ True,
+ -100,
+ 0.1,
+ 100.0 + 100.0j
+])
[email protected](reason='Wrong SparseBlock initialization '
+ '(Segfault) '
+ '(GH 17386)')
+def test_where_with_numeric_data_and_other(data, other):
+ # GH 17386
+ lower_bound = 1.5
+
+ sparse = SparseSeries(data)
+ result = sparse.where(sparse > lower_bound, other)
+
+ dense = Series(data)
+ dense_expected = dense.where(dense > lower_bound, other)
+ sparse_expected = SparseSeries(dense_expected, fill_value=other)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
+
+
[email protected](reason='Wrong SparseBlock initialization (GH#17386)')
+def test_where_with_bool_data():
+ # GH 17386
+ data = [False, False, True, True, False, False]
+ cond = True
+
+ sparse = SparseSeries(data)
+ result = sparse.where(sparse == cond)
+
+ dense = Series(data)
+ dense_expected = dense.where(dense == cond)
+ sparse_expected = SparseSeries(dense_expected)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
+
+
+ True,
+ 0,
+ 0.1,
+ 100.0 + 100.0j
+])
[email protected](reason='Wrong SparseBlock initialization '
+ '(Segfault) '
+ '(GH 17386)')
+def test_where_with_bool_data_and_other(other):
+ # GH 17386
+ data = [False, False, True, True, False, False]
+ cond = True
+
+ sparse = SparseSeries(data)
+ result = sparse.where(sparse == cond, other)
+
+ dense = Series(data)
+ dense_expected = dense.where(dense == cond, other)
+ sparse_expected = SparseSeries(dense_expected, fill_value=other)
+
+ tm.assert_series_equal(result, dense_expected)
+ tm.assert_sp_series_equal(result, sparse_expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/series/test_series.py b/contrib/python/pandas/py2/pandas/tests/sparse/series/test_series.py
new file mode 100644
index 00000000000..7eed47d0de8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/series/test_series.py
@@ -0,0 +1,1523 @@
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime
+import operator
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas._libs.sparse import BlockIndex, IntIndex
+from pandas.compat import PY36, range
+from pandas.errors import PerformanceWarning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, Series, SparseDtype, SparseSeries, bdate_range, compat, isna)
+from pandas.core.reshape.util import cartesian_product
+import pandas.core.sparse.frame as spf
+from pandas.tests.series.test_api import SharedWithSparse
+import pandas.util.testing as tm
+
+from pandas.tseries.offsets import BDay
+
+
+def _test_data1():
+ # nan-based
+ arr = np.arange(20, dtype=float)
+ index = np.arange(20)
+ arr[:2] = nan
+ arr[5:10] = nan
+ arr[-3:] = nan
+
+ return arr, index
+
+
+def _test_data2():
+ # nan-based
+ arr = np.arange(15, dtype=float)
+ index = np.arange(15)
+ arr[7:12] = nan
+ arr[-1:] = nan
+ return arr, index
+
+
+def _test_data1_zero():
+ # zero-based
+ arr, index = _test_data1()
+ arr[np.isnan(arr)] = 0
+ return arr, index
+
+
+def _test_data2_zero():
+ # zero-based
+ arr, index = _test_data2()
+ arr[np.isnan(arr)] = 0
+ return arr, index
+
+
+class TestSparseSeries(SharedWithSparse):
+
+ series_klass = SparseSeries
+ # SharedWithSparse tests use generic, series_klass-agnostic assertion
+ _assert_series_equal = staticmethod(tm.assert_sp_series_equal)
+
+ def setup_method(self, method):
+ arr, index = _test_data1()
+
+ date_index = bdate_range('1/1/2011', periods=len(index))
+
+ self.bseries = SparseSeries(arr, index=index, kind='block',
+ name='bseries')
+ self.ts = self.bseries
+
+ self.btseries = SparseSeries(arr, index=date_index, kind='block')
+
+ self.iseries = SparseSeries(arr, index=index, kind='integer',
+ name='iseries')
+
+ arr, index = _test_data2()
+ self.bseries2 = SparseSeries(arr, index=index, kind='block')
+ self.iseries2 = SparseSeries(arr, index=index, kind='integer')
+
+ arr, index = _test_data1_zero()
+ self.zbseries = SparseSeries(arr, index=index, kind='block',
+ fill_value=0, name='zbseries')
+ self.ziseries = SparseSeries(arr, index=index, kind='integer',
+ fill_value=0)
+
+ arr, index = _test_data2_zero()
+ self.zbseries2 = SparseSeries(arr, index=index, kind='block',
+ fill_value=0)
+ self.ziseries2 = SparseSeries(arr, index=index, kind='integer',
+ fill_value=0)
+
+ def test_constructor_dict_input(self):
+ # gh-16905
+ constructor_dict = {1: 1.}
+ index = [0, 1, 2]
+
+ # Series with index passed in
+ series = pd.Series(constructor_dict)
+ expected = SparseSeries(series, index=index)
+
+ result = SparseSeries(constructor_dict, index=index)
+ tm.assert_sp_series_equal(result, expected)
+
+ # Series with index and dictionary with no index
+ expected = SparseSeries(series)
+
+ result = SparseSeries(constructor_dict)
+ tm.assert_sp_series_equal(result, expected)
+
+ def test_constructor_dict_order(self):
+ # GH19018
+ # initialization ordering: by insertion order if python>= 3.6, else
+ # order by value
+ d = {'b': 1, 'a': 0, 'c': 2}
+ result = SparseSeries(d)
+ if PY36:
+ expected = SparseSeries([1, 0, 2], index=list('bac'))
+ else:
+ expected = SparseSeries([0, 1, 2], index=list('abc'))
+ tm.assert_sp_series_equal(result, expected)
+
+ def test_constructor_dtype(self):
+ arr = SparseSeries([np.nan, 1, 2, np.nan])
+ assert arr.dtype == SparseDtype(np.float64)
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseSeries([np.nan, 1, 2, np.nan], fill_value=0)
+ assert arr.dtype == SparseDtype(np.float64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseSeries([0, 1, 2, 4], dtype=np.int64, fill_value=np.nan)
+ assert arr.dtype == SparseDtype(np.int64, np.nan)
+ assert np.isnan(arr.fill_value)
+
+ arr = SparseSeries([0, 1, 2, 4], dtype=np.int64)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ arr = SparseSeries([0, 1, 2, 4], fill_value=0, dtype=np.int64)
+ assert arr.dtype == SparseDtype(np.int64, 0)
+ assert arr.fill_value == 0
+
+ def test_iteration_and_str(self):
+ [x for x in self.bseries]
+ str(self.bseries)
+
+ def test_construct_DataFrame_with_sp_series(self):
+ # it works!
+ df = DataFrame({'col': self.bseries})
+
+ # printing & access
+ df.iloc[:1]
+ df['col']
+ df.dtypes
+ str(df)
+
+ # blocking
+ expected = Series({'col': 'float64:sparse'})
+ result = df.ftypes
+ tm.assert_series_equal(expected, result)
+
+ def test_constructor_preserve_attr(self):
+ arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
+ assert arr.dtype == SparseDtype(np.int64)
+ assert arr.fill_value == 0
+
+ s = pd.SparseSeries(arr, name='x')
+ assert s.dtype == SparseDtype(np.int64)
+ assert s.fill_value == 0
+
+ def test_series_density(self):
+ # GH2803
+ ts = Series(np.random.randn(10))
+ ts[2:-2] = nan
+ sts = ts.to_sparse()
+ density = sts.density # don't die
+ assert density == 4 / 10.0
+
+ def test_sparse_to_dense(self):
+ arr, index = _test_data1()
+ series = self.bseries.to_dense()
+ tm.assert_series_equal(series, Series(arr, name='bseries'))
+
+ series = self.iseries.to_dense()
+ tm.assert_series_equal(series, Series(arr, name='iseries'))
+
+ arr, index = _test_data1_zero()
+ series = self.zbseries.to_dense()
+ tm.assert_series_equal(series, Series(arr, name='zbseries'))
+
+ series = self.ziseries.to_dense()
+ tm.assert_series_equal(series, Series(arr))
+
+ def test_to_dense_fill_value(self):
+ s = pd.Series([1, np.nan, np.nan, 3, np.nan])
+ res = SparseSeries(s).to_dense()
+ tm.assert_series_equal(res, s)
+
+ res = SparseSeries(s, fill_value=0).to_dense()
+ tm.assert_series_equal(res, s)
+
+ s = pd.Series([1, np.nan, 0, 3, 0])
+ res = SparseSeries(s, fill_value=0).to_dense()
+ tm.assert_series_equal(res, s)
+
+ res = SparseSeries(s, fill_value=0).to_dense()
+ tm.assert_series_equal(res, s)
+
+ s = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])
+ res = SparseSeries(s).to_dense()
+ tm.assert_series_equal(res, s)
+
+ s = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])
+ res = SparseSeries(s, fill_value=0).to_dense()
+ tm.assert_series_equal(res, s)
+
+ def test_dense_to_sparse(self):
+ series = self.bseries.to_dense()
+ bseries = series.to_sparse(kind='block')
+ iseries = series.to_sparse(kind='integer')
+ tm.assert_sp_series_equal(bseries, self.bseries)
+ tm.assert_sp_series_equal(iseries, self.iseries, check_names=False)
+ assert iseries.name == self.bseries.name
+
+ assert len(series) == len(bseries)
+ assert len(series) == len(iseries)
+ assert series.shape == bseries.shape
+ assert series.shape == iseries.shape
+
+ # non-NaN fill value
+ series = self.zbseries.to_dense()
+ zbseries = series.to_sparse(kind='block', fill_value=0)
+ ziseries = series.to_sparse(kind='integer', fill_value=0)
+ tm.assert_sp_series_equal(zbseries, self.zbseries)
+ tm.assert_sp_series_equal(ziseries, self.ziseries, check_names=False)
+ assert ziseries.name == self.zbseries.name
+
+ assert len(series) == len(zbseries)
+ assert len(series) == len(ziseries)
+ assert series.shape == zbseries.shape
+ assert series.shape == ziseries.shape
+
+ def test_to_dense_preserve_name(self):
+ assert (self.bseries.name is not None)
+ result = self.bseries.to_dense()
+ assert result.name == self.bseries.name
+
+ def test_constructor(self):
+ # test setup guys
+ assert np.isnan(self.bseries.fill_value)
+ assert isinstance(self.bseries.sp_index, BlockIndex)
+ assert np.isnan(self.iseries.fill_value)
+ assert isinstance(self.iseries.sp_index, IntIndex)
+
+ assert self.zbseries.fill_value == 0
+ tm.assert_numpy_array_equal(self.zbseries.values.values,
+ self.bseries.to_dense().fillna(0).values)
+
+ # pass SparseSeries
+ def _check_const(sparse, name):
+ # use passed series name
+ result = SparseSeries(sparse)
+ tm.assert_sp_series_equal(result, sparse)
+ assert sparse.name == name
+ assert result.name == name
+
+ # use passed name
+ result = SparseSeries(sparse, name='x')
+ tm.assert_sp_series_equal(result, sparse, check_names=False)
+ assert result.name == 'x'
+
+ _check_const(self.bseries, 'bseries')
+ _check_const(self.iseries, 'iseries')
+ _check_const(self.zbseries, 'zbseries')
+
+ # Sparse time series works
+ date_index = bdate_range('1/1/2000', periods=len(self.bseries))
+ s5 = SparseSeries(self.bseries, index=date_index)
+ assert isinstance(s5, SparseSeries)
+
+ # pass Series
+ bseries2 = SparseSeries(self.bseries.to_dense())
+ tm.assert_numpy_array_equal(self.bseries.sp_values, bseries2.sp_values)
+
+ # pass dict?
+
+ # don't copy the data by default
+ values = np.ones(self.bseries.npoints)
+ sp = SparseSeries(values, sparse_index=self.bseries.sp_index)
+ sp.sp_values[:5] = 97
+ assert values[0] == 97
+
+ assert len(sp) == 20
+ assert sp.shape == (20, )
+
+ # but can make it copy!
+ sp = SparseSeries(values, sparse_index=self.bseries.sp_index,
+ copy=True)
+ sp.sp_values[:5] = 100
+ assert values[0] == 97
+
+ assert len(sp) == 20
+ assert sp.shape == (20, )
+
+ def test_constructor_scalar(self):
+ data = 5
+ sp = SparseSeries(data, np.arange(100))
+ sp = sp.reindex(np.arange(200))
+ assert (sp.loc[:99] == data).all()
+ assert isna(sp.loc[100:]).all()
+
+ data = np.nan
+ sp = SparseSeries(data, np.arange(100))
+ assert len(sp) == 100
+ assert sp.shape == (100, )
+
+ def test_constructor_ndarray(self):
+ pass
+
+ def test_constructor_nonnan(self):
+ arr = [0, 0, 0, nan, nan]
+ sp_series = SparseSeries(arr, fill_value=0)
+ tm.assert_numpy_array_equal(sp_series.values.values, np.array(arr))
+ assert len(sp_series) == 5
+ assert sp_series.shape == (5, )
+
+ def test_constructor_empty(self):
+ # see gh-9272
+ sp = SparseSeries()
+ assert len(sp.index) == 0
+ assert sp.shape == (0, )
+
+ def test_copy_astype(self):
+ cop = self.bseries.astype(np.float64)
+ assert cop is not self.bseries
+ assert cop.sp_index is self.bseries.sp_index
+ assert cop.dtype == SparseDtype(np.float64)
+
+ cop2 = self.iseries.copy()
+
+ tm.assert_sp_series_equal(cop, self.bseries)
+ tm.assert_sp_series_equal(cop2, self.iseries)
+
+ # test that data is copied
+ cop[:5] = 97
+ assert cop.sp_values[0] == 97
+ assert self.bseries.sp_values[0] != 97
+
+ # correct fill value
+ zbcop = self.zbseries.copy()
+ zicop = self.ziseries.copy()
+
+ tm.assert_sp_series_equal(zbcop, self.zbseries)
+ tm.assert_sp_series_equal(zicop, self.ziseries)
+
+ # no deep copy
+ view = self.bseries.copy(deep=False)
+ view.sp_values[:5] = 5
+ assert (self.bseries.sp_values[:5] == 5).all()
+
+ def test_shape(self):
+ # see gh-10452
+ assert self.bseries.shape == (20, )
+ assert self.btseries.shape == (20, )
+ assert self.iseries.shape == (20, )
+
+ assert self.bseries2.shape == (15, )
+ assert self.iseries2.shape == (15, )
+
+ assert self.zbseries2.shape == (15, )
+ assert self.ziseries2.shape == (15, )
+
+ def test_astype(self):
+ result = self.bseries.astype(SparseDtype(np.int64, 0))
+ expected = (self.bseries.to_dense()
+ .fillna(0)
+ .astype(np.int64)
+ .to_sparse(fill_value=0))
+ tm.assert_sp_series_equal(result, expected)
+
+ def test_astype_all(self):
+ orig = pd.Series(np.array([1, 2, 3]))
+ s = SparseSeries(orig)
+
+ types = [np.float64, np.float32, np.int64,
+ np.int32, np.int16, np.int8]
+ for typ in types:
+ dtype = SparseDtype(typ)
+ res = s.astype(dtype)
+ assert res.dtype == dtype
+ tm.assert_series_equal(res.to_dense(), orig.astype(typ))
+
+ def test_kind(self):
+ assert self.bseries.kind == 'block'
+ assert self.iseries.kind == 'integer'
+
+ def test_to_frame(self):
+ # GH 9850
+ s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x')
+ exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]})
+ tm.assert_sp_frame_equal(s.to_frame(), exp)
+
+ exp = pd.SparseDataFrame({'y': [1, 2, 0, nan, 4, nan, 0]})
+ tm.assert_sp_frame_equal(s.to_frame(name='y'), exp)
+
+ s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x', fill_value=0)
+ exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]},
+ default_fill_value=0)
+
+ tm.assert_sp_frame_equal(s.to_frame(), exp)
+ exp = pd.DataFrame({'y': [1, 2, 0, nan, 4, nan, 0]})
+ tm.assert_frame_equal(s.to_frame(name='y').to_dense(), exp)
+
+ def test_pickle(self):
+ def _test_roundtrip(series):
+ unpickled = tm.round_trip_pickle(series)
+ tm.assert_sp_series_equal(series, unpickled)
+ tm.assert_series_equal(series.to_dense(), unpickled.to_dense())
+
+ self._check_all(_test_roundtrip)
+
+ def _check_all(self, check_func):
+ check_func(self.bseries)
+ check_func(self.iseries)
+ check_func(self.zbseries)
+ check_func(self.ziseries)
+
+ def test_getitem(self):
+ def _check_getitem(sp, dense):
+ for idx, val in compat.iteritems(dense):
+ tm.assert_almost_equal(val, sp[idx])
+
+ for i in range(len(dense)):
+ tm.assert_almost_equal(sp[i], dense[i])
+ # j = np.float64(i)
+ # assert_almost_equal(sp[j], dense[j])
+
+ # API change 1/6/2012
+ # negative getitem works
+ # for i in xrange(len(dense)):
+ # assert_almost_equal(sp[-i], dense[-i])
+
+ _check_getitem(self.bseries, self.bseries.to_dense())
+ _check_getitem(self.btseries, self.btseries.to_dense())
+
+ _check_getitem(self.zbseries, self.zbseries.to_dense())
+ _check_getitem(self.iseries, self.iseries.to_dense())
+ _check_getitem(self.ziseries, self.ziseries.to_dense())
+
+ # exception handling
+ pytest.raises(Exception, self.bseries.__getitem__,
+ len(self.bseries) + 1)
+
+ # index not contained
+ pytest.raises(Exception, self.btseries.__getitem__,
+ self.btseries.index[-1] + BDay())
+
+ def test_get_get_value(self):
+ tm.assert_almost_equal(self.bseries.get(10), self.bseries[10])
+ assert self.bseries.get(len(self.bseries) + 1) is None
+
+ dt = self.btseries.index[10]
+ result = self.btseries.get(dt)
+ expected = self.btseries.to_dense()[dt]
+ tm.assert_almost_equal(result, expected)
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ tm.assert_almost_equal(
+ self.bseries.get_value(10), self.bseries[10])
+
+ def test_set_value(self):
+
+ idx = self.btseries.index[7]
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ self.btseries.set_value(idx, 0)
+ assert self.btseries[idx] == 0
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ self.iseries.set_value('foobar', 0)
+ assert self.iseries.index[-1] == 'foobar'
+ assert self.iseries['foobar'] == 0
+
+ def test_getitem_slice(self):
+ idx = self.bseries.index
+ res = self.bseries[::2]
+ assert isinstance(res, SparseSeries)
+
+ expected = self.bseries.reindex(idx[::2])
+ tm.assert_sp_series_equal(res, expected)
+
+ res = self.bseries[:5]
+ assert isinstance(res, SparseSeries)
+ tm.assert_sp_series_equal(res, self.bseries.reindex(idx[:5]))
+
+ res = self.bseries[5:]
+ tm.assert_sp_series_equal(res, self.bseries.reindex(idx[5:]))
+
+ # negative indices
+ res = self.bseries[:-3]
+ tm.assert_sp_series_equal(res, self.bseries.reindex(idx[:-3]))
+
+ def test_take(self):
+ def _compare_with_dense(sp):
+ dense = sp.to_dense()
+
+ def _compare(idx):
+ dense_result = dense.take(idx).values
+ sparse_result = sp.take(idx)
+ assert isinstance(sparse_result, SparseSeries)
+ tm.assert_almost_equal(dense_result,
+ sparse_result.values.values)
+
+ _compare([1., 2., 3., 4., 5., 0.])
+ _compare([7, 2, 9, 0, 4])
+ _compare([3, 6, 3, 4, 7])
+
+ self._check_all(_compare_with_dense)
+
+ pytest.raises(Exception, self.bseries.take,
+ [0, len(self.bseries) + 1])
+
+ # Corner case
+ # XXX: changed test. Why wsa this considered a corner case?
+ sp = SparseSeries(np.ones(10) * nan)
+ exp = pd.Series(np.repeat(nan, 5))
+ tm.assert_series_equal(sp.take([0, 1, 2, 3, 4]), exp.to_sparse())
+
+ with tm.assert_produces_warning(FutureWarning):
+ sp.take([1, 5], convert=True)
+
+ with tm.assert_produces_warning(FutureWarning):
+ sp.take([1, 5], convert=False)
+
+ def test_numpy_take(self):
+ sp = SparseSeries([1.0, 2.0, 3.0])
+ indices = [1, 2]
+
+ tm.assert_series_equal(np.take(sp, indices, axis=0).to_dense(),
+ np.take(sp.to_dense(), indices, axis=0))
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.take(sp, indices, out=np.empty(sp.shape))
+
+ msg = "the 'mode' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.take(sp, indices, out=None, mode='clip')
+
+ def test_setitem(self):
+ self.bseries[5] = 7.
+ assert self.bseries[5] == 7.
+
+ def test_setslice(self):
+ self.bseries[5:10] = 7.
+ tm.assert_series_equal(self.bseries[5:10].to_dense(),
+ Series(7., index=range(5, 10),
+ name=self.bseries.name))
+
+ def test_operators(self):
+
+ def _check_op(a, b, op):
+ sp_result = op(a, b)
+ adense = a.to_dense() if isinstance(a, SparseSeries) else a
+ bdense = b.to_dense() if isinstance(b, SparseSeries) else b
+ dense_result = op(adense, bdense)
+ tm.assert_almost_equal(sp_result.to_dense(), dense_result)
+
+ def check(a, b):
+ _check_op(a, b, operator.add)
+ _check_op(a, b, operator.sub)
+ _check_op(a, b, operator.truediv)
+ _check_op(a, b, operator.floordiv)
+ _check_op(a, b, operator.mul)
+
+ _check_op(a, b, lambda x, y: operator.add(y, x))
+ _check_op(a, b, lambda x, y: operator.sub(y, x))
+ _check_op(a, b, lambda x, y: operator.truediv(y, x))
+ _check_op(a, b, lambda x, y: operator.floordiv(y, x))
+ _check_op(a, b, lambda x, y: operator.mul(y, x))
+
+ # NaN ** 0 = 1 in C?
+ # _check_op(a, b, operator.pow)
+ # _check_op(a, b, lambda x, y: operator.pow(y, x))
+
+ check(self.bseries, self.bseries)
+ check(self.iseries, self.iseries)
+ check(self.bseries, self.iseries)
+
+ check(self.bseries, self.bseries2)
+ check(self.bseries, self.iseries2)
+ check(self.iseries, self.iseries2)
+
+ # scalar value
+ check(self.bseries, 5)
+
+ # zero-based
+ check(self.zbseries, self.zbseries * 2)
+ check(self.zbseries, self.zbseries2)
+ check(self.ziseries, self.ziseries2)
+
+ # with dense
+ result = self.bseries + self.bseries.to_dense()
+ tm.assert_sp_series_equal(result, self.bseries + self.bseries)
+
+ def test_binary_operators(self):
+
+ # skipping for now #####
+ import pytest
+ pytest.skip("skipping sparse binary operators test")
+
+ def _check_inplace_op(iop, op):
+ tmp = self.bseries.copy()
+
+ expected = op(tmp, self.bseries)
+ iop(tmp, self.bseries)
+ tm.assert_sp_series_equal(tmp, expected)
+
+ inplace_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow']
+ for op in inplace_ops:
+ _check_inplace_op(getattr(operator, "i%s" % op),
+ getattr(operator, op))
+
+ @pytest.mark.parametrize("values, op, fill_value", [
+ ([True, False, False, True], operator.invert, True),
+ ([True, False, False, True], operator.invert, False),
+ ([0, 1, 2, 3], operator.pos, 0),
+ ([0, 1, 2, 3], operator.neg, 0),
+ ([0, np.nan, 2, 3], operator.pos, np.nan),
+ ([0, np.nan, 2, 3], operator.neg, np.nan),
+ ])
+ def test_unary_operators(self, values, op, fill_value):
+ # https://github.com/pandas-dev/pandas/issues/22835
+ values = np.asarray(values)
+ if op is operator.invert:
+ new_fill_value = not fill_value
+ else:
+ new_fill_value = op(fill_value)
+ s = SparseSeries(values,
+ fill_value=fill_value,
+ index=['a', 'b', 'c', 'd'],
+ name='name')
+ result = op(s)
+ expected = SparseSeries(op(values),
+ fill_value=new_fill_value,
+ index=['a', 'b', 'c', 'd'],
+ name='name')
+ tm.assert_sp_series_equal(result, expected)
+
+ def test_abs(self):
+ s = SparseSeries([1, 2, -3], name='x')
+ expected = SparseSeries([1, 2, 3], name='x')
+ result = s.abs()
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ result = abs(s)
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ result = np.abs(s)
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x')
+ expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index,
+ fill_value=2, name='x')
+ result = s.abs()
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ result = abs(s)
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ result = np.abs(s)
+ tm.assert_sp_series_equal(result, expected)
+ assert result.name == 'x'
+
+ def test_reindex(self):
+ def _compare_with_series(sps, new_index):
+ spsre = sps.reindex(new_index)
+
+ series = sps.to_dense()
+ seriesre = series.reindex(new_index)
+ seriesre = seriesre.to_sparse(fill_value=sps.fill_value)
+
+ tm.assert_sp_series_equal(spsre, seriesre)
+ tm.assert_series_equal(spsre.to_dense(), seriesre.to_dense())
+
+ _compare_with_series(self.bseries, self.bseries.index[::2])
+ _compare_with_series(self.bseries, list(self.bseries.index[::2]))
+ _compare_with_series(self.bseries, self.bseries.index[:10])
+ _compare_with_series(self.bseries, self.bseries.index[5:])
+
+ _compare_with_series(self.zbseries, self.zbseries.index[::2])
+ _compare_with_series(self.zbseries, self.zbseries.index[:10])
+ _compare_with_series(self.zbseries, self.zbseries.index[5:])
+
+ # special cases
+ same_index = self.bseries.reindex(self.bseries.index)
+ tm.assert_sp_series_equal(self.bseries, same_index)
+ assert same_index is not self.bseries
+
+ # corner cases
+ sp = SparseSeries([], index=[])
+ # TODO: sp_zero is not used anywhere...remove?
+ sp_zero = SparseSeries([], index=[], fill_value=0) # noqa
+ _compare_with_series(sp, np.arange(10))
+
+ # with copy=False
+ reindexed = self.bseries.reindex(self.bseries.index, copy=True)
+ reindexed.sp_values[:] = 1.
+ assert (self.bseries.sp_values != 1.).all()
+
+ reindexed = self.bseries.reindex(self.bseries.index, copy=False)
+ reindexed.sp_values[:] = 1.
+ tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1., 10))
+
+ def test_sparse_reindex(self):
+ length = 10
+
+ def _check(values, index1, index2, fill_value):
+ first_series = SparseSeries(values, sparse_index=index1,
+ fill_value=fill_value)
+ reindexed = first_series.sparse_reindex(index2)
+ assert reindexed.sp_index is index2
+
+ int_indices1 = index1.to_int_index().indices
+ int_indices2 = index2.to_int_index().indices
+
+ expected = Series(values, index=int_indices1)
+ expected = expected.reindex(int_indices2).fillna(fill_value)
+ tm.assert_almost_equal(expected.values, reindexed.sp_values)
+
+ # make sure level argument asserts
+ # TODO: expected is not used anywhere...remove?
+ expected = expected.reindex(int_indices2).fillna(fill_value) # noqa
+
+ def _check_with_fill_value(values, first, second, fill_value=nan):
+ i_index1 = IntIndex(length, first)
+ i_index2 = IntIndex(length, second)
+
+ b_index1 = i_index1.to_block_index()
+ b_index2 = i_index2.to_block_index()
+
+ _check(values, i_index1, i_index2, fill_value)
+ _check(values, b_index1, b_index2, fill_value)
+
+ def _check_all(values, first, second):
+ _check_with_fill_value(values, first, second, fill_value=nan)
+ _check_with_fill_value(values, first, second, fill_value=0)
+
+ index1 = [2, 4, 5, 6, 8, 9]
+ values1 = np.arange(6.)
+
+ _check_all(values1, index1, [2, 4, 5])
+ _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9])
+ _check_all(values1, index1, [0, 1])
+ _check_all(values1, index1, [0, 1, 7, 8, 9])
+ _check_all(values1, index1, [])
+
+ first_series = SparseSeries(values1,
+ sparse_index=IntIndex(length, index1),
+ fill_value=nan)
+ with pytest.raises(TypeError,
+ match='new index must be a SparseIndex'):
+ first_series.sparse_reindex(0)
+
+ def test_repr(self):
+ # TODO: These aren't used
+ bsrepr = repr(self.bseries) # noqa
+ isrepr = repr(self.iseries) # noqa
+
+ def test_iter(self):
+ pass
+
+ def test_truncate(self):
+ pass
+
+ def test_fillna(self):
+ pass
+
+ def test_groupby(self):
+ pass
+
+ def test_reductions(self):
+ def _compare_with_dense(obj, op):
+ sparse_result = getattr(obj, op)()
+ series = obj.to_dense()
+ dense_result = getattr(series, op)()
+ assert sparse_result == dense_result
+
+ to_compare = ['count', 'sum', 'mean', 'std', 'var', 'skew']
+
+ def _compare_all(obj):
+ for op in to_compare:
+ _compare_with_dense(obj, op)
+
+ _compare_all(self.bseries)
+
+ self.bseries.sp_values[5:10] = np.NaN
+ _compare_all(self.bseries)
+
+ _compare_all(self.zbseries)
+ self.zbseries.sp_values[5:10] = np.NaN
+ _compare_all(self.zbseries)
+
+ series = self.zbseries.copy()
+ series.fill_value = 2
+ _compare_all(series)
+
+ nonna = Series(np.random.randn(20)).to_sparse()
+ _compare_all(nonna)
+
+ nonna2 = Series(np.random.randn(20)).to_sparse(fill_value=0)
+ _compare_all(nonna2)
+
+ def test_dropna(self):
+ sp = SparseSeries([0, 0, 0, nan, nan, 5, 6], fill_value=0)
+
+ sp_valid = sp.dropna()
+
+ expected = sp.to_dense().dropna()
+ expected = expected[expected != 0]
+ exp_arr = pd.SparseArray(expected.values, fill_value=0, kind='block')
+ tm.assert_sp_array_equal(sp_valid.values, exp_arr)
+ tm.assert_index_equal(sp_valid.index, expected.index)
+ assert len(sp_valid.sp_values) == 2
+
+ result = self.bseries.dropna()
+ expected = self.bseries.to_dense().dropna()
+ assert not isinstance(result, SparseSeries)
+ tm.assert_series_equal(result, expected)
+
+ def test_homogenize(self):
+ def _check_matches(indices, expected):
+ data = {i: SparseSeries(idx.to_int_index().indices,
+ sparse_index=idx, fill_value=np.nan)
+ for i, idx in enumerate(indices)}
+
+ # homogenized is only valid with NaN fill values
+ homogenized = spf.homogenize(data)
+
+ for k, v in compat.iteritems(homogenized):
+ assert (v.sp_index.equals(expected))
+
+ indices1 = [BlockIndex(10, [2], [7]), BlockIndex(10, [1, 6], [3, 4]),
+ BlockIndex(10, [0], [10])]
+ expected1 = BlockIndex(10, [2, 6], [2, 3])
+ _check_matches(indices1, expected1)
+
+ indices2 = [BlockIndex(10, [2], [7]), BlockIndex(10, [2], [7])]
+ expected2 = indices2[0]
+ _check_matches(indices2, expected2)
+
+ # must have NaN fill value
+ data = {'a': SparseSeries(np.arange(7), sparse_index=expected2,
+ fill_value=0)}
+ with pytest.raises(TypeError, match="NaN fill value"):
+ spf.homogenize(data)
+
+ def test_fill_value_corner(self):
+ cop = self.zbseries.copy()
+ cop.fill_value = 0
+ result = self.bseries / cop
+
+ assert np.isnan(result.fill_value)
+
+ cop2 = self.zbseries.copy()
+ cop2.fill_value = 1
+ result = cop2 / cop
+ # 1 / 0 is inf
+ assert np.isinf(result.fill_value)
+
+ def test_fill_value_when_combine_const(self):
+ # GH12723
+ s = SparseSeries([0, 1, np.nan, 3, 4, 5], index=np.arange(6))
+
+ exp = s.fillna(0).add(2)
+ res = s.add(2, fill_value=0)
+ tm.assert_series_equal(res, exp)
+
+ def test_shift(self):
+ series = SparseSeries([nan, 1., 2., 3., nan, nan], index=np.arange(6))
+
+ shifted = series.shift(0)
+ # assert shifted is not series
+ tm.assert_sp_series_equal(shifted, series)
+
+ f = lambda s: s.shift(1)
+ _dense_series_compare(series, f)
+
+ f = lambda s: s.shift(-2)
+ _dense_series_compare(series, f)
+
+ series = SparseSeries([nan, 1., 2., 3., nan, nan],
+ index=bdate_range('1/1/2000', periods=6))
+ f = lambda s: s.shift(2, freq='B')
+ _dense_series_compare(series, f)
+
+ f = lambda s: s.shift(2, freq=BDay())
+ _dense_series_compare(series, f)
+
+ def test_shift_nan(self):
+ # GH 12908
+ orig = pd.Series([np.nan, 2, np.nan, 4, 0, np.nan, 0])
+ sparse = orig.to_sparse()
+
+ tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse(),
+ check_kind=False)
+
+ tm.assert_sp_series_equal(sparse.shift(-1), orig.shift(-1).to_sparse())
+ tm.assert_sp_series_equal(sparse.shift(-2), orig.shift(-2).to_sparse())
+ tm.assert_sp_series_equal(sparse.shift(-3), orig.shift(-3).to_sparse())
+ tm.assert_sp_series_equal(sparse.shift(-4), orig.shift(-4).to_sparse())
+
+ sparse = orig.to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(
+ sparse.shift(0),
+ orig.shift(0).to_sparse(fill_value=sparse.fill_value)
+ )
+ tm.assert_sp_series_equal(sparse.shift(1),
+ orig.shift(1).to_sparse(fill_value=0),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(2),
+ orig.shift(2).to_sparse(fill_value=0),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(3),
+ orig.shift(3).to_sparse(fill_value=0),
+ check_kind=False)
+
+ tm.assert_sp_series_equal(sparse.shift(-1),
+ orig.shift(-1).to_sparse(fill_value=0),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-2),
+ orig.shift(-2).to_sparse(fill_value=0),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-3),
+ orig.shift(-3).to_sparse(fill_value=0),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-4),
+ orig.shift(-4).to_sparse(fill_value=0),
+ check_kind=False)
+
+ def test_shift_dtype(self):
+ # GH 12908
+ orig = pd.Series([1, 2, 3, 4], dtype=np.int64)
+
+ sparse = orig.to_sparse()
+ tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse())
+
+ sparse = orig.to_sparse(fill_value=np.nan)
+ tm.assert_sp_series_equal(sparse.shift(0),
+ orig.shift(0).to_sparse(fill_value=np.nan))
+ # shift(1) or more span changes dtype to float64
+ # XXX: SparseSeries doesn't need to shift dtype here.
+ # Do we want to astype in shift, for backwards compat?
+ # If not, document it.
+ tm.assert_sp_series_equal(sparse.shift(1).astype('f8'),
+ orig.shift(1).to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.shift(2).astype('f8'),
+ orig.shift(2).to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.shift(3).astype('f8'),
+ orig.shift(3).to_sparse(kind='integer'))
+
+ tm.assert_sp_series_equal(sparse.shift(-1).astype('f8'),
+ orig.shift(-1).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-2).astype('f8'),
+ orig.shift(-2).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-3).astype('f8'),
+ orig.shift(-3).to_sparse(),
+ check_kind=False)
+ tm.assert_sp_series_equal(sparse.shift(-4).astype('f8'),
+ orig.shift(-4).to_sparse(),
+ check_kind=False)
+
+ @pytest.mark.parametrize("fill_value", [
+ 0,
+ 1,
+ np.nan
+ ])
+ @pytest.mark.parametrize("periods", [0, 1, 2, 3, -1, -2, -3, -4])
+ def test_shift_dtype_fill_value(self, fill_value, periods):
+ # GH 12908
+ orig = pd.Series([1, 0, 0, 4], dtype=np.dtype('int64'))
+
+ sparse = orig.to_sparse(fill_value=fill_value)
+
+ result = sparse.shift(periods)
+ expected = orig.shift(periods).to_sparse(fill_value=fill_value)
+
+ tm.assert_sp_series_equal(result, expected,
+ check_kind=False,
+ consolidate_block_indices=True)
+
+ def test_combine_first(self):
+ s = self.bseries
+
+ result = s[::2].combine_first(s)
+ result2 = s[::2].combine_first(s.to_dense())
+
+ expected = s[::2].to_dense().combine_first(s.to_dense())
+ expected = expected.to_sparse(fill_value=s.fill_value)
+
+ tm.assert_sp_series_equal(result, result2)
+ tm.assert_sp_series_equal(result, expected)
+
+ @pytest.mark.parametrize('deep', [True, False])
+ @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None])
+ def test_memory_usage_deep(self, deep, fill_value):
+ values = [1.0] + [fill_value] * 20
+ sparse_series = SparseSeries(values, fill_value=fill_value)
+ dense_series = Series(values)
+ sparse_usage = sparse_series.memory_usage(deep=deep)
+ dense_usage = dense_series.memory_usage(deep=deep)
+
+ assert sparse_usage < dense_usage
+
+
+class TestSparseHandlingMultiIndexes(object):
+
+ def setup_method(self, method):
+ miindex = pd.MultiIndex.from_product(
+ [["x", "y"], ["10", "20"]], names=['row-foo', 'row-bar'])
+ micol = pd.MultiIndex.from_product(
+ [['a', 'b', 'c'], ["1", "2"]], names=['col-foo', 'col-bar'])
+ dense_multiindex_frame = pd.DataFrame(
+ index=miindex, columns=micol).sort_index().sort_index(axis=1)
+ self.dense_multiindex_frame = dense_multiindex_frame.fillna(value=3.14)
+
+ def test_to_sparse_preserve_multiindex_names_columns(self):
+ sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse()
+ sparse_multiindex_frame = sparse_multiindex_frame.copy()
+ tm.assert_index_equal(sparse_multiindex_frame.columns,
+ self.dense_multiindex_frame.columns)
+
+ def test_round_trip_preserve_multiindex_names(self):
+ sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse()
+ round_trip_multiindex_frame = sparse_multiindex_frame.to_dense()
+ tm.assert_frame_equal(self.dense_multiindex_frame,
+ round_trip_multiindex_frame,
+ check_column_type=True,
+ check_names=True)
+
+
[email protected]_if_no_scipy
+ "ignore:the matrix subclass:PendingDeprecationWarning"
+)
+class TestSparseSeriesScipyInteraction(object):
+ # Issue 8048: add SparseSeries coo methods
+
+ def setup_method(self, method):
+ import scipy.sparse
+ # SparseSeries inputs used in tests, the tests rely on the order
+ self.sparse_series = []
+ s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan])
+ s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
+ (1, 2, 'a', 1),
+ (1, 1, 'b', 0),
+ (1, 1, 'b', 1),
+ (2, 1, 'b', 0),
+ (2, 1, 'b', 1)],
+ names=['A', 'B', 'C', 'D'])
+ self.sparse_series.append(s.to_sparse())
+
+ ss = self.sparse_series[0].copy()
+ ss.index.names = [3, 0, 1, 2]
+ self.sparse_series.append(ss)
+
+ ss = pd.Series([
+ nan
+ ] * 12, index=cartesian_product((range(3), range(4)))).to_sparse()
+ for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]):
+ ss[k] = v
+ self.sparse_series.append(ss)
+
+ # results used in tests
+ self.coo_matrices = []
+ self.coo_matrices.append(scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4)))
+ self.coo_matrices.append(scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)))
+ self.coo_matrices.append(scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2)))
+ self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)],
+ [(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b')]]
+ self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)], [0, 1]]
+
+ def test_to_coo_text_names_integer_row_levels_nosort(self):
+ ss = self.sparse_series[0]
+ kwargs = {'row_levels': [0, 1], 'column_levels': [2, 3]}
+ result = (self.coo_matrices[0], self.ils[0], self.jls[0])
+ self._run_test(ss, kwargs, result)
+
+ def test_to_coo_text_names_integer_row_levels_sort(self):
+ ss = self.sparse_series[0]
+ kwargs = {'row_levels': [0, 1],
+ 'column_levels': [2, 3],
+ 'sort_labels': True}
+ result = (self.coo_matrices[1], self.ils[1], self.jls[0])
+ self._run_test(ss, kwargs, result)
+
+ def test_to_coo_text_names_text_row_levels_nosort_col_level_single(self):
+ ss = self.sparse_series[0]
+ kwargs = {'row_levels': ['A', 'B', 'C'],
+ 'column_levels': ['D'],
+ 'sort_labels': False}
+ result = (self.coo_matrices[2], self.ils[2], self.jls[1])
+ self._run_test(ss, kwargs, result)
+
+ def test_to_coo_integer_names_integer_row_levels_nosort(self):
+ ss = self.sparse_series[1]
+ kwargs = {'row_levels': [3, 0], 'column_levels': [1, 2]}
+ result = (self.coo_matrices[0], self.ils[0], self.jls[0])
+ self._run_test(ss, kwargs, result)
+
+ def test_to_coo_text_names_text_row_levels_nosort(self):
+ ss = self.sparse_series[0]
+ kwargs = {'row_levels': ['A', 'B'], 'column_levels': ['C', 'D']}
+ result = (self.coo_matrices[0], self.ils[0], self.jls[0])
+ self._run_test(ss, kwargs, result)
+
+ def test_to_coo_bad_partition_nonnull_intersection(self):
+ ss = self.sparse_series[0]
+ pytest.raises(ValueError, ss.to_coo, ['A', 'B', 'C'], ['C', 'D'])
+
+ def test_to_coo_bad_partition_small_union(self):
+ ss = self.sparse_series[0]
+ pytest.raises(ValueError, ss.to_coo, ['A'], ['C', 'D'])
+
+ def test_to_coo_nlevels_less_than_two(self):
+ ss = self.sparse_series[0]
+ ss.index = np.arange(len(ss.index))
+ pytest.raises(ValueError, ss.to_coo)
+
+ def test_to_coo_bad_ilevel(self):
+ ss = self.sparse_series[0]
+ pytest.raises(KeyError, ss.to_coo, ['A', 'B'], ['C', 'D', 'E'])
+
+ def test_to_coo_duplicate_index_entries(self):
+ ss = pd.concat([self.sparse_series[0],
+ self.sparse_series[0]]).to_sparse()
+ pytest.raises(ValueError, ss.to_coo, ['A', 'B'], ['C', 'D'])
+
+ def test_from_coo_dense_index(self):
+ ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True)
+ check = self.sparse_series[2]
+ tm.assert_sp_series_equal(ss, check)
+
+ def test_from_coo_nodense_index(self):
+ ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=False)
+ check = self.sparse_series[2]
+ check = check.dropna().to_sparse()
+ tm.assert_sp_series_equal(ss, check)
+
+ def test_from_coo_long_repr(self):
+ # GH 13114
+ # test it doesn't raise error. Formatting is tested in test_format
+ import scipy.sparse
+
+ sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18))
+ repr(sparse)
+
+ def _run_test(self, ss, kwargs, check):
+ results = ss.to_coo(**kwargs)
+ self._check_results_to_coo(results, check)
+ # for every test, also test symmetry property (transpose), switch
+ # row_levels and column_levels
+ d = kwargs.copy()
+ d['row_levels'] = kwargs['column_levels']
+ d['column_levels'] = kwargs['row_levels']
+ results = ss.to_coo(**d)
+ results = (results[0].T, results[2], results[1])
+ self._check_results_to_coo(results, check)
+
+ def _check_results_to_coo(self, results, check):
+ (A, il, jl) = results
+ (A_result, il_result, jl_result) = check
+ # convert to dense and compare
+ tm.assert_numpy_array_equal(A.todense(), A_result.todense())
+ # or compare directly as difference of sparse
+ # assert(abs(A - A_result).max() < 1e-12) # max is failing in python
+ # 2.6
+ assert il == il_result
+ assert jl == jl_result
+
+ def test_concat(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ for kind in ['integer', 'block']:
+ sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, name='y', kind=kind)
+
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind=kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind)
+
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, fill_value=0, kind=kind)
+ tm.assert_sp_series_equal(res, exp,
+ consolidate_block_indices=True)
+
+ def test_concat_axis1(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x')
+ sparse2 = pd.SparseSeries(val2, name='y')
+
+ res = pd.concat([sparse1, sparse2], axis=1)
+ exp = pd.concat([pd.Series(val1, name='x'),
+ pd.Series(val2, name='y')], axis=1)
+ exp = pd.SparseDataFrame(exp)
+ tm.assert_sp_frame_equal(res, exp)
+
+ def test_concat_different_fill(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ for kind in ['integer', 'block']:
+ sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind=kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse2, sparse1])
+ exp = pd.concat([pd.Series(val2), pd.Series(val1)])
+ exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ def test_concat_axis1_different_fill(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x')
+ sparse2 = pd.SparseSeries(val2, name='y', fill_value=0)
+
+ res = pd.concat([sparse1, sparse2], axis=1)
+ exp = pd.concat([pd.Series(val1, name='x'),
+ pd.Series(val2, name='y')], axis=1)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ def test_concat_different_kind(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x', kind='integer')
+ sparse2 = pd.SparseSeries(val2, name='y', kind='block', fill_value=0)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind='integer')
+ tm.assert_sp_series_equal(res, exp)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse2, sparse1])
+ exp = pd.concat([pd.Series(val2), pd.Series(val1)])
+ exp = pd.SparseSeries(exp, kind='block', fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ def test_concat_sparse_dense(self):
+ # use first input's fill_value
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ for kind in ['integer', 'block']:
+ sparse = pd.SparseSeries(val1, name='x', kind=kind)
+ dense = pd.Series(val2, name='y')
+
+ res = pd.concat([sparse, dense])
+ exp = pd.concat([pd.Series(val1), dense])
+ exp = pd.SparseSeries(exp, kind=kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ res = pd.concat([dense, sparse, dense])
+ exp = pd.concat([dense, pd.Series(val1), dense])
+ exp = exp.astype("Sparse")
+ tm.assert_series_equal(res, exp)
+
+ sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
+ dense = pd.Series(val2, name='y')
+
+ res = pd.concat([sparse, dense])
+ exp = pd.concat([pd.Series(val1), dense])
+ exp = exp.astype(SparseDtype(exp.dtype, 0))
+ tm.assert_series_equal(res, exp)
+
+ res = pd.concat([dense, sparse, dense])
+ exp = pd.concat([dense, pd.Series(val1), dense])
+ exp = exp.astype(SparseDtype(exp.dtype, 0))
+ tm.assert_series_equal(res, exp)
+
+ def test_value_counts(self):
+ vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1]
+ dense = pd.Series(vals, name='xx')
+
+ sparse = pd.SparseSeries(vals, name='xx')
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ sparse = pd.SparseSeries(vals, name='xx', fill_value=0)
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ def test_value_counts_dup(self):
+ vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1]
+
+ # numeric op may cause sp_values to include the same value as
+ # fill_value
+ dense = pd.Series(vals, name='xx') / 0.
+ sparse = pd.SparseSeries(vals, name='xx') / 0.
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ vals = [1, 2, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1]
+
+ dense = pd.Series(vals, name='xx') * 0.
+ sparse = pd.SparseSeries(vals, name='xx') * 0.
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ def test_value_counts_int(self):
+ vals = [1, 2, 0, 1, 2, 1, 2, 0, 1, 1]
+ dense = pd.Series(vals, name='xx')
+
+ # fill_value is np.nan, but should not be included in the result
+ sparse = pd.SparseSeries(vals, name='xx')
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ sparse = pd.SparseSeries(vals, name='xx', fill_value=0)
+ tm.assert_series_equal(sparse.value_counts(),
+ dense.value_counts())
+ tm.assert_series_equal(sparse.value_counts(dropna=False),
+ dense.value_counts(dropna=False))
+
+ def test_isna(self):
+ # GH 8276
+ s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx')
+
+ res = s.isna()
+ exp = pd.SparseSeries([True, True, False, False, True], name='xxx',
+ fill_value=True)
+ tm.assert_sp_series_equal(res, exp)
+
+ # if fill_value is not nan, True can be included in sp_values
+ s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx',
+ fill_value=0.)
+ res = s.isna()
+ assert isinstance(res, pd.SparseSeries)
+ exp = pd.Series([True, False, False, False, False], name='xxx')
+ tm.assert_series_equal(res.to_dense(), exp)
+
+ def test_notna(self):
+ # GH 8276
+ s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx')
+
+ res = s.notna()
+ exp = pd.SparseSeries([False, False, True, True, False], name='xxx',
+ fill_value=False)
+ tm.assert_sp_series_equal(res, exp)
+
+ # if fill_value is not nan, True can be included in sp_values
+ s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx',
+ fill_value=0.)
+ res = s.notna()
+ assert isinstance(res, pd.SparseSeries)
+ exp = pd.Series([False, True, True, True, True], name='xxx')
+ tm.assert_series_equal(res.to_dense(), exp)
+
+
+def _dense_series_compare(s, f):
+ result = f(s)
+ assert (isinstance(result, SparseSeries))
+ dense_result = f(s.to_dense())
+ tm.assert_series_equal(result.to_dense(), dense_result)
+
+
+class TestSparseSeriesAnalytics(object):
+
+ def setup_method(self, method):
+ arr, index = _test_data1()
+ self.bseries = SparseSeries(arr, index=index, kind='block',
+ name='bseries')
+
+ arr, index = _test_data1_zero()
+ self.zbseries = SparseSeries(arr, index=index, kind='block',
+ fill_value=0, name='zbseries')
+
+ def test_cumsum(self):
+ result = self.bseries.cumsum()
+ expected = SparseSeries(self.bseries.to_dense().cumsum())
+ tm.assert_sp_series_equal(result, expected)
+
+ result = self.zbseries.cumsum()
+ expected = self.zbseries.to_dense().cumsum().to_sparse()
+ tm.assert_series_equal(result, expected)
+
+ axis = 1 # Series is 1-D, so only axis = 0 is valid.
+ msg = "No axis named {axis}".format(axis=axis)
+ with pytest.raises(ValueError, match=msg):
+ self.bseries.cumsum(axis=axis)
+
+ def test_numpy_cumsum(self):
+ result = np.cumsum(self.bseries)
+ expected = SparseSeries(self.bseries.to_dense().cumsum())
+ tm.assert_sp_series_equal(result, expected)
+
+ result = np.cumsum(self.zbseries)
+ expected = self.zbseries.to_dense().cumsum().to_sparse()
+ tm.assert_series_equal(result, expected)
+
+ msg = "the 'dtype' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(self.bseries, dtype=np.int64)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.cumsum(self.zbseries, out=result)
+
+ def test_numpy_func_call(self):
+ # no exception should be raised even though
+ # numpy passes in 'axis=None' or `axis=-1'
+ funcs = ['sum', 'cumsum', 'var', 'mean',
+ 'prod', 'cumprod', 'std', 'argsort',
+ 'min', 'max']
+ for func in funcs:
+ for series in ('bseries', 'zbseries'):
+ getattr(np, func)(getattr(self, series))
+
+ def test_deprecated_numpy_func_call(self):
+ # NOTE: These should be add to the 'test_numpy_func_call' test above
+ # once the behavior of argmin/argmax is corrected.
+ funcs = ['argmin', 'argmax']
+ for func in funcs:
+ for series in ('bseries', 'zbseries'):
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ getattr(np, func)(getattr(self, series))
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ getattr(getattr(self, series), func)()
+
+ def test_deprecated_reindex_axis(self):
+ # https://github.com/pandas-dev/pandas/issues/17833
+ with tm.assert_produces_warning(FutureWarning) as m:
+ self.bseries.reindex_axis([0, 1, 2])
+ assert 'reindex' in str(m[0].message)
+
+
+ 'datetime_type', (np.datetime64,
+ pd.Timestamp,
+ lambda x: datetime.strptime(x, '%Y-%m-%d')))
+def test_constructor_dict_datetime64_index(datetime_type):
+ # GH 9456
+ dates = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
+ values = [42544017.198965244, 1234565, 40512335.181958228, -1]
+
+ result = SparseSeries(dict(zip(map(datetime_type, dates), values)))
+ expected = SparseSeries(values, map(pd.Timestamp, dates))
+
+ tm.assert_sp_series_equal(result, expected)
+
+
+def test_to_sparse():
+ # https://github.com/pandas-dev/pandas/issues/22389
+ arr = pd.SparseArray([1, 2, None, 3])
+ result = pd.Series(arr).to_sparse()
+ assert len(result) == 4
+ tm.assert_sp_array_equal(result.values, arr, check_kind=False)
+
+
+def test_constructor_mismatched_raises():
+ msg = "Length of passed values is 2, index implies 3"
+ with pytest.raises(ValueError, match=msg):
+ SparseSeries([1, 2], index=[1, 2, 3])
+
+
+def test_block_deprecated():
+ s = SparseSeries([1])
+ with tm.assert_produces_warning(FutureWarning):
+ s.block
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_combine_concat.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_combine_concat.py
new file mode 100644
index 00000000000..97d5aaca827
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_combine_concat.py
@@ -0,0 +1,462 @@
+# pylint: disable-msg=E1101,W0612
+import itertools
+
+import numpy as np
+import pytest
+
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestSparseArrayConcat(object):
+ @pytest.mark.parametrize('kind', ['integer', 'block'])
+ def test_basic(self, kind):
+ a = pd.SparseArray([1, 0, 0, 2], kind=kind)
+ b = pd.SparseArray([1, 0, 2, 2], kind=kind)
+
+ result = pd.SparseArray._concat_same_type([a, b])
+ # Can't make any assertions about the sparse index itself
+ # since we aren't don't merge sparse blocs across arrays
+ # in to_concat
+ expected = np.array([1, 2, 1, 2, 2], dtype='int64')
+ tm.assert_numpy_array_equal(result.sp_values, expected)
+ assert result.kind == kind
+
+ @pytest.mark.parametrize('kind', ['integer', 'block'])
+ def test_uses_first_kind(self, kind):
+ other = 'integer' if kind == 'block' else 'block'
+ a = pd.SparseArray([1, 0, 0, 2], kind=kind)
+ b = pd.SparseArray([1, 0, 2, 2], kind=other)
+
+ result = pd.SparseArray._concat_same_type([a, b])
+ expected = np.array([1, 2, 1, 2, 2], dtype='int64')
+ tm.assert_numpy_array_equal(result.sp_values, expected)
+ assert result.kind == kind
+
+
+class TestSparseSeriesConcat(object):
+
+ @pytest.mark.parametrize('kind', [
+ 'integer',
+ 'block',
+ ])
+ def test_concat(self, kind):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, name='y', kind=kind)
+
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind=kind)
+ tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
+
+ sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind)
+
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, fill_value=0, kind=kind)
+ tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
+
+ def test_concat_axis1(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x')
+ sparse2 = pd.SparseSeries(val2, name='y')
+
+ res = pd.concat([sparse1, sparse2], axis=1)
+ exp = pd.concat([pd.Series(val1, name='x'),
+ pd.Series(val2, name='y')], axis=1)
+ exp = pd.SparseDataFrame(exp)
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ def test_concat_different_fill(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ for kind in ['integer', 'block']:
+ sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
+ sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse1, sparse2])
+
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind=kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse2, sparse1])
+
+ exp = pd.concat([pd.Series(val2), pd.Series(val1)])
+ exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ def test_concat_axis1_different_fill(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x')
+ sparse2 = pd.SparseSeries(val2, name='y', fill_value=0)
+
+ res = pd.concat([sparse1, sparse2], axis=1)
+ exp = pd.concat([pd.Series(val1, name='x'),
+ pd.Series(val2, name='y')], axis=1)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ def test_concat_different_kind(self):
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse1 = pd.SparseSeries(val1, name='x', kind='integer')
+ sparse2 = pd.SparseSeries(val2, name='y', kind='block')
+
+ res = pd.concat([sparse1, sparse2])
+ exp = pd.concat([pd.Series(val1), pd.Series(val2)])
+ exp = pd.SparseSeries(exp, kind=sparse1.kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ res = pd.concat([sparse2, sparse1])
+ exp = pd.concat([pd.Series(val2), pd.Series(val1)])
+ exp = pd.SparseSeries(exp, kind=sparse2.kind)
+ tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True)
+
+ @pytest.mark.parametrize('kind', [
+ 'integer',
+ 'block',
+ ])
+ def test_concat_sparse_dense(self, kind):
+ # use first input's fill_value
+ val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
+ val2 = np.array([3, np.nan, 4, 0, 0])
+
+ sparse = pd.SparseSeries(val1, name='x', kind=kind)
+ dense = pd.Series(val2, name='y')
+
+ res = pd.concat([sparse, dense])
+ exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind)
+ tm.assert_sp_series_equal(res, exp)
+
+ res = pd.concat([dense, sparse, dense])
+ exp = pd.concat([dense, pd.Series(val1), dense])
+ # XXX: changed from SparseSeries to Series[sparse]
+ exp = pd.Series(
+ pd.SparseArray(exp, kind=kind),
+ index=exp.index,
+ name=exp.name,
+ )
+ tm.assert_series_equal(res, exp)
+
+ sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
+ dense = pd.Series(val2, name='y')
+
+ res = pd.concat([sparse, dense])
+ # XXX: changed from SparseSeries to Series[sparse]
+ exp = pd.concat([pd.Series(val1), dense])
+ exp = pd.Series(
+ pd.SparseArray(exp, kind=kind, fill_value=0),
+ index=exp.index,
+ name=exp.name,
+ )
+ tm.assert_series_equal(res, exp)
+
+ res = pd.concat([dense, sparse, dense])
+ exp = pd.concat([dense, pd.Series(val1), dense])
+ # XXX: changed from SparseSeries to Series[sparse]
+ exp = pd.Series(
+ pd.SparseArray(exp, kind=kind, fill_value=0),
+ index=exp.index,
+ name=exp.name,
+ )
+ tm.assert_series_equal(res, exp)
+
+
+class TestSparseDataFrameConcat(object):
+
+ def setup_method(self, method):
+
+ self.dense1 = pd.DataFrame({'A': [0., 1., 2., np.nan],
+ 'B': [0., 0., 0., 0.],
+ 'C': [np.nan, np.nan, np.nan, np.nan],
+ 'D': [1., 2., 3., 4.]})
+
+ self.dense2 = pd.DataFrame({'A': [5., 6., 7., 8.],
+ 'B': [np.nan, 0., 7., 8.],
+ 'C': [5., 6., np.nan, np.nan],
+ 'D': [np.nan, np.nan, np.nan, np.nan]})
+
+ self.dense3 = pd.DataFrame({'E': [5., 6., 7., 8.],
+ 'F': [np.nan, 0., 7., 8.],
+ 'G': [5., 6., np.nan, np.nan],
+ 'H': [np.nan, np.nan, np.nan, np.nan]})
+
+ def test_concat(self):
+ # fill_value = np.nan
+ sparse = self.dense1.to_sparse()
+ sparse2 = self.dense2.to_sparse()
+
+ res = pd.concat([sparse, sparse])
+ exp = pd.concat([self.dense1, self.dense1]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse2, sparse2])
+ exp = pd.concat([self.dense2, self.dense2]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse, sparse2])
+ exp = pd.concat([self.dense1, self.dense2]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse2, sparse])
+ exp = pd.concat([self.dense2, self.dense1]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ # fill_value = 0
+ sparse = self.dense1.to_sparse(fill_value=0)
+ sparse2 = self.dense2.to_sparse(fill_value=0)
+
+ res = pd.concat([sparse, sparse])
+ exp = pd.concat([self.dense1, self.dense1]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse2, sparse2])
+ exp = pd.concat([self.dense2, self.dense2]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse, sparse2])
+ exp = pd.concat([self.dense1, self.dense2]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ res = pd.concat([sparse2, sparse])
+ exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ def test_concat_different_fill_value(self):
+ # 1st fill_value will be used
+ sparse = self.dense1.to_sparse()
+ sparse2 = self.dense2.to_sparse(fill_value=0)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse, sparse2])
+ exp = pd.concat([self.dense1, self.dense2]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ with tm.assert_produces_warning(PerformanceWarning):
+ res = pd.concat([sparse2, sparse])
+ exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True)
+
+ def test_concat_different_columns_sort_warns(self):
+ sparse = self.dense1.to_sparse()
+ sparse3 = self.dense3.to_sparse()
+
+ with tm.assert_produces_warning(FutureWarning):
+ res = pd.concat([sparse, sparse3])
+ with tm.assert_produces_warning(FutureWarning):
+ exp = pd.concat([self.dense1, self.dense3])
+
+ exp = exp.to_sparse()
+ tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+ def test_concat_different_columns(self):
+ # fill_value = np.nan
+ sparse = self.dense1.to_sparse()
+ sparse3 = self.dense3.to_sparse()
+
+ res = pd.concat([sparse, sparse3], sort=True)
+ exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+ res = pd.concat([sparse3, sparse], sort=True)
+ exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse()
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+ def test_concat_bug(self):
+ from pandas.core.sparse.api import SparseDtype
+ x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan],
+ fill_value=0)})
+ y = pd.SparseDataFrame({"B": []})
+ res = pd.concat([x, y], sort=False)[['A']]
+ exp = pd.DataFrame({"A": pd.SparseArray([np.nan, np.nan],
+ dtype=SparseDtype(float, 0))})
+ tm.assert_frame_equal(res, exp)
+
+ def test_concat_different_columns_buggy(self):
+ sparse = self.dense1.to_sparse(fill_value=0)
+ sparse3 = self.dense3.to_sparse(fill_value=0)
+
+ res = pd.concat([sparse, sparse3], sort=True)
+ exp = (pd.concat([self.dense1, self.dense3], sort=True)
+ .to_sparse(fill_value=0))
+ exp._default_fill_value = np.nan
+
+ tm.assert_sp_frame_equal(res, exp, check_kind=False,
+ consolidate_block_indices=True)
+
+ res = pd.concat([sparse3, sparse], sort=True)
+ exp = (pd.concat([self.dense3, self.dense1], sort=True)
+ .to_sparse(fill_value=0))
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, check_kind=False,
+ consolidate_block_indices=True)
+
+ # different fill values
+ sparse = self.dense1.to_sparse()
+ sparse3 = self.dense3.to_sparse(fill_value=0)
+ # each columns keeps its fill_value, thus compare in dense
+ res = pd.concat([sparse, sparse3], sort=True)
+ exp = pd.concat([self.dense1, self.dense3], sort=True)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ res = pd.concat([sparse3, sparse], sort=True)
+ exp = pd.concat([self.dense3, self.dense1], sort=True)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ def test_concat_series(self):
+ # fill_value = np.nan
+ sparse = self.dense1.to_sparse()
+ sparse2 = self.dense2.to_sparse()
+
+ for col in ['A', 'D']:
+ res = pd.concat([sparse, sparse2[col]])
+ exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+ res = pd.concat([sparse2[col], sparse])
+ exp = pd.concat([self.dense2[col], self.dense1]).to_sparse()
+ tm.assert_sp_frame_equal(res, exp, check_kind=False)
+
+ # fill_value = 0
+ sparse = self.dense1.to_sparse(fill_value=0)
+ sparse2 = self.dense2.to_sparse(fill_value=0)
+
+ for col in ['C', 'D']:
+ res = pd.concat([sparse, sparse2[col]])
+ exp = pd.concat([self.dense1,
+ self.dense2[col]]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, check_kind=False,
+ consolidate_block_indices=True)
+
+ res = pd.concat([sparse2[col], sparse])
+ exp = pd.concat([self.dense2[col],
+ self.dense1]).to_sparse(fill_value=0)
+ exp['C'] = res['C']
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True,
+ check_kind=False)
+
+ def test_concat_axis1(self):
+ # fill_value = np.nan
+ sparse = self.dense1.to_sparse()
+ sparse3 = self.dense3.to_sparse()
+
+ res = pd.concat([sparse, sparse3], axis=1)
+ exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse()
+ tm.assert_sp_frame_equal(res, exp)
+
+ res = pd.concat([sparse3, sparse], axis=1)
+ exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse()
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp)
+
+ # fill_value = 0
+ sparse = self.dense1.to_sparse(fill_value=0)
+ sparse3 = self.dense3.to_sparse(fill_value=0)
+
+ res = pd.concat([sparse, sparse3], axis=1)
+ exp = pd.concat([self.dense1, self.dense3],
+ axis=1).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp)
+
+ res = pd.concat([sparse3, sparse], axis=1)
+ exp = pd.concat([self.dense3, self.dense1],
+ axis=1).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(res, exp)
+
+ # different fill values
+ sparse = self.dense1.to_sparse()
+ sparse3 = self.dense3.to_sparse(fill_value=0)
+ # each columns keeps its fill_value, thus compare in dense
+ res = pd.concat([sparse, sparse3], axis=1)
+ exp = pd.concat([self.dense1, self.dense3], axis=1)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ res = pd.concat([sparse3, sparse], axis=1)
+ exp = pd.concat([self.dense3, self.dense1], axis=1)
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
+ itertools.product([None, 0, 1, np.nan],
+ [0, 1],
+ [1, 0]))
+ def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
+ frames = [self.dense1, self.dense2]
+ sparse_frame = [frames[dense_idx],
+ frames[sparse_idx].to_sparse(fill_value=fill_value)]
+ dense_frame = [frames[dense_idx], frames[sparse_idx]]
+
+ # This will try both directions sparse + dense and dense + sparse
+ for _ in range(2):
+ res = pd.concat(sparse_frame)
+ exp = pd.concat(dense_frame)
+
+ assert isinstance(res, pd.SparseDataFrame)
+ tm.assert_frame_equal(res.to_dense(), exp)
+
+ sparse_frame = sparse_frame[::-1]
+ dense_frame = dense_frame[::-1]
+
+ @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
+ itertools.product([None, 0, 1, np.nan],
+ [0, 1],
+ [1, 0]))
+ @pytest.mark.xfail(reason="The iloc fails and I can't make expected",
+ strict=False)
+ def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
+ # See GH16874, GH18914 and #18686 for why this should be a DataFrame
+ from pandas.core.dtypes.common import is_sparse
+
+ frames = [self.dense1, self.dense3]
+
+ sparse_frame = [frames[dense_idx],
+ frames[sparse_idx].to_sparse(fill_value=fill_value)]
+ dense_frame = [frames[dense_idx], frames[sparse_idx]]
+
+ # This will try both directions sparse + dense and dense + sparse
+ for _ in range(2):
+ res = pd.concat(sparse_frame, axis=1)
+ exp = pd.concat(dense_frame, axis=1)
+ cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)]
+
+ for col in cols:
+ exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse")
+
+ for column in frames[dense_idx].columns:
+ if dense_idx == sparse_idx:
+ tm.assert_frame_equal(res[column], exp[column])
+ else:
+ tm.assert_series_equal(res[column], exp[column])
+
+ tm.assert_frame_equal(res, exp)
+
+ sparse_frame = sparse_frame[::-1]
+ dense_frame = dense_frame[::-1]
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_format.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_format.py
new file mode 100644
index 00000000000..63018f9525b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_format.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import numpy as np
+
+from pandas.compat import is_platform_32bit, is_platform_windows
+
+import pandas as pd
+from pandas.core.config import option_context
+import pandas.util.testing as tm
+
+use_32bit_repr = is_platform_windows() or is_platform_32bit()
+
+
+class TestSparseSeriesFormatting(object):
+
+ @property
+ def dtype_format_for_platform(self):
+ return '' if use_32bit_repr else ', dtype=int32'
+
+ def test_sparse_max_row(self):
+ s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
+ result = repr(s)
+ dfm = self.dtype_format_for_platform
+ exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
+ "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dfm))
+ assert result == exp
+
+ def test_sparsea_max_row_truncated(self):
+ s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
+ dfm = self.dtype_format_for_platform
+
+ with option_context("display.max_rows", 3):
+ # GH 10560
+ result = repr(s)
+ exp = ("0 1.0\n ... \n4 NaN\n"
+ "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dfm))
+ assert result == exp
+
+ def test_sparse_mi_max_row(self):
+ idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
+ ('C', 0), ('C', 1), ('C', 2)])
+ s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan],
+ index=idx).to_sparse()
+ result = repr(s)
+ dfm = self.dtype_format_for_platform
+ exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n"
+ "C 0 3.0\n 1 NaN\n 2 NaN\n"
+ "dtype: Sparse[float64, nan]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dfm))
+ assert result == exp
+
+ with option_context("display.max_rows", 3,
+ "display.show_dimensions", False):
+ # GH 13144
+ result = repr(s)
+ exp = ("A 0 1.0\n ... \nC 2 NaN\n"
+ "dtype: Sparse[float64, nan]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dfm))
+ assert result == exp
+
+ def test_sparse_bool(self):
+ # GH 13110
+ s = pd.SparseSeries([True, False, False, True, False, False],
+ fill_value=False)
+ result = repr(s)
+ dtype = '' if use_32bit_repr else ', dtype=int32'
+ exp = ("0 True\n1 False\n2 False\n"
+ "3 True\n4 False\n5 False\n"
+ "dtype: Sparse[bool, False]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dtype))
+ assert result == exp
+
+ with option_context("display.max_rows", 3):
+ result = repr(s)
+ exp = ("0 True\n ... \n5 False\n"
+ "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n"
+ "Block locations: array([0, 3]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dtype))
+ assert result == exp
+
+ def test_sparse_int(self):
+ # GH 13110
+ s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False)
+
+ result = repr(s)
+ dtype = '' if use_32bit_repr else ', dtype=int32'
+ exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n"
+ "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n"
+ "Block locations: array([1, 4]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dtype))
+ assert result == exp
+
+ with option_context("display.max_rows", 3,
+ "display.show_dimensions", False):
+ result = repr(s)
+ exp = ("0 0\n ..\n5 0\n"
+ "dtype: Sparse[int64, False]\nBlockIndex\n"
+ "Block locations: array([1, 4]{0})\n"
+ "Block lengths: array([1, 1]{0})".format(dtype))
+ assert result == exp
+
+
+class TestSparseDataFrameFormatting(object):
+
+ def test_sparse_frame(self):
+ # GH 13110
+ df = pd.DataFrame({'A': [True, False, True, False, True],
+ 'B': [True, False, True, False, True],
+ 'C': [0, 0, 3, 0, 5],
+ 'D': [np.nan, np.nan, np.nan, 1, 2]})
+ sparse = df.to_sparse()
+ assert repr(sparse) == repr(df)
+
+ with option_context("display.max_rows", 3):
+ assert repr(sparse) == repr(df)
+
+ def test_sparse_repr_after_set(self):
+ # GH 15488
+ sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]])
+ res = sdf.copy()
+
+ # Ignore the warning
+ with pd.option_context('mode.chained_assignment', None):
+ sdf[0][1] = 2 # This line triggers the bug
+
+ repr(sdf)
+ tm.assert_sp_frame_equal(sdf, res)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_groupby.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_groupby.py
new file mode 100644
index 00000000000..d0ff2a02c40
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_groupby.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestSparseGroupBy(object):
+
+ def setup_method(self, method):
+ self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8),
+ 'E': [np.nan, np.nan, 1, 2,
+ np.nan, 1, np.nan, np.nan]})
+ self.sparse = self.dense.to_sparse()
+
+ def test_first_last_nth(self):
+ # tests for first / last / nth
+ sparse_grouped = self.sparse.groupby('A')
+ dense_grouped = self.dense.groupby('A')
+
+ sparse_grouped_first = sparse_grouped.first()
+ sparse_grouped_last = sparse_grouped.last()
+ sparse_grouped_nth = sparse_grouped.nth(1)
+
+ dense_grouped_first = dense_grouped.first().to_sparse()
+ dense_grouped_last = dense_grouped.last().to_sparse()
+ dense_grouped_nth = dense_grouped.nth(1).to_sparse()
+
+ # TODO: shouldn't these all be spares or not?
+ tm.assert_frame_equal(sparse_grouped_first,
+ dense_grouped_first)
+ tm.assert_frame_equal(sparse_grouped_last,
+ dense_grouped_last)
+ tm.assert_frame_equal(sparse_grouped_nth,
+ dense_grouped_nth)
+
+ def test_aggfuncs(self):
+ sparse_grouped = self.sparse.groupby('A')
+ dense_grouped = self.dense.groupby('A')
+
+ result = sparse_grouped.mean().to_sparse()
+ expected = dense_grouped.mean().to_sparse()
+
+ tm.assert_frame_equal(result, expected)
+
+ # ToDo: sparse sum includes str column
+ # tm.assert_frame_equal(sparse_grouped.sum(),
+ # dense_grouped.sum())
+
+ result = sparse_grouped.count().to_sparse()
+ expected = dense_grouped.count().to_sparse()
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("fill_value", [0, np.nan])
+def test_groupby_includes_fill_value(fill_value):
+ # https://github.com/pandas-dev/pandas/issues/5078
+ df = pd.DataFrame({'a': [fill_value, 1, fill_value, fill_value],
+ 'b': [fill_value, 1, fill_value, fill_value]})
+ sdf = df.to_sparse(fill_value=fill_value)
+ result = sdf.groupby('a').sum()
+ expected = df.groupby('a').sum().to_sparse(fill_value=fill_value)
+ tm.assert_frame_equal(result, expected, check_index_type=False)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_indexing.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_indexing.py
new file mode 100644
index 00000000000..6d8c6f13cd3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_indexing.py
@@ -0,0 +1,1058 @@
+# pylint: disable-msg=E1101,W0612
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.sparse.api import SparseDtype
+import pandas.util.testing as tm
+
+
+class TestSparseSeriesIndexing(object):
+
+ def setup_method(self, method):
+ self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
+ self.sparse = self.orig.to_sparse()
+
+ def test_getitem(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse[0] == 1
+ assert np.isnan(sparse[1])
+ assert sparse[3] == 3
+
+ result = sparse[[1, 3, 4]]
+ exp = orig[[1, 3, 4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse[orig % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse[sparse % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_getitem_slice(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse())
+ tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse())
+ tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse())
+ tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse())
+
+ def test_getitem_int_dtype(self):
+ # GH 8292
+ s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx')
+ res = s[::2]
+ exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx')
+ tm.assert_sp_series_equal(res, exp)
+ assert res.dtype == SparseDtype(np.int64)
+
+ s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx')
+ res = s[::2]
+ exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6],
+ fill_value=0, name='xxx')
+ tm.assert_sp_series_equal(res, exp)
+ assert res.dtype == SparseDtype(np.int64)
+
+ def test_getitem_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse(fill_value=0)
+
+ assert sparse[0] == 1
+ assert np.isnan(sparse[1])
+ assert sparse[2] == 0
+ assert sparse[3] == 3
+
+ result = sparse[[1, 3, 4]]
+ exp = orig[[1, 3, 4]].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse[orig % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse[sparse % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_getitem_ellipsis(self):
+ # GH 9467
+ s = pd.SparseSeries([1, np.nan, 2, 0, np.nan])
+ tm.assert_sp_series_equal(s[...], s)
+
+ s = pd.SparseSeries([1, np.nan, 2, 0, np.nan], fill_value=0)
+ tm.assert_sp_series_equal(s[...], s)
+
+ def test_getitem_slice_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(sparse[:2],
+ orig[:2].to_sparse(fill_value=0))
+ tm.assert_sp_series_equal(sparse[4:2],
+ orig[4:2].to_sparse(fill_value=0))
+ tm.assert_sp_series_equal(sparse[::2],
+ orig[::2].to_sparse(fill_value=0))
+ tm.assert_sp_series_equal(sparse[-5:],
+ orig[-5:].to_sparse(fill_value=0))
+
+ def test_loc(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse.loc[0] == 1
+ assert np.isnan(sparse.loc[1])
+
+ result = sparse.loc[[1, 3, 4]]
+ exp = orig.loc[[1, 3, 4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # exceeds the bounds
+ result = sparse.reindex([1, 3, 4, 5])
+ exp = orig.reindex([1, 3, 4, 5]).to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+ # padded with NaN
+ assert np.isnan(result[-1])
+
+ # dense array
+ result = sparse.loc[orig % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_loc_index(self):
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE'))
+ sparse = orig.to_sparse()
+
+ assert sparse.loc['A'] == 1
+ assert np.isnan(sparse.loc['B'])
+
+ result = sparse.loc[['A', 'C', 'D']]
+ exp = orig.loc[['A', 'C', 'D']].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse.loc[orig % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_loc_index_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ assert sparse.loc['A'] == 1
+ assert np.isnan(sparse.loc['B'])
+
+ result = sparse.loc[['A', 'C', 'D']]
+ exp = orig.loc[['A', 'C', 'D']].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse.loc[orig % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_loc_slice(self):
+ orig = self.orig
+ sparse = self.sparse
+ tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())
+
+ def test_loc_slice_index_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ tm.assert_sp_series_equal(sparse.loc['C':],
+ orig.loc['C':].to_sparse(fill_value=0))
+
+ def test_loc_slice_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(sparse.loc[2:],
+ orig.loc[2:].to_sparse(fill_value=0))
+
+ def test_iloc(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse.iloc[3] == 3
+ assert np.isnan(sparse.iloc[2])
+
+ result = sparse.iloc[[1, 3, 4]]
+ exp = orig.iloc[[1, 3, 4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ result = sparse.iloc[[1, -2, -4]]
+ exp = orig.iloc[[1, -2, -4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ with pytest.raises(IndexError):
+ sparse.iloc[[1, 3, 5]]
+
+ def test_iloc_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse(fill_value=0)
+
+ assert sparse.iloc[3] == 3
+ assert np.isnan(sparse.iloc[1])
+ assert sparse.iloc[4] == 0
+
+ result = sparse.iloc[[1, 3, 4]]
+ exp = orig.iloc[[1, 3, 4]].to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_iloc_slice(self):
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
+ sparse = orig.to_sparse()
+ tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())
+
+ def test_iloc_slice_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(sparse.iloc[2:],
+ orig.iloc[2:].to_sparse(fill_value=0))
+
+ def test_at(self):
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
+ sparse = orig.to_sparse()
+ assert sparse.at[0] == orig.at[0]
+ assert np.isnan(sparse.at[1])
+ assert np.isnan(sparse.at[2])
+ assert sparse.at[3] == orig.at[3]
+ assert np.isnan(sparse.at[4])
+
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan],
+ index=list('abcde'))
+ sparse = orig.to_sparse()
+ assert sparse.at['a'] == orig.at['a']
+ assert np.isnan(sparse.at['b'])
+ assert np.isnan(sparse.at['c'])
+ assert sparse.at['d'] == orig.at['d']
+ assert np.isnan(sparse.at['e'])
+
+ def test_at_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0],
+ index=list('abcde'))
+ sparse = orig.to_sparse(fill_value=0)
+ assert sparse.at['a'] == orig.at['a']
+ assert np.isnan(sparse.at['b'])
+ assert sparse.at['c'] == orig.at['c']
+ assert sparse.at['d'] == orig.at['d']
+ assert sparse.at['e'] == orig.at['e']
+
+ def test_iat(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse.iat[0] == orig.iat[0]
+ assert np.isnan(sparse.iat[1])
+ assert np.isnan(sparse.iat[2])
+ assert sparse.iat[3] == orig.iat[3]
+ assert np.isnan(sparse.iat[4])
+
+ assert np.isnan(sparse.iat[-1])
+ assert sparse.iat[-5] == orig.iat[-5]
+
+ def test_iat_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0])
+ sparse = orig.to_sparse()
+ assert sparse.iat[0] == orig.iat[0]
+ assert np.isnan(sparse.iat[1])
+ assert sparse.iat[2] == orig.iat[2]
+ assert sparse.iat[3] == orig.iat[3]
+ assert sparse.iat[4] == orig.iat[4]
+
+ assert sparse.iat[-1] == orig.iat[-1]
+ assert sparse.iat[-5] == orig.iat[-5]
+
+ def test_get(self):
+ s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan])
+ assert s.get(0) == 1
+ assert np.isnan(s.get(1))
+ assert s.get(5) is None
+
+ s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+ assert s.get('A') == 1
+ assert np.isnan(s.get('B'))
+ assert s.get('C') == 0
+ assert s.get('XX') is None
+
+ s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'),
+ fill_value=0)
+ assert s.get('A') == 1
+ assert np.isnan(s.get('B'))
+ assert s.get('C') == 0
+ assert s.get('XX') is None
+
+ def test_take(self):
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse()
+
+ tm.assert_sp_series_equal(sparse.take([0]),
+ orig.take([0]).to_sparse())
+ tm.assert_sp_series_equal(sparse.take([0, 1, 3]),
+ orig.take([0, 1, 3]).to_sparse())
+ tm.assert_sp_series_equal(sparse.take([-1, -2]),
+ orig.take([-1, -2]).to_sparse())
+
+ def test_take_fill_value(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ tm.assert_sp_series_equal(sparse.take([0]),
+ orig.take([0]).to_sparse(fill_value=0))
+
+ exp = orig.take([0, 1, 3]).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(sparse.take([0, 1, 3]), exp)
+
+ exp = orig.take([-1, -2]).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(sparse.take([-1, -2]), exp)
+
+ def test_reindex(self):
+ orig = pd.Series([1, np.nan, np.nan, 3, np.nan],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse()
+
+ res = sparse.reindex(['A', 'E', 'C', 'D'])
+ exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ # all missing & fill_value
+ res = sparse.reindex(['B', 'E', 'C'])
+ exp = orig.reindex(['B', 'E', 'C']).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse()
+
+ res = sparse.reindex(['A', 'E', 'C', 'D'])
+ exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ def test_fill_value_reindex(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'E', 'C', 'D'])
+ exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ # includes missing and fill_value
+ res = sparse.reindex(['A', 'B', 'C'])
+ exp = orig.reindex(['A', 'B', 'C']).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ # all missing
+ orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'E', 'C', 'D'])
+ exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ # all fill_value
+ orig = pd.Series([0., 0., 0., 0., 0.],
+ index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ def test_fill_value_reindex_coerces_float_int(self):
+ orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'E', 'C', 'D'])
+ exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0)
+ tm.assert_sp_series_equal(res, exp)
+
+ def test_reindex_fill_value(self):
+ floats = pd.Series([1., 2., 3.]).to_sparse()
+ result = floats.reindex([1, 2, 3], fill_value=0)
+ expected = pd.Series([2., 3., 0], index=[1, 2, 3]).to_sparse()
+ tm.assert_sp_series_equal(result, expected)
+
+ def test_reindex_nearest(self):
+ s = pd.Series(np.arange(10, dtype='float64')).to_sparse()
+ target = [0.1, 0.9, 1.5, 2.0]
+ actual = s.reindex(target, method='nearest')
+ expected = pd.Series(np.around(target), target).to_sparse()
+ tm.assert_sp_series_equal(expected, actual)
+
+ actual = s.reindex(target, method='nearest', tolerance=0.2)
+ expected = pd.Series([0, 1, np.nan, 2], target).to_sparse()
+ tm.assert_sp_series_equal(expected, actual)
+
+ actual = s.reindex(target, method='nearest',
+ tolerance=[0.3, 0.01, 0.4, 3])
+ expected = pd.Series([0, np.nan, np.nan, 2], target).to_sparse()
+ tm.assert_sp_series_equal(expected, actual)
+
+ @pytest.mark.parametrize("kind", ["integer", "block"])
+ @pytest.mark.parametrize("fill", [True, False, np.nan])
+ def tests_indexing_with_sparse(self, kind, fill):
+ # see gh-13985
+ arr = pd.SparseArray([1, 2, 3], kind=kind)
+ indexer = pd.SparseArray([True, False, True],
+ fill_value=fill,
+ dtype=bool)
+
+ expected = arr[indexer]
+ result = pd.SparseArray([1, 3], kind=kind)
+ tm.assert_sp_array_equal(result, expected)
+
+ s = pd.SparseSeries(arr, index=["a", "b", "c"], dtype=np.float64)
+ expected = pd.SparseSeries([1, 3], index=["a", "c"], kind=kind,
+ dtype=SparseDtype(np.float64, s.fill_value))
+
+ tm.assert_sp_series_equal(s[indexer], expected)
+ tm.assert_sp_series_equal(s.loc[indexer], expected)
+ tm.assert_sp_series_equal(s.iloc[indexer], expected)
+
+ indexer = pd.SparseSeries(indexer, index=["a", "b", "c"])
+ tm.assert_sp_series_equal(s[indexer], expected)
+ tm.assert_sp_series_equal(s.loc[indexer], expected)
+
+ msg = ("iLocation based boolean indexing cannot "
+ "use an indexable as a mask")
+ with pytest.raises(ValueError, match=msg):
+ s.iloc[indexer]
+
+
+class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):
+
+ def setup_method(self, method):
+ # Mi with duplicated values
+ idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
+ ('C', 0), ('C', 1)])
+ self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx)
+ self.sparse = self.orig.to_sparse()
+
+ def test_getitem_multi(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse[0] == orig[0]
+ assert np.isnan(sparse[1])
+ assert sparse[3] == orig[3]
+
+ tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse())
+ tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse())
+
+ result = sparse[[1, 3, 4]]
+ exp = orig[[1, 3, 4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse[orig % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse[sparse % 2 == 1]
+ exp = orig[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_getitem_multi_tuple(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse['C', 0] == orig['C', 0]
+ assert np.isnan(sparse['A', 1])
+ assert np.isnan(sparse['B', 0])
+
+ def test_getitems_slice_multi(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
+
+ tm.assert_sp_series_equal(sparse.loc['A':'B'],
+ orig.loc['A':'B'].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
+
+ def test_loc(self):
+ # need to be override to use different label
+ orig = self.orig
+ sparse = self.sparse
+
+ tm.assert_sp_series_equal(sparse.loc['A'],
+ orig.loc['A'].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc['B'],
+ orig.loc['B'].to_sparse())
+
+ result = sparse.loc[[1, 3, 4]]
+ exp = orig.loc[[1, 3, 4]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # exceeds the bounds
+ result = sparse.loc[[1, 3, 4, 5]]
+ exp = orig.loc[[1, 3, 4, 5]].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # single element list (GH 15447)
+ result = sparse.loc[['A']]
+ exp = orig.loc[['A']].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # dense array
+ result = sparse.loc[orig % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse % 2 == 1]
+ exp = orig.loc[orig % 2 == 1].to_sparse()
+ tm.assert_sp_series_equal(result, exp)
+
+ # sparse array
+ result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)]
+ tm.assert_sp_series_equal(result, exp)
+
+ def test_loc_multi_tuple(self):
+ orig = self.orig
+ sparse = self.sparse
+
+ assert sparse.loc['C', 0] == orig.loc['C', 0]
+ assert np.isnan(sparse.loc['A', 1])
+ assert np.isnan(sparse.loc['B', 0])
+
+ def test_loc_slice(self):
+ orig = self.orig
+ sparse = self.sparse
+ tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
+
+ tm.assert_sp_series_equal(sparse.loc['A':'B'],
+ orig.loc['A':'B'].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
+
+ def test_reindex(self):
+ # GH 15447
+ orig = self.orig
+ sparse = self.sparse
+
+ res = sparse.reindex([('A', 0), ('C', 1)])
+ exp = orig.reindex([('A', 0), ('C', 1)]).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ # On specific level:
+ res = sparse.reindex(['A', 'C', 'B'], level=0)
+ exp = orig.reindex(['A', 'C', 'B'], level=0).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ # single element list (GH 15447)
+ res = sparse.reindex(['A'], level=0)
+ exp = orig.reindex(['A'], level=0).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+
+ with pytest.raises(TypeError):
+ # Incomplete keys are not accepted for reindexing:
+ sparse.reindex(['A', 'C'])
+
+ # "copy" argument:
+ res = sparse.reindex(sparse.index, copy=True)
+ exp = orig.reindex(orig.index, copy=True).to_sparse()
+ tm.assert_sp_series_equal(res, exp)
+ assert sparse is not res
+
+
+class TestSparseDataFrameIndexing(object):
+
+ def test_getitem(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4],
+ [0, np.nan, 5]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ tm.assert_sp_series_equal(sparse['x'], orig['x'].to_sparse())
+ tm.assert_sp_frame_equal(sparse[['x']], orig[['x']].to_sparse())
+ tm.assert_sp_frame_equal(sparse[['z', 'x']],
+ orig[['z', 'x']].to_sparse())
+
+ tm.assert_sp_frame_equal(sparse[[True, False, True, True]],
+ orig[[True, False, True, True]].to_sparse())
+
+ tm.assert_sp_frame_equal(sparse.iloc[[1, 2]],
+ orig.iloc[[1, 2]].to_sparse())
+
+ def test_getitem_fill_value(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ result = sparse[['z']]
+ expected = orig[['z']].to_sparse(fill_value=0)
+ tm.assert_sp_frame_equal(result, expected, check_fill_value=False)
+
+ tm.assert_sp_series_equal(sparse['y'],
+ orig['y'].to_sparse(fill_value=0))
+
+ exp = orig[['x']].to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse[['x']], exp)
+
+ exp = orig[['z', 'x']].to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse[['z', 'x']], exp)
+
+ indexer = [True, False, True, True]
+ exp = orig[indexer].to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse[indexer], exp)
+
+ exp = orig.iloc[[1, 2]].to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp)
+
+ def test_loc(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ assert sparse.loc[0, 'x'] == 1
+ assert np.isnan(sparse.loc[1, 'z'])
+ assert sparse.loc[2, 'z'] == 4
+
+ # have to specify `kind='integer'`, since we construct a
+ # new SparseArray here, and the default sparse type is
+ # integer there, but block in SparseSeries
+ tm.assert_sp_series_equal(sparse.loc[0],
+ orig.loc[0].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc[1],
+ orig.loc[1].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc[2, :],
+ orig.loc[2, :].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc[2, :],
+ orig.loc[2, :].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc[:, 'y'],
+ orig.loc[:, 'y'].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc[:, 'y'],
+ orig.loc[:, 'y'].to_sparse())
+
+ result = sparse.loc[[1, 2]]
+ exp = orig.loc[[1, 2]].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[[1, 2], :]
+ exp = orig.loc[[1, 2], :].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[:, ['x', 'z']]
+ exp = orig.loc[:, ['x', 'z']].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[[0, 2], ['x', 'z']]
+ exp = orig.loc[[0, 2], ['x', 'z']].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # exceeds the bounds
+ result = sparse.reindex([1, 3, 4, 5])
+ exp = orig.reindex([1, 3, 4, 5]).to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # dense array
+ result = sparse.loc[orig.x % 2 == 1]
+ exp = orig.loc[orig.x % 2 == 1].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse.x % 2 == 1]
+ exp = orig.loc[orig.x % 2 == 1].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # sparse array
+ result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)]
+ tm.assert_sp_frame_equal(result, exp)
+
+ def test_loc_index(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4]],
+ index=list('abc'), columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ assert sparse.loc['a', 'x'] == 1
+ assert np.isnan(sparse.loc['b', 'z'])
+ assert sparse.loc['c', 'z'] == 4
+
+ tm.assert_sp_series_equal(sparse.loc['a'],
+ orig.loc['a'].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc['b'],
+ orig.loc['b'].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc['b', :],
+ orig.loc['b', :].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.loc['b', :],
+ orig.loc['b', :].to_sparse(kind='integer'))
+
+ tm.assert_sp_series_equal(sparse.loc[:, 'z'],
+ orig.loc[:, 'z'].to_sparse())
+ tm.assert_sp_series_equal(sparse.loc[:, 'z'],
+ orig.loc[:, 'z'].to_sparse())
+
+ result = sparse.loc[['a', 'b']]
+ exp = orig.loc[['a', 'b']].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[['a', 'b'], :]
+ exp = orig.loc[['a', 'b'], :].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[:, ['x', 'z']]
+ exp = orig.loc[:, ['x', 'z']].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.loc[['c', 'a'], ['x', 'z']]
+ exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # dense array
+ result = sparse.loc[orig.x % 2 == 1]
+ exp = orig.loc[orig.x % 2 == 1].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # sparse array (actuary it coerces to normal Series)
+ result = sparse.loc[sparse.x % 2 == 1]
+ exp = orig.loc[orig.x % 2 == 1].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ # sparse array
+ result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)]
+ tm.assert_sp_frame_equal(result, exp)
+
+ def test_loc_slice(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse()
+ tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse())
+
+ def test_iloc(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4]])
+ sparse = orig.to_sparse()
+
+ assert sparse.iloc[1, 1] == 3
+ assert np.isnan(sparse.iloc[2, 0])
+
+ tm.assert_sp_series_equal(sparse.iloc[0],
+ orig.loc[0].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.iloc[1],
+ orig.loc[1].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.iloc[2, :],
+ orig.iloc[2, :].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.iloc[2, :],
+ orig.iloc[2, :].to_sparse(kind='integer'))
+ tm.assert_sp_series_equal(sparse.iloc[:, 1],
+ orig.iloc[:, 1].to_sparse())
+ tm.assert_sp_series_equal(sparse.iloc[:, 1],
+ orig.iloc[:, 1].to_sparse())
+
+ result = sparse.iloc[[1, 2]]
+ exp = orig.iloc[[1, 2]].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.iloc[[1, 2], :]
+ exp = orig.iloc[[1, 2], :].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.iloc[:, [1, 0]]
+ exp = orig.iloc[:, [1, 0]].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ result = sparse.iloc[[2], [1, 0]]
+ exp = orig.iloc[[2], [1, 0]].to_sparse()
+ tm.assert_sp_frame_equal(result, exp)
+
+ with pytest.raises(IndexError):
+ sparse.iloc[[1, 3, 5]]
+
+ def test_iloc_slice(self):
+ orig = pd.DataFrame([[1, np.nan, np.nan],
+ [2, 3, np.nan],
+ [np.nan, np.nan, 4]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse()
+ tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())
+
+ def test_at(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse()
+ assert sparse.at['A', 'x'] == orig.at['A', 'x']
+ assert np.isnan(sparse.at['B', 'z'])
+ assert np.isnan(sparse.at['C', 'y'])
+ assert sparse.at['D', 'x'] == orig.at['D', 'x']
+
+ def test_at_fill_value(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+ assert sparse.at['A', 'x'] == orig.at['A', 'x']
+ assert np.isnan(sparse.at['B', 'z'])
+ assert np.isnan(sparse.at['C', 'y'])
+ assert sparse.at['D', 'x'] == orig.at['D', 'x']
+
+ def test_iat(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse()
+ assert sparse.iat[0, 0] == orig.iat[0, 0]
+ assert np.isnan(sparse.iat[1, 2])
+ assert np.isnan(sparse.iat[2, 1])
+ assert sparse.iat[2, 0] == orig.iat[2, 0]
+
+ assert np.isnan(sparse.iat[-1, -2])
+ assert sparse.iat[-1, -1] == orig.iat[-1, -1]
+
+ def test_iat_fill_value(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+ assert sparse.iat[0, 0] == orig.iat[0, 0]
+ assert np.isnan(sparse.iat[1, 2])
+ assert np.isnan(sparse.iat[2, 1])
+ assert sparse.iat[2, 0] == orig.iat[2, 0]
+
+ assert np.isnan(sparse.iat[-1, -2])
+ assert sparse.iat[-1, -1] == orig.iat[-1, -1]
+
+ def test_take(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ tm.assert_sp_frame_equal(sparse.take([0]),
+ orig.take([0]).to_sparse())
+ tm.assert_sp_frame_equal(sparse.take([0, 1]),
+ orig.take([0, 1]).to_sparse())
+ tm.assert_sp_frame_equal(sparse.take([-1, -2]),
+ orig.take([-1, -2]).to_sparse())
+
+ def test_take_fill_value(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ exp = orig.take([0]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse.take([0]), exp)
+
+ exp = orig.take([0, 1]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse.take([0, 1]), exp)
+
+ exp = orig.take([-1, -2]).to_sparse(fill_value=0)
+ exp._default_fill_value = np.nan
+ tm.assert_sp_frame_equal(sparse.take([-1, -2]), exp)
+
+ def test_reindex(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ res = sparse.reindex(['A', 'C', 'B'])
+ exp = orig.reindex(['A', 'C', 'B']).to_sparse()
+ tm.assert_sp_frame_equal(res, exp)
+
+ orig = pd.DataFrame([[np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse()
+
+ res = sparse.reindex(['A', 'C', 'B'])
+ exp = orig.reindex(['A', 'C', 'B']).to_sparse()
+ tm.assert_sp_frame_equal(res, exp)
+
+ def test_reindex_fill_value(self):
+ orig = pd.DataFrame([[1, np.nan, 0],
+ [2, 3, np.nan],
+ [0, np.nan, 4],
+ [0, np.nan, 5]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'C', 'B'])
+ exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+ # all missing
+ orig = pd.DataFrame([[np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan],
+ [np.nan, np.nan, np.nan]],
+ index=list('ABCD'), columns=list('xyz'))
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'C', 'B'])
+ exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+ # all fill_value
+ orig = pd.DataFrame([[0, 0, 0],
+ [0, 0, 0],
+ [0, 0, 0],
+ [0, 0, 0]],
+ index=list('ABCD'), columns=list('xyz'),
+ dtype=np.int)
+ sparse = orig.to_sparse(fill_value=0)
+
+ res = sparse.reindex(['A', 'C', 'B'])
+ exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0)
+ tm.assert_sp_frame_equal(res, exp)
+
+
+class TestMultitype(object):
+
+ def setup_method(self, method):
+ self.cols = ['string', 'int', 'float', 'object']
+
+ self.string_series = pd.SparseSeries(['a', 'b', 'c'])
+ self.int_series = pd.SparseSeries([1, 2, 3])
+ self.float_series = pd.SparseSeries([1.1, 1.2, 1.3])
+ self.object_series = pd.SparseSeries([[], {}, set()])
+ self.sdf = pd.SparseDataFrame({
+ 'string': self.string_series,
+ 'int': self.int_series,
+ 'float': self.float_series,
+ 'object': self.object_series,
+ })
+ self.sdf = self.sdf[self.cols]
+ self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols)
+
+ def test_frame_basic_dtypes(self):
+ for _, row in self.sdf.iterrows():
+ assert row.dtype == SparseDtype(object)
+ tm.assert_sp_series_equal(self.sdf['string'], self.string_series,
+ check_names=False)
+ tm.assert_sp_series_equal(self.sdf['int'], self.int_series,
+ check_names=False)
+ tm.assert_sp_series_equal(self.sdf['float'], self.float_series,
+ check_names=False)
+ tm.assert_sp_series_equal(self.sdf['object'], self.object_series,
+ check_names=False)
+
+ def test_frame_indexing_single(self):
+ tm.assert_sp_series_equal(self.sdf.iloc[0],
+ pd.SparseSeries(['a', 1, 1.1, []],
+ index=self.cols),
+ check_names=False)
+ tm.assert_sp_series_equal(self.sdf.iloc[1],
+ pd.SparseSeries(['b', 2, 1.2, {}],
+ index=self.cols),
+ check_names=False)
+ tm.assert_sp_series_equal(self.sdf.iloc[2],
+ pd.SparseSeries(['c', 3, 1.3, set()],
+ index=self.cols),
+ check_names=False)
+
+ def test_frame_indexing_multiple(self):
+ tm.assert_sp_frame_equal(self.sdf, self.sdf[:])
+ tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:])
+ tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]],
+ pd.SparseDataFrame({
+ 'string': self.string_series.iloc[[1, 2]],
+ 'int': self.int_series.iloc[[1, 2]],
+ 'float': self.float_series.iloc[[1, 2]],
+ 'object': self.object_series.iloc[[1, 2]]
+ }, index=[1, 2])[self.cols])
+ tm.assert_sp_frame_equal(self.sdf[['int', 'string']],
+ pd.SparseDataFrame({
+ 'int': self.int_series,
+ 'string': self.string_series,
+ }))
+
+ def test_series_indexing_single(self):
+ for i, idx in enumerate(self.cols):
+ assert self.ss.iloc[i] == self.ss[idx]
+ tm.assert_class_equal(self.ss.iloc[i], self.ss[idx],
+ obj="series index")
+
+ assert self.ss['string'] == 'a'
+ assert self.ss['int'] == 1
+ assert self.ss['float'] == 1.1
+ assert self.ss['object'] == []
+
+ def test_series_indexing_multiple(self):
+ tm.assert_sp_series_equal(self.ss.loc[['string', 'int']],
+ pd.SparseSeries(['a', 1],
+ index=['string', 'int']))
+ tm.assert_sp_series_equal(self.ss.loc[['string', 'object']],
+ pd.SparseSeries(['a', []],
+ index=['string', 'object']))
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_pivot.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_pivot.py
new file mode 100644
index 00000000000..af7de43ec0f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_pivot.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestPivotTable(object):
+
+ def setup_method(self, method):
+ self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'three',
+ 'two', 'two', 'one', 'three'],
+ 'C': np.random.randn(8),
+ 'D': np.random.randn(8),
+ 'E': [np.nan, np.nan, 1, 2,
+ np.nan, 1, np.nan, np.nan]})
+ self.sparse = self.dense.to_sparse()
+
+ def test_pivot_table(self):
+ res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+ values='C')
+ res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+ values='C')
+ tm.assert_frame_equal(res_sparse, res_dense)
+
+ res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+ values='E')
+ res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+ values='E')
+ tm.assert_frame_equal(res_sparse, res_dense)
+
+ res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+ values='E', aggfunc='mean')
+ res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+ values='E', aggfunc='mean')
+ tm.assert_frame_equal(res_sparse, res_dense)
+
+ # ToDo: sum doesn't handle nan properly
+ # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+ # values='E', aggfunc='sum')
+ # res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+ # values='E', aggfunc='sum')
+ # tm.assert_frame_equal(res_sparse, res_dense)
+
+ def test_pivot_table_multi(self):
+ res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+ values=['D', 'E'])
+ res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+ values=['D', 'E'])
+ res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]"))
+ tm.assert_frame_equal(res_sparse, res_dense)
diff --git a/contrib/python/pandas/py2/pandas/tests/sparse/test_reshape.py b/contrib/python/pandas/py2/pandas/tests/sparse/test_reshape.py
new file mode 100644
index 00000000000..6830e40ce65
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/sparse/test_reshape.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+def sparse_df():
+ return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye
+
+
+def multi_index3():
+ return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
+
+
+def test_sparse_frame_stack(sparse_df, multi_index3):
+ ss = sparse_df.stack()
+ expected = pd.SparseSeries(np.ones(3), index=multi_index3)
+ tm.assert_sp_series_equal(ss, expected)
+
+
+def test_sparse_frame_unstack(sparse_df):
+ mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
+ sparse_df.index = mi
+ arr = np.array([[1, np.nan, np.nan],
+ [np.nan, 1, np.nan],
+ [np.nan, np.nan, 1]])
+ unstacked_df = pd.DataFrame(arr, index=mi).unstack()
+ unstacked_sdf = sparse_df.unstack()
+
+ tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)
+
+
+def test_sparse_series_unstack(sparse_df, multi_index3):
+ frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
+
+ arr = np.array([1, np.nan, np.nan])
+ arrays = {i: pd.SparseArray(np.roll(arr, i)) for i in range(3)}
+ expected = pd.DataFrame(arrays)
+ tm.assert_frame_equal(frame, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_algos.py b/contrib/python/pandas/py2/pandas/tests/test_algos.py
new file mode 100644
index 00000000000..919c4702b12
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_algos.py
@@ -0,0 +1,1881 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+from itertools import permutations
+import struct
+
+import numpy as np
+from numpy import nan
+from numpy.random import RandomState
+import pytest
+
+from pandas._libs import (
+ algos as libalgos, groupby as libgroupby, hashtable as ht)
+from pandas.compat import lrange, range
+from pandas.compat.numpy import np_array_datetime64_compat
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DatetimeIndex, Index, IntervalIndex, Series,
+ Timestamp, compat)
+import pandas.core.algorithms as algos
+from pandas.core.arrays import DatetimeArray
+import pandas.core.common as com
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal
+
+
+class TestMatch(object):
+
+ def test_ints(self):
+ values = np.array([0, 2, 1])
+ to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])
+
+ result = algos.match(to_match, values)
+ expected = np.array([0, 2, 1, 1, 0, 2, -1, 0], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = Series(algos.match(to_match, values, np.nan))
+ expected = Series(np.array([0, 2, 1, 1, 0, 2, np.nan, 0]))
+ tm.assert_series_equal(result, expected)
+
+ s = Series(np.arange(5), dtype=np.float32)
+ result = algos.match(s, [2, 4])
+ expected = np.array([-1, -1, 0, -1, 1], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = Series(algos.match(s, [2, 4], np.nan))
+ expected = Series(np.array([np.nan, np.nan, 0, np.nan, 1]))
+ tm.assert_series_equal(result, expected)
+
+ def test_strings(self):
+ values = ['foo', 'bar', 'baz']
+ to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']
+
+ result = algos.match(to_match, values)
+ expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = Series(algos.match(to_match, values, np.nan))
+ expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan]))
+ tm.assert_series_equal(result, expected)
+
+
+class TestFactorize(object):
+
+ def test_basic(self):
+
+ labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c',
+ 'c'])
+ tm.assert_numpy_array_equal(
+ uniques, np.array(['a', 'b', 'c'], dtype=object))
+
+ labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
+ 'a', 'c', 'c', 'c'], sort=True)
+ exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = np.array(['a', 'b', 'c'], dtype=object)
+ tm.assert_numpy_array_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(list(reversed(range(5))))
+ exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
+ tm.assert_numpy_array_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
+
+ exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+ tm.assert_numpy_array_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
+ exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64)
+ tm.assert_numpy_array_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(list(reversed(np.arange(5.))),
+ sort=True)
+ exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64)
+ tm.assert_numpy_array_equal(uniques, exp)
+
+ def test_mixed(self):
+
+ # doc example reshaping.rst
+ x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
+ labels, uniques = algos.factorize(x)
+
+ exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = Index(['A', 'B', 3.14, np.inf])
+ tm.assert_index_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(x, sort=True)
+ exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = Index([3.14, np.inf, 'A', 'B'])
+ tm.assert_index_equal(uniques, exp)
+
+ def test_datelike(self):
+
+ # M8
+ v1 = Timestamp('20130101 09:00:00.00004')
+ v2 = Timestamp('20130101')
+ x = Series([v1, v1, v1, v2, v2, v1])
+ labels, uniques = algos.factorize(x)
+
+ exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = DatetimeIndex([v1, v2])
+ tm.assert_index_equal(uniques, exp)
+
+ labels, uniques = algos.factorize(x, sort=True)
+ exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ exp = DatetimeIndex([v2, v1])
+ tm.assert_index_equal(uniques, exp)
+
+ # period
+ v1 = pd.Period('201302', freq='M')
+ v2 = pd.Period('201303', freq='M')
+ x = Series([v1, v1, v1, v2, v2, v1])
+
+ # periods are not 'sorted' as they are converted back into an index
+ labels, uniques = algos.factorize(x)
+ exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
+
+ labels, uniques = algos.factorize(x, sort=True)
+ exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
+
+ # GH 5986
+ v1 = pd.to_timedelta('1 day 1 min')
+ v2 = pd.to_timedelta('1 day')
+ x = Series([v1, v2, v1, v1, v2, v2, v1])
+ labels, uniques = algos.factorize(x)
+ exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))
+
+ labels, uniques = algos.factorize(x, sort=True)
+ exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(labels, exp)
+ tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
+
+ def test_factorize_nan(self):
+ # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
+ # rizer.factorize should not raise an exception if na_sentinel indexes
+ # outside of reverse_indexer
+ key = np.array([1, 2, 1, np.nan], dtype='O')
+ rizer = ht.Factorizer(len(key))
+ for na_sentinel in (-1, 20):
+ ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
+ expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
+ assert len(set(key)) == len(set(expected))
+ tm.assert_numpy_array_equal(pd.isna(key),
+ expected == na_sentinel)
+
+ # nan still maps to na_sentinel when sort=False
+ key = np.array([0, np.nan, 1], dtype='O')
+ na_sentinel = -1
+
+ # TODO(wesm): unused?
+ ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa
+
+ expected = np.array([2, -1, 0], dtype='int32')
+ assert len(set(key)) == len(set(expected))
+ tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
+
+ @pytest.mark.parametrize("data,expected_label,expected_level", [
+ (
+ [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'],
+ [0, 1, 2, 1, 3],
+ [(1, 1), (1, 2), (0, 0), 'nonsense']
+ ),
+ (
+ [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
+ [0, 1, 2, 1, 3],
+ [(1, 1), (1, 2), (0, 0), (1, 2, 3)]
+ ),
+ (
+ [(1, 1), (1, 2), (0, 0), (1, 2)],
+ [0, 1, 2, 1],
+ [(1, 1), (1, 2), (0, 0)]
+ )
+ ])
+ def test_factorize_tuple_list(self, data, expected_label, expected_level):
+ # GH9454
+ result = pd.factorize(data)
+
+ tm.assert_numpy_array_equal(result[0],
+ np.array(expected_label, dtype=np.intp))
+
+ expected_level_array = com.asarray_tuplesafe(expected_level,
+ dtype=object)
+ tm.assert_numpy_array_equal(result[1], expected_level_array)
+
+ def test_complex_sorting(self):
+ # gh 12666 - check no segfault
+ x17 = np.array([complex(i) for i in range(17)], dtype=object)
+
+ pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True)
+
+ def test_float64_factorize(self, writable):
+ data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
+ data.setflags(write=writable)
+ exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
+ exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
+
+ labels, uniques = algos.factorize(data)
+ tm.assert_numpy_array_equal(labels, exp_labels)
+ tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+ def test_uint64_factorize(self, writable):
+ data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
+ data.setflags(write=writable)
+ exp_labels = np.array([0, 1, 0], dtype=np.intp)
+ exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
+
+ labels, uniques = algos.factorize(data)
+ tm.assert_numpy_array_equal(labels, exp_labels)
+ tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+ def test_int64_factorize(self, writable):
+ data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64)
+ data.setflags(write=writable)
+ exp_labels = np.array([0, 1, 0], dtype=np.intp)
+ exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64)
+
+ labels, uniques = algos.factorize(data)
+ tm.assert_numpy_array_equal(labels, exp_labels)
+ tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+ def test_string_factorize(self, writable):
+ data = np.array(['a', 'c', 'a', 'b', 'c'],
+ dtype=object)
+ data.setflags(write=writable)
+ exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp)
+ exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
+
+ labels, uniques = algos.factorize(data)
+ tm.assert_numpy_array_equal(labels, exp_labels)
+ tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+ def test_object_factorize(self, writable):
+ data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'],
+ dtype=object)
+ data.setflags(write=writable)
+ exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
+ exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
+
+ labels, uniques = algos.factorize(data)
+ tm.assert_numpy_array_equal(labels, exp_labels)
+ tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+ def test_deprecate_order(self):
+ # gh 19727 - check warning is raised for deprecated keyword, order.
+ # Test not valid once order keyword is removed.
+ data = np.array([2**63, 1, 2**63], dtype=np.uint64)
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ algos.factorize(data, order=True)
+ with tm.assert_produces_warning(False):
+ algos.factorize(data)
+
+ @pytest.mark.parametrize('data', [
+ np.array([0, 1, 0], dtype='u8'),
+ np.array([-2**63, 1, -2**63], dtype='i8'),
+ np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
+ ])
+ def test_parametrized_factorize_na_value_default(self, data):
+ # arrays that include the NA default for that type, but isn't used.
+ l, u = algos.factorize(data)
+ expected_uniques = data[[0, 1]]
+ expected_labels = np.array([0, 1, 0], dtype=np.intp)
+ tm.assert_numpy_array_equal(l, expected_labels)
+ tm.assert_numpy_array_equal(u, expected_uniques)
+
+ @pytest.mark.parametrize('data, na_value', [
+ (np.array([0, 1, 0, 2], dtype='u8'), 0),
+ (np.array([1, 0, 1, 2], dtype='u8'), 1),
+ (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
+ (np.array([1, -2**63, 1, 0], dtype='i8'), 1),
+ (np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
+ (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
+ (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
+ ('a', 1)),
+ ])
+ def test_parametrized_factorize_na_value(self, data, na_value):
+ l, u = algos._factorize_array(data, na_value=na_value)
+ expected_uniques = data[[1, 3]]
+ expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(l, expected_labels)
+ tm.assert_numpy_array_equal(u, expected_uniques)
+
+ @pytest.mark.parametrize('sort', [True, False])
+ @pytest.mark.parametrize('na_sentinel', [-1, -10, 100])
+ def test_factorize_na_sentinel(self, sort, na_sentinel):
+ data = np.array(['b', 'a', None, 'b'], dtype=object)
+ labels, uniques = algos.factorize(data, sort=sort,
+ na_sentinel=na_sentinel)
+ if sort:
+ expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
+ expected_uniques = np.array(['a', 'b'], dtype=object)
+ else:
+ expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
+ expected_uniques = np.array(['b', 'a'], dtype=object)
+ tm.assert_numpy_array_equal(labels, expected_labels)
+ tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+
+class TestUnique(object):
+
+ def test_ints(self):
+ arr = np.random.randint(0, 100, size=50)
+
+ result = algos.unique(arr)
+ assert isinstance(result, np.ndarray)
+
+ def test_objects(self):
+ arr = np.random.randint(0, 100, size=50).astype('O')
+
+ result = algos.unique(arr)
+ assert isinstance(result, np.ndarray)
+
+ def test_object_refcount_bug(self):
+ lst = ['A', 'B', 'C', 'D', 'E']
+ for i in range(1000):
+ len(algos.unique(lst))
+
+ def test_on_index_object(self):
+
+ mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
+ np.arange(5), 5)])
+ expected = mindex.values
+ expected.sort()
+
+ mindex = mindex.repeat(2)
+
+ result = pd.unique(mindex)
+ result.sort()
+
+ tm.assert_almost_equal(result, expected)
+
+ def test_datetime64_dtype_array_returned(self):
+ # GH 9431
+ expected = np_array_datetime64_compat(
+ ['2015-01-03T00:00:00.000000000+0000',
+ '2015-01-01T00:00:00.000000000+0000'],
+ dtype='M8[ns]')
+
+ dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
+ '2015-01-01T00:00:00.000000000',
+ '2015-01-01T00:00:00.000000000'])
+ result = algos.unique(dt_index)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ s = Series(dt_index)
+ result = algos.unique(s)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ arr = s.values
+ result = algos.unique(arr)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ def test_timedelta64_dtype_array_returned(self):
+ # GH 9431
+ expected = np.array([31200, 45678, 10000], dtype='m8[ns]')
+
+ td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
+ result = algos.unique(td_index)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ s = Series(td_index)
+ result = algos.unique(s)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ arr = s.values
+ result = algos.unique(arr)
+ tm.assert_numpy_array_equal(result, expected)
+ assert result.dtype == expected.dtype
+
+ def test_uint64_overflow(self):
+ s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+ exp = np.array([1, 2, 2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(algos.unique(s), exp)
+
+ def test_nan_in_object_array(self):
+ duplicated_items = ['a', np.nan, 'c', 'c']
+ result = pd.unique(duplicated_items)
+ expected = np.array(['a', np.nan, 'c'], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_categorical(self):
+
+ # we are expecting to return in the order
+ # of appearance
+ expected = Categorical(list('bac'), categories=list('bac'))
+
+ # we are expecting to return in the order
+ # of the categories
+ expected_o = Categorical(
+ list('bac'), categories=list('abc'), ordered=True)
+
+ # GH 15939
+ c = Categorical(list('baabc'))
+ result = c.unique()
+ tm.assert_categorical_equal(result, expected)
+
+ result = algos.unique(c)
+ tm.assert_categorical_equal(result, expected)
+
+ c = Categorical(list('baabc'), ordered=True)
+ result = c.unique()
+ tm.assert_categorical_equal(result, expected_o)
+
+ result = algos.unique(c)
+ tm.assert_categorical_equal(result, expected_o)
+
+ # Series of categorical dtype
+ s = Series(Categorical(list('baabc')), name='foo')
+ result = s.unique()
+ tm.assert_categorical_equal(result, expected)
+
+ result = pd.unique(s)
+ tm.assert_categorical_equal(result, expected)
+
+ # CI -> return CI
+ ci = CategoricalIndex(Categorical(list('baabc'),
+ categories=list('bac')))
+ expected = CategoricalIndex(expected)
+ result = ci.unique()
+ tm.assert_index_equal(result, expected)
+
+ result = pd.unique(ci)
+ tm.assert_index_equal(result, expected)
+
+ def test_datetime64tz_aware(self):
+ # GH 15939
+
+ result = Series(
+ Index([Timestamp('20160101', tz='US/Eastern'),
+ Timestamp('20160101', tz='US/Eastern')])).unique()
+ expected = DatetimeArray._from_sequence(np.array([
+ Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
+ ]))
+ tm.assert_extension_array_equal(result, expected)
+
+ result = Index([Timestamp('20160101', tz='US/Eastern'),
+ Timestamp('20160101', tz='US/Eastern')]).unique()
+ expected = DatetimeIndex(['2016-01-01 00:00:00'],
+ dtype='datetime64[ns, US/Eastern]', freq=None)
+ tm.assert_index_equal(result, expected)
+
+ result = pd.unique(
+ Series(Index([Timestamp('20160101', tz='US/Eastern'),
+ Timestamp('20160101', tz='US/Eastern')])))
+ expected = DatetimeArray._from_sequence(np.array([
+ Timestamp('2016-01-01', tz="US/Eastern"),
+ ]))
+ tm.assert_extension_array_equal(result, expected)
+
+ result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
+ Timestamp('20160101', tz='US/Eastern')]))
+ expected = DatetimeIndex(['2016-01-01 00:00:00'],
+ dtype='datetime64[ns, US/Eastern]', freq=None)
+ tm.assert_index_equal(result, expected)
+
+ def test_order_of_appearance(self):
+ # 9346
+ # light testing of guarantee of order of appearance
+ # these also are the doc-examples
+ result = pd.unique(Series([2, 1, 3, 3]))
+ tm.assert_numpy_array_equal(result,
+ np.array([2, 1, 3], dtype='int64'))
+
+ result = pd.unique(Series([2] + [1] * 5))
+ tm.assert_numpy_array_equal(result,
+ np.array([2, 1], dtype='int64'))
+
+ result = pd.unique(Series([Timestamp('20160101'),
+ Timestamp('20160101')]))
+ expected = np.array(['2016-01-01T00:00:00.000000000'],
+ dtype='datetime64[ns]')
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = pd.unique(Index(
+ [Timestamp('20160101', tz='US/Eastern'),
+ Timestamp('20160101', tz='US/Eastern')]))
+ expected = DatetimeIndex(['2016-01-01 00:00:00'],
+ dtype='datetime64[ns, US/Eastern]',
+ freq=None)
+ tm.assert_index_equal(result, expected)
+
+ result = pd.unique(list('aabc'))
+ expected = np.array(['a', 'b', 'c'], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = pd.unique(Series(Categorical(list('aabc'))))
+ expected = Categorical(list('abc'))
+ tm.assert_categorical_equal(result, expected)
+
+ @pytest.mark.parametrize("arg ,expected", [
+ (('1', '1', '2'), np.array(['1', '2'], dtype=object)),
+ (('foo',), np.array(['foo'], dtype=object))
+ ])
+ def test_tuple_with_strings(self, arg, expected):
+ # see GH 17108
+ result = pd.unique(arg)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_obj_none_preservation(self):
+ # GH 20866
+ arr = np.array(['foo', None], dtype=object)
+ result = pd.unique(arr)
+ expected = np.array(['foo', None], dtype=object)
+
+ tm.assert_numpy_array_equal(result, expected, strict_nan=True)
+
+ def test_signed_zero(self):
+ # GH 21866
+ a = np.array([-0.0, 0.0])
+ result = pd.unique(a)
+ expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_different_nans(self):
+ # GH 21866
+ # create different nans from bit-patterns:
+ NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+ NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+ assert NAN1 != NAN1
+ assert NAN2 != NAN2
+ a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
+ result = pd.unique(a)
+ expected = np.array([np.nan])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_first_nan_kept(self):
+ # GH 22295
+ # create different nans from bit-patterns:
+ bits_for_nan1 = 0xfff8000000000001
+ bits_for_nan2 = 0x7ff8000000000001
+ NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
+ NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
+ assert NAN1 != NAN1
+ assert NAN2 != NAN2
+ for el_type in [np.float64, np.object]:
+ a = np.array([NAN1, NAN2], dtype=el_type)
+ result = pd.unique(a)
+ assert result.size == 1
+ # use bit patterns to identify which nan was kept:
+ result_nan_bits = struct.unpack("=Q",
+ struct.pack("d", result[0]))[0]
+ assert result_nan_bits == bits_for_nan1
+
+ def test_do_not_mangle_na_values(self, unique_nulls_fixture,
+ unique_nulls_fixture2):
+ # GH 22295
+ if unique_nulls_fixture is unique_nulls_fixture2:
+ return # skip it, values not unique
+ a = np.array([unique_nulls_fixture,
+ unique_nulls_fixture2], dtype=np.object)
+ result = pd.unique(a)
+ assert result.size == 2
+ assert a[0] is unique_nulls_fixture
+ assert a[1] is unique_nulls_fixture2
+
+
+class TestIsin(object):
+
+ def test_invalid(self):
+
+ pytest.raises(TypeError, lambda: algos.isin(1, 1))
+ pytest.raises(TypeError, lambda: algos.isin(1, [1]))
+ pytest.raises(TypeError, lambda: algos.isin([1], 1))
+
+ def test_basic(self):
+
+ result = algos.isin([1, 2], [1])
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(np.array([1, 2]), [1])
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(Series([1, 2]), [1])
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(Series([1, 2]), Series([1]))
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(Series([1, 2]), {1})
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(['a', 'b'], ['a'])
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(Series(['a', 'b']), Series(['a']))
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(Series(['a', 'b']), {'a'})
+ expected = np.array([True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(['a', 'b'], [1])
+ expected = np.array([False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_i8(self):
+
+ arr = pd.date_range('20130101', periods=3).values
+ result = algos.isin(arr, [arr[0]])
+ expected = np.array([True, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(arr, arr[0:2])
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(arr, set(arr[0:2]))
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ arr = pd.timedelta_range('1 day', periods=3).values
+ result = algos.isin(arr, [arr[0]])
+ expected = np.array([True, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(arr, arr[0:2])
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.isin(arr, set(arr[0:2]))
+ expected = np.array([True, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_large(self):
+
+ s = pd.date_range('20000101', periods=2000000, freq='s').values
+ result = algos.isin(s, s[0:2])
+ expected = np.zeros(len(s), dtype=bool)
+ expected[0] = True
+ expected[1] = True
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_categorical_from_codes(self):
+ # GH 16639
+ vals = np.array([0, 1, 2, 0])
+ cats = ['a', 'b', 'c']
+ Sd = Series(Categorical(1).from_codes(vals, cats))
+ St = Series(Categorical(1).from_codes(np.array([0, 1]), cats))
+ expected = np.array([True, True, False, True])
+ result = algos.isin(Sd, St)
+ tm.assert_numpy_array_equal(expected, result)
+
+ def test_same_nan_is_in(self):
+ # GH 22160
+ # nan is special, because from " a is b" doesn't follow "a == b"
+ # at least, isin() should follow python's "np.nan in [nan] == True"
+ # casting to -> np.float64 -> another float-object somewher on
+ # the way could lead jepardize this behavior
+ comps = [np.nan] # could be casted to float64
+ values = [np.nan]
+ expected = np.array([True])
+ result = algos.isin(comps, values)
+ tm.assert_numpy_array_equal(expected, result)
+
+ def test_same_object_is_in(self):
+ # GH 22160
+ # there could be special treatment for nans
+ # the user however could define a custom class
+ # with similar behavior, then we at least should
+ # fall back to usual python's behavior: "a in [a] == True"
+ class LikeNan(object):
+ def __eq__(self):
+ return False
+
+ def __hash__(self):
+ return 0
+
+ a, b = LikeNan(), LikeNan()
+ # same object -> True
+ tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
+ # different objects -> False
+ tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
+
+ def test_different_nans(self):
+ # GH 22160
+ # all nans are handled as equivalent
+
+ comps = [float('nan')]
+ values = [float('nan')]
+ assert comps[0] is not values[0] # different nan-objects
+
+ # as list of python-objects:
+ result = algos.isin(comps, values)
+ tm.assert_numpy_array_equal(np.array([True]), result)
+
+ # as object-array:
+ result = algos.isin(np.asarray(comps, dtype=np.object),
+ np.asarray(values, dtype=np.object))
+ tm.assert_numpy_array_equal(np.array([True]), result)
+
+ # as float64-array:
+ result = algos.isin(np.asarray(comps, dtype=np.float64),
+ np.asarray(values, dtype=np.float64))
+ tm.assert_numpy_array_equal(np.array([True]), result)
+
+ def test_no_cast(self):
+ # GH 22160
+ # ensure 42 is not casted to a string
+ comps = ['ss', 42]
+ values = ['42']
+ expected = np.array([False, False])
+ result = algos.isin(comps, values)
+ tm.assert_numpy_array_equal(expected, result)
+
+ @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
+ def test_empty(self, empty):
+ # see gh-16991
+ vals = Index(["a", "b"])
+ expected = np.array([False, False])
+
+ result = algos.isin(vals, empty)
+ tm.assert_numpy_array_equal(expected, result)
+
+ def test_different_nan_objects(self):
+ # GH 22119
+ comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object)
+ vals = np.array([float('nan')], dtype=np.object)
+ expected = np.array([False, False, True])
+ result = algos.isin(comps, vals)
+ tm.assert_numpy_array_equal(expected, result)
+
+ def test_different_nans_as_float64(self):
+ # GH 21866
+ # create different nans from bit-patterns,
+ # these nans will land in different buckets in the hash-table
+ # if no special care is taken
+ NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+ NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+ assert NAN1 != NAN1
+ assert NAN2 != NAN2
+
+ # check that NAN1 and NAN2 are equivalent:
+ arr = np.array([NAN1, NAN2], dtype=np.float64)
+ lookup1 = np.array([NAN1], dtype=np.float64)
+ result = algos.isin(arr, lookup1)
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ lookup2 = np.array([NAN2], dtype=np.float64)
+ result = algos.isin(arr, lookup2)
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestValueCounts(object):
+
+ def test_value_counts(self):
+ np.random.seed(1234)
+ from pandas.core.reshape.tile import cut
+
+ arr = np.random.randn(4)
+ factor = cut(arr, 4)
+
+ # assert isinstance(factor, n)
+ result = algos.value_counts(factor)
+ breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
+ index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
+ expected = Series([1, 1, 1, 1], index=index)
+ tm.assert_series_equal(result.sort_index(), expected.sort_index())
+
+ def test_value_counts_bins(self):
+ s = [1, 2, 3, 4]
+ result = algos.value_counts(s, bins=1)
+ expected = Series([4],
+ index=IntervalIndex.from_tuples([(0.996, 4.0)]))
+ tm.assert_series_equal(result, expected)
+
+ result = algos.value_counts(s, bins=2, sort=False)
+ expected = Series([2, 2],
+ index=IntervalIndex.from_tuples([(0.996, 2.5),
+ (2.5, 4.0)]))
+ tm.assert_series_equal(result, expected)
+
+ def test_value_counts_dtypes(self):
+ result = algos.value_counts([1, 1.])
+ assert len(result) == 1
+
+ result = algos.value_counts([1, 1.], bins=1)
+ assert len(result) == 1
+
+ result = algos.value_counts(Series([1, 1., '1'])) # object
+ assert len(result) == 2
+
+ pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1),
+ ['1', 1])
+
+ def test_value_counts_nat(self):
+ td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]')
+ dt = pd.to_datetime(['NaT', '2014-01-01'])
+
+ for s in [td, dt]:
+ vc = algos.value_counts(s)
+ vc_with_na = algos.value_counts(s, dropna=False)
+ assert len(vc) == 1
+ assert len(vc_with_na) == 2
+
+ exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1})
+ tm.assert_series_equal(algos.value_counts(dt), exp_dt)
+ # TODO same for (timedelta)
+
+ def test_value_counts_datetime_outofbounds(self):
+ # GH 13663
+ s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1),
+ datetime(5000, 1, 1), datetime(6000, 1, 1),
+ datetime(3000, 1, 1), datetime(3000, 1, 1)])
+ res = s.value_counts()
+
+ exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1),
+ datetime(6000, 1, 1)], dtype=object)
+ exp = Series([3, 2, 1], index=exp_index)
+ tm.assert_series_equal(res, exp)
+
+ # GH 12424
+ res = pd.to_datetime(Series(['2362-01-01', np.nan]),
+ errors='ignore')
+ exp = Series(['2362-01-01', np.nan], dtype=object)
+ tm.assert_series_equal(res, exp)
+
+ def test_categorical(self):
+ s = Series(Categorical(list('aaabbc')))
+ result = s.value_counts()
+ expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c']))
+
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # preserve order?
+ s = s.cat.as_ordered()
+ result = s.value_counts()
+ expected.index = expected.index.as_ordered()
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ def test_categorical_nans(self):
+ s = Series(Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan)
+ s.iloc[1] = np.nan
+ result = s.value_counts()
+ expected = Series([4, 3, 2], index=CategoricalIndex(
+ ['a', 'b', 'c'], categories=['a', 'b', 'c']))
+ tm.assert_series_equal(result, expected, check_index_type=True)
+ result = s.value_counts(dropna=False)
+ expected = Series([
+ 4, 3, 2, 1
+ ], index=CategoricalIndex(['a', 'b', 'c', np.nan]))
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ # out of order
+ s = Series(Categorical(
+ list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c']))
+ s.iloc[1] = np.nan
+ result = s.value_counts()
+ expected = Series([4, 3, 2], index=CategoricalIndex(
+ ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True))
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ result = s.value_counts(dropna=False)
+ expected = Series([4, 3, 2, 1], index=CategoricalIndex(
+ ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ def test_categorical_zeroes(self):
+ # keep the `d` category with 0
+ s = Series(Categorical(
+ list('bbbaac'), categories=list('abcd'), ordered=True))
+ result = s.value_counts()
+ expected = Series([3, 2, 1, 0], index=Categorical(
+ ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
+ tm.assert_series_equal(result, expected, check_index_type=True)
+
+ def test_dropna(self):
+ # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
+
+ tm.assert_series_equal(
+ Series([True, True, False]).value_counts(dropna=True),
+ Series([2, 1], index=[True, False]))
+ tm.assert_series_equal(
+ Series([True, True, False]).value_counts(dropna=False),
+ Series([2, 1], index=[True, False]))
+
+ tm.assert_series_equal(
+ Series([True, True, False, None]).value_counts(dropna=True),
+ Series([2, 1], index=[True, False]))
+ tm.assert_series_equal(
+ Series([True, True, False, None]).value_counts(dropna=False),
+ Series([2, 1, 1], index=[True, False, np.nan]))
+ tm.assert_series_equal(
+ Series([10.3, 5., 5.]).value_counts(dropna=True),
+ Series([2, 1], index=[5., 10.3]))
+ tm.assert_series_equal(
+ Series([10.3, 5., 5.]).value_counts(dropna=False),
+ Series([2, 1], index=[5., 10.3]))
+
+ tm.assert_series_equal(
+ Series([10.3, 5., 5., None]).value_counts(dropna=True),
+ Series([2, 1], index=[5., 10.3]))
+
+ # 32-bit linux has a different ordering
+ if not compat.is_platform_32bit():
+ result = Series([10.3, 5., 5., None]).value_counts(dropna=False)
+ expected = Series([2, 1, 1], index=[5., 10.3, np.nan])
+ tm.assert_series_equal(result, expected)
+
+ def test_value_counts_normalized(self):
+ # GH12558
+ s = Series([1, 2, np.nan, np.nan, np.nan])
+ dtypes = (np.float64, np.object, 'M8[ns]')
+ for t in dtypes:
+ s_typed = s.astype(t)
+ result = s_typed.value_counts(normalize=True, dropna=False)
+ expected = Series([0.6, 0.2, 0.2],
+ index=Series([np.nan, 2.0, 1.0], dtype=t))
+ tm.assert_series_equal(result, expected)
+
+ result = s_typed.value_counts(normalize=True, dropna=True)
+ expected = Series([0.5, 0.5],
+ index=Series([2.0, 1.0], dtype=t))
+ tm.assert_series_equal(result, expected)
+
+ def test_value_counts_uint64(self):
+ arr = np.array([2**63], dtype=np.uint64)
+ expected = Series([1], index=[2**63])
+ result = algos.value_counts(arr)
+
+ tm.assert_series_equal(result, expected)
+
+ arr = np.array([-1, 2**63], dtype=object)
+ expected = Series([1, 1], index=[-1, 2**63])
+ result = algos.value_counts(arr)
+
+ # 32-bit linux has a different ordering
+ if not compat.is_platform_32bit():
+ tm.assert_series_equal(result, expected)
+
+
+class TestDuplicated(object):
+
+ def test_duplicated_with_nas(self):
+ keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
+
+ result = algos.duplicated(keys)
+ expected = np.array([False, False, False, True, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.duplicated(keys, keep='first')
+ expected = np.array([False, False, False, True, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.duplicated(keys, keep='last')
+ expected = np.array([True, False, True, False, False, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.duplicated(keys, keep=False)
+ expected = np.array([True, False, True, True, False, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ keys = np.empty(8, dtype=object)
+ for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2,
+ [0, np.nan, 0, np.nan] * 2)):
+ keys[i] = t
+
+ result = algos.duplicated(keys)
+ falses = [False] * 4
+ trues = [True] * 4
+ expected = np.array(falses + trues)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.duplicated(keys, keep='last')
+ expected = np.array(trues + falses)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.duplicated(keys, keep=False)
+ expected = np.array(trues + trues)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('case', [
+ np.array([1, 2, 1, 5, 3,
+ 2, 4, 1, 5, 6]),
+ np.array([1.1, 2.2, 1.1, np.nan, 3.3,
+ 2.2, 4.4, 1.1, np.nan, 6.6]),
+ np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
+ 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
+ np.array(['a', 'b', 'a', 'e', 'c',
+ 'b', 'd', 'a', 'e', 'f'], dtype=object),
+ np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7],
+ dtype=np.uint64),
+ ])
+ def test_numeric_object_likes(self, case):
+ exp_first = np.array([False, False, True, False, False,
+ True, False, True, True, False])
+ exp_last = np.array([True, True, True, True, False,
+ False, False, False, False, False])
+ exp_false = exp_first | exp_last
+
+ res_first = algos.duplicated(case, keep='first')
+ tm.assert_numpy_array_equal(res_first, exp_first)
+
+ res_last = algos.duplicated(case, keep='last')
+ tm.assert_numpy_array_equal(res_last, exp_last)
+
+ res_false = algos.duplicated(case, keep=False)
+ tm.assert_numpy_array_equal(res_false, exp_false)
+
+ # index
+ for idx in [Index(case), Index(case, dtype='category')]:
+ res_first = idx.duplicated(keep='first')
+ tm.assert_numpy_array_equal(res_first, exp_first)
+
+ res_last = idx.duplicated(keep='last')
+ tm.assert_numpy_array_equal(res_last, exp_last)
+
+ res_false = idx.duplicated(keep=False)
+ tm.assert_numpy_array_equal(res_false, exp_false)
+
+ # series
+ for s in [Series(case), Series(case, dtype='category')]:
+ res_first = s.duplicated(keep='first')
+ tm.assert_series_equal(res_first, Series(exp_first))
+
+ res_last = s.duplicated(keep='last')
+ tm.assert_series_equal(res_last, Series(exp_last))
+
+ res_false = s.duplicated(keep=False)
+ tm.assert_series_equal(res_false, Series(exp_false))
+
+ def test_datetime_likes(self):
+
+ dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03',
+ '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06']
+ td = ['1 days', '2 days', '1 days', 'NaT', '3 days',
+ '2 days', '4 days', '1 days', 'NaT', '6 days']
+
+ cases = [np.array([Timestamp(d) for d in dt]),
+ np.array([Timestamp(d, tz='US/Eastern') for d in dt]),
+ np.array([pd.Period(d, freq='D') for d in dt]),
+ np.array([np.datetime64(d) for d in dt]),
+ np.array([pd.Timedelta(d) for d in td])]
+
+ exp_first = np.array([False, False, True, False, False,
+ True, False, True, True, False])
+ exp_last = np.array([True, True, True, True, False,
+ False, False, False, False, False])
+ exp_false = exp_first | exp_last
+
+ for case in cases:
+ res_first = algos.duplicated(case, keep='first')
+ tm.assert_numpy_array_equal(res_first, exp_first)
+
+ res_last = algos.duplicated(case, keep='last')
+ tm.assert_numpy_array_equal(res_last, exp_last)
+
+ res_false = algos.duplicated(case, keep=False)
+ tm.assert_numpy_array_equal(res_false, exp_false)
+
+ # index
+ for idx in [Index(case), Index(case, dtype='category'),
+ Index(case, dtype=object)]:
+ res_first = idx.duplicated(keep='first')
+ tm.assert_numpy_array_equal(res_first, exp_first)
+
+ res_last = idx.duplicated(keep='last')
+ tm.assert_numpy_array_equal(res_last, exp_last)
+
+ res_false = idx.duplicated(keep=False)
+ tm.assert_numpy_array_equal(res_false, exp_false)
+
+ # series
+ for s in [Series(case), Series(case, dtype='category'),
+ Series(case, dtype=object)]:
+ res_first = s.duplicated(keep='first')
+ tm.assert_series_equal(res_first, Series(exp_first))
+
+ res_last = s.duplicated(keep='last')
+ tm.assert_series_equal(res_last, Series(exp_last))
+
+ res_false = s.duplicated(keep=False)
+ tm.assert_series_equal(res_false, Series(exp_false))
+
+ def test_unique_index(self):
+ cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)]
+ for case in cases:
+ assert case.is_unique is True
+ tm.assert_numpy_array_equal(case.duplicated(),
+ np.array([False, False, False]))
+
+ @pytest.mark.parametrize('arr, unique', [
+ ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
+ [(0, 0), (0, 1), (1, 0), (1, 1)]),
+ ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')],
+ [('b', 'c'), ('a', 'b')]),
+ ([('a', 1), ('b', 2), ('a', 3), ('a', 1)],
+ [('a', 1), ('b', 2), ('a', 3)]),
+ ])
+ def test_unique_tuples(self, arr, unique):
+ # https://github.com/pandas-dev/pandas/issues/16519
+ expected = np.empty(len(unique), dtype=object)
+ expected[:] = unique
+
+ result = pd.unique(arr)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class GroupVarTestMixin(object):
+
+ def test_group_var_generic_1d(self):
+ prng = RandomState(1234)
+
+ out = (np.nan * np.ones((5, 1))).astype(self.dtype)
+ counts = np.zeros(5, dtype='int64')
+ values = 10 * prng.rand(15, 1).astype(self.dtype)
+ labels = np.tile(np.arange(5), (3, )).astype('int64')
+
+ expected_out = (np.squeeze(values)
+ .reshape((5, 3), order='F')
+ .std(axis=1, ddof=1) ** 2)[:, np.newaxis]
+ expected_counts = counts + 3
+
+ self.algo(out, counts, values, labels)
+ assert np.allclose(out, expected_out, self.rtol)
+ tm.assert_numpy_array_equal(counts, expected_counts)
+
+ def test_group_var_generic_1d_flat_labels(self):
+ prng = RandomState(1234)
+
+ out = (np.nan * np.ones((1, 1))).astype(self.dtype)
+ counts = np.zeros(1, dtype='int64')
+ values = 10 * prng.rand(5, 1).astype(self.dtype)
+ labels = np.zeros(5, dtype='int64')
+
+ expected_out = np.array([[values.std(ddof=1) ** 2]])
+ expected_counts = counts + 5
+
+ self.algo(out, counts, values, labels)
+
+ assert np.allclose(out, expected_out, self.rtol)
+ tm.assert_numpy_array_equal(counts, expected_counts)
+
+ def test_group_var_generic_2d_all_finite(self):
+ prng = RandomState(1234)
+
+ out = (np.nan * np.ones((5, 2))).astype(self.dtype)
+ counts = np.zeros(5, dtype='int64')
+ values = 10 * prng.rand(10, 2).astype(self.dtype)
+ labels = np.tile(np.arange(5), (2, )).astype('int64')
+
+ expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
+ expected_counts = counts + 2
+
+ self.algo(out, counts, values, labels)
+ assert np.allclose(out, expected_out, self.rtol)
+ tm.assert_numpy_array_equal(counts, expected_counts)
+
+ def test_group_var_generic_2d_some_nan(self):
+ prng = RandomState(1234)
+
+ out = (np.nan * np.ones((5, 2))).astype(self.dtype)
+ counts = np.zeros(5, dtype='int64')
+ values = 10 * prng.rand(10, 2).astype(self.dtype)
+ values[:, 1] = np.nan
+ labels = np.tile(np.arange(5), (2, )).astype('int64')
+
+ expected_out = np.vstack([values[:, 0]
+ .reshape(5, 2, order='F')
+ .std(ddof=1, axis=1) ** 2,
+ np.nan * np.ones(5)]).T.astype(self.dtype)
+ expected_counts = counts + 2
+
+ self.algo(out, counts, values, labels)
+ tm.assert_almost_equal(out, expected_out, check_less_precise=6)
+ tm.assert_numpy_array_equal(counts, expected_counts)
+
+ def test_group_var_constant(self):
+ # Regression test from GH 10448.
+
+ out = np.array([[np.nan]], dtype=self.dtype)
+ counts = np.array([0], dtype='int64')
+ values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
+ labels = np.zeros(3, dtype='int64')
+
+ self.algo(out, counts, values, labels)
+
+ assert counts[0] == 3
+ assert out[0, 0] >= 0
+ tm.assert_almost_equal(out[0, 0], 0.0)
+
+
+class TestGroupVarFloat64(GroupVarTestMixin):
+ __test__ = True
+
+ algo = libgroupby.group_var_float64
+ dtype = np.float64
+ rtol = 1e-5
+
+ def test_group_var_large_inputs(self):
+
+ prng = RandomState(1234)
+
+ out = np.array([[np.nan]], dtype=self.dtype)
+ counts = np.array([0], dtype='int64')
+ values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype)
+ values.shape = (10 ** 6, 1)
+ labels = np.zeros(10 ** 6, dtype='int64')
+
+ self.algo(out, counts, values, labels)
+
+ assert counts[0] == 10 ** 6
+ tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True)
+
+
+class TestGroupVarFloat32(GroupVarTestMixin):
+ __test__ = True
+
+ algo = libgroupby.group_var_float32
+ dtype = np.float32
+ rtol = 1e-2
+
+
+class TestHashTable(object):
+
+ def test_lookup_nan(self, writable):
+ xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
+ # GH 21688 ensure we can deal with readonly memory views
+ xs.setflags(write=writable)
+ m = ht.Float64HashTable()
+ m.map_locations(xs)
+ tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
+ dtype=np.int64))
+
+ def test_add_signed_zeros(self):
+ # GH 21866 inconsistent hash-function for float64
+ # default hash-function would lead to different hash-buckets
+ # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
+ # but this would mean 16GB
+ N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
+ m = ht.Float64HashTable(N)
+ m.set_item(0.0, 0)
+ m.set_item(-0.0, 0)
+ assert len(m) == 1 # 0.0 and -0.0 are equivalent
+
+ def test_add_different_nans(self):
+ # GH 21866 inconsistent hash-function for float64
+ # create different nans from bit-patterns:
+ NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+ NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+ assert NAN1 != NAN1
+ assert NAN2 != NAN2
+ # default hash function would lead to different hash-buckets
+ # for NAN1 and NAN2 even if there are only 4 buckets:
+ m = ht.Float64HashTable()
+ m.set_item(NAN1, 0)
+ m.set_item(NAN2, 0)
+ assert len(m) == 1 # NAN1 and NAN2 are equivalent
+
+ def test_lookup_overflow(self, writable):
+ xs = np.array([1, 2, 2**63], dtype=np.uint64)
+ # GH 21688 ensure we can deal with readonly memory views
+ xs.setflags(write=writable)
+ m = ht.UInt64HashTable()
+ m.map_locations(xs)
+ tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
+ dtype=np.int64))
+
+ def test_get_unique(self):
+ s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+ exp = np.array([1, 2, 2**63], dtype=np.uint64)
+ tm.assert_numpy_array_equal(s.unique(), exp)
+
+ @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case
+ @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [
+ (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
+ (ht.StringHashTable, ht.ObjectVector, 'object', True),
+ (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
+ (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
+ (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)])
+ def test_vector_resize(self, writable, htable, uniques, dtype,
+ safely_resizes, nvals):
+ # Test for memory errors after internal vector
+ # reallocations (GH 7157)
+ vals = np.array(np.random.randn(1000), dtype=dtype)
+
+ # GH 21688 ensures we can deal with read-only memory views
+ vals.setflags(write=writable)
+
+ # initialise instances; cannot initialise in parametrization,
+ # as otherwise external views would be held on the array (which is
+ # one of the things this test is checking)
+ htable = htable()
+ uniques = uniques()
+
+ # get_labels may append to uniques
+ htable.get_labels(vals[:nvals], uniques, 0, -1)
+ # to_array() sets an external_view_exists flag on uniques.
+ tmp = uniques.to_array()
+ oldshape = tmp.shape
+
+ # subsequent get_labels() calls can no longer append to it
+ # (except for StringHashTables + ObjectVector)
+ if safely_resizes:
+ htable.get_labels(vals, uniques, 0, -1)
+ else:
+ with pytest.raises(ValueError, match='external reference.*'):
+ htable.get_labels(vals, uniques, 0, -1)
+
+ uniques.to_array() # should not raise here
+ assert tmp.shape == oldshape
+
+ @pytest.mark.parametrize('htable, tm_dtype', [
+ (ht.PyObjectHashTable, 'String'),
+ (ht.StringHashTable, 'String'),
+ (ht.Float64HashTable, 'Float'),
+ (ht.Int64HashTable, 'Int'),
+ (ht.UInt64HashTable, 'UInt')])
+ def test_hashtable_unique(self, htable, tm_dtype, writable):
+ # output of maker has guaranteed unique elements
+ maker = getattr(tm, 'make' + tm_dtype + 'Index')
+ s = Series(maker(1000))
+ if htable == ht.Float64HashTable:
+ # add NaN for float column
+ s.loc[500] = np.nan
+ elif htable == ht.PyObjectHashTable:
+ # use different NaN types for object column
+ s.loc[500:502] = [np.nan, None, pd.NaT]
+
+ # create duplicated selection
+ s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
+ s_duplicated.values.setflags(write=writable)
+
+ # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+ # and is tested separately; keeps first occurrence like ht.unique()
+ expected_unique = s_duplicated.drop_duplicates(keep='first').values
+ result_unique = htable().unique(s_duplicated.values)
+ tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+ # test return_inverse=True
+ # reconstruction can only succeed if the inverse is correct
+ result_unique, result_inverse = htable().unique(s_duplicated.values,
+ return_inverse=True)
+ tm.assert_numpy_array_equal(result_unique, expected_unique)
+ reconstr = result_unique[result_inverse]
+ tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
+
+ @pytest.mark.parametrize('htable, tm_dtype', [
+ (ht.PyObjectHashTable, 'String'),
+ (ht.StringHashTable, 'String'),
+ (ht.Float64HashTable, 'Float'),
+ (ht.Int64HashTable, 'Int'),
+ (ht.UInt64HashTable, 'UInt')])
+ def test_hashtable_factorize(self, htable, tm_dtype, writable):
+ # output of maker has guaranteed unique elements
+ maker = getattr(tm, 'make' + tm_dtype + 'Index')
+ s = Series(maker(1000))
+ if htable == ht.Float64HashTable:
+ # add NaN for float column
+ s.loc[500] = np.nan
+ elif htable == ht.PyObjectHashTable:
+ # use different NaN types for object column
+ s.loc[500:502] = [np.nan, None, pd.NaT]
+
+ # create duplicated selection
+ s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
+ s_duplicated.values.setflags(write=writable)
+ na_mask = s_duplicated.isna().values
+
+ result_unique, result_inverse = htable().factorize(s_duplicated.values)
+
+ # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+ # and is tested separately; keeps first occurrence like ht.factorize()
+ # since factorize removes all NaNs, we do the same here
+ expected_unique = s_duplicated.dropna().drop_duplicates().values
+ tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+ # reconstruction can only succeed if the inverse is correct. Since
+ # factorize removes the NaNs, those have to be excluded here as well
+ result_reconstruct = result_unique[result_inverse[~na_mask]]
+ expected_reconstruct = s_duplicated.dropna().values
+ tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
+
+ @pytest.mark.parametrize('hashtable', [
+ ht.PyObjectHashTable, ht.StringHashTable,
+ ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable])
+ def test_hashtable_large_sizehint(self, hashtable):
+ # GH 22729
+ size_hint = np.iinfo(np.uint32).max + 1
+ tbl = hashtable(size_hint=size_hint) # noqa
+
+
+def test_quantile():
+ s = Series(np.random.randn(100))
+
+ result = algos.quantile(s, [0, .25, .5, .75, 1.])
+ expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
+ tm.assert_almost_equal(result, expected)
+
+
+def test_unique_label_indices():
+
+ a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')
+
+ left = ht.unique_label_indices(a)
+ right = np.unique(a, return_index=True)[1]
+
+ tm.assert_numpy_array_equal(left, right,
+ check_dtype=False)
+
+ a[np.random.choice(len(a), 10)] = -1
+ left = ht.unique_label_indices(a)
+ right = np.unique(a, return_index=True)[1][1:]
+ tm.assert_numpy_array_equal(left, right,
+ check_dtype=False)
+
+
+class TestRank(object):
+
+ @td.skip_if_no_scipy
+ def test_scipy_compat(self):
+ from scipy.stats import rankdata
+
+ def _check(arr):
+ mask = ~np.isfinite(arr)
+ arr = arr.copy()
+ result = libalgos.rank_1d_float64(arr)
+ arr[mask] = np.inf
+ exp = rankdata(arr)
+ exp[mask] = nan
+ assert_almost_equal(result, exp)
+
+ _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
+ _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))
+
+ def test_basic(self):
+ exp = np.array([1, 2], dtype=np.float64)
+
+ for dtype in np.typecodes['AllInteger']:
+ s = Series([1, 100], dtype=dtype)
+ tm.assert_numpy_array_equal(algos.rank(s), exp)
+
+ def test_uint64_overflow(self):
+ exp = np.array([1, 2], dtype=np.float64)
+
+ for dtype in [np.float64, np.uint64]:
+ s = Series([1, 2**63], dtype=dtype)
+ tm.assert_numpy_array_equal(algos.rank(s), exp)
+
+ def test_too_many_ndims(self):
+ arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
+ msg = "Array with ndim > 2 are not supported"
+
+ with pytest.raises(TypeError, match=msg):
+ algos.rank(arr)
+
+ @pytest.mark.single
+ @pytest.mark.parametrize('values', [
+ np.arange(2**24 + 1),
+ np.arange(2**25 + 2).reshape(2**24 + 1, 2)],
+ ids=['1d', '2d'])
+ def test_pct_max_many_rows(self, values):
+ # GH 18271
+ result = algos.rank(values, pct=True).max()
+ assert result == 1
+
+
+def test_pad_backfill_object_segfault():
+
+ old = np.array([], dtype='O')
+ new = np.array([datetime(2010, 12, 31)], dtype='O')
+
+ result = libalgos.pad["object"](old, new)
+ expected = np.array([-1], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = libalgos.pad["object"](new, old)
+ expected = np.array([], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = libalgos.backfill["object"](old, new)
+ expected = np.array([-1], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = libalgos.backfill["object"](new, old)
+ expected = np.array([], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_arrmap():
+ values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O')
+ result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar'])
+ assert (result.dtype == np.bool_)
+
+
+class TestTseriesUtil(object):
+
+ def test_combineFunc(self):
+ pass
+
+ def test_reindex(self):
+ pass
+
+ def test_isna(self):
+ pass
+
+ def test_groupby(self):
+ pass
+
+ def test_groupby_withnull(self):
+ pass
+
+ def test_backfill(self):
+ old = Index([1, 5, 10])
+ new = Index(lrange(12))
+
+ filler = libalgos.backfill["int64_t"](old.values, new.values)
+
+ expect_filler = np.array([0, 0, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(filler, expect_filler)
+
+ # corner case
+ old = Index([1, 4])
+ new = Index(lrange(5, 10))
+ filler = libalgos.backfill["int64_t"](old.values, new.values)
+
+ expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(filler, expect_filler)
+
+ def test_pad(self):
+ old = Index([1, 5, 10])
+ new = Index(lrange(12))
+
+ filler = libalgos.pad["int64_t"](old.values, new.values)
+
+ expect_filler = np.array([-1, 0, 0, 0, 0, 1,
+ 1, 1, 1, 1, 2, 2], dtype=np.int64)
+ tm.assert_numpy_array_equal(filler, expect_filler)
+
+ # corner case
+ old = Index([5, 10])
+ new = Index(lrange(5))
+ filler = libalgos.pad["int64_t"](old.values, new.values)
+ expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(filler, expect_filler)
+
+
+def test_is_lexsorted():
+ failure = [
+ np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3,
+ 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1,
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'),
+ np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14,
+ 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28,
+ 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
+ 12, 11,
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25,
+ 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,
+ 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22,
+ 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+ 6, 5,
+ 4, 3, 2, 1, 0], dtype='int64')]
+
+ assert (not libalgos.is_lexsorted(failure))
+
+
+def test_groupsort_indexer():
+ a = np.random.randint(0, 1000, 100).astype(np.int64)
+ b = np.random.randint(0, 1000, 100).astype(np.int64)
+
+ result = libalgos.groupsort_indexer(a, 1000)[0]
+
+ # need to use a stable sort
+ # np.argsort returns int, groupsort_indexer
+ # always returns int64
+ expected = np.argsort(a, kind='mergesort')
+ expected = expected.astype(np.int64)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ # compare with lexsort
+ # np.lexsort returns int, groupsort_indexer
+ # always returns int64
+ key = a * 1000 + b
+ result = libalgos.groupsort_indexer(key, 1000000)[0]
+ expected = np.lexsort((b, a))
+ expected = expected.astype(np.int64)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_infinity_sort():
+ # GH 13445
+ # numpy's argsort can be unhappy if something is less than
+ # itself. Instead, let's give our infinities a self-consistent
+ # ordering, but outside the float extended real line.
+
+ Inf = libalgos.Infinity()
+ NegInf = libalgos.NegInfinity()
+
+ ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
+
+ assert all(Inf >= x for x in ref_nums)
+ assert all(Inf > x or x is Inf for x in ref_nums)
+ assert Inf >= Inf and Inf == Inf
+ assert not Inf < Inf and not Inf > Inf
+ assert libalgos.Infinity() == libalgos.Infinity()
+ assert not libalgos.Infinity() != libalgos.Infinity()
+
+ assert all(NegInf <= x for x in ref_nums)
+ assert all(NegInf < x or x is NegInf for x in ref_nums)
+ assert NegInf <= NegInf and NegInf == NegInf
+ assert not NegInf < NegInf and not NegInf > NegInf
+ assert libalgos.NegInfinity() == libalgos.NegInfinity()
+ assert not libalgos.NegInfinity() != libalgos.NegInfinity()
+
+ for perm in permutations(ref_nums):
+ assert sorted(perm) == ref_nums
+
+ # smoke tests
+ np.array([libalgos.Infinity()] * 32).argsort()
+ np.array([libalgos.NegInfinity()] * 32).argsort()
+
+
+def test_infinity_against_nan():
+ Inf = libalgos.Infinity()
+ NegInf = libalgos.NegInfinity()
+
+ assert not Inf > np.nan
+ assert not Inf >= np.nan
+ assert not Inf < np.nan
+ assert not Inf <= np.nan
+ assert not Inf == np.nan
+ assert Inf != np.nan
+
+ assert not NegInf > np.nan
+ assert not NegInf >= np.nan
+ assert not NegInf < np.nan
+ assert not NegInf <= np.nan
+ assert not NegInf == np.nan
+ assert NegInf != np.nan
+
+
+def test_ensure_platform_int():
+ arr = np.arange(100, dtype=np.intp)
+
+ result = libalgos.ensure_platform_int(arr)
+ assert (result is arr)
+
+
+def test_int64_add_overflow():
+ # see gh-14068
+ msg = "Overflow in int64 addition"
+ m = np.iinfo(np.int64).max
+ n = np.iinfo(np.int64).min
+
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, m]), m)
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([n, n]), n)
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ arr_mask=np.array([False, True]))
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ b_mask=np.array([False, True]))
+ with pytest.raises(OverflowError, match=msg):
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ arr_mask=np.array([False, True]),
+ b_mask=np.array([False, True]))
+ with pytest.raises(OverflowError, match=msg):
+ with tm.assert_produces_warning(RuntimeWarning):
+ algos.checked_add_with_arr(np.array([m, m]),
+ np.array([np.nan, m]))
+
+ # Check that the nan boolean arrays override whether or not
+ # the addition overflows. We don't check the result but just
+ # the fact that an OverflowError is not raised.
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ arr_mask=np.array([True, True]))
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ b_mask=np.array([True, True]))
+ algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+ arr_mask=np.array([True, False]),
+ b_mask=np.array([False, True]))
+
+
+class TestMode(object):
+
+ def test_no_mode(self):
+ exp = Series([], dtype=np.float64)
+ tm.assert_series_equal(algos.mode([]), exp)
+
+ def test_mode_single(self):
+ # GH 15714
+ exp_single = [1]
+ data_single = [1]
+
+ exp_multi = [1]
+ data_multi = [1, 1]
+
+ for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
+ s = Series(data_single, dtype=dt)
+ exp = Series(exp_single, dtype=dt)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ s = Series(data_multi, dtype=dt)
+ exp = Series(exp_multi, dtype=dt)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ exp = Series([1], dtype=np.int)
+ tm.assert_series_equal(algos.mode([1]), exp)
+
+ exp = Series(['a', 'b', 'c'], dtype=np.object)
+ tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp)
+
+ def test_number_mode(self):
+ exp_single = [1]
+ data_single = [1] * 5 + [2] * 3
+
+ exp_multi = [1, 3]
+ data_multi = [1] * 5 + [2] * 3 + [3] * 5
+
+ for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
+ s = Series(data_single, dtype=dt)
+ exp = Series(exp_single, dtype=dt)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ s = Series(data_multi, dtype=dt)
+ exp = Series(exp_multi, dtype=dt)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_strobj_mode(self):
+ exp = ['b']
+ data = ['a'] * 2 + ['b'] * 3
+
+ s = Series(data, dtype='c')
+ exp = Series(exp, dtype='c')
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ exp = ['bar']
+ data = ['foo'] * 2 + ['bar'] * 3
+
+ for dt in [str, object]:
+ s = Series(data, dtype=dt)
+ exp = Series(exp, dtype=dt)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_datelike_mode(self):
+ exp = Series(['1900-05-03', '2011-01-03',
+ '2013-01-02'], dtype="M8[ns]")
+ s = Series(['2011-01-03', '2013-01-02',
+ '1900-05-03'], dtype='M8[ns]')
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]')
+ s = Series(['2011-01-03', '2013-01-02', '1900-05-03',
+ '2011-01-03', '2013-01-02'], dtype='M8[ns]')
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_timedelta_mode(self):
+ exp = Series(['-1 days', '0 days', '1 days'],
+ dtype='timedelta64[ns]')
+ s = Series(['1 days', '-1 days', '0 days'],
+ dtype='timedelta64[ns]')
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
+ s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min',
+ '2 min', '2 min'], dtype='timedelta64[ns]')
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_mixed_dtype(self):
+ exp = Series(['foo'])
+ s = Series([1, 'foo', 'foo'])
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_uint64_overflow(self):
+ exp = Series([2**63], dtype=np.uint64)
+ s = Series([1, 2**63, 2**63], dtype=np.uint64)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ exp = Series([1, 2**63], dtype=np.uint64)
+ s = Series([1, 2**63], dtype=np.uint64)
+ tm.assert_series_equal(algos.mode(s), exp)
+
+ def test_categorical(self):
+ c = Categorical([1, 2])
+ exp = c
+ tm.assert_categorical_equal(algos.mode(c), exp)
+ tm.assert_categorical_equal(c.mode(), exp)
+
+ c = Categorical([1, 'a', 'a'])
+ exp = Categorical(['a'], categories=[1, 'a'])
+ tm.assert_categorical_equal(algos.mode(c), exp)
+ tm.assert_categorical_equal(c.mode(), exp)
+
+ c = Categorical([1, 1, 2, 3, 3])
+ exp = Categorical([1, 3], categories=[1, 2, 3])
+ tm.assert_categorical_equal(algos.mode(c), exp)
+ tm.assert_categorical_equal(c.mode(), exp)
+
+ def test_index(self):
+ idx = Index([1, 2, 3])
+ exp = Series([1, 2, 3], dtype=np.int64)
+ tm.assert_series_equal(algos.mode(idx), exp)
+
+ idx = Index([1, 'a', 'a'])
+ exp = Series(['a'], dtype=object)
+ tm.assert_series_equal(algos.mode(idx), exp)
+
+ idx = Index([1, 1, 2, 3, 3])
+ exp = Series([1, 3], dtype=np.int64)
+ tm.assert_series_equal(algos.mode(idx), exp)
+
+ exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
+ idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min',
+ '2 min', '2 min'], dtype='timedelta64[ns]')
+ tm.assert_series_equal(algos.mode(idx), exp)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_base.py b/contrib/python/pandas/py2/pandas/tests/test_base.py
new file mode 100644
index 00000000000..ac365eb87d1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_base.py
@@ -0,0 +1,1351 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+from datetime import datetime, timedelta
+import re
+import sys
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+import pandas.compat as compat
+from pandas.compat import PYPY, StringIO, long
+from pandas.compat.numpy import np_array_datetime64_compat
+
+from pandas.core.dtypes.common import (
+ is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype,
+ is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+from pandas import (
+ CategoricalIndex, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex,
+ Panel, PeriodIndex, Series, Timedelta, TimedeltaIndex, Timestamp)
+from pandas.core.accessor import PandasDelegate
+from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray
+from pandas.core.base import NoNewAttributesMixin, PandasObject
+from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
+import pandas.util.testing as tm
+
+
+class CheckStringMixin(object):
+
+ def test_string_methods_dont_fail(self):
+ repr(self.container)
+ str(self.container)
+ bytes(self.container)
+ if not compat.PY3:
+ unicode(self.container) # noqa
+
+ def test_tricky_container(self):
+ if not hasattr(self, 'unicode_container'):
+ pytest.skip('Need unicode_container to test with this')
+ repr(self.unicode_container)
+ str(self.unicode_container)
+ bytes(self.unicode_container)
+ if not compat.PY3:
+ unicode(self.unicode_container) # noqa
+
+
+class CheckImmutable(object):
+ mutable_regex = re.compile('does not support mutable operations')
+
+ def check_mutable_error(self, *args, **kwargs):
+ # Pass whatever function you normally would to pytest.raises
+ # (after the Exception kind).
+ with pytest.raises(TypeError):
+ self.mutable_regex(*args, **kwargs)
+
+ def test_no_mutable_funcs(self):
+ def setitem():
+ self.container[0] = 5
+
+ self.check_mutable_error(setitem)
+
+ def setslice():
+ self.container[1:2] = 3
+
+ self.check_mutable_error(setslice)
+
+ def delitem():
+ del self.container[0]
+
+ self.check_mutable_error(delitem)
+
+ def delslice():
+ del self.container[0:3]
+
+ self.check_mutable_error(delslice)
+ mutable_methods = getattr(self, "mutable_methods", [])
+
+ for meth in mutable_methods:
+ self.check_mutable_error(getattr(self.container, meth))
+
+ def test_slicing_maintains_type(self):
+ result = self.container[1:2]
+ expected = self.lst[1:2]
+ self.check_result(result, expected)
+
+ def check_result(self, result, expected, klass=None):
+ klass = klass or self.klass
+ assert isinstance(result, klass)
+ assert result == expected
+
+
+class TestPandasDelegate(object):
+
+ class Delegator(object):
+ _properties = ['foo']
+ _methods = ['bar']
+
+ def _set_foo(self, value):
+ self.foo = value
+
+ def _get_foo(self):
+ return self.foo
+
+ foo = property(_get_foo, _set_foo, doc="foo property")
+
+ def bar(self, *args, **kwargs):
+ """ a test bar method """
+ pass
+
+ class Delegate(PandasDelegate, PandasObject):
+
+ def __init__(self, obj):
+ self.obj = obj
+
+ def setup_method(self, method):
+ pass
+
+ def test_invalid_delegation(self):
+ # these show that in order for the delegation to work
+ # the _delegate_* methods need to be overridden to not raise
+ # a TypeError
+
+ self.Delegate._add_delegate_accessors(
+ delegate=self.Delegator,
+ accessors=self.Delegator._properties,
+ typ='property'
+ )
+ self.Delegate._add_delegate_accessors(
+ delegate=self.Delegator,
+ accessors=self.Delegator._methods,
+ typ='method'
+ )
+
+ delegate = self.Delegate(self.Delegator())
+
+ with pytest.raises(TypeError):
+ delegate.foo
+
+ with pytest.raises(TypeError):
+ delegate.foo = 5
+
+ with pytest.raises(TypeError):
+ delegate.foo()
+
+ @pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
+ def test_memory_usage(self):
+ # Delegate does not implement memory_usage.
+ # Check that we fall back to in-built `__sizeof__`
+ # GH 12924
+ delegate = self.Delegate(self.Delegator())
+ sys.getsizeof(delegate)
+
+
+class Ops(object):
+
+ def _allow_na_ops(self, obj):
+ """Whether to skip test cases including NaN"""
+ if (isinstance(obj, Index) and
+ (obj.is_boolean() or not obj._can_hold_na)):
+ # don't test boolean / int64 index
+ return False
+ return True
+
+ def setup_method(self, method):
+ self.bool_index = tm.makeBoolIndex(10, name='a')
+ self.int_index = tm.makeIntIndex(10, name='a')
+ self.float_index = tm.makeFloatIndex(10, name='a')
+ self.dt_index = tm.makeDateIndex(10, name='a')
+ self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize(
+ tz='US/Eastern')
+ self.period_index = tm.makePeriodIndex(10, name='a')
+ self.string_index = tm.makeStringIndex(10, name='a')
+ self.unicode_index = tm.makeUnicodeIndex(10, name='a')
+
+ arr = np.random.randn(10)
+ self.bool_series = Series(arr, index=self.bool_index, name='a')
+ self.int_series = Series(arr, index=self.int_index, name='a')
+ self.float_series = Series(arr, index=self.float_index, name='a')
+ self.dt_series = Series(arr, index=self.dt_index, name='a')
+ self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True)
+ self.period_series = Series(arr, index=self.period_index, name='a')
+ self.string_series = Series(arr, index=self.string_index, name='a')
+ self.unicode_series = Series(arr, index=self.unicode_index, name='a')
+
+ types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string',
+ 'unicode']
+ self.indexes = [getattr(self, '{}_index'.format(t)) for t in types]
+ self.series = [getattr(self, '{}_series'.format(t)) for t in types]
+ self.objs = self.indexes + self.series
+
+ def check_ops_properties(self, props, filter=None, ignore_failures=False):
+ for op in props:
+ for o in self.is_valid_objs:
+
+ # if a filter, skip if it doesn't match
+ if filter is not None:
+ filt = o.index if isinstance(o, Series) else o
+ if not filter(filt):
+ continue
+
+ try:
+ if isinstance(o, Series):
+ expected = Series(
+ getattr(o.index, op), index=o.index, name='a')
+ else:
+ expected = getattr(o, op)
+ except (AttributeError):
+ if ignore_failures:
+ continue
+
+ result = getattr(o, op)
+
+ # these couuld be series, arrays or scalars
+ if isinstance(result, Series) and isinstance(expected, Series):
+ tm.assert_series_equal(result, expected)
+ elif isinstance(result, Index) and isinstance(expected, Index):
+ tm.assert_index_equal(result, expected)
+ elif isinstance(result, np.ndarray) and isinstance(expected,
+ np.ndarray):
+ tm.assert_numpy_array_equal(result, expected)
+ else:
+ assert result == expected
+
+ # freq raises AttributeError on an Int64Index because its not
+ # defined we mostly care about Series here anyhow
+ if not ignore_failures:
+ for o in self.not_valid_objs:
+
+ # an object that is datetimelike will raise a TypeError,
+ # otherwise an AttributeError
+ err = AttributeError
+ if issubclass(type(o), DatetimeIndexOpsMixin):
+ err = TypeError
+
+ with pytest.raises(err):
+ getattr(o, op)
+
+ @pytest.mark.parametrize('klass', [Series, DataFrame, Panel])
+ def test_binary_ops_docs(self, klass):
+ op_map = {'add': '+',
+ 'sub': '-',
+ 'mul': '*',
+ 'mod': '%',
+ 'pow': '**',
+ 'truediv': '/',
+ 'floordiv': '//'}
+ for op_name in op_map:
+ operand1 = klass.__name__.lower()
+ operand2 = 'other'
+ op = op_map[op_name]
+ expected_str = ' '.join([operand1, op, operand2])
+ assert expected_str in getattr(klass, op_name).__doc__
+
+ # reverse version of the binary ops
+ expected_str = ' '.join([operand2, op, operand1])
+ assert expected_str in getattr(klass, 'r' + op_name).__doc__
+
+
+class TestIndexOps(Ops):
+
+ def setup_method(self, method):
+ super(TestIndexOps, self).setup_method(method)
+ self.is_valid_objs = self.objs
+ self.not_valid_objs = []
+
+ def test_none_comparison(self):
+
+ # bug brought up by #1079
+ # changed from TypeError in 0.17.0
+ for o in self.is_valid_objs:
+ if isinstance(o, Series):
+
+ o[0] = np.nan
+
+ # noinspection PyComparisonWithNone
+ result = o == None # noqa
+ assert not result.iat[0]
+ assert not result.iat[1]
+
+ # noinspection PyComparisonWithNone
+ result = o != None # noqa
+ assert result.iat[0]
+ assert result.iat[1]
+
+ result = None == o # noqa
+ assert not result.iat[0]
+ assert not result.iat[1]
+
+ result = None != o # noqa
+ assert result.iat[0]
+ assert result.iat[1]
+
+ if (is_datetime64_dtype(o) or is_datetime64tz_dtype(o)):
+ # Following DatetimeIndex (and Timestamp) convention,
+ # inequality comparisons with Series[datetime64] raise
+ with pytest.raises(TypeError):
+ None > o
+ with pytest.raises(TypeError):
+ o > None
+ else:
+ result = None > o
+ assert not result.iat[0]
+ assert not result.iat[1]
+
+ result = o < None
+ assert not result.iat[0]
+ assert not result.iat[1]
+
+ def test_ndarray_compat_properties(self):
+
+ for o in self.objs:
+ # Check that we work.
+ for p in ['shape', 'dtype', 'T', 'nbytes']:
+ assert getattr(o, p, None) is not None
+
+ # deprecated properties
+ for p in ['flags', 'strides', 'itemsize']:
+ with tm.assert_produces_warning(FutureWarning):
+ assert getattr(o, p, None) is not None
+
+ with tm.assert_produces_warning(FutureWarning):
+ assert hasattr(o, 'base')
+
+ # If we have a datetime-like dtype then needs a view to work
+ # but the user is responsible for that
+ try:
+ with tm.assert_produces_warning(FutureWarning):
+ assert o.data is not None
+ except ValueError:
+ pass
+
+ with pytest.raises(ValueError):
+ o.item() # len > 1
+
+ assert o.ndim == 1
+ assert o.size == len(o)
+
+ assert Index([1]).item() == 1
+ assert Series([1]).item() == 1
+
+ def test_value_counts_unique_nunique(self):
+ for orig in self.objs:
+ o = orig.copy()
+ klass = type(o)
+ values = o._values
+
+ if isinstance(values, Index):
+ # reset name not to affect latter process
+ values.name = None
+
+ # create repeated values, 'n'th element is repeated by n+1 times
+ # skip boolean, because it only has 2 values at most
+ if isinstance(o, Index) and o.is_boolean():
+ continue
+ elif isinstance(o, Index):
+ expected_index = Index(o[::-1])
+ expected_index.name = None
+ o = o.repeat(range(1, len(o) + 1))
+ o.name = 'a'
+ else:
+ expected_index = Index(values[::-1])
+ idx = o.index.repeat(range(1, len(o) + 1))
+ # take-based repeat
+ indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1))
+ rep = values.take(indices)
+ o = klass(rep, index=idx, name='a')
+
+ # check values has the same dtype as the original
+ assert o.dtype == orig.dtype
+
+ expected_s = Series(range(10, 0, -1), index=expected_index,
+ dtype='int64', name='a')
+
+ result = o.value_counts()
+ tm.assert_series_equal(result, expected_s)
+ assert result.index.name is None
+ assert result.name == 'a'
+
+ result = o.unique()
+ if isinstance(o, Index):
+ assert isinstance(result, o.__class__)
+ tm.assert_index_equal(result, orig)
+ elif is_datetime64tz_dtype(o):
+ # datetimetz Series returns array of Timestamp
+ assert result[0] == orig[0]
+ for r in result:
+ assert isinstance(r, Timestamp)
+
+ tm.assert_numpy_array_equal(
+ result.astype(object),
+ orig._values.astype(object))
+ else:
+ tm.assert_numpy_array_equal(result, orig.values)
+
+ assert o.nunique() == len(np.unique(o.values))
+
+ @pytest.mark.parametrize('null_obj', [np.nan, None])
+ def test_value_counts_unique_nunique_null(self, null_obj):
+
+ for orig in self.objs:
+ o = orig.copy()
+ klass = type(o)
+ values = o._ndarray_values
+
+ if not self._allow_na_ops(o):
+ continue
+
+ # special assign to the numpy array
+ if is_datetime64tz_dtype(o):
+ if isinstance(o, DatetimeIndex):
+ v = o.asi8
+ v[0:2] = iNaT
+ values = o._shallow_copy(v)
+ else:
+ o = o.copy()
+ o[0:2] = iNaT
+ values = o._values
+
+ elif needs_i8_conversion(o):
+ values[0:2] = iNaT
+ values = o._shallow_copy(values)
+ else:
+ values[0:2] = null_obj
+ # check values has the same dtype as the original
+
+ assert values.dtype == o.dtype
+
+ # create repeated values, 'n'th element is repeated by n+1
+ # times
+ if isinstance(o, (DatetimeIndex, PeriodIndex)):
+ expected_index = o.copy()
+ expected_index.name = None
+
+ # attach name to klass
+ o = klass(values.repeat(range(1, len(o) + 1)))
+ o.name = 'a'
+ else:
+ if isinstance(o, DatetimeIndex):
+ expected_index = orig._values._shallow_copy(values)
+ else:
+ expected_index = Index(values)
+ expected_index.name = None
+ o = o.repeat(range(1, len(o) + 1))
+ o.name = 'a'
+
+ # check values has the same dtype as the original
+ assert o.dtype == orig.dtype
+ # check values correctly have NaN
+ nanloc = np.zeros(len(o), dtype=np.bool)
+ nanloc[:3] = True
+ if isinstance(o, Index):
+ tm.assert_numpy_array_equal(pd.isna(o), nanloc)
+ else:
+ exp = Series(nanloc, o.index, name='a')
+ tm.assert_series_equal(pd.isna(o), exp)
+
+ expected_s_na = Series(list(range(10, 2, -1)) + [3],
+ index=expected_index[9:0:-1],
+ dtype='int64', name='a')
+ expected_s = Series(list(range(10, 2, -1)),
+ index=expected_index[9:1:-1],
+ dtype='int64', name='a')
+
+ result_s_na = o.value_counts(dropna=False)
+ tm.assert_series_equal(result_s_na, expected_s_na)
+ assert result_s_na.index.name is None
+ assert result_s_na.name == 'a'
+ result_s = o.value_counts()
+ tm.assert_series_equal(o.value_counts(), expected_s)
+ assert result_s.index.name is None
+ assert result_s.name == 'a'
+
+ result = o.unique()
+ if isinstance(o, Index):
+ tm.assert_index_equal(result,
+ Index(values[1:], name='a'))
+ elif is_datetime64tz_dtype(o):
+ # unable to compare NaT / nan
+ tm.assert_extension_array_equal(result[1:], values[2:])
+ assert result[0] is pd.NaT
+ else:
+ tm.assert_numpy_array_equal(result[1:], values[2:])
+
+ assert pd.isna(result[0])
+ assert result.dtype == orig.dtype
+
+ assert o.nunique() == 8
+ assert o.nunique(dropna=False) == 9
+
+ @pytest.mark.parametrize('klass', [Index, Series])
+ def test_value_counts_inferred(self, klass):
+ s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
+ s = klass(s_values)
+ expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
+ tm.assert_series_equal(s.value_counts(), expected)
+
+ if isinstance(s, Index):
+ exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
+ tm.assert_index_equal(s.unique(), exp)
+ else:
+ exp = np.unique(np.array(s_values, dtype=np.object_))
+ tm.assert_numpy_array_equal(s.unique(), exp)
+
+ assert s.nunique() == 4
+ # don't sort, have to sort after the fact as not sorting is
+ # platform-dep
+ hist = s.value_counts(sort=False).sort_values()
+ expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values()
+ tm.assert_series_equal(hist, expected)
+
+ # sort ascending
+ hist = s.value_counts(ascending=True)
+ expected = Series([1, 2, 3, 4], index=list('cdab'))
+ tm.assert_series_equal(hist, expected)
+
+ # relative histogram.
+ hist = s.value_counts(normalize=True)
+ expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
+ tm.assert_series_equal(hist, expected)
+
+ @pytest.mark.parametrize('klass', [Index, Series])
+ def test_value_counts_bins(self, klass):
+ s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
+ s = klass(s_values)
+
+ # bins
+ with pytest.raises(TypeError):
+ s.value_counts(bins=1)
+
+ s1 = Series([1, 1, 2, 3])
+ res1 = s1.value_counts(bins=1)
+ exp1 = Series({Interval(0.997, 3.0): 4})
+ tm.assert_series_equal(res1, exp1)
+ res1n = s1.value_counts(bins=1, normalize=True)
+ exp1n = Series({Interval(0.997, 3.0): 1.0})
+ tm.assert_series_equal(res1n, exp1n)
+
+ if isinstance(s1, Index):
+ tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
+ else:
+ exp = np.array([1, 2, 3], dtype=np.int64)
+ tm.assert_numpy_array_equal(s1.unique(), exp)
+
+ assert s1.nunique() == 3
+
+ # these return the same
+ res4 = s1.value_counts(bins=4, dropna=True)
+ intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
+ exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+ tm.assert_series_equal(res4, exp4)
+
+ res4 = s1.value_counts(bins=4, dropna=False)
+ intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
+ exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+ tm.assert_series_equal(res4, exp4)
+
+ res4n = s1.value_counts(bins=4, normalize=True)
+ exp4n = Series([0.5, 0.25, 0.25, 0],
+ index=intervals.take([0, 3, 1, 2]))
+ tm.assert_series_equal(res4n, exp4n)
+
+ # handle NA's properly
+ s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
+ 'd', 'd', 'a', 'a', 'b']
+ s = klass(s_values)
+ expected = Series([4, 3, 2], index=['b', 'a', 'd'])
+ tm.assert_series_equal(s.value_counts(), expected)
+
+ if isinstance(s, Index):
+ exp = Index(['a', 'b', np.nan, 'd'])
+ tm.assert_index_equal(s.unique(), exp)
+ else:
+ exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
+ tm.assert_numpy_array_equal(s.unique(), exp)
+ assert s.nunique() == 3
+
+ s = klass({})
+ expected = Series([], dtype=np.int64)
+ tm.assert_series_equal(s.value_counts(), expected,
+ check_index_type=False)
+ # returned dtype differs depending on original
+ if isinstance(s, Index):
+ tm.assert_index_equal(s.unique(), Index([]), exact=False)
+ else:
+ tm.assert_numpy_array_equal(s.unique(), np.array([]),
+ check_dtype=False)
+
+ assert s.nunique() == 0
+
+ @pytest.mark.parametrize('klass', [Index, Series])
+ def test_value_counts_datetime64(self, klass):
+
+ # GH 3002, datetime64[ns]
+ # don't test names though
+ txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM',
+ 'xxyyzz20100101EGG', 'xxyyww20090101EGG',
+ 'foofoo20080909PIE', 'foofoo20080909GUM'])
+ f = StringIO(txt)
+ df = pd.read_fwf(f, widths=[6, 8, 3],
+ names=["person_id", "dt", "food"],
+ parse_dates=["dt"])
+
+ s = klass(df['dt'].copy())
+ s.name = None
+ idx = pd.to_datetime(['2010-01-01 00:00:00',
+ '2008-09-09 00:00:00',
+ '2009-01-01 00:00:00'])
+ expected_s = Series([3, 2, 1], index=idx)
+ tm.assert_series_equal(s.value_counts(), expected_s)
+
+ expected = np_array_datetime64_compat(['2010-01-01 00:00:00',
+ '2009-01-01 00:00:00',
+ '2008-09-09 00:00:00'],
+ dtype='datetime64[ns]')
+ if isinstance(s, Index):
+ tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
+ else:
+ tm.assert_numpy_array_equal(s.unique(), expected)
+
+ assert s.nunique() == 3
+
+ # with NaT
+ s = df['dt'].copy()
+ s = klass([v for v in s.values] + [pd.NaT])
+
+ result = s.value_counts()
+ assert result.index.dtype == 'datetime64[ns]'
+ tm.assert_series_equal(result, expected_s)
+
+ result = s.value_counts(dropna=False)
+ expected_s[pd.NaT] = 1
+ tm.assert_series_equal(result, expected_s)
+
+ unique = s.unique()
+ assert unique.dtype == 'datetime64[ns]'
+
+ # numpy_array_equal cannot compare pd.NaT
+ if isinstance(s, Index):
+ exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
+ tm.assert_index_equal(unique, exp_idx)
+ else:
+ tm.assert_numpy_array_equal(unique[:3], expected)
+ assert pd.isna(unique[3])
+
+ assert s.nunique() == 3
+ assert s.nunique(dropna=False) == 4
+
+ # timedelta64[ns]
+ td = df.dt - df.dt + timedelta(1)
+ td = klass(td, name='dt')
+
+ result = td.value_counts()
+ expected_s = Series([6], index=[Timedelta('1day')], name='dt')
+ tm.assert_series_equal(result, expected_s)
+
+ expected = TimedeltaIndex(['1 days'], name='dt')
+ if isinstance(td, Index):
+ tm.assert_index_equal(td.unique(), expected)
+ else:
+ tm.assert_numpy_array_equal(td.unique(), expected.values)
+
+ td2 = timedelta(1) + (df.dt - df.dt)
+ td2 = klass(td2, name='dt')
+ result2 = td2.value_counts()
+ tm.assert_series_equal(result2, expected_s)
+
+ def test_factorize(self):
+ for orig in self.objs:
+ o = orig.copy()
+
+ if isinstance(o, Index) and o.is_boolean():
+ exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp)
+ exp_uniques = o
+ exp_uniques = Index([False, True])
+ else:
+ exp_arr = np.array(range(len(o)), dtype=np.intp)
+ exp_uniques = o
+ labels, uniques = o.factorize()
+
+ tm.assert_numpy_array_equal(labels, exp_arr)
+ if isinstance(o, Series):
+ tm.assert_index_equal(uniques, Index(orig),
+ check_names=False)
+ else:
+ # factorize explicitly resets name
+ tm.assert_index_equal(uniques, exp_uniques,
+ check_names=False)
+
+ def test_factorize_repeated(self):
+ for orig in self.objs:
+ o = orig.copy()
+
+ # don't test boolean
+ if isinstance(o, Index) and o.is_boolean():
+ continue
+
+ # sort by value, and create duplicates
+ if isinstance(o, Series):
+ o = o.sort_values()
+ n = o.iloc[5:].append(o)
+ else:
+ indexer = o.argsort()
+ o = o.take(indexer)
+ n = o[5:].append(o)
+
+ exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+ dtype=np.intp)
+ labels, uniques = n.factorize(sort=True)
+
+ tm.assert_numpy_array_equal(labels, exp_arr)
+ if isinstance(o, Series):
+ tm.assert_index_equal(uniques, Index(orig).sort_values(),
+ check_names=False)
+ else:
+ tm.assert_index_equal(uniques, o, check_names=False)
+
+ exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4],
+ np.intp)
+ labels, uniques = n.factorize(sort=False)
+ tm.assert_numpy_array_equal(labels, exp_arr)
+
+ if isinstance(o, Series):
+ expected = Index(o.iloc[5:10].append(o.iloc[:5]))
+ tm.assert_index_equal(uniques, expected, check_names=False)
+ else:
+ expected = o[5:10].append(o[:5])
+ tm.assert_index_equal(uniques, expected, check_names=False)
+
+ def test_duplicated_drop_duplicates_index(self):
+ # GH 4060
+ for original in self.objs:
+ if isinstance(original, Index):
+
+ # special case
+ if original.is_boolean():
+ result = original.drop_duplicates()
+ expected = Index([False, True], name='a')
+ tm.assert_index_equal(result, expected)
+ continue
+
+ # original doesn't have duplicates
+ expected = np.array([False] * len(original), dtype=bool)
+ duplicated = original.duplicated()
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ result = original.drop_duplicates()
+ tm.assert_index_equal(result, original)
+ assert result is not original
+
+ # has_duplicates
+ assert not original.has_duplicates
+
+ # create repeated values, 3rd and 5th values are duplicated
+ idx = original[list(range(len(original))) + [5, 3]]
+ expected = np.array([False] * len(original) + [True, True],
+ dtype=bool)
+ duplicated = idx.duplicated()
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ tm.assert_index_equal(idx.drop_duplicates(), original)
+
+ base = [False] * len(idx)
+ base[3] = True
+ base[5] = True
+ expected = np.array(base)
+
+ duplicated = idx.duplicated(keep='last')
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ result = idx.drop_duplicates(keep='last')
+ tm.assert_index_equal(result, idx[~expected])
+
+ base = [False] * len(original) + [True, True]
+ base[3] = True
+ base[5] = True
+ expected = np.array(base)
+
+ duplicated = idx.duplicated(keep=False)
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ result = idx.drop_duplicates(keep=False)
+ tm.assert_index_equal(result, idx[~expected])
+
+ with pytest.raises(TypeError,
+ match=(r"drop_duplicates\(\) got an "
+ r"unexpected keyword argument")):
+ idx.drop_duplicates(inplace=True)
+
+ else:
+ expected = Series([False] * len(original),
+ index=original.index, name='a')
+ tm.assert_series_equal(original.duplicated(), expected)
+ result = original.drop_duplicates()
+ tm.assert_series_equal(result, original)
+ assert result is not original
+
+ idx = original.index[list(range(len(original))) + [5, 3]]
+ values = original._values[list(range(len(original))) + [5, 3]]
+ s = Series(values, index=idx, name='a')
+
+ expected = Series([False] * len(original) + [True, True],
+ index=idx, name='a')
+ tm.assert_series_equal(s.duplicated(), expected)
+ tm.assert_series_equal(s.drop_duplicates(), original)
+
+ base = [False] * len(idx)
+ base[3] = True
+ base[5] = True
+ expected = Series(base, index=idx, name='a')
+
+ tm.assert_series_equal(s.duplicated(keep='last'), expected)
+ tm.assert_series_equal(s.drop_duplicates(keep='last'),
+ s[~np.array(base)])
+
+ base = [False] * len(original) + [True, True]
+ base[3] = True
+ base[5] = True
+ expected = Series(base, index=idx, name='a')
+
+ tm.assert_series_equal(s.duplicated(keep=False), expected)
+ tm.assert_series_equal(s.drop_duplicates(keep=False),
+ s[~np.array(base)])
+
+ s.drop_duplicates(inplace=True)
+ tm.assert_series_equal(s, original)
+
+ def test_drop_duplicates_series_vs_dataframe(self):
+ # GH 14192
+ df = pd.DataFrame({'a': [1, 1, 1, 'one', 'one'],
+ 'b': [2, 2, np.nan, np.nan, np.nan],
+ 'c': [3, 3, np.nan, np.nan, 'three'],
+ 'd': [1, 2, 3, 4, 4],
+ 'e': [datetime(2015, 1, 1), datetime(2015, 1, 1),
+ datetime(2015, 2, 1), pd.NaT, pd.NaT]
+ })
+ for column in df.columns:
+ for keep in ['first', 'last', False]:
+ dropped_frame = df[[column]].drop_duplicates(keep=keep)
+ dropped_series = df[column].drop_duplicates(keep=keep)
+ tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
+
+ def test_fillna(self):
+ # # GH 11343
+ # though Index.fillna and Series.fillna has separate impl,
+ # test here to confirm these works as the same
+
+ for orig in self.objs:
+
+ o = orig.copy()
+ values = o.values
+
+ # values will not be changed
+ result = o.fillna(o.astype(object).values[0])
+ if isinstance(o, Index):
+ tm.assert_index_equal(o, result)
+ else:
+ tm.assert_series_equal(o, result)
+ # check shallow_copied
+ assert o is not result
+
+ for null_obj in [np.nan, None]:
+ for orig in self.objs:
+ o = orig.copy()
+ klass = type(o)
+
+ if not self._allow_na_ops(o):
+ continue
+
+ if needs_i8_conversion(o):
+
+ values = o.astype(object).values
+ fill_value = values[0]
+ values[0:2] = pd.NaT
+ else:
+ values = o.values.copy()
+ fill_value = o.values[0]
+ values[0:2] = null_obj
+
+ expected = [fill_value] * 2 + list(values[2:])
+
+ expected = klass(expected)
+ o = klass(values)
+
+ # check values has the same dtype as the original
+ assert o.dtype == orig.dtype
+
+ result = o.fillna(fill_value)
+ if isinstance(o, Index):
+ tm.assert_index_equal(result, expected)
+ else:
+ tm.assert_series_equal(result, expected)
+ # check shallow_copied
+ assert o is not result
+
+ @pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
+ def test_memory_usage(self):
+ for o in self.objs:
+ res = o.memory_usage()
+ res_deep = o.memory_usage(deep=True)
+
+ if (is_object_dtype(o) or (isinstance(o, Series) and
+ is_object_dtype(o.index))):
+ # if there are objects, only deep will pick them up
+ assert res_deep > res
+ else:
+ assert res == res_deep
+
+ if isinstance(o, Series):
+ assert ((o.memory_usage(index=False) +
+ o.index.memory_usage()) ==
+ o.memory_usage(index=True))
+
+ # sys.getsizeof will call the .memory_usage with
+ # deep=True, and add on some GC overhead
+ diff = res_deep - sys.getsizeof(o)
+ assert abs(diff) < 100
+
+ def test_searchsorted(self):
+ # See gh-12238
+ for o in self.objs:
+ index = np.searchsorted(o, max(o))
+ assert 0 <= index <= len(o)
+
+ index = np.searchsorted(o, max(o), sorter=range(len(o)))
+ assert 0 <= index <= len(o)
+
+ def test_validate_bool_args(self):
+ invalid_values = [1, "True", [1, 2, 3], 5.0]
+
+ for value in invalid_values:
+ with pytest.raises(ValueError):
+ self.int_series.drop_duplicates(inplace=value)
+
+ def test_getitem(self):
+ for i in self.indexes:
+ s = pd.Series(i)
+
+ assert i[0] == s.iloc[0]
+ assert i[5] == s.iloc[5]
+ assert i[-1] == s.iloc[-1]
+
+ assert i[-1] == i[9]
+
+ with pytest.raises(IndexError):
+ i[20]
+ with pytest.raises(IndexError):
+ s.iloc[20]
+
+ @pytest.mark.parametrize('indexer_klass', [list, pd.Index])
+ @pytest.mark.parametrize('indexer', [[True] * 10, [False] * 10,
+ [True, False, True, True, False,
+ False, True, True, False, True]])
+ def test_bool_indexing(self, indexer_klass, indexer):
+ # GH 22533
+ for idx in self.indexes:
+ exp_idx = [i for i in range(len(indexer)) if indexer[i]]
+ tm.assert_index_equal(idx[indexer_klass(indexer)], idx[exp_idx])
+ s = pd.Series(idx)
+ tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx])
+
+
+class TestTranspose(Ops):
+ errmsg = "the 'axes' parameter is not supported"
+
+ def test_transpose(self):
+ for obj in self.objs:
+ tm.assert_equal(obj.transpose(), obj)
+
+ def test_transpose_non_default_axes(self):
+ for obj in self.objs:
+ with pytest.raises(ValueError, match=self.errmsg):
+ obj.transpose(1)
+ with pytest.raises(ValueError, match=self.errmsg):
+ obj.transpose(axes=1)
+
+ def test_numpy_transpose(self):
+ for obj in self.objs:
+ tm.assert_equal(np.transpose(obj), obj)
+
+ with pytest.raises(ValueError, match=self.errmsg):
+ np.transpose(obj, axes=1)
+
+
+class TestNoNewAttributesMixin(object):
+
+ def test_mixin(self):
+ class T(NoNewAttributesMixin):
+ pass
+
+ t = T()
+ assert not hasattr(t, "__frozen")
+
+ t.a = "test"
+ assert t.a == "test"
+
+ t._freeze()
+ assert "__frozen" in dir(t)
+ assert getattr(t, "__frozen")
+
+ with pytest.raises(AttributeError):
+ t.b = "test"
+
+ assert not hasattr(t, "b")
+
+
+class TestToIterable(object):
+ # test that we convert an iterable to python types
+
+ dtypes = [
+ ('int8', (int, long)),
+ ('int16', (int, long)),
+ ('int32', (int, long)),
+ ('int64', (int, long)),
+ ('uint8', (int, long)),
+ ('uint16', (int, long)),
+ ('uint32', (int, long)),
+ ('uint64', (int, long)),
+ ('float16', float),
+ ('float32', float),
+ ('float64', float),
+ ('datetime64[ns]', Timestamp),
+ ('datetime64[ns, US/Eastern]', Timestamp),
+ ('timedelta64[ns]', Timedelta)]
+
+ @pytest.mark.parametrize(
+ 'dtype, rdtype', dtypes)
+ @pytest.mark.parametrize(
+ 'method',
+ [
+ lambda x: x.tolist(),
+ lambda x: x.to_list(),
+ lambda x: list(x),
+ lambda x: list(x.__iter__()),
+ ], ids=['tolist', 'to_list', 'list', 'iter'])
+ @pytest.mark.parametrize('typ', [Series, Index])
+ @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning")
+ # TODO(GH-24559): Remove the filterwarnings
+ def test_iterable(self, typ, method, dtype, rdtype):
+ # gh-10904
+ # gh-13258
+ # coerce iteration to underlying python / pandas types
+ s = typ([1], dtype=dtype)
+ result = method(s)[0]
+ assert isinstance(result, rdtype)
+
+ @pytest.mark.parametrize(
+ 'dtype, rdtype, obj',
+ [
+ ('object', object, 'a'),
+ ('object', (int, long), 1),
+ ('category', object, 'a'),
+ ('category', (int, long), 1)])
+ @pytest.mark.parametrize(
+ 'method',
+ [
+ lambda x: x.tolist(),
+ lambda x: x.to_list(),
+ lambda x: list(x),
+ lambda x: list(x.__iter__()),
+ ], ids=['tolist', 'to_list', 'list', 'iter'])
+ @pytest.mark.parametrize('typ', [Series, Index])
+ def test_iterable_object_and_category(self, typ, method,
+ dtype, rdtype, obj):
+ # gh-10904
+ # gh-13258
+ # coerce iteration to underlying python / pandas types
+ s = typ([obj], dtype=dtype)
+ result = method(s)[0]
+ assert isinstance(result, rdtype)
+
+ @pytest.mark.parametrize(
+ 'dtype, rdtype', dtypes)
+ def test_iterable_items(self, dtype, rdtype):
+ # gh-13258
+ # test items / iteritems yields the correct boxed scalars
+ # this only applies to series
+ s = Series([1], dtype=dtype)
+ _, result = list(s.items())[0]
+ assert isinstance(result, rdtype)
+
+ _, result = list(s.iteritems())[0]
+ assert isinstance(result, rdtype)
+
+ @pytest.mark.parametrize(
+ 'dtype, rdtype',
+ dtypes + [
+ ('object', (int, long)),
+ ('category', (int, long))])
+ @pytest.mark.parametrize('typ', [Series, Index])
+ @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning")
+ # TODO(GH-24559): Remove the filterwarnings
+ def test_iterable_map(self, typ, dtype, rdtype):
+ # gh-13236
+ # coerce iteration to underlying python / pandas types
+ s = typ([1], dtype=dtype)
+ result = s.map(type)[0]
+ if not isinstance(rdtype, tuple):
+ rdtype = tuple([rdtype])
+ assert result in rdtype
+
+ @pytest.mark.parametrize(
+ 'method',
+ [
+ lambda x: x.tolist(),
+ lambda x: x.to_list(),
+ lambda x: list(x),
+ lambda x: list(x.__iter__()),
+ ], ids=['tolist', 'to_list', 'list', 'iter'])
+ def test_categorial_datetimelike(self, method):
+ i = CategoricalIndex([Timestamp('1999-12-31'),
+ Timestamp('2000-12-31')])
+
+ result = method(i)[0]
+ assert isinstance(result, Timestamp)
+
+ def test_iter_box(self):
+ vals = [Timestamp('2011-01-01'), Timestamp('2011-01-02')]
+ s = Series(vals)
+ assert s.dtype == 'datetime64[ns]'
+ for res, exp in zip(s, vals):
+ assert isinstance(res, Timestamp)
+ assert res.tz is None
+ assert res == exp
+
+ vals = [Timestamp('2011-01-01', tz='US/Eastern'),
+ Timestamp('2011-01-02', tz='US/Eastern')]
+ s = Series(vals)
+
+ assert s.dtype == 'datetime64[ns, US/Eastern]'
+ for res, exp in zip(s, vals):
+ assert isinstance(res, Timestamp)
+ assert res.tz == exp.tz
+ assert res == exp
+
+ # timedelta
+ vals = [Timedelta('1 days'), Timedelta('2 days')]
+ s = Series(vals)
+ assert s.dtype == 'timedelta64[ns]'
+ for res, exp in zip(s, vals):
+ assert isinstance(res, Timedelta)
+ assert res == exp
+
+ # period
+ vals = [pd.Period('2011-01-01', freq='M'),
+ pd.Period('2011-01-02', freq='M')]
+ s = Series(vals)
+ assert s.dtype == 'Period[M]'
+ for res, exp in zip(s, vals):
+ assert isinstance(res, pd.Period)
+ assert res.freq == 'M'
+ assert res == exp
+
+
[email protected]('array, expected_type, dtype', [
+ (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'),
+ (np.array(['a', 'b']), np.ndarray, 'object'),
+ (pd.Categorical(['a', 'b']), pd.Categorical, 'category'),
+ (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray,
+ 'datetime64[ns, US/Central]'),
+
+ (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray,
+ pd.core.dtypes.dtypes.PeriodDtype("A-DEC")),
+ (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray,
+ 'interval'),
+
+ # This test is currently failing for datetime64[ns] and timedelta64[ns].
+ # The NumPy type system is sufficient for representing these types, so
+ # we just use NumPy for Series / DataFrame columns of these types (so
+ # we get consolidation and so on).
+ # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray
+ # abstraction to for code reuse.
+ # At the moment, we've judged that allowing this test to fail is more
+ # practical that overriding Series._values to special case
+ # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray.
+ pytest.param(
+ pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]',
+ marks=[pytest.mark.xfail(reason="datetime _values", strict=True)]
+ ),
+ pytest.param(
+ pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]',
+ marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)]
+ ),
+
+])
+def test_values_consistent(array, expected_type, dtype):
+ l_values = pd.Series(array)._values
+ r_values = pd.Index(array)._values
+ assert type(l_values) is expected_type
+ assert type(l_values) is type(r_values)
+
+ tm.assert_equal(l_values, r_values)
+
+
[email protected]('array, expected', [
+ (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)),
+ (np.array(['0', '1']), np.array(['0', '1'], dtype=object)),
+ (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')),
+ (pd.DatetimeIndex(['2017-01-01T00:00:00']),
+ np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')),
+ (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"),
+ np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')),
+ (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')),
+ (pd.PeriodIndex(['2017', '2018'], freq='D'),
+ np.array([17167, 17532], dtype=np.int64)),
+])
+def test_ndarray_values(array, expected):
+ l_values = pd.Series(array)._ndarray_values
+ r_values = pd.Index(array)._ndarray_values
+ tm.assert_numpy_array_equal(l_values, r_values)
+ tm.assert_numpy_array_equal(l_values, expected)
+
+
+ np.array([1, 2, 3]),
+])
+def test_numpy_array(arr):
+ ser = pd.Series(arr)
+ result = ser.array
+ expected = PandasArray(arr)
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_numpy_array_all_dtypes(any_numpy_dtype):
+ ser = pd.Series(dtype=any_numpy_dtype)
+ result = ser.array
+ if is_datetime64_dtype(any_numpy_dtype):
+ assert isinstance(result, DatetimeArray)
+ elif is_timedelta64_dtype(any_numpy_dtype):
+ assert isinstance(result, TimedeltaArray)
+ else:
+ assert isinstance(result, PandasArray)
+
+
[email protected]("array, attr", [
+ (pd.Categorical(['a', 'b']), '_codes'),
+ (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'),
+ (pd.core.arrays.integer_array([0, np.nan]), '_data'),
+ (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'),
+ (pd.SparseArray([0, 1]), '_sparse_values'),
+ (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"),
+ # tz-aware Datetime
+ (DatetimeArray(np.array(['2000-01-01T12:00:00',
+ '2000-01-02T12:00:00'],
+ dtype='M8[ns]'),
+ dtype=DatetimeTZDtype(tz="US/Central")),
+ '_data'),
+])
[email protected]('box', [pd.Series, pd.Index])
+def test_array(array, attr, box):
+ if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index:
+ pytest.skip("No index type for {}".format(array.dtype))
+ result = box(array, copy=False).array
+
+ if attr:
+ array = getattr(array, attr)
+ result = getattr(result, attr)
+
+ assert result is array
+
+
+def test_array_multiindex_raises():
+ idx = pd.MultiIndex.from_product([['A'], ['a', 'b']])
+ with pytest.raises(ValueError, match='MultiIndex'):
+ idx.array
+
+
[email protected]('array, expected', [
+ (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
+ (pd.Categorical(['a', 'b']), np.array(['a', 'b'], dtype=object)),
+ (pd.core.arrays.period_array(['2000', '2001'], freq='D'),
+ np.array([pd.Period('2000', freq="D"), pd.Period('2001', freq='D')])),
+ (pd.core.arrays.integer_array([0, np.nan]),
+ np.array([0, np.nan], dtype=object)),
+ (pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]),
+ np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)),
+ (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
+
+ # tz-naive datetime
+ (DatetimeArray(np.array(['2000', '2001'], dtype='M8[ns]')),
+ np.array(['2000', '2001'], dtype='M8[ns]')),
+
+ # tz-aware stays tz`-aware
+ (DatetimeArray(np.array(['2000-01-01T06:00:00',
+ '2000-01-02T06:00:00'],
+ dtype='M8[ns]'),
+ dtype=DatetimeTZDtype(tz='US/Central')),
+ np.array([pd.Timestamp('2000-01-01', tz='US/Central'),
+ pd.Timestamp('2000-01-02', tz='US/Central')])),
+
+ # Timedelta
+ (TimedeltaArray(np.array([0, 3600000000000], dtype='i8'), freq='H'),
+ np.array([0, 3600000000000], dtype='m8[ns]')),
+])
[email protected]('box', [pd.Series, pd.Index])
+def test_to_numpy(array, expected, box):
+ thing = box(array)
+
+ if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index:
+ pytest.skip("No index type for {}".format(array.dtype))
+
+ result = thing.to_numpy()
+ tm.assert_numpy_array_equal(result, expected)
+
+
[email protected]("as_series", [True, False])
+ np.array([1, 2, 3], dtype="int64"),
+ np.array(['a', 'b', 'c'], dtype=object),
+])
+def test_to_numpy_copy(arr, as_series):
+ obj = pd.Index(arr, copy=False)
+ if as_series:
+ obj = pd.Series(obj.values, copy=False)
+
+ # no copy by default
+ result = obj.to_numpy()
+ assert np.shares_memory(arr, result) is True
+
+ result = obj.to_numpy(copy=False)
+ assert np.shares_memory(arr, result) is True
+
+ # copy=True
+ result = obj.to_numpy(copy=True)
+ assert np.shares_memory(arr, result) is False
+
+
[email protected]("as_series", [True, False])
+def test_to_numpy_dtype(as_series):
+ tz = "US/Eastern"
+ obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
+ if as_series:
+ obj = pd.Series(obj)
+
+ # preserve tz by default
+ result = obj.to_numpy()
+ expected = np.array([pd.Timestamp('2000', tz=tz),
+ pd.Timestamp('2001', tz=tz)],
+ dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = obj.to_numpy(dtype="object")
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = obj.to_numpy(dtype="M8[ns]")
+ expected = np.array(['2000-01-01T05', '2001-01-01T05'],
+ dtype='M8[ns]')
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_common.py b/contrib/python/pandas/py2/pandas/tests/test_common.py
new file mode 100644
index 00000000000..18eb760e31d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_common.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+import collections
+from functools import partial
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Series, Timestamp
+from pandas.core import common as com, ops
+
+
+def test_get_callable_name():
+ getname = com.get_callable_name
+
+ def fn(x):
+ return x
+
+ lambda_ = lambda x: x # noqa: E731
+ part1 = partial(fn)
+ part2 = partial(part1)
+
+ class somecall(object):
+
+ def __call__(self):
+ return x # noqa
+
+ assert getname(fn) == 'fn'
+ assert getname(lambda_)
+ assert getname(part1) == 'fn'
+ assert getname(part2) == 'fn'
+ assert getname(somecall()) == 'somecall'
+ assert getname(1) is None
+
+
+def test_any_none():
+ assert (com._any_none(1, 2, 3, None))
+ assert (not com._any_none(1, 2, 3, 4))
+
+
+def test_all_not_none():
+ assert (com._all_not_none(1, 2, 3, 4))
+ assert (not com._all_not_none(1, 2, 3, None))
+ assert (not com._all_not_none(None, None, None, None))
+
+
+def test_random_state():
+ import numpy.random as npr
+ # Check with seed
+ state = com.random_state(5)
+ assert state.uniform() == npr.RandomState(5).uniform()
+
+ # Check with random state object
+ state2 = npr.RandomState(10)
+ assert com.random_state(state2).uniform() == npr.RandomState(10).uniform()
+
+ # check with no arg random state
+ assert com.random_state() is np.random
+
+ # Error for floats or strings
+ with pytest.raises(ValueError):
+ com.random_state('test')
+
+ with pytest.raises(ValueError):
+ com.random_state(5.5)
+
+
[email protected]('left, right, expected', [
+ (Series([1], name='x'), Series([2], name='x'), 'x'),
+ (Series([1], name='x'), Series([2], name='y'), None),
+ (Series([1]), Series([2], name='x'), None),
+ (Series([1], name='x'), Series([2]), None),
+ (Series([1], name='x'), [2], 'x'),
+ ([1], Series([2], name='y'), 'y')])
+def test_maybe_match_name(left, right, expected):
+ assert ops._maybe_match_name(left, right) == expected
+
+
+def test_dict_compat():
+ data_datetime64 = {np.datetime64('1990-03-15'): 1,
+ np.datetime64('2015-03-15'): 2}
+ data_unchanged = {1: 2, 3: 4, 5: 6}
+ expected = {Timestamp('1990-3-15'): 1, Timestamp('2015-03-15'): 2}
+ assert (com.dict_compat(data_datetime64) == expected)
+ assert (com.dict_compat(expected) == expected)
+ assert (com.dict_compat(data_unchanged) == data_unchanged)
+
+
+def test_standardize_mapping():
+ # No uninitialized defaultdicts
+ with pytest.raises(TypeError):
+ com.standardize_mapping(collections.defaultdict)
+
+ # No non-mapping subtypes, instance
+ with pytest.raises(TypeError):
+ com.standardize_mapping([])
+
+ # No non-mapping subtypes, class
+ with pytest.raises(TypeError):
+ com.standardize_mapping(list)
+
+ fill = {'bad': 'data'}
+ assert (com.standardize_mapping(fill) == dict)
+
+ # Convert instance to type
+ assert (com.standardize_mapping({}) == dict)
+
+ dd = collections.defaultdict(list)
+ assert isinstance(com.standardize_mapping(dd), partial)
+
+
+def test_git_version():
+ # GH 21295
+ git_version = pd.__git_version__
+ assert len(git_version) == 40
+ assert all(c in string.hexdigits for c in git_version)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_compat.py b/contrib/python/pandas/py2/pandas/tests/test_compat.py
new file mode 100644
index 00000000000..d1a3ee43a46
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_compat.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Testing that functions from compat work as expected
+"""
+
+import re
+
+import pytest
+
+from pandas.compat import (
+ PY2, builtins, filter, get_range_parameters, iteritems, iterkeys,
+ itervalues, lfilter, lmap, lrange, lzip, map, next, range, re_type, zip)
+
+
+class TestBuiltinIterators(object):
+
+ @classmethod
+ def check_result(cls, actual, expected, lengths):
+ for (iter_res, list_res), exp, length in zip(actual, expected,
+ lengths):
+ assert not isinstance(iter_res, list)
+ assert isinstance(list_res, list)
+
+ iter_res = list(iter_res)
+
+ assert len(list_res) == length
+ assert len(iter_res) == length
+ assert iter_res == exp
+ assert list_res == exp
+
+ def test_range(self):
+ actual1 = range(10)
+ actual2 = lrange(10)
+ actual = [actual1, actual2],
+ expected = list(builtins.range(10)),
+ lengths = 10,
+
+ actual1 = range(1, 10, 2)
+ actual2 = lrange(1, 10, 2)
+ actual += [actual1, actual2],
+ lengths += 5,
+ expected += list(builtins.range(1, 10, 2)),
+ self.check_result(actual, expected, lengths)
+
+ def test_map(self):
+ func = lambda x, y, z: x + y + z
+ lst = [builtins.range(10), builtins.range(10), builtins.range(10)]
+ actual1 = map(func, *lst)
+ actual2 = lmap(func, *lst)
+ actual = [actual1, actual2],
+ expected = list(builtins.map(func, *lst)),
+ lengths = 10,
+ self.check_result(actual, expected, lengths)
+
+ def test_filter(self):
+ func = lambda x: x
+ lst = list(builtins.range(10))
+ actual1 = filter(func, lst)
+ actual2 = lfilter(func, lst)
+ actual = [actual1, actual2],
+ lengths = 9,
+ expected = list(builtins.filter(func, lst)),
+ self.check_result(actual, expected, lengths)
+
+ def test_zip(self):
+ lst = [builtins.range(10), builtins.range(10), builtins.range(10)]
+ actual = [zip(*lst), lzip(*lst)],
+ expected = list(builtins.zip(*lst)),
+ lengths = 10,
+ self.check_result(actual, expected, lengths)
+
+ def test_dict_iterators(self):
+ assert next(itervalues({1: 2})) == 2
+ assert next(iterkeys({1: 2})) == 1
+ assert next(iteritems({1: 2})) == (1, 2)
+
+
+class TestCompatFunctions(object):
+
+ @pytest.mark.parametrize(
+ 'start,stop,step', [(0, 10, 2), (11, -2, -1), (0, -5, 1), (2, 4, 8)])
+ def test_get_range_parameters(self, start, stop, step):
+ rng = range(start, stop, step)
+ if PY2 and len(rng) == 0:
+ start_expected, stop_expected, step_expected = 0, 0, 1
+ elif PY2 and len(rng) == 1:
+ start_expected, stop_expected, step_expected = start, start + 1, 1
+ else:
+ start_expected, stop_expected, step_expected = start, stop, step
+
+ start_result, stop_result, step_result = get_range_parameters(rng)
+ assert start_result == start_expected
+ assert stop_result == stop_expected
+ assert step_result == step_expected
+
+
+def test_re_type():
+ assert isinstance(re.compile(''), re_type)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_config.py b/contrib/python/pandas/py2/pandas/tests/test_config.py
new file mode 100644
index 00000000000..54db3887850
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_config.py
@@ -0,0 +1,433 @@
+# -*- coding: utf-8 -*-
+import warnings
+
+import pytest
+
+import pandas as pd
+
+
+class TestConfig(object):
+
+ @classmethod
+ def setup_class(cls):
+ from copy import deepcopy
+
+ cls.cf = pd.core.config
+ cls.gc = deepcopy(getattr(cls.cf, '_global_config'))
+ cls.do = deepcopy(getattr(cls.cf, '_deprecated_options'))
+ cls.ro = deepcopy(getattr(cls.cf, '_registered_options'))
+
+ def setup_method(self, method):
+ setattr(self.cf, '_global_config', {})
+ setattr(self.cf, 'options', self.cf.DictWrapper(
+ self.cf._global_config))
+ setattr(self.cf, '_deprecated_options', {})
+ setattr(self.cf, '_registered_options', {})
+
+ # Our test fixture in conftest.py sets "chained_assignment"
+ # to "raise" only after all test methods have been setup.
+ # However, after this setup, there is no longer any
+ # "chained_assignment" option, so re-register it.
+ self.cf.register_option('chained_assignment', 'raise')
+
+ def teardown_method(self, method):
+ setattr(self.cf, '_global_config', self.gc)
+ setattr(self.cf, '_deprecated_options', self.do)
+ setattr(self.cf, '_registered_options', self.ro)
+
+ def test_api(self):
+
+ # the pandas object exposes the user API
+ assert hasattr(pd, 'get_option')
+ assert hasattr(pd, 'set_option')
+ assert hasattr(pd, 'reset_option')
+ assert hasattr(pd, 'describe_option')
+
+ def test_is_one_of_factory(self):
+ v = self.cf.is_one_of_factory([None, 12])
+
+ v(12)
+ v(None)
+ pytest.raises(ValueError, v, 1.1)
+
+ def test_register_option(self):
+ self.cf.register_option('a', 1, 'doc')
+
+ # can't register an already registered option
+ pytest.raises(KeyError, self.cf.register_option, 'a', 1, 'doc')
+
+ # can't register an already registered option
+ pytest.raises(KeyError, self.cf.register_option, 'a.b.c.d1', 1,
+ 'doc')
+ pytest.raises(KeyError, self.cf.register_option, 'a.b.c.d2', 1,
+ 'doc')
+
+ # no python keywords
+ pytest.raises(ValueError, self.cf.register_option, 'for', 0)
+ pytest.raises(ValueError, self.cf.register_option, 'a.for.b', 0)
+ # must be valid identifier (ensure attribute access works)
+ pytest.raises(ValueError, self.cf.register_option,
+ 'Oh my Goddess!', 0)
+
+ # we can register options several levels deep
+ # without predefining the intermediate steps
+ # and we can define differently named options
+ # in the same namespace
+ self.cf.register_option('k.b.c.d1', 1, 'doc')
+ self.cf.register_option('k.b.c.d2', 1, 'doc')
+
+ def test_describe_option(self):
+ self.cf.register_option('a', 1, 'doc')
+ self.cf.register_option('b', 1, 'doc2')
+ self.cf.deprecate_option('b')
+
+ self.cf.register_option('c.d.e1', 1, 'doc3')
+ self.cf.register_option('c.d.e2', 1, 'doc4')
+ self.cf.register_option('f', 1)
+ self.cf.register_option('g.h', 1)
+ self.cf.register_option('k', 2)
+ self.cf.deprecate_option('g.h', rkey="k")
+ self.cf.register_option('l', "foo")
+
+ # non-existent keys raise KeyError
+ pytest.raises(KeyError, self.cf.describe_option, 'no.such.key')
+
+ # we can get the description for any key we registered
+ assert 'doc' in self.cf.describe_option('a', _print_desc=False)
+ assert 'doc2' in self.cf.describe_option('b', _print_desc=False)
+ assert 'precated' in self.cf.describe_option('b', _print_desc=False)
+ assert 'doc3' in self.cf.describe_option('c.d.e1', _print_desc=False)
+ assert 'doc4' in self.cf.describe_option('c.d.e2', _print_desc=False)
+
+ # if no doc is specified we get a default message
+ # saying "description not available"
+ assert 'vailable' in self.cf.describe_option('f', _print_desc=False)
+ assert 'vailable' in self.cf.describe_option('g.h', _print_desc=False)
+ assert 'precated' in self.cf.describe_option('g.h', _print_desc=False)
+ assert 'k' in self.cf.describe_option('g.h', _print_desc=False)
+
+ # default is reported
+ assert 'foo' in self.cf.describe_option('l', _print_desc=False)
+ # current value is reported
+ assert 'bar' not in self.cf.describe_option('l', _print_desc=False)
+ self.cf.set_option("l", "bar")
+ assert 'bar' in self.cf.describe_option('l', _print_desc=False)
+
+ def test_case_insensitive(self):
+ self.cf.register_option('KanBAN', 1, 'doc')
+
+ assert 'doc' in self.cf.describe_option('kanbaN', _print_desc=False)
+ assert self.cf.get_option('kanBaN') == 1
+ self.cf.set_option('KanBan', 2)
+ assert self.cf.get_option('kAnBaN') == 2
+
+ # gets of non-existent keys fail
+ pytest.raises(KeyError, self.cf.get_option, 'no_such_option')
+ self.cf.deprecate_option('KanBan')
+
+ assert self.cf._is_deprecated('kAnBaN')
+
+ def test_get_option(self):
+ self.cf.register_option('a', 1, 'doc')
+ self.cf.register_option('b.c', 'hullo', 'doc2')
+ self.cf.register_option('b.b', None, 'doc2')
+
+ # gets of existing keys succeed
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+ assert self.cf.get_option('b.b') is None
+
+ # gets of non-existent keys fail
+ pytest.raises(KeyError, self.cf.get_option, 'no_such_option')
+
+ def test_set_option(self):
+ self.cf.register_option('a', 1, 'doc')
+ self.cf.register_option('b.c', 'hullo', 'doc2')
+ self.cf.register_option('b.b', None, 'doc2')
+
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+ assert self.cf.get_option('b.b') is None
+
+ self.cf.set_option('a', 2)
+ self.cf.set_option('b.c', 'wurld')
+ self.cf.set_option('b.b', 1.1)
+
+ assert self.cf.get_option('a') == 2
+ assert self.cf.get_option('b.c') == 'wurld'
+ assert self.cf.get_option('b.b') == 1.1
+
+ pytest.raises(KeyError, self.cf.set_option, 'no.such.key', None)
+
+ def test_set_option_empty_args(self):
+ pytest.raises(ValueError, self.cf.set_option)
+
+ def test_set_option_uneven_args(self):
+ pytest.raises(ValueError, self.cf.set_option, 'a.b', 2, 'b.c')
+
+ def test_set_option_invalid_single_argument_type(self):
+ pytest.raises(ValueError, self.cf.set_option, 2)
+
+ def test_set_option_multiple(self):
+ self.cf.register_option('a', 1, 'doc')
+ self.cf.register_option('b.c', 'hullo', 'doc2')
+ self.cf.register_option('b.b', None, 'doc2')
+
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+ assert self.cf.get_option('b.b') is None
+
+ self.cf.set_option('a', '2', 'b.c', None, 'b.b', 10.0)
+
+ assert self.cf.get_option('a') == '2'
+ assert self.cf.get_option('b.c') is None
+ assert self.cf.get_option('b.b') == 10.0
+
+ def test_validation(self):
+ self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int)
+ self.cf.register_option('b.c', 'hullo', 'doc2',
+ validator=self.cf.is_text)
+ pytest.raises(ValueError, self.cf.register_option, 'a.b.c.d2',
+ 'NO', 'doc', validator=self.cf.is_int)
+
+ self.cf.set_option('a', 2) # int is_int
+ self.cf.set_option('b.c', 'wurld') # str is_str
+
+ pytest.raises(
+ ValueError, self.cf.set_option, 'a', None) # None not is_int
+ pytest.raises(ValueError, self.cf.set_option, 'a', 'ab')
+ pytest.raises(ValueError, self.cf.set_option, 'b.c', 1)
+
+ validator = self.cf.is_one_of_factory([None, self.cf.is_callable])
+ self.cf.register_option('b', lambda: None, 'doc',
+ validator=validator)
+ self.cf.set_option('b', '%.1f'.format) # Formatter is callable
+ self.cf.set_option('b', None) # Formatter is none (default)
+ pytest.raises(ValueError, self.cf.set_option, 'b', '%.1f')
+
+ def test_reset_option(self):
+ self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int)
+ self.cf.register_option('b.c', 'hullo', 'doc2',
+ validator=self.cf.is_str)
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+
+ self.cf.set_option('a', 2)
+ self.cf.set_option('b.c', 'wurld')
+ assert self.cf.get_option('a') == 2
+ assert self.cf.get_option('b.c') == 'wurld'
+
+ self.cf.reset_option('a')
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'wurld'
+ self.cf.reset_option('b.c')
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+
+ def test_reset_option_all(self):
+ self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int)
+ self.cf.register_option('b.c', 'hullo', 'doc2',
+ validator=self.cf.is_str)
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+
+ self.cf.set_option('a', 2)
+ self.cf.set_option('b.c', 'wurld')
+ assert self.cf.get_option('a') == 2
+ assert self.cf.get_option('b.c') == 'wurld'
+
+ self.cf.reset_option("all")
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b.c') == 'hullo'
+
+ def test_deprecate_option(self):
+ # we can deprecate non-existent options
+ self.cf.deprecate_option('foo')
+
+ assert self.cf._is_deprecated('foo')
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ with pytest.raises(
+ KeyError,
+ match="No such keys.s.: 'foo'"):
+ self.cf.get_option('foo')
+ assert len(w) == 1 # should have raised one warning
+ assert 'deprecated' in str(w[-1]) # we get the default message
+
+ self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int)
+ self.cf.register_option('b.c', 'hullo', 'doc2')
+ self.cf.register_option('foo', 'hullo', 'doc2')
+
+ self.cf.deprecate_option('a', removal_ver='nifty_ver')
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ self.cf.get_option('a')
+
+ assert len(w) == 1 # should have raised one warning
+ assert 'eprecated' in str(w[-1]) # we get the default message
+ assert 'nifty_ver' in str(w[-1]) # with the removal_ver quoted
+
+ pytest.raises(
+ KeyError, self.cf.deprecate_option, 'a') # can't depr. twice
+
+ self.cf.deprecate_option('b.c', 'zounds!')
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ self.cf.get_option('b.c')
+
+ assert len(w) == 1 # should have raised one warning
+ assert 'zounds!' in str(w[-1]) # we get the custom message
+
+ # test rerouting keys
+ self.cf.register_option('d.a', 'foo', 'doc2')
+ self.cf.register_option('d.dep', 'bar', 'doc2')
+ assert self.cf.get_option('d.a') == 'foo'
+ assert self.cf.get_option('d.dep') == 'bar'
+
+ self.cf.deprecate_option('d.dep', rkey='d.a') # reroute d.dep to d.a
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ assert self.cf.get_option('d.dep') == 'foo'
+
+ assert len(w) == 1 # should have raised one warning
+ assert 'eprecated' in str(w[-1]) # we get the custom message
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ self.cf.set_option('d.dep', 'baz') # should overwrite "d.a"
+
+ assert len(w) == 1 # should have raised one warning
+ assert 'eprecated' in str(w[-1]) # we get the custom message
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ assert self.cf.get_option('d.dep') == 'baz'
+
+ assert len(w) == 1 # should have raised one warning
+ assert 'eprecated' in str(w[-1]) # we get the custom message
+
+ def test_config_prefix(self):
+ with self.cf.config_prefix("base"):
+ self.cf.register_option('a', 1, "doc1")
+ self.cf.register_option('b', 2, "doc2")
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b') == 2
+
+ self.cf.set_option('a', 3)
+ self.cf.set_option('b', 4)
+ assert self.cf.get_option('a') == 3
+ assert self.cf.get_option('b') == 4
+
+ assert self.cf.get_option('base.a') == 3
+ assert self.cf.get_option('base.b') == 4
+ assert 'doc1' in self.cf.describe_option('base.a', _print_desc=False)
+ assert 'doc2' in self.cf.describe_option('base.b', _print_desc=False)
+
+ self.cf.reset_option('base.a')
+ self.cf.reset_option('base.b')
+
+ with self.cf.config_prefix("base"):
+ assert self.cf.get_option('a') == 1
+ assert self.cf.get_option('b') == 2
+
+ def test_callback(self):
+ k = [None]
+ v = [None]
+
+ def callback(key):
+ k.append(key)
+ v.append(self.cf.get_option(key))
+
+ self.cf.register_option('d.a', 'foo', cb=callback)
+ self.cf.register_option('d.b', 'foo', cb=callback)
+
+ del k[-1], v[-1]
+ self.cf.set_option("d.a", "fooz")
+ assert k[-1] == "d.a"
+ assert v[-1] == "fooz"
+
+ del k[-1], v[-1]
+ self.cf.set_option("d.b", "boo")
+ assert k[-1] == "d.b"
+ assert v[-1] == "boo"
+
+ del k[-1], v[-1]
+ self.cf.reset_option("d.b")
+ assert k[-1] == "d.b"
+
+ def test_set_ContextManager(self):
+ def eq(val):
+ assert self.cf.get_option("a") == val
+
+ self.cf.register_option('a', 0)
+ eq(0)
+ with self.cf.option_context("a", 15):
+ eq(15)
+ with self.cf.option_context("a", 25):
+ eq(25)
+ eq(15)
+ eq(0)
+
+ self.cf.set_option("a", 17)
+ eq(17)
+
+ def test_attribute_access(self):
+ holder = []
+
+ def f():
+ options.b = 1
+
+ def f2():
+ options.display = 1
+
+ def f3(key):
+ holder.append(True)
+
+ self.cf.register_option('a', 0)
+ self.cf.register_option('c', 0, cb=f3)
+ options = self.cf.options
+
+ assert options.a == 0
+ with self.cf.option_context("a", 15):
+ assert options.a == 15
+
+ options.a = 500
+ assert self.cf.get_option("a") == 500
+
+ self.cf.reset_option("a")
+ assert options.a == self.cf.get_option("a", 0)
+
+ pytest.raises(KeyError, f)
+ pytest.raises(KeyError, f2)
+
+ # make sure callback kicks when using this form of setting
+ options.c = 1
+ assert len(holder) == 1
+
+ def test_option_context_scope(self):
+ # Ensure that creating a context does not affect the existing
+ # environment as it is supposed to be used with the `with` statement.
+ # See https://github.com/pandas-dev/pandas/issues/8514
+
+ original_value = 60
+ context_value = 10
+ option_name = 'a'
+
+ self.cf.register_option(option_name, original_value)
+
+ # Ensure creating contexts didn't affect the current context.
+ ctx = self.cf.option_context(option_name, context_value)
+ assert self.cf.get_option(option_name) == original_value
+
+ # Ensure the correct value is available inside the context.
+ with ctx:
+ assert self.cf.get_option(option_name) == context_value
+
+ # Ensure the current context is reset
+ assert self.cf.get_option(option_name) == original_value
+
+ def test_dictwrapper_getattr(self):
+ options = self.cf.options
+ # GH 19789
+ pytest.raises(self.cf.OptionError, getattr, options, 'bananas')
+ assert not hasattr(options, 'bananas')
diff --git a/contrib/python/pandas/py2/pandas/tests/test_downstream.py b/contrib/python/pandas/py2/pandas/tests/test_downstream.py
new file mode 100644
index 00000000000..92b4e5a9904
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_downstream.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+"""
+Testing that we work in the downstream packages
+"""
+import importlib
+import subprocess
+import sys
+
+import numpy as np # noqa
+import pytest
+
+from pandas.compat import PY2, PY36, is_platform_windows
+
+from pandas import DataFrame
+from pandas.util import testing as tm
+
+
+def import_module(name):
+ # we *only* want to skip if the module is truly not available
+ # and NOT just an actual import error because of pandas changes
+
+ if PY36:
+ try:
+ return importlib.import_module(name)
+ except ModuleNotFoundError: # noqa
+ pytest.skip("skipping as {} not available".format(name))
+
+ else:
+ try:
+ return importlib.import_module(name)
+ except ImportError as e:
+ if "No module named" in str(e) and name in str(e):
+ pytest.skip("skipping as {} not available".format(name))
+ raise
+
+
+def df():
+ return DataFrame({'A': [1, 2, 3]})
+
+
+def test_dask(df):
+
+ toolz = import_module('toolz') # noqa
+ dask = import_module('dask') # noqa
+
+ import dask.dataframe as dd
+
+ ddf = dd.from_pandas(df, npartitions=3)
+ assert ddf.A is not None
+ assert ddf.compute() is not None
+
+
+def test_xarray(df):
+
+ xarray = import_module('xarray') # noqa
+
+ assert df.to_xarray() is not None
+
+
[email protected](is_platform_windows() and PY2,
+ reason="Broken on Windows / Py2")
+def test_oo_optimizable():
+ # GH 21071
+ subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
+
+
+# Cython import warning
[email protected]("ignore:can't:ImportWarning")
+def test_statsmodels():
+
+ statsmodels = import_module('statsmodels') # noqa
+ import statsmodels.api as sm
+ import statsmodels.formula.api as smf
+ df = sm.datasets.get_rdataset("Guerry", "HistData").data
+ smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit()
+
+
+# Cython import warning
[email protected]("ignore:can't:ImportWarning")
+def test_scikit_learn(df):
+
+ sklearn = import_module('sklearn') # noqa
+ from sklearn import svm, datasets
+
+ digits = datasets.load_digits()
+ clf = svm.SVC(gamma=0.001, C=100.)
+ clf.fit(digits.data[:-1], digits.target[:-1])
+ clf.predict(digits.data[-1:])
+
+
+# Cython import warning and traitlets
+def test_seaborn():
+
+ seaborn = import_module('seaborn')
+ tips = seaborn.load_dataset("tips")
+ seaborn.stripplot(x="day", y="total_bill", data=tips)
+
+
+def test_pandas_gbq(df):
+
+ pandas_gbq = import_module('pandas_gbq') # noqa
+
+
[email protected](reason="0.7.0 pending")
+def test_pandas_datareader():
+
+ pandas_datareader = import_module('pandas_datareader') # noqa
+ pandas_datareader.DataReader(
+ 'F', 'quandl', '2017-01-01', '2017-02-01')
+
+
+# importing from pandas, Cython import warning
[email protected]("ignore:The 'warn':DeprecationWarning")
[email protected]("ignore:pandas.util:DeprecationWarning")
[email protected]("ignore:can't resolve:ImportWarning")
+def test_geopandas():
+
+ geopandas = import_module('geopandas') # noqa
+ fp = geopandas.datasets.get_path('naturalearth_lowres')
+ assert geopandas.read_file(fp) is not None
+
+
+# Cython import warning
[email protected]("ignore:can't resolve:ImportWarning")
+def test_pyarrow(df):
+
+ pyarrow = import_module('pyarrow') # noqa
+ table = pyarrow.Table.from_pandas(df)
+ result = table.to_pandas()
+ tm.assert_frame_equal(result, df)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_errors.py b/contrib/python/pandas/py2/pandas/tests/test_errors.py
new file mode 100644
index 00000000000..d3b6a237a97
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_errors.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas.errors import AbstractMethodError
+
+import pandas as pd # noqa
+
+
+ "exc", ['UnsupportedFunctionCall', 'UnsortedIndexError',
+ 'OutOfBoundsDatetime',
+ 'ParserError', 'PerformanceWarning', 'DtypeWarning',
+ 'EmptyDataError', 'ParserWarning', 'MergeError'])
+def test_exception_importable(exc):
+ from pandas import errors
+ e = getattr(errors, exc)
+ assert e is not None
+
+ # check that we can raise on them
+ with pytest.raises(e):
+ raise e()
+
+
+def test_catch_oob():
+ from pandas import errors
+
+ try:
+ pd.Timestamp('15000101')
+ except errors.OutOfBoundsDatetime:
+ pass
+
+
+def test_error_rename():
+ # see gh-12665
+ from pandas.errors import ParserError
+ from pandas.io.common import CParserError
+
+ try:
+ raise CParserError()
+ except ParserError:
+ pass
+
+ try:
+ raise ParserError()
+ except CParserError:
+ pass
+
+
+class Foo(object):
+ @classmethod
+ def classmethod(cls):
+ raise AbstractMethodError(cls, methodtype='classmethod')
+
+ @property
+ def property(self):
+ raise AbstractMethodError(self, methodtype='property')
+
+ def method(self):
+ raise AbstractMethodError(self)
+
+
+def test_AbstractMethodError_classmethod():
+ xpr = "This classmethod must be defined in the concrete class Foo"
+ with pytest.raises(AbstractMethodError, match=xpr):
+ Foo.classmethod()
+
+ xpr = "This property must be defined in the concrete class Foo"
+ with pytest.raises(AbstractMethodError, match=xpr):
+ Foo().property
+
+ xpr = "This method must be defined in the concrete class Foo"
+ with pytest.raises(AbstractMethodError, match=xpr):
+ Foo().method()
diff --git a/contrib/python/pandas/py2/pandas/tests/test_expressions.py b/contrib/python/pandas/py2/pandas/tests/test_expressions.py
new file mode 100644
index 00000000000..f5aa0b0b3c9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_expressions.py
@@ -0,0 +1,463 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import operator
+import re
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+from pandas import _np_version_under1p13, compat
+from pandas.core.api import DataFrame, Panel
+from pandas.core.computation import expressions as expr
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_panel_equal,
+ assert_series_equal)
+
+from pandas.io.formats.printing import pprint_thing
+
+# pylint: disable-msg=W0612,E1101
+
+
+_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64')
+_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64')
+_mixed = DataFrame({'A': _frame['A'].copy(),
+ 'B': _frame['B'].astype('float32'),
+ 'C': _frame['C'].astype('int64'),
+ 'D': _frame['D'].astype('int32')})
+_mixed2 = DataFrame({'A': _frame2['A'].copy(),
+ 'B': _frame2['B'].astype('float32'),
+ 'C': _frame2['C'].astype('int64'),
+ 'D': _frame2['D'].astype('int32')})
+_integer = DataFrame(
+ np.random.randint(1, 100,
+ size=(10001, 4)),
+ columns=list('ABCD'), dtype='int64')
+_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)),
+ columns=list('ABCD'), dtype='int64')
+
+with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ _frame_panel = Panel(dict(ItemA=_frame.copy(),
+ ItemB=(_frame.copy() + 3),
+ ItemC=_frame.copy(),
+ ItemD=_frame.copy()))
+ _frame2_panel = Panel(dict(ItemA=_frame2.copy(),
+ ItemB=(_frame2.copy() + 3),
+ ItemC=_frame2.copy(),
+ ItemD=_frame2.copy()))
+ _integer_panel = Panel(dict(ItemA=_integer,
+ ItemB=(_integer + 34).astype('int64')))
+ _integer2_panel = Panel(dict(ItemA=_integer2,
+ ItemB=(_integer2 + 34).astype('int64')))
+ _mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3)))
+ _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))
+
+
[email protected](not expr._USE_NUMEXPR, reason='not using numexpr')
+class TestExpressions(object):
+
+ def setup_method(self, method):
+
+ self.frame = _frame.copy()
+ self.frame2 = _frame2.copy()
+ self.mixed = _mixed.copy()
+ self.mixed2 = _mixed2.copy()
+ self.integer = _integer.copy()
+ self._MIN_ELEMENTS = expr._MIN_ELEMENTS
+
+ def teardown_method(self, method):
+ expr._MIN_ELEMENTS = self._MIN_ELEMENTS
+
+ def run_arithmetic(self, df, other, assert_func, check_dtype=False,
+ test_flex=True):
+ expr._MIN_ELEMENTS = 0
+ operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv']
+ if not compat.PY3:
+ operations.append('div')
+ for arith in operations:
+
+ operator_name = arith
+ if arith == 'div':
+ operator_name = 'truediv'
+
+ if test_flex:
+ op = lambda x, y: getattr(df, arith)(y)
+ op.__name__ = arith
+ else:
+ op = getattr(operator, operator_name)
+ expr.set_use_numexpr(False)
+ expected = op(df, other)
+ expr.set_use_numexpr(True)
+
+ result = op(df, other)
+ try:
+ if check_dtype:
+ if arith == 'truediv':
+ assert expected.dtype.kind == 'f'
+ assert_func(expected, result)
+ except Exception:
+ pprint_thing("Failed test with operator %r" % op.__name__)
+ raise
+
+ def test_integer_arithmetic(self):
+ self.run_arithmetic(self.integer, self.integer,
+ assert_frame_equal)
+ self.run_arithmetic(self.integer.iloc[:, 0],
+ self.integer.iloc[:, 0], assert_series_equal,
+ check_dtype=True)
+
+ def run_binary(self, df, other, assert_func, test_flex=False,
+ numexpr_ops={'gt', 'lt', 'ge', 'le', 'eq', 'ne'}):
+ """
+ tests solely that the result is the same whether or not numexpr is
+ enabled. Need to test whether the function does the correct thing
+ elsewhere.
+ """
+ expr._MIN_ELEMENTS = 0
+ expr.set_test_mode(True)
+ operations = ['gt', 'lt', 'ge', 'le', 'eq', 'ne']
+
+ for arith in operations:
+ if test_flex:
+ op = lambda x, y: getattr(df, arith)(y)
+ op.__name__ = arith
+ else:
+ op = getattr(operator, arith)
+ expr.set_use_numexpr(False)
+ expected = op(df, other)
+ expr.set_use_numexpr(True)
+ expr.get_test_result()
+ result = op(df, other)
+ used_numexpr = expr.get_test_result()
+ try:
+ if arith in numexpr_ops:
+ assert used_numexpr, "Did not use numexpr as expected."
+ else:
+ assert not used_numexpr, "Used numexpr unexpectedly."
+ assert_func(expected, result)
+ except Exception:
+ pprint_thing("Failed test with operation %r" % arith)
+ pprint_thing("test_flex was %r" % test_flex)
+ raise
+
+ def run_frame(self, df, other, binary_comp=None, run_binary=True,
+ **kwargs):
+ self.run_arithmetic(df, other, assert_frame_equal,
+ test_flex=False, **kwargs)
+ self.run_arithmetic(df, other, assert_frame_equal, test_flex=True,
+ **kwargs)
+ if run_binary:
+ if binary_comp is None:
+ expr.set_use_numexpr(False)
+ binary_comp = other + 1
+ expr.set_use_numexpr(True)
+ self.run_binary(df, binary_comp, assert_frame_equal,
+ test_flex=False, **kwargs)
+ self.run_binary(df, binary_comp, assert_frame_equal,
+ test_flex=True, **kwargs)
+
+ def run_series(self, ser, other, binary_comp=None, **kwargs):
+ self.run_arithmetic(ser, other, assert_series_equal,
+ test_flex=False, **kwargs)
+ self.run_arithmetic(ser, other, assert_almost_equal,
+ test_flex=True, **kwargs)
+ # series doesn't uses vec_compare instead of numexpr...
+ # if binary_comp is None:
+ # binary_comp = other + 1
+ # self.run_binary(ser, binary_comp, assert_frame_equal,
+ # test_flex=False, **kwargs)
+ # self.run_binary(ser, binary_comp, assert_frame_equal,
+ # test_flex=True, **kwargs)
+
+ def run_panel(self, panel, other, binary_comp=None, run_binary=True,
+ assert_func=assert_panel_equal, **kwargs):
+ self.run_arithmetic(panel, other, assert_func, test_flex=False,
+ **kwargs)
+ self.run_arithmetic(panel, other, assert_func, test_flex=True,
+ **kwargs)
+ if run_binary:
+ if binary_comp is None:
+ binary_comp = other + 1
+ self.run_binary(panel, binary_comp, assert_func,
+ test_flex=False, **kwargs)
+ self.run_binary(panel, binary_comp, assert_func,
+ test_flex=True, **kwargs)
+
+ def test_integer_arithmetic_frame(self):
+ self.run_frame(self.integer, self.integer)
+
+ def test_integer_arithmetic_series(self):
+ self.run_series(self.integer.iloc[:, 0], self.integer.iloc[:, 0])
+
+ @pytest.mark.slow
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_integer_panel(self):
+ self.run_panel(_integer2_panel, np.random.randint(1, 100))
+
+ def test_float_arithemtic_frame(self):
+ self.run_frame(self.frame2, self.frame2)
+
+ def test_float_arithmetic_series(self):
+ self.run_series(self.frame2.iloc[:, 0], self.frame2.iloc[:, 0])
+
+ @pytest.mark.slow
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_float_panel(self):
+ self.run_panel(_frame2_panel, np.random.randn() + 0.1, binary_comp=0.8)
+
+ def test_mixed_arithmetic_frame(self):
+ # TODO: FIGURE OUT HOW TO GET IT TO WORK...
+ # can't do arithmetic because comparison methods try to do *entire*
+ # frame instead of by-column
+ self.run_frame(self.mixed2, self.mixed2, run_binary=False)
+
+ def test_mixed_arithmetic_series(self):
+ for col in self.mixed2.columns:
+ self.run_series(self.mixed2[col], self.mixed2[col], binary_comp=4)
+
+ @pytest.mark.slow
+ @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
+ def test_mixed_panel(self):
+ self.run_panel(_mixed2_panel, np.random.randint(1, 100),
+ binary_comp=-2)
+
+ def test_float_arithemtic(self):
+ self.run_arithmetic(self.frame, self.frame, assert_frame_equal)
+ self.run_arithmetic(self.frame.iloc[:, 0], self.frame.iloc[:, 0],
+ assert_series_equal, check_dtype=True)
+
+ def test_mixed_arithmetic(self):
+ self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal)
+ for col in self.mixed.columns:
+ self.run_arithmetic(self.mixed[col], self.mixed[col],
+ assert_series_equal)
+
+ def test_integer_with_zeros(self):
+ self.integer *= np.random.randint(0, 2, size=np.shape(self.integer))
+ self.run_arithmetic(self.integer, self.integer,
+ assert_frame_equal)
+ self.run_arithmetic(self.integer.iloc[:, 0],
+ self.integer.iloc[:, 0], assert_series_equal)
+
+ def test_invalid(self):
+
+ # no op
+ result = expr._can_use_numexpr(operator.add, None, self.frame,
+ self.frame, 'evaluate')
+ assert not result
+
+ # mixed
+ result = expr._can_use_numexpr(operator.add, '+', self.mixed,
+ self.frame, 'evaluate')
+ assert not result
+
+ # min elements
+ result = expr._can_use_numexpr(operator.add, '+', self.frame2,
+ self.frame2, 'evaluate')
+ assert not result
+
+ # ok, we only check on first part of expression
+ result = expr._can_use_numexpr(operator.add, '+', self.frame,
+ self.frame2, 'evaluate')
+ assert result
+
+ def test_binary_ops(self):
+ def testit():
+
+ for f, f2 in [(self.frame, self.frame2),
+ (self.mixed, self.mixed2)]:
+
+ for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'),
+ ('div', '/'), ('pow', '**')]:
+
+ if op == 'pow':
+ continue
+
+ if op == 'div':
+ op = getattr(operator, 'truediv', None)
+ else:
+ op = getattr(operator, op, None)
+ if op is not None:
+ result = expr._can_use_numexpr(op, op_str, f, f,
+ 'evaluate')
+ assert result != f._is_mixed_type
+
+ result = expr.evaluate(op, op_str, f, f,
+ use_numexpr=True)
+ expected = expr.evaluate(op, op_str, f, f,
+ use_numexpr=False)
+
+ if isinstance(result, DataFrame):
+ tm.assert_frame_equal(result, expected)
+ else:
+ tm.assert_numpy_array_equal(result,
+ expected.values)
+
+ result = expr._can_use_numexpr(op, op_str, f2, f2,
+ 'evaluate')
+ assert not result
+
+ expr.set_use_numexpr(False)
+ testit()
+ expr.set_use_numexpr(True)
+ expr.set_numexpr_threads(1)
+ testit()
+ expr.set_numexpr_threads()
+ testit()
+
+ def test_boolean_ops(self):
+ def testit():
+ for f, f2 in [(self.frame, self.frame2),
+ (self.mixed, self.mixed2)]:
+
+ f11 = f
+ f12 = f + 1
+
+ f21 = f2
+ f22 = f2 + 1
+
+ for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='),
+ ('le', '<='), ('eq', '=='), ('ne', '!=')]:
+
+ op = getattr(operator, op)
+
+ result = expr._can_use_numexpr(op, op_str, f11, f12,
+ 'evaluate')
+ assert result != f11._is_mixed_type
+
+ result = expr.evaluate(op, op_str, f11, f12,
+ use_numexpr=True)
+ expected = expr.evaluate(op, op_str, f11, f12,
+ use_numexpr=False)
+ if isinstance(result, DataFrame):
+ tm.assert_frame_equal(result, expected)
+ else:
+ tm.assert_numpy_array_equal(result, expected.values)
+
+ result = expr._can_use_numexpr(op, op_str, f21, f22,
+ 'evaluate')
+ assert not result
+
+ expr.set_use_numexpr(False)
+ testit()
+ expr.set_use_numexpr(True)
+ expr.set_numexpr_threads(1)
+ testit()
+ expr.set_numexpr_threads()
+ testit()
+
+ def test_where(self):
+ def testit():
+ for f in [self.frame, self.frame2, self.mixed, self.mixed2]:
+
+ for cond in [True, False]:
+
+ c = np.empty(f.shape, dtype=np.bool_)
+ c.fill(cond)
+ result = expr.where(c, f.values, f.values + 1)
+ expected = np.where(c, f.values, f.values + 1)
+ tm.assert_numpy_array_equal(result, expected)
+
+ expr.set_use_numexpr(False)
+ testit()
+ expr.set_use_numexpr(True)
+ expr.set_numexpr_threads(1)
+ testit()
+ expr.set_numexpr_threads()
+ testit()
+
+ def test_bool_ops_raise_on_arithmetic(self):
+ df = DataFrame({'a': np.random.rand(10) > 0.5,
+ 'b': np.random.rand(10) > 0.5})
+ names = 'div', 'truediv', 'floordiv', 'pow'
+ ops = '/', '/', '//', '**'
+ msg = 'operator %r not implemented for bool dtypes'
+ for op, name in zip(ops, names):
+ if not compat.PY3 or name != 'div':
+ f = getattr(operator, name)
+ err_msg = re.escape(msg % op)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(df, df)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(df.a, df.b)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(df.a, True)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(False, df.a)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(False, df)
+
+ with pytest.raises(NotImplementedError, match=err_msg):
+ f(df, True)
+
+ def test_bool_ops_warn_on_arithmetic(self):
+ n = 10
+ df = DataFrame({'a': np.random.rand(n) > 0.5,
+ 'b': np.random.rand(n) > 0.5})
+ names = 'add', 'mul', 'sub'
+ ops = '+', '*', '-'
+ subs = {'+': '|', '*': '&', '-': '^'}
+ sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'}
+ for op, name in zip(ops, names):
+ f = getattr(operator, name)
+ fe = getattr(operator, sub_funcs[subs[op]])
+
+ # >= 1.13.0 these are now TypeErrors
+ if op == '-' and not _np_version_under1p13:
+ continue
+
+ with tm.use_numexpr(True, min_elements=5):
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(df, df)
+ e = fe(df, df)
+ tm.assert_frame_equal(r, e)
+
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(df.a, df.b)
+ e = fe(df.a, df.b)
+ tm.assert_series_equal(r, e)
+
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(df.a, True)
+ e = fe(df.a, True)
+ tm.assert_series_equal(r, e)
+
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(False, df.a)
+ e = fe(False, df.a)
+ tm.assert_series_equal(r, e)
+
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(False, df)
+ e = fe(False, df)
+ tm.assert_frame_equal(r, e)
+
+ with tm.assert_produces_warning(check_stacklevel=False):
+ r = f(df, True)
+ e = fe(df, True)
+ tm.assert_frame_equal(r, e)
+
+ @pytest.mark.parametrize("test_input,expected", [
+ (DataFrame([[0, 1, 2, 'aa'], [0, 1, 2, 'aa']],
+ columns=['a', 'b', 'c', 'dtype']),
+ DataFrame([[False, False], [False, False]],
+ columns=['a', 'dtype'])),
+ (DataFrame([[0, 3, 2, 'aa'], [0, 4, 2, 'aa'], [0, 1, 1, 'bb']],
+ columns=['a', 'b', 'c', 'dtype']),
+ DataFrame([[False, False], [False, False],
+ [False, False]], columns=['a', 'dtype'])),
+ ])
+ def test_bool_ops_column_name_dtype(self, test_input, expected):
+ # GH 22383 - .ne fails if columns containing column name 'dtype'
+ result = test_input.loc[:, ['a', 'dtype']].ne(
+ test_input.loc[:, ['a', 'dtype']])
+ assert_frame_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_join.py b/contrib/python/pandas/py2/pandas/tests/test_join.py
new file mode 100644
index 00000000000..5b6656de157
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_join.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+from pandas._libs import join as _join
+
+from pandas import Categorical, DataFrame, Index, merge
+import pandas.util.testing as tm
+from pandas.util.testing import assert_almost_equal, assert_frame_equal
+
+
+class TestIndexer(object):
+
+ def test_outer_join_indexer(self):
+ typemap = [('int32', _join.outer_join_indexer_int32),
+ ('int64', _join.outer_join_indexer_int64),
+ ('float32', _join.outer_join_indexer_float32),
+ ('float64', _join.outer_join_indexer_float64),
+ ('object', _join.outer_join_indexer_object)]
+
+ for dtype, indexer in typemap:
+ left = np.arange(3, dtype=dtype)
+ right = np.arange(2, 5, dtype=dtype)
+ empty = np.array([], dtype=dtype)
+
+ result, lindexer, rindexer = indexer(left, right)
+ assert isinstance(result, np.ndarray)
+ assert isinstance(lindexer, np.ndarray)
+ assert isinstance(rindexer, np.ndarray)
+ tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
+ exp = np.array([0, 1, 2, -1, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(lindexer, exp)
+ exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64)
+ tm.assert_numpy_array_equal(rindexer, exp)
+
+ result, lindexer, rindexer = indexer(empty, right)
+ tm.assert_numpy_array_equal(result, right)
+ exp = np.array([-1, -1, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(lindexer, exp)
+ exp = np.array([0, 1, 2], dtype=np.int64)
+ tm.assert_numpy_array_equal(rindexer, exp)
+
+ result, lindexer, rindexer = indexer(left, empty)
+ tm.assert_numpy_array_equal(result, left)
+ exp = np.array([0, 1, 2], dtype=np.int64)
+ tm.assert_numpy_array_equal(lindexer, exp)
+ exp = np.array([-1, -1, -1], dtype=np.int64)
+ tm.assert_numpy_array_equal(rindexer, exp)
+
+
+def test_left_join_indexer_unique():
+ a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+ b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
+
+ result = _join.left_join_indexer_unique_int64(b, a)
+ expected = np.array([1, 1, 2, 3, 3], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_left_outer_join_bug():
+ left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3,
+ 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1,
+ 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0,
+ 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3,
+ 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0,
+ 3, 1, 2, 0, 2], dtype=np.int64)
+
+ right = np.array([3, 1], dtype=np.int64)
+ max_groups = 4
+
+ lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False)
+
+ exp_lidx = np.arange(len(left), dtype=np.int64)
+ exp_ridx = -np.ones(len(left), dtype=np.int64)
+
+ exp_ridx[left == 1] = 1
+ exp_ridx[left == 3] = 0
+
+ tm.assert_numpy_array_equal(lidx, exp_lidx)
+ tm.assert_numpy_array_equal(ridx, exp_ridx)
+
+
+def test_inner_join_indexer():
+ a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+ b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+ index, ares, bres = _join.inner_join_indexer_int64(a, b)
+
+ index_exp = np.array([3, 5], dtype=np.int64)
+ assert_almost_equal(index, index_exp)
+
+ aexp = np.array([2, 4], dtype=np.int64)
+ bexp = np.array([1, 2], dtype=np.int64)
+ assert_almost_equal(ares, aexp)
+ assert_almost_equal(bres, bexp)
+
+ a = np.array([5], dtype=np.int64)
+ b = np.array([5], dtype=np.int64)
+
+ index, ares, bres = _join.inner_join_indexer_int64(a, b)
+ tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+ tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
+ tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
+
+
+def test_outer_join_indexer():
+ a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+ b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+ index, ares, bres = _join.outer_join_indexer_int64(a, b)
+
+ index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
+ assert_almost_equal(index, index_exp)
+
+ aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64)
+ bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64)
+ assert_almost_equal(ares, aexp)
+ assert_almost_equal(bres, bexp)
+
+ a = np.array([5], dtype=np.int64)
+ b = np.array([5], dtype=np.int64)
+
+ index, ares, bres = _join.outer_join_indexer_int64(a, b)
+ tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+ tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
+ tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
+
+
+def test_left_join_indexer():
+ a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+ b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+ index, ares, bres = _join.left_join_indexer_int64(a, b)
+
+ assert_almost_equal(index, a)
+
+ aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+ bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64)
+ assert_almost_equal(ares, aexp)
+ assert_almost_equal(bres, bexp)
+
+ a = np.array([5], dtype=np.int64)
+ b = np.array([5], dtype=np.int64)
+
+ index, ares, bres = _join.left_join_indexer_int64(a, b)
+ tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+ tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64))
+ tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64))
+
+
+def test_left_join_indexer2():
+ idx = Index([1, 1, 2, 5])
+ idx2 = Index([1, 2, 5, 7, 9])
+
+ res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values)
+
+ exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
+ assert_almost_equal(res, exp_res)
+
+ exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
+ assert_almost_equal(lidx, exp_lidx)
+
+ exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
+ assert_almost_equal(ridx, exp_ridx)
+
+
+def test_outer_join_indexer2():
+ idx = Index([1, 1, 2, 5])
+ idx2 = Index([1, 2, 5, 7, 9])
+
+ res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values)
+
+ exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
+ assert_almost_equal(res, exp_res)
+
+ exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64)
+ assert_almost_equal(lidx, exp_lidx)
+
+ exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64)
+ assert_almost_equal(ridx, exp_ridx)
+
+
+def test_inner_join_indexer2():
+ idx = Index([1, 1, 2, 5])
+ idx2 = Index([1, 2, 5, 7, 9])
+
+ res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values)
+
+ exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
+ assert_almost_equal(res, exp_res)
+
+ exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64)
+ assert_almost_equal(lidx, exp_lidx)
+
+ exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64)
+ assert_almost_equal(ridx, exp_ridx)
+
+
+def test_merge_join_categorical_multiindex():
+ # From issue 16627
+ a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
+ ['a', 'b', 'c']),
+ 'Int1': [0, 1, 0, 1, 0, 0]}
+ a = DataFrame(a)
+
+ b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+ ['a', 'b', 'c']),
+ 'Int': [0, 0, 0, 1, 1, 1],
+ 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
+ b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
+
+ expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+ right_on=['Cat', 'Int'], how='left')
+ result = a.join(b, on=['Cat1', 'Int1'])
+ expected = expected.drop(['Cat', 'Int'], axis=1)
+ assert_frame_equal(expected, result)
+
+ # Same test, but with ordered categorical
+ a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
+ ['b', 'a', 'c'],
+ ordered=True),
+ 'Int1': [0, 1, 0, 1, 0, 0]}
+ a = DataFrame(a)
+
+ b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+ ['b', 'a', 'c'],
+ ordered=True),
+ 'Int': [0, 0, 0, 1, 1, 1],
+ 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
+ b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
+
+ expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+ right_on=['Cat', 'Int'], how='left')
+ result = a.join(b, on=['Cat1', 'Int1'])
+ expected = expected.drop(['Cat', 'Int'], axis=1)
+ assert_frame_equal(expected, result)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_lib.py b/contrib/python/pandas/py2/pandas/tests/test_lib.py
new file mode 100644
index 00000000000..c5dcfc89faa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_lib.py
@@ -0,0 +1,207 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib, writers as libwriters
+
+from pandas import Index
+import pandas.util.testing as tm
+
+
+class TestMisc(object):
+
+ def test_max_len_string_array(self):
+
+ arr = a = np.array(['foo', 'b', np.nan], dtype='object')
+ assert libwriters.max_len_string_array(arr) == 3
+
+ # unicode
+ arr = a.astype('U').astype(object)
+ assert libwriters.max_len_string_array(arr) == 3
+
+ # bytes for python3
+ arr = a.astype('S').astype(object)
+ assert libwriters.max_len_string_array(arr) == 3
+
+ # raises
+ with pytest.raises(TypeError):
+ libwriters.max_len_string_array(arr.astype('U'))
+
+ def test_fast_unique_multiple_list_gen_sort(self):
+ keys = [['p', 'a'], ['n', 'd'], ['a', 's']]
+
+ gen = (key for key in keys)
+ expected = np.array(['a', 'd', 'n', 'p', 's'])
+ out = lib.fast_unique_multiple_list_gen(gen, sort=True)
+ tm.assert_numpy_array_equal(np.array(out), expected)
+
+ gen = (key for key in keys)
+ expected = np.array(['p', 'a', 'n', 'd', 's'])
+ out = lib.fast_unique_multiple_list_gen(gen, sort=False)
+ tm.assert_numpy_array_equal(np.array(out), expected)
+
+
+class TestIndexing(object):
+
+ def test_maybe_indices_to_slice_left_edge(self):
+ target = np.arange(100)
+
+ # slice
+ indices = np.array([], dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ for end in [1, 2, 5, 20, 99]:
+ for step in [1, 2, 4]:
+ indices = np.arange(0, end, step, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # reverse
+ indices = indices[::-1]
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # not slice
+ for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2],
+ [2, 0, -2]]:
+ indices = np.array(case, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ def test_maybe_indices_to_slice_right_edge(self):
+ target = np.arange(100)
+
+ # slice
+ for start in [0, 2, 5, 20, 97, 98]:
+ for step in [1, 2, 4]:
+ indices = np.arange(start, 99, step, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # reverse
+ indices = indices[::-1]
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # not slice
+ indices = np.array([97, 98, 99, 100], dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+
+ with pytest.raises(IndexError):
+ target[indices]
+ with pytest.raises(IndexError):
+ target[maybe_slice]
+
+ indices = np.array([100, 99, 98, 97], dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+
+ with pytest.raises(IndexError):
+ target[indices]
+ with pytest.raises(IndexError):
+ target[maybe_slice]
+
+ for case in [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]:
+ indices = np.array(case, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ def test_maybe_indices_to_slice_both_edges(self):
+ target = np.arange(10)
+
+ # slice
+ for step in [1, 2, 4, 5, 8, 9]:
+ indices = np.arange(0, 9, step, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ # reverse
+ indices = indices[::-1]
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ # not slice
+ for case in [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]]:
+ indices = np.array(case, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ def test_maybe_indices_to_slice_middle(self):
+ target = np.arange(100)
+
+ # slice
+ for start, end in [(2, 10), (5, 25), (65, 97)]:
+ for step in [1, 2, 4, 20]:
+ indices = np.arange(start, end, step, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # reverse
+ indices = indices[::-1]
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(target[indices],
+ target[maybe_slice])
+
+ # not slice
+ for case in [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]:
+ indices = np.array(case, dtype=np.int64)
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+ assert not isinstance(maybe_slice, slice)
+ tm.assert_numpy_array_equal(maybe_slice, indices)
+ tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+ def test_maybe_booleans_to_slice(self):
+ arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8)
+ result = lib.maybe_booleans_to_slice(arr)
+ assert result.dtype == np.bool_
+
+ result = lib.maybe_booleans_to_slice(arr[:0])
+ assert result == slice(0, 0)
+
+ def test_get_reverse_indexer(self):
+ indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64)
+ result = lib.get_reverse_indexer(indexer, 5)
+ expected = np.array([4, 2, 3, 6, 7], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_cache_readonly_preserve_docstrings():
+ # GH18197
+ assert Index.hasnans.__doc__ is not None
diff --git a/contrib/python/pandas/py2/pandas/tests/test_multilevel.py b/contrib/python/pandas/py2/pandas/tests/test_multilevel.py
new file mode 100644
index 00000000000..a7bbbbb5033
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_multilevel.py
@@ -0,0 +1,2063 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=W0612,E1101,W0141
+import datetime
+import itertools
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+from numpy.random import randn
+import pytest
+import pytz
+
+from pandas.compat import (
+ StringIO, lrange, lzip, product as cart_product, range, u, zip)
+
+from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
+
+import pandas as pd
+from pandas import DataFrame, Panel, Series, Timestamp, isna
+from pandas.core.index import Index, MultiIndex
+import pandas.util.testing as tm
+
+AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad',
+ 'std', 'var', 'sem']
+
+
+class Base(object):
+
+ def setup_method(self, method):
+
+ index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
+ 'three']],
+ codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
+ [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
+ names=['first', 'second'])
+ self.frame = DataFrame(np.random.randn(10, 3), index=index,
+ columns=Index(['A', 'B', 'C'], name='exp'))
+
+ self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
+ codes=[[0, 1, 2, 3]], names=['first'])
+
+ # create test series object
+ arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+ tuples = lzip(*arrays)
+ index = MultiIndex.from_tuples(tuples)
+ s = Series(randn(8), index=index)
+ s[3] = np.NaN
+ self.series = s
+
+ self.tdf = tm.makeTimeDataFrame(100)
+ self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day]).sum()
+
+ # use Int64Index, to make sure things work
+ self.ymd.index.set_levels([lev.astype('i8')
+ for lev in self.ymd.index.levels],
+ inplace=True)
+ self.ymd.index.set_names(['year', 'month', 'day'], inplace=True)
+
+
+class TestMultiLevel(Base):
+
+ def test_append(self):
+ a, b = self.frame[:5], self.frame[5:]
+
+ result = a.append(b)
+ tm.assert_frame_equal(result, self.frame)
+
+ result = a['A'].append(b['A'])
+ tm.assert_series_equal(result, self.frame['A'])
+
+ def test_append_index(self):
+ idx1 = Index([1.1, 1.2, 1.3])
+ idx2 = pd.date_range('2011-01-01', freq='D', periods=3,
+ tz='Asia/Tokyo')
+ idx3 = Index(['A', 'B', 'C'])
+
+ midx_lv2 = MultiIndex.from_arrays([idx1, idx2])
+ midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3])
+
+ result = idx1.append(midx_lv2)
+
+ # see gh-7112
+ tz = pytz.timezone('Asia/Tokyo')
+ expected_tuples = [(1.1, tz.localize(datetime.datetime(2011, 1, 1))),
+ (1.2, tz.localize(datetime.datetime(2011, 1, 2))),
+ (1.3, tz.localize(datetime.datetime(2011, 1, 3)))]
+ expected = Index([1.1, 1.2, 1.3] + expected_tuples)
+ tm.assert_index_equal(result, expected)
+
+ result = midx_lv2.append(idx1)
+ expected = Index(expected_tuples + [1.1, 1.2, 1.3])
+ tm.assert_index_equal(result, expected)
+
+ result = midx_lv2.append(midx_lv2)
+ expected = MultiIndex.from_arrays([idx1.append(idx1),
+ idx2.append(idx2)])
+ tm.assert_index_equal(result, expected)
+
+ result = midx_lv2.append(midx_lv3)
+ tm.assert_index_equal(result, expected)
+
+ result = midx_lv3.append(midx_lv2)
+ expected = Index._simple_new(
+ np.array([(1.1, tz.localize(datetime.datetime(2011, 1, 1)), 'A'),
+ (1.2, tz.localize(datetime.datetime(2011, 1, 2)), 'B'),
+ (1.3, tz.localize(datetime.datetime(2011, 1, 3)), 'C')] +
+ expected_tuples), None)
+ tm.assert_index_equal(result, expected)
+
+ def test_dataframe_constructor(self):
+ multi = DataFrame(np.random.randn(4, 4),
+ index=[np.array(['a', 'a', 'b', 'b']),
+ np.array(['x', 'y', 'x', 'y'])])
+ assert isinstance(multi.index, MultiIndex)
+ assert not isinstance(multi.columns, MultiIndex)
+
+ multi = DataFrame(np.random.randn(4, 4),
+ columns=[['a', 'a', 'b', 'b'],
+ ['x', 'y', 'x', 'y']])
+ assert isinstance(multi.columns, MultiIndex)
+
+ def test_series_constructor(self):
+ multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array(
+ ['x', 'y', 'x', 'y'])])
+ assert isinstance(multi.index, MultiIndex)
+
+ multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']])
+ assert isinstance(multi.index, MultiIndex)
+
+ multi = Series(lrange(4), index=[['a', 'a', 'b', 'b'],
+ ['x', 'y', 'x', 'y']])
+ assert isinstance(multi.index, MultiIndex)
+
+ def test_reindex_level(self):
+ # axis=0
+ month_sums = self.ymd.sum(level='month')
+ result = month_sums.reindex(self.ymd.index, level=1)
+ expected = self.ymd.groupby(level='month').transform(np.sum)
+
+ tm.assert_frame_equal(result, expected)
+
+ # Series
+ result = month_sums['A'].reindex(self.ymd.index, level=1)
+ expected = self.ymd['A'].groupby(level='month').transform(np.sum)
+ tm.assert_series_equal(result, expected, check_names=False)
+
+ # axis=1
+ month_sums = self.ymd.T.sum(axis=1, level='month')
+ result = month_sums.reindex(columns=self.ymd.index, level=1)
+ expected = self.ymd.groupby(level='month').transform(np.sum).T
+ tm.assert_frame_equal(result, expected)
+
+ def test_binops_level(self):
+ def _check_op(opname):
+ op = getattr(DataFrame, opname)
+ month_sums = self.ymd.sum(level='month')
+ result = op(self.ymd, month_sums, level='month')
+
+ broadcasted = self.ymd.groupby(level='month').transform(np.sum)
+ expected = op(self.ymd, broadcasted)
+ tm.assert_frame_equal(result, expected)
+
+ # Series
+ op = getattr(Series, opname)
+ result = op(self.ymd['A'], month_sums['A'], level='month')
+ broadcasted = self.ymd['A'].groupby(level='month').transform(
+ np.sum)
+ expected = op(self.ymd['A'], broadcasted)
+ expected.name = 'A'
+ tm.assert_series_equal(result, expected)
+
+ _check_op('sub')
+ _check_op('add')
+ _check_op('mul')
+ _check_op('div')
+
+ def test_pickle(self):
+ def _test_roundtrip(frame):
+ unpickled = tm.round_trip_pickle(frame)
+ tm.assert_frame_equal(frame, unpickled)
+
+ _test_roundtrip(self.frame)
+ _test_roundtrip(self.frame.T)
+ _test_roundtrip(self.ymd)
+ _test_roundtrip(self.ymd.T)
+
+ def test_reindex(self):
+ expected = self.frame.iloc[[0, 3]]
+ reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]]
+ tm.assert_frame_equal(reindexed, expected)
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
+ tm.assert_frame_equal(reindexed, expected)
+
+ def test_reindex_preserve_levels(self):
+ new_index = self.ymd.index[::10]
+ chunk = self.ymd.reindex(new_index)
+ assert chunk.index is new_index
+
+ chunk = self.ymd.loc[new_index]
+ assert chunk.index is new_index
+
+ with catch_warnings(record=True):
+ simplefilter("ignore", DeprecationWarning)
+ chunk = self.ymd.ix[new_index]
+ assert chunk.index is new_index
+
+ ymdT = self.ymd.T
+ chunk = ymdT.reindex(columns=new_index)
+ assert chunk.columns is new_index
+
+ chunk = ymdT.loc[:, new_index]
+ assert chunk.columns is new_index
+
+ def test_repr_to_string(self):
+ repr(self.frame)
+ repr(self.ymd)
+ repr(self.frame.T)
+ repr(self.ymd.T)
+
+ buf = StringIO()
+ self.frame.to_string(buf=buf)
+ self.ymd.to_string(buf=buf)
+ self.frame.T.to_string(buf=buf)
+ self.ymd.T.to_string(buf=buf)
+
+ def test_repr_name_coincide(self):
+ index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')],
+ names=['a', 'b', 'c'])
+
+ df = DataFrame({'value': [0, 1]}, index=index)
+
+ lines = repr(df).split('\n')
+ assert lines[2].startswith('a 0 foo')
+
+ def test_delevel_infer_dtype(self):
+ tuples = [tuple
+ for tuple in cart_product(
+ ['foo', 'bar'], [10, 20], [1.0, 1.1])]
+ index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2'])
+ df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'],
+ index=index)
+ deleveled = df.reset_index()
+ assert is_integer_dtype(deleveled['prm1'])
+ assert is_float_dtype(deleveled['prm2'])
+
+ def test_reset_index_with_drop(self):
+ deleveled = self.ymd.reset_index(drop=True)
+ assert len(deleveled.columns) == len(self.ymd.columns)
+ assert deleveled.index.name == self.ymd.index.name
+
+ deleveled = self.series.reset_index()
+ assert isinstance(deleveled, DataFrame)
+ assert len(deleveled.columns) == len(self.series.index.levels) + 1
+ assert deleveled.index.name == self.series.index.name
+
+ deleveled = self.series.reset_index(drop=True)
+ assert isinstance(deleveled, Series)
+ assert deleveled.index.name == self.series.index.name
+
+ def test_count_level(self):
+ def _check_counts(frame, axis=0):
+ index = frame._get_axis(axis)
+ for i in range(index.nlevels):
+ result = frame.count(axis=axis, level=i)
+ expected = frame.groupby(axis=axis, level=i).count()
+ expected = expected.reindex_like(result).astype('i8')
+ tm.assert_frame_equal(result, expected)
+
+ self.frame.iloc[1, [1, 2]] = np.nan
+ self.frame.iloc[7, [0, 1]] = np.nan
+ self.ymd.iloc[1, [1, 2]] = np.nan
+ self.ymd.iloc[7, [0, 1]] = np.nan
+
+ _check_counts(self.frame)
+ _check_counts(self.ymd)
+ _check_counts(self.frame.T, axis=1)
+ _check_counts(self.ymd.T, axis=1)
+
+ # can't call with level on regular DataFrame
+ df = tm.makeTimeDataFrame()
+ with pytest.raises(TypeError, match='hierarchical'):
+ df.count(level=0)
+
+ self.frame['D'] = 'foo'
+ result = self.frame.count(level=0, numeric_only=True)
+ tm.assert_index_equal(result.columns, Index(list('ABC'), name='exp'))
+
+ def test_count_level_series(self):
+ index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two',
+ 'three', 'four']],
+ codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]])
+
+ s = Series(np.random.randn(len(index)), index=index)
+
+ result = s.count(level=0)
+ expected = s.groupby(level=0).count()
+ tm.assert_series_equal(
+ result.astype('f8'), expected.reindex(result.index).fillna(0))
+
+ result = s.count(level=1)
+ expected = s.groupby(level=1).count()
+ tm.assert_series_equal(
+ result.astype('f8'), expected.reindex(result.index).fillna(0))
+
+ def test_count_level_corner(self):
+ s = self.frame['A'][:0]
+ result = s.count(level=0)
+ expected = Series(0, index=s.index.levels[0], name='A')
+ tm.assert_series_equal(result, expected)
+
+ df = self.frame[:0]
+ result = df.count(level=0)
+ expected = DataFrame({}, index=s.index.levels[0],
+ columns=df.columns).fillna(0).astype(np.int64)
+ tm.assert_frame_equal(result, expected)
+
+ def test_get_level_number_out_of_bounds(self):
+ with pytest.raises(IndexError, match="Too many levels"):
+ self.frame.index._get_level_number(2)
+ with pytest.raises(IndexError, match="not a valid level number"):
+ self.frame.index._get_level_number(-3)
+
+ def test_unstack(self):
+ # just check that it works for now
+ unstacked = self.ymd.unstack()
+ unstacked.unstack()
+
+ # test that ints work
+ self.ymd.astype(int).unstack()
+
+ # test that int32 work
+ self.ymd.astype(np.int32).unstack()
+
+ def test_unstack_multiple_no_empty_columns(self):
+ index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (
+ 1, 'baz', 1), (1, 'qux', 1)])
+
+ s = Series(np.random.randn(4), index=index)
+
+ unstacked = s.unstack([1, 2])
+ expected = unstacked.dropna(axis=1, how='all')
+ tm.assert_frame_equal(unstacked, expected)
+
+ def test_stack(self):
+ # regular roundtrip
+ unstacked = self.ymd.unstack()
+ restacked = unstacked.stack()
+ tm.assert_frame_equal(restacked, self.ymd)
+
+ unlexsorted = self.ymd.sort_index(level=2)
+
+ unstacked = unlexsorted.unstack(2)
+ restacked = unstacked.stack()
+ tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
+
+ unlexsorted = unlexsorted[::-1]
+ unstacked = unlexsorted.unstack(1)
+ restacked = unstacked.stack().swaplevel(1, 2)
+ tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
+
+ unlexsorted = unlexsorted.swaplevel(0, 1)
+ unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
+ restacked = unstacked.stack(0).swaplevel(1, 2)
+ tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd)
+
+ # columns unsorted
+ unstacked = self.ymd.unstack()
+ unstacked = unstacked.sort_index(axis=1, ascending=False)
+ restacked = unstacked.stack()
+ tm.assert_frame_equal(restacked, self.ymd)
+
+ # more than 2 levels in the columns
+ unstacked = self.ymd.unstack(1).unstack(1)
+
+ result = unstacked.stack(1)
+ expected = self.ymd.unstack()
+ tm.assert_frame_equal(result, expected)
+
+ result = unstacked.stack(2)
+ expected = self.ymd.unstack(1)
+ tm.assert_frame_equal(result, expected)
+
+ result = unstacked.stack(0)
+ expected = self.ymd.stack().unstack(1).unstack(1)
+ tm.assert_frame_equal(result, expected)
+
+ # not all levels present in each echelon
+ unstacked = self.ymd.unstack(2).loc[:, ::3]
+ stacked = unstacked.stack().stack()
+ ymd_stacked = self.ymd.stack()
+ tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
+
+ # stack with negative number
+ result = self.ymd.unstack(0).stack(-2)
+ expected = self.ymd.unstack(0).stack(0)
+
+ # GH10417
+ def check(left, right):
+ tm.assert_series_equal(left, right)
+ assert left.index.is_unique is False
+ li, ri = left.index, right.index
+ tm.assert_index_equal(li, ri)
+
+ df = DataFrame(np.arange(12).reshape(4, 3),
+ index=list('abab'),
+ columns=['1st', '2nd', '3rd'])
+
+ mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']],
+ codes=[np.tile(
+ np.arange(2).repeat(3), 2), np.tile(
+ np.arange(3), 4)])
+
+ left, right = df.stack(), Series(np.arange(12), index=mi)
+ check(left, right)
+
+ df.columns = ['1st', '2nd', '1st']
+ mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], codes=[np.tile(
+ np.arange(2).repeat(3), 2), np.tile(
+ [0, 1, 0], 4)])
+
+ left, right = df.stack(), Series(np.arange(12), index=mi)
+ check(left, right)
+
+ tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2)
+ df.index = MultiIndex.from_tuples(tpls)
+ mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']],
+ codes=[np.tile(
+ np.arange(2).repeat(3), 2), np.repeat(
+ [1, 0, 1], [3, 6, 3]), np.tile(
+ [0, 1, 0], 4)])
+
+ left, right = df.stack(), Series(np.arange(12), index=mi)
+ check(left, right)
+
+ def test_unstack_odd_failure(self):
+ data = """day,time,smoker,sum,len
+Fri,Dinner,No,8.25,3.
+Fri,Dinner,Yes,27.03,9
+Fri,Lunch,No,3.0,1
+Fri,Lunch,Yes,13.68,6
+Sat,Dinner,No,139.63,45
+Sat,Dinner,Yes,120.77,42
+Sun,Dinner,No,180.57,57
+Sun,Dinner,Yes,66.82,19
+Thur,Dinner,No,3.0,1
+Thur,Lunch,No,117.32,44
+Thur,Lunch,Yes,51.51,17"""
+
+ df = pd.read_csv(StringIO(data)).set_index(['day', 'time', 'smoker'])
+
+ # it works, #2100
+ result = df.unstack(2)
+
+ recons = result.stack()
+ tm.assert_frame_equal(recons, df)
+
+ def test_stack_mixed_dtype(self):
+ df = self.frame.T
+ df['foo', 'four'] = 'foo'
+ df = df.sort_index(level=1, axis=1)
+
+ stacked = df.stack()
+ result = df['foo'].stack().sort_index()
+ tm.assert_series_equal(stacked['foo'], result, check_names=False)
+ assert result.name is None
+ assert stacked['bar'].dtype == np.float_
+
+ def test_unstack_bug(self):
+ df = DataFrame({'state': ['naive', 'naive', 'naive', 'activ', 'activ',
+ 'activ'],
+ 'exp': ['a', 'b', 'b', 'b', 'a', 'a'],
+ 'barcode': [1, 2, 3, 4, 1, 3],
+ 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'],
+ 'extra': np.arange(6.)})
+
+ result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len)
+
+ unstacked = result.unstack()
+ restacked = unstacked.stack()
+ tm.assert_series_equal(
+ restacked, result.reindex(restacked.index).astype(float))
+
+ def test_stack_unstack_preserve_names(self):
+ unstacked = self.frame.unstack()
+ assert unstacked.index.name == 'first'
+ assert unstacked.columns.names == ['exp', 'second']
+
+ restacked = unstacked.stack()
+ assert restacked.index.names == self.frame.index.names
+
+ def test_unstack_level_name(self):
+ result = self.frame.unstack('second')
+ expected = self.frame.unstack(level=1)
+ tm.assert_frame_equal(result, expected)
+
+ def test_stack_level_name(self):
+ unstacked = self.frame.unstack('second')
+ result = unstacked.stack('exp')
+ expected = self.frame.unstack().stack(0)
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame.stack('exp')
+ expected = self.frame.stack()
+ tm.assert_series_equal(result, expected)
+
+ def test_stack_unstack_multiple(self):
+ unstacked = self.ymd.unstack(['year', 'month'])
+ expected = self.ymd.unstack('year').unstack('month')
+ tm.assert_frame_equal(unstacked, expected)
+ assert unstacked.columns.names == expected.columns.names
+
+ # series
+ s = self.ymd['A']
+ s_unstacked = s.unstack(['year', 'month'])
+ tm.assert_frame_equal(s_unstacked, expected['A'])
+
+ restacked = unstacked.stack(['year', 'month'])
+ restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
+ restacked = restacked.sort_index(level=0)
+
+ tm.assert_frame_equal(restacked, self.ymd)
+ assert restacked.index.names == self.ymd.index.names
+
+ # GH #451
+ unstacked = self.ymd.unstack([1, 2])
+ expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all')
+ tm.assert_frame_equal(unstacked, expected)
+
+ unstacked = self.ymd.unstack([2, 1])
+ expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all')
+ tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
+
+ def test_stack_names_and_numbers(self):
+ unstacked = self.ymd.unstack(['year', 'month'])
+
+ # Can't use mixture of names and numbers to stack
+ with pytest.raises(ValueError, match="level should contain"):
+ unstacked.stack([0, 'month'])
+
+ def test_stack_multiple_out_of_bounds(self):
+ # nlevels == 3
+ unstacked = self.ymd.unstack(['year', 'month'])
+
+ with pytest.raises(IndexError, match="Too many levels"):
+ unstacked.stack([2, 3])
+ with pytest.raises(IndexError, match="not a valid level number"):
+ unstacked.stack([-4, -3])
+
+ def test_unstack_period_series(self):
+ # GH 4342
+ idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',
+ '2013-03', '2013-03'], freq='M', name='period')
+ idx2 = Index(['A', 'B'] * 3, name='str')
+ value = [1, 2, 3, 4, 5, 6]
+
+ idx = MultiIndex.from_arrays([idx1, idx2])
+ s = Series(value, index=idx)
+
+ result1 = s.unstack()
+ result2 = s.unstack(level=1)
+ result3 = s.unstack(level=0)
+
+ e_idx = pd.PeriodIndex(
+ ['2013-01', '2013-02', '2013-03'], freq='M', name='period')
+ expected = DataFrame({'A': [1, 3, 5], 'B': [2, 4, 6]}, index=e_idx,
+ columns=['A', 'B'])
+ expected.columns.name = 'str'
+
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+ tm.assert_frame_equal(result3, expected.T)
+
+ idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',
+ '2013-03', '2013-03'], freq='M', name='period1')
+
+ idx2 = pd.PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
+ '2013-08', '2013-07'], freq='M', name='period2')
+ idx = MultiIndex.from_arrays([idx1, idx2])
+ s = Series(value, index=idx)
+
+ result1 = s.unstack()
+ result2 = s.unstack(level=1)
+ result3 = s.unstack(level=0)
+
+ e_idx = pd.PeriodIndex(
+ ['2013-01', '2013-02', '2013-03'], freq='M', name='period1')
+ e_cols = pd.PeriodIndex(['2013-07', '2013-08', '2013-09', '2013-10',
+ '2013-11', '2013-12'],
+ freq='M', name='period2')
+ expected = DataFrame([[np.nan, np.nan, np.nan, np.nan, 2, 1],
+ [np.nan, np.nan, 4, 3, np.nan, np.nan],
+ [6, 5, np.nan, np.nan, np.nan, np.nan]],
+ index=e_idx, columns=e_cols)
+
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+ tm.assert_frame_equal(result3, expected.T)
+
+ def test_unstack_period_frame(self):
+ # GH 4342
+ idx1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-02', '2014-02',
+ '2014-01', '2014-01'],
+ freq='M', name='period1')
+ idx2 = pd.PeriodIndex(['2013-12', '2013-12', '2014-02', '2013-10',
+ '2013-10', '2014-02'],
+ freq='M', name='period2')
+ value = {'A': [1, 2, 3, 4, 5, 6], 'B': [6, 5, 4, 3, 2, 1]}
+ idx = MultiIndex.from_arrays([idx1, idx2])
+ df = DataFrame(value, index=idx)
+
+ result1 = df.unstack()
+ result2 = df.unstack(level=1)
+ result3 = df.unstack(level=0)
+
+ e_1 = pd.PeriodIndex(['2014-01', '2014-02'], freq='M', name='period1')
+ e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02', '2013-10',
+ '2013-12', '2014-02'], freq='M', name='period2')
+ e_cols = MultiIndex.from_arrays(['A A A B B B'.split(), e_2])
+ expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]],
+ index=e_1, columns=e_cols)
+
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01',
+ '2014-02'], freq='M', name='period1')
+ e_2 = pd.PeriodIndex(
+ ['2013-10', '2013-12', '2014-02'], freq='M', name='period2')
+ e_cols = MultiIndex.from_arrays(['A A B B'.split(), e_1])
+ expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]],
+ index=e_2, columns=e_cols)
+
+ tm.assert_frame_equal(result3, expected)
+
+ def test_stack_multiple_bug(self):
+ """ bug when some uniques are not present in the data #3170"""
+ id_col = ([1] * 3) + ([2] * 3)
+ name = (['a'] * 3) + (['b'] * 3)
+ date = pd.to_datetime(['2013-01-03', '2013-01-04', '2013-01-05'] * 2)
+ var1 = np.random.randint(0, 100, 6)
+ df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1))
+
+ multi = df.set_index(['DATE', 'ID'])
+ multi.columns.name = 'Params'
+ unst = multi.unstack('ID')
+ down = unst.resample('W-THU').mean()
+
+ rs = down.stack('ID')
+ xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID')
+ xp.columns.name = 'Params'
+ tm.assert_frame_equal(rs, xp)
+
+ def test_stack_dropna(self):
+ # GH #3997
+ df = DataFrame({'A': ['a1', 'a2'], 'B': ['b1', 'b2'], 'C': [1, 1]})
+ df = df.set_index(['A', 'B'])
+
+ stacked = df.unstack().stack(dropna=False)
+ assert len(stacked) > len(stacked.dropna())
+
+ stacked = df.unstack().stack(dropna=True)
+ tm.assert_frame_equal(stacked, stacked.dropna())
+
+ def test_unstack_multiple_hierarchical(self):
+ df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1],
+ [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1
+ ]],
+ columns=[[0, 0, 1, 1], [0, 1, 0, 1]])
+
+ df.index.names = ['a', 'b', 'c']
+ df.columns.names = ['d', 'e']
+
+ # it works!
+ df.unstack(['b', 'c'])
+
+ def test_groupby_transform(self):
+ s = self.frame['A']
+ grouper = s.index.get_level_values(0)
+
+ grouped = s.groupby(grouper)
+
+ applied = grouped.apply(lambda x: x * 2)
+ expected = grouped.transform(lambda x: x * 2)
+ result = applied.reindex(expected.index)
+ tm.assert_series_equal(result, expected, check_names=False)
+
+ def test_unstack_sparse_keyspace(self):
+ # memory problems with naive impl #2278
+ # Generate Long File & Test Pivot
+ NUM_ROWS = 1000
+
+ df = DataFrame({'A': np.random.randint(100, size=NUM_ROWS),
+ 'B': np.random.randint(300, size=NUM_ROWS),
+ 'C': np.random.randint(-7, 7, size=NUM_ROWS),
+ 'D': np.random.randint(-19, 19, size=NUM_ROWS),
+ 'E': np.random.randint(3000, size=NUM_ROWS),
+ 'F': np.random.randn(NUM_ROWS)})
+
+ idf = df.set_index(['A', 'B', 'C', 'D', 'E'])
+
+ # it works! is sufficient
+ idf.unstack('E')
+
+ def test_unstack_unobserved_keys(self):
+ # related to #2278 refactoring
+ levels = [[0, 1], [0, 1, 2, 3]]
+ codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
+
+ index = MultiIndex(levels, codes)
+
+ df = DataFrame(np.random.randn(4, 2), index=index)
+
+ result = df.unstack()
+ assert len(result.columns) == 4
+
+ recons = result.stack()
+ tm.assert_frame_equal(recons, df)
+
+ @pytest.mark.slow
+ def test_unstack_number_of_levels_larger_than_int32(self):
+ # GH 20601
+ df = DataFrame(np.random.randn(2 ** 16, 2),
+ index=[np.arange(2 ** 16), np.arange(2 ** 16)])
+ with pytest.raises(ValueError, match='int32 overflow'):
+ df.unstack()
+
+ def test_stack_order_with_unsorted_levels(self):
+ # GH 16323
+
+ def manual_compare_stacked(df, df_stacked, lev0, lev1):
+ assert all(df.loc[row, col] ==
+ df_stacked.loc[(row, col[lev0]), col[lev1]]
+ for row in df.index for col in df.columns)
+
+ # deep check for 1-row case
+ for width in [2, 3]:
+ levels_poss = itertools.product(
+ itertools.permutations([0, 1, 2], width),
+ repeat=2)
+
+ for levels in levels_poss:
+ columns = MultiIndex(levels=levels,
+ codes=[[0, 0, 1, 1],
+ [0, 1, 0, 1]])
+ df = DataFrame(columns=columns, data=[range(4)])
+ for stack_lev in range(2):
+ df_stacked = df.stack(stack_lev)
+ manual_compare_stacked(df, df_stacked,
+ stack_lev, 1 - stack_lev)
+
+ # check multi-row case
+ mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]],
+ codes=[np.repeat(range(3), 3), np.tile(range(3), 3)])
+ df = DataFrame(columns=mi, index=range(5),
+ data=np.arange(5 * len(mi)).reshape(5, -1))
+ manual_compare_stacked(df, df.stack(0), 0, 1)
+
+ def test_groupby_corner(self):
+ midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']],
+ codes=[[0], [0], [0]],
+ names=['one', 'two', 'three'])
+ df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'],
+ index=midx)
+ # should work
+ df.groupby(level='three')
+
+ def test_groupby_level_no_obs(self):
+ # #1697
+ midx = MultiIndex.from_tuples([('f1', 's1'), ('f1', 's2'), (
+ 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')])
+ df = DataFrame(
+ [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx)
+ df1 = df.loc(axis=1)[df.columns.map(
+ lambda u: u[0] in ['f2', 'f3'])]
+
+ grouped = df1.groupby(axis=1, level=0)
+ result = grouped.sum()
+ assert (result.columns == ['f2', 'f3']).all()
+
+ def test_join(self):
+ a = self.frame.loc[self.frame.index[:5], ['A']]
+ b = self.frame.loc[self.frame.index[2:], ['B', 'C']]
+
+ joined = a.join(b, how='outer').reindex(self.frame.index)
+ expected = self.frame.copy()
+ expected.values[np.isnan(joined.values)] = np.nan
+
+ assert not np.isnan(joined.values).all()
+
+ # TODO what should join do with names ?
+ tm.assert_frame_equal(joined, expected, check_names=False)
+
+ def test_swaplevel(self):
+ swapped = self.frame['A'].swaplevel()
+ swapped2 = self.frame['A'].swaplevel(0)
+ swapped3 = self.frame['A'].swaplevel(0, 1)
+ swapped4 = self.frame['A'].swaplevel('first', 'second')
+ assert not swapped.index.equals(self.frame.index)
+ tm.assert_series_equal(swapped, swapped2)
+ tm.assert_series_equal(swapped, swapped3)
+ tm.assert_series_equal(swapped, swapped4)
+
+ back = swapped.swaplevel()
+ back2 = swapped.swaplevel(0)
+ back3 = swapped.swaplevel(0, 1)
+ back4 = swapped.swaplevel('second', 'first')
+ assert back.index.equals(self.frame.index)
+ tm.assert_series_equal(back, back2)
+ tm.assert_series_equal(back, back3)
+ tm.assert_series_equal(back, back4)
+
+ ft = self.frame.T
+ swapped = ft.swaplevel('first', 'second', axis=1)
+ exp = self.frame.swaplevel('first', 'second').T
+ tm.assert_frame_equal(swapped, exp)
+
+ def test_swaplevel_panel(self):
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ panel = Panel({'ItemA': self.frame, 'ItemB': self.frame * 2})
+ expected = panel.copy()
+ expected.major_axis = expected.major_axis.swaplevel(0, 1)
+
+ for result in (panel.swaplevel(axis='major'),
+ panel.swaplevel(0, axis='major'),
+ panel.swaplevel(0, 1, axis='major')):
+ tm.assert_panel_equal(result, expected)
+
+ def test_reorder_levels(self):
+ result = self.ymd.reorder_levels(['month', 'day', 'year'])
+ expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
+ tm.assert_frame_equal(result, expected)
+
+ result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
+ expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
+ tm.assert_series_equal(result, expected)
+
+ result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
+ expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(TypeError, match='hierarchical axis'):
+ self.ymd.reorder_levels([1, 2], axis=1)
+
+ with pytest.raises(IndexError, match='Too many levels'):
+ self.ymd.index.reorder_levels([1, 2, 3])
+
+ def test_insert_index(self):
+ df = self.ymd[:5].T
+ df[2000, 1, 10] = df[2000, 1, 7]
+ assert isinstance(df.columns, MultiIndex)
+ assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
+
+ def test_alignment(self):
+ x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), (
+ "A", 2), ("B", 3)]))
+
+ y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), (
+ "Z", 2), ("B", 3)]))
+
+ res = x - y
+ exp_index = x.index.union(y.index)
+ exp = x.reindex(exp_index) - y.reindex(exp_index)
+ tm.assert_series_equal(res, exp)
+
+ # hit non-monotonic code path
+ res = x[::-1] - y[::-1]
+ exp_index = x.index.union(y.index)
+ exp = x.reindex(exp_index) - y.reindex(exp_index)
+ tm.assert_series_equal(res, exp)
+
+ def test_count(self):
+ frame = self.frame.copy()
+ frame.index.names = ['a', 'b']
+
+ result = frame.count(level='b')
+ expect = self.frame.count(level=1)
+ tm.assert_frame_equal(result, expect, check_names=False)
+
+ result = frame.count(level='a')
+ expect = self.frame.count(level=0)
+ tm.assert_frame_equal(result, expect, check_names=False)
+
+ series = self.series.copy()
+ series.index.names = ['a', 'b']
+
+ result = series.count(level='b')
+ expect = self.series.count(level=1)
+ tm.assert_series_equal(result, expect, check_names=False)
+ assert result.index.name == 'b'
+
+ result = series.count(level='a')
+ expect = self.series.count(level=0)
+ tm.assert_series_equal(result, expect, check_names=False)
+ assert result.index.name == 'a'
+
+ pytest.raises(KeyError, series.count, 'x')
+ pytest.raises(KeyError, frame.count, level='x')
+
+ @pytest.mark.parametrize('op', AGG_FUNCTIONS)
+ @pytest.mark.parametrize('level', [0, 1])
+ @pytest.mark.parametrize('skipna', [True, False])
+ @pytest.mark.parametrize('sort', [True, False])
+ def test_series_group_min_max(self, op, level, skipna, sort):
+ # GH 17537
+ grouped = self.series.groupby(level=level, sort=sort)
+ # skipna=True
+ leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna))
+ rightside = getattr(self.series, op)(level=level, skipna=skipna)
+ if sort:
+ rightside = rightside.sort_index(level=level)
+ tm.assert_series_equal(leftside, rightside)
+
+ @pytest.mark.parametrize('op', AGG_FUNCTIONS)
+ @pytest.mark.parametrize('level', [0, 1])
+ @pytest.mark.parametrize('axis', [0, 1])
+ @pytest.mark.parametrize('skipna', [True, False])
+ @pytest.mark.parametrize('sort', [True, False])
+ def test_frame_group_ops(self, op, level, axis, skipna, sort):
+ # GH 17537
+ self.frame.iloc[1, [1, 2]] = np.nan
+ self.frame.iloc[7, [0, 1]] = np.nan
+
+ if axis == 0:
+ frame = self.frame
+ else:
+ frame = self.frame.T
+
+ grouped = frame.groupby(level=level, axis=axis, sort=sort)
+
+ pieces = []
+
+ def aggf(x):
+ pieces.append(x)
+ return getattr(x, op)(skipna=skipna, axis=axis)
+
+ leftside = grouped.agg(aggf)
+ rightside = getattr(frame, op)(level=level, axis=axis,
+ skipna=skipna)
+ if sort:
+ rightside = rightside.sort_index(level=level, axis=axis)
+ frame = frame.sort_index(level=level, axis=axis)
+
+ # for good measure, groupby detail
+ level_index = frame._get_axis(axis).levels[level]
+
+ tm.assert_index_equal(leftside._get_axis(axis), level_index)
+ tm.assert_index_equal(rightside._get_axis(axis), level_index)
+
+ tm.assert_frame_equal(leftside, rightside)
+
+ def test_stat_op_corner(self):
+ obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)]))
+
+ result = obj.sum(level=0)
+ expected = Series([10.0], index=[2])
+ tm.assert_series_equal(result, expected)
+
+ def test_frame_any_all_group(self):
+ df = DataFrame(
+ {'data': [False, False, True, False, True, False, True]},
+ index=[
+ ['one', 'one', 'two', 'one', 'two', 'two', 'two'],
+ [0, 1, 0, 2, 1, 2, 3]])
+
+ result = df.any(level=0)
+ ex = DataFrame({'data': [False, True]}, index=['one', 'two'])
+ tm.assert_frame_equal(result, ex)
+
+ result = df.all(level=0)
+ ex = DataFrame({'data': [False, False]}, index=['one', 'two'])
+ tm.assert_frame_equal(result, ex)
+
+ def test_std_var_pass_ddof(self):
+ index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile(
+ np.arange(10), 5)])
+ df = DataFrame(np.random.randn(len(index), 5), index=index)
+
+ for meth in ['var', 'std']:
+ ddof = 4
+ alt = lambda x: getattr(x, meth)(ddof=ddof)
+
+ result = getattr(df[0], meth)(level=0, ddof=ddof)
+ expected = df[0].groupby(level=0).agg(alt)
+ tm.assert_series_equal(result, expected)
+
+ result = getattr(df, meth)(level=0, ddof=ddof)
+ expected = df.groupby(level=0).agg(alt)
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_series_agg_multiple_levels(self):
+ result = self.ymd.sum(level=['year', 'month'])
+ expected = self.ymd.groupby(level=['year', 'month']).sum()
+ tm.assert_frame_equal(result, expected)
+
+ result = self.ymd['A'].sum(level=['year', 'month'])
+ expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
+ tm.assert_series_equal(result, expected)
+
+ def test_groupby_multilevel(self):
+ result = self.ymd.groupby(level=[0, 1]).mean()
+
+ k1 = self.ymd.index.get_level_values(0)
+ k2 = self.ymd.index.get_level_values(1)
+
+ expected = self.ymd.groupby([k1, k2]).mean()
+
+ # TODO groupby with level_values drops names
+ tm.assert_frame_equal(result, expected, check_names=False)
+ assert result.index.names == self.ymd.index.names[:2]
+
+ result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
+ tm.assert_frame_equal(result, result2)
+
+ def test_groupby_multilevel_with_transform(self):
+ pass
+
+ def test_multilevel_consolidate(self):
+ index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), (
+ 'bar', 'one'), ('bar', 'two')])
+ df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
+ df['Totals', ''] = df.sum(1)
+ df = df._consolidate()
+
+ def test_ix_preserve_names(self):
+ result = self.ymd.loc[2000]
+ result2 = self.ymd['A'].loc[2000]
+ assert result.index.names == self.ymd.index.names[1:]
+ assert result2.index.names == self.ymd.index.names[1:]
+
+ result = self.ymd.loc[2000, 2]
+ result2 = self.ymd['A'].loc[2000, 2]
+ assert result.index.name == self.ymd.index.names[2]
+ assert result2.index.name == self.ymd.index.names[2]
+
+ def test_unstack_preserve_types(self):
+ # GH #403
+ self.ymd['E'] = 'foo'
+ self.ymd['F'] = 2
+
+ unstacked = self.ymd.unstack('month')
+ assert unstacked['A', 1].dtype == np.float64
+ assert unstacked['E', 1].dtype == np.object_
+ assert unstacked['F', 1].dtype == np.float64
+
+ def test_unstack_group_index_overflow(self):
+ codes = np.tile(np.arange(500), 2)
+ level = np.arange(500)
+
+ index = MultiIndex(levels=[level] * 8 + [[0, 1]],
+ codes=[codes] * 8 + [np.arange(2).repeat(500)])
+
+ s = Series(np.arange(1000), index=index)
+ result = s.unstack()
+ assert result.shape == (500, 2)
+
+ # test roundtrip
+ stacked = result.stack()
+ tm.assert_series_equal(s, stacked.reindex(s.index))
+
+ # put it at beginning
+ index = MultiIndex(levels=[[0, 1]] + [level] * 8,
+ codes=[np.arange(2).repeat(500)] + [codes] * 8)
+
+ s = Series(np.arange(1000), index=index)
+ result = s.unstack(0)
+ assert result.shape == (500, 2)
+
+ # put it in middle
+ index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4,
+ codes=([codes] * 4 + [np.arange(2).repeat(500)] +
+ [codes] * 4))
+
+ s = Series(np.arange(1000), index=index)
+ result = s.unstack(4)
+ assert result.shape == (500, 2)
+
+ def test_pyint_engine(self):
+ # GH 18519 : when combinations of codes cannot be represented in 64
+ # bits, the index underlying the MultiIndex engine works with Python
+ # integers, rather than uint64.
+ N = 5
+ keys = [tuple(l) for l in [[0] * 10 * N,
+ [1] * 10 * N,
+ [2] * 10 * N,
+ [np.nan] * N + [2] * 9 * N,
+ [0] * N + [2] * 9 * N,
+ [np.nan] * N + [2] * 8 * N + [0] * N]]
+ # Each level contains 4 elements (including NaN), so it is represented
+ # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
+ # 64 bit engine and truncating the first levels, the fourth and fifth
+ # keys would collide; if truncating the last levels, the fifth and
+ # sixth; if rotating bits rather than shifting, the third and fifth.
+
+ for idx in range(len(keys)):
+ index = MultiIndex.from_tuples(keys)
+ assert index.get_loc(keys[idx]) == idx
+
+ expected = np.arange(idx + 1, dtype=np.intp)
+ result = index.get_indexer([keys[i] for i in expected])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # With missing key:
+ idces = range(len(keys))
+ expected = np.array([-1] + list(idces), dtype=np.intp)
+ missing = tuple([0, 1] * 5 * N)
+ result = index.get_indexer([missing] + [keys[i] for i in idces])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_to_html(self):
+ self.ymd.columns.name = 'foo'
+ self.ymd.to_html()
+ self.ymd.T.to_html()
+
+ def test_level_with_tuples(self):
+ index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), (
+ 'foo', 'qux', 0)], [0, 1]],
+ codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
+
+ series = Series(np.random.randn(6), index=index)
+ frame = DataFrame(np.random.randn(6, 4), index=index)
+
+ result = series[('foo', 'bar', 0)]
+ result2 = series.loc[('foo', 'bar', 0)]
+ expected = series[:2]
+ expected.index = expected.index.droplevel(0)
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
+
+ pytest.raises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2))
+
+ result = frame.loc[('foo', 'bar', 0)]
+ result2 = frame.xs(('foo', 'bar', 0))
+ expected = frame[:2]
+ expected.index = expected.index.droplevel(0)
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), (
+ 'foo', 'qux')], [0, 1]],
+ codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])
+
+ series = Series(np.random.randn(6), index=index)
+ frame = DataFrame(np.random.randn(6, 4), index=index)
+
+ result = series[('foo', 'bar')]
+ result2 = series.loc[('foo', 'bar')]
+ expected = series[:2]
+ expected.index = expected.index.droplevel(0)
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
+
+ result = frame.loc[('foo', 'bar')]
+ result2 = frame.xs(('foo', 'bar'))
+ expected = frame[:2]
+ expected.index = expected.index.droplevel(0)
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ def test_mixed_depth_drop(self):
+ arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
+ ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
+ ['', 'wx', 'wy', '', '', '']]
+
+ tuples = sorted(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples)
+ df = DataFrame(randn(4, 6), columns=index)
+
+ result = df.drop('a', axis=1)
+ expected = df.drop([('a', '', '')], axis=1)
+ tm.assert_frame_equal(expected, result)
+
+ result = df.drop(['top'], axis=1)
+ expected = df.drop([('top', 'OD', 'wx')], axis=1)
+ expected = expected.drop([('top', 'OD', 'wy')], axis=1)
+ tm.assert_frame_equal(expected, result)
+
+ result = df.drop(('top', 'OD', 'wx'), axis=1)
+ expected = df.drop([('top', 'OD', 'wx')], axis=1)
+ tm.assert_frame_equal(expected, result)
+
+ expected = df.drop([('top', 'OD', 'wy')], axis=1)
+ expected = df.drop('top', axis=1)
+
+ result = df.drop('result1', level=1, axis=1)
+ expected = df.drop([('routine1', 'result1', ''),
+ ('routine2', 'result1', '')], axis=1)
+ tm.assert_frame_equal(expected, result)
+
+ def test_drop_nonunique(self):
+ df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2],
+ ["z-c", "z", "c", 3.1], ["x-a", "x", "a", 4.1],
+ ["x-b", "x", "b", 5.1], ["x-b", "x", "b", 4.1],
+ ["x-b", "x", "b", 2.2],
+ ["y-a", "y", "a", 1.2], ["z-b", "z", "b", 2.1]],
+ columns=["var1", "var2", "var3", "var4"])
+
+ grp_size = df.groupby("var1").size()
+ drop_idx = grp_size.loc[grp_size == 1]
+
+ idf = df.set_index(["var1", "var2", "var3"])
+
+ # it works! #2101
+ result = idf.drop(drop_idx.index, level=0).reset_index()
+ expected = df[-df.var1.isin(drop_idx.index)]
+
+ result.index = expected.index
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_mixed_depth_pop(self):
+ arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
+ ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
+ ['', 'wx', 'wy', '', '', '']]
+
+ tuples = sorted(zip(*arrays))
+ index = MultiIndex.from_tuples(tuples)
+ df = DataFrame(randn(4, 6), columns=index)
+
+ df1 = df.copy()
+ df2 = df.copy()
+ result = df1.pop('a')
+ expected = df2.pop(('a', '', ''))
+ tm.assert_series_equal(expected, result, check_names=False)
+ tm.assert_frame_equal(df1, df2)
+ assert result.name == 'a'
+
+ expected = df1['top']
+ df1 = df1.drop(['top'], axis=1)
+ result = df2.pop('top')
+ tm.assert_frame_equal(expected, result)
+ tm.assert_frame_equal(df1, df2)
+
+ def test_reindex_level_partial_selection(self):
+ result = self.frame.reindex(['foo', 'qux'], level=0)
+ expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]]
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame.T.reindex(['foo', 'qux'], axis=1, level=0)
+ tm.assert_frame_equal(result, expected.T)
+
+ result = self.frame.loc[['foo', 'qux']]
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame['A'].loc[['foo', 'qux']]
+ tm.assert_series_equal(result, expected['A'])
+
+ result = self.frame.T.loc[:, ['foo', 'qux']]
+ tm.assert_frame_equal(result, expected.T)
+
+ def test_drop_level(self):
+ result = self.frame.drop(['bar', 'qux'], level='first')
+ expected = self.frame.iloc[[0, 1, 2, 5, 6]]
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame.drop(['two'], level='second')
+ expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]]
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first')
+ expected = self.frame.iloc[[0, 1, 2, 5, 6]].T
+ tm.assert_frame_equal(result, expected)
+
+ result = self.frame.T.drop(['two'], axis=1, level='second')
+ expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T
+ tm.assert_frame_equal(result, expected)
+
+ def test_drop_level_nonunique_datetime(self):
+ # GH 12701
+ idx = Index([2, 3, 4, 4, 5], name='id')
+ idxdt = pd.to_datetime(['201603231400',
+ '201603231500',
+ '201603231600',
+ '201603231600',
+ '201603231700'])
+ df = DataFrame(np.arange(10).reshape(5, 2),
+ columns=list('ab'), index=idx)
+ df['tstamp'] = idxdt
+ df = df.set_index('tstamp', append=True)
+ ts = Timestamp('201603231600')
+ assert df.index.is_unique is False
+
+ result = df.drop(ts, level='tstamp')
+ expected = df.loc[idx != 4]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [Series, DataFrame])
+ def test_drop_tz_aware_timestamp_across_dst(self, box):
+ # GH 21761
+ start = Timestamp('2017-10-29', tz='Europe/Berlin')
+ end = Timestamp('2017-10-29 04:00:00', tz='Europe/Berlin')
+ index = pd.date_range(start, end, freq='15min')
+ data = box(data=[1] * len(index), index=index)
+ result = data.drop(start)
+ expected_start = Timestamp('2017-10-29 00:15:00', tz='Europe/Berlin')
+ expected_idx = pd.date_range(expected_start, end, freq='15min')
+ expected = box(data=[1] * len(expected_idx), index=expected_idx)
+ tm.assert_equal(result, expected)
+
+ def test_drop_preserve_names(self):
+ index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
+ [1, 2, 3, 1, 2, 3]],
+ names=['one', 'two'])
+
+ df = DataFrame(np.random.randn(6, 3), index=index)
+
+ result = df.drop([(0, 2)])
+ assert result.index.names == ('one', 'two')
+
+ def test_unicode_repr_issues(self):
+ levels = [Index([u('a/\u03c3'), u('b/\u03c3'), u('c/\u03c3')]),
+ Index([0, 1])]
+ codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
+ index = MultiIndex(levels=levels, codes=codes)
+
+ repr(index.levels)
+
+ # NumPy bug
+ # repr(index.get_level_values(1))
+
+ def test_unicode_repr_level_names(self):
+ index = MultiIndex.from_tuples([(0, 0), (1, 1)],
+ names=[u('\u0394'), 'i1'])
+
+ s = Series(lrange(2), index=index)
+ df = DataFrame(np.random.randn(2, 4), index=index)
+ repr(s)
+ repr(df)
+
+ def test_join_segfault(self):
+ # 1532
+ df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
+ df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
+ df1 = df1.set_index(['a', 'b'])
+ df2 = df2.set_index(['a', 'b'])
+ # it works!
+ for how in ['left', 'right', 'outer']:
+ df1.join(df2, how=how)
+
+ def test_frame_dict_constructor_empty_series(self):
+ s1 = Series([
+ 1, 2, 3, 4
+ ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]))
+ s2 = Series([
+ 1, 2, 3, 4
+ ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]))
+ s3 = Series()
+
+ # it works!
+ DataFrame({'foo': s1, 'bar': s2, 'baz': s3})
+ DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2})
+
+ def test_multiindex_na_repr(self):
+ # only an issue with long columns
+
+ from numpy import nan
+ df3 = DataFrame({
+ 'A' * 30: {('A', 'A0006000', 'nuit'): 'A0006000'},
+ 'B' * 30: {('A', 'A0006000', 'nuit'): nan},
+ 'C' * 30: {('A', 'A0006000', 'nuit'): nan},
+ 'D' * 30: {('A', 'A0006000', 'nuit'): nan},
+ 'E' * 30: {('A', 'A0006000', 'nuit'): 'A'},
+ 'F' * 30: {('A', 'A0006000', 'nuit'): nan},
+ })
+
+ idf = df3.set_index(['A' * 30, 'C' * 30])
+ repr(idf)
+
+ def test_assign_index_sequences(self):
+ # #2200
+ df = DataFrame({"a": [1, 2, 3],
+ "b": [4, 5, 6],
+ "c": [7, 8, 9]}).set_index(["a", "b"])
+ index = list(df.index)
+ index[0] = ("faz", "boo")
+ df.index = index
+ repr(df)
+
+ # this travels an improper code path
+ index[0] = ["faz", "boo"]
+ df.index = index
+ repr(df)
+
+ def test_tuples_have_na(self):
+ index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
+ codes=[[1, 1, 1, 1, -1, 0, 0, 0],
+ [0, 1, 2, 3, 0, 1, 2, 3]])
+
+ assert isna(index[4][0])
+ assert isna(index.values[4][0])
+
+ def test_duplicate_groupby_issues(self):
+ idx_tp = [('600809', '20061231'), ('600809', '20070331'),
+ ('600809', '20070630'), ('600809', '20070331')]
+ dt = ['demo', 'demo', 'demo', 'demo']
+
+ idx = MultiIndex.from_tuples(idx_tp, names=['STK_ID', 'RPT_Date'])
+ s = Series(dt, index=idx)
+
+ result = s.groupby(s.index).first()
+ assert len(result) == 3
+
+ def test_duplicate_mi(self):
+ # GH 4516
+ df = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2],
+ ['bah', 'bam', 3.0, 3],
+ ['bah', 'bam', 4.0, 4], ['foo', 'bar', 5.0, 5],
+ ['bah', 'bam', 6.0, 6]],
+ columns=list('ABCD'))
+ df = df.set_index(['A', 'B'])
+ df = df.sort_index(level=0)
+ expected = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2],
+ ['foo', 'bar', 5.0, 5]],
+ columns=list('ABCD')).set_index(['A', 'B'])
+ result = df.loc[('foo', 'bar')]
+ tm.assert_frame_equal(result, expected)
+
+ def test_duplicated_drop_duplicates(self):
+ # GH 4060
+ idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
+
+ expected = np.array(
+ [False, False, False, True, False, False], dtype=bool)
+ duplicated = idx.duplicated()
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
+ tm.assert_index_equal(idx.drop_duplicates(), expected)
+
+ expected = np.array([True, False, False, False, False, False])
+ duplicated = idx.duplicated(keep='last')
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
+ tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected)
+
+ expected = np.array([True, False, False, True, False, False])
+ duplicated = idx.duplicated(keep=False)
+ tm.assert_numpy_array_equal(duplicated, expected)
+ assert duplicated.dtype == bool
+ expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
+ tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
+
+ def test_multiindex_set_index(self):
+ # segfault in #3308
+ d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]}
+ df = DataFrame(d)
+ tuples = [(0, 1), (0, 2), (1, 2)]
+ df['tuples'] = tuples
+
+ index = MultiIndex.from_tuples(df['tuples'])
+ # it works!
+ df.set_index(index)
+
+ def test_datetimeindex(self):
+ idx1 = pd.DatetimeIndex(
+ ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'
+ ] * 2, tz='Asia/Tokyo')
+ idx2 = pd.date_range('2010/01/01', periods=6, freq='M',
+ tz='US/Eastern')
+ idx = MultiIndex.from_arrays([idx1, idx2])
+
+ expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00',
+ '2013-04-03 9:00'], tz='Asia/Tokyo')
+
+ tm.assert_index_equal(idx.levels[0], expected1)
+ tm.assert_index_equal(idx.levels[1], idx2)
+
+ # from datetime combos
+ # GH 7888
+ date1 = datetime.date.today()
+ date2 = datetime.datetime.today()
+ date3 = Timestamp.today()
+
+ for d1, d2 in itertools.product(
+ [date1, date2, date3], [date1, date2, date3]):
+ index = MultiIndex.from_product([[d1], [d2]])
+ assert isinstance(index.levels[0], pd.DatetimeIndex)
+ assert isinstance(index.levels[1], pd.DatetimeIndex)
+
+ def test_constructor_with_tz(self):
+
+ index = pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'],
+ name='dt1', tz='US/Pacific')
+ columns = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'],
+ name='dt2', tz='Asia/Tokyo')
+
+ result = MultiIndex.from_arrays([index, columns])
+ tm.assert_index_equal(result.levels[0], index)
+ tm.assert_index_equal(result.levels[1], columns)
+
+ result = MultiIndex.from_arrays([Series(index), Series(columns)])
+ tm.assert_index_equal(result.levels[0], index)
+ tm.assert_index_equal(result.levels[1], columns)
+
+ def test_set_index_datetime(self):
+ # GH 3950
+ df = DataFrame(
+ {'label': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'datetime': ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00', '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00', '2011-07-19 09:00:00'],
+ 'value': range(6)})
+ df.index = pd.to_datetime(df.pop('datetime'), utc=True)
+ df.index = df.index.tz_convert('US/Pacific')
+
+ expected = pd.DatetimeIndex(['2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00'], name='datetime')
+ expected = expected.tz_localize('UTC').tz_convert('US/Pacific')
+
+ df = df.set_index('label', append=True)
+ tm.assert_index_equal(df.index.levels[0], expected)
+ tm.assert_index_equal(df.index.levels[1],
+ Index(['a', 'b'], name='label'))
+
+ df = df.swaplevel(0, 1)
+ tm.assert_index_equal(df.index.levels[0],
+ Index(['a', 'b'], name='label'))
+ tm.assert_index_equal(df.index.levels[1], expected)
+
+ df = DataFrame(np.random.random(6))
+ idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00', '2011-07-19 07:00:00',
+ '2011-07-19 08:00:00', '2011-07-19 09:00:00'],
+ tz='US/Eastern')
+ idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00',
+ '2012-04-01 09:00', '2012-04-02 09:00',
+ '2012-04-02 09:00', '2012-04-02 09:00'],
+ tz='US/Eastern')
+ idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo')
+
+ df = df.set_index(idx1)
+ df = df.set_index(idx2, append=True)
+ df = df.set_index(idx3, append=True)
+
+ expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
+ '2011-07-19 08:00:00',
+ '2011-07-19 09:00:00'], tz='US/Eastern')
+ expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'],
+ tz='US/Eastern')
+
+ tm.assert_index_equal(df.index.levels[0], expected1)
+ tm.assert_index_equal(df.index.levels[1], expected2)
+ tm.assert_index_equal(df.index.levels[2], idx3)
+
+ # GH 7092
+ tm.assert_index_equal(df.index.get_level_values(0), idx1)
+ tm.assert_index_equal(df.index.get_level_values(1), idx2)
+ tm.assert_index_equal(df.index.get_level_values(2), idx3)
+
+ def test_reset_index_datetime(self):
+ # GH 3950
+ for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
+ idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz,
+ name='idx1')
+ idx2 = Index(range(5), name='idx2', dtype='int64')
+ idx = MultiIndex.from_arrays([idx1, idx2])
+ df = DataFrame(
+ {'a': np.arange(5, dtype='int64'),
+ 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)
+
+ expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1),
+ datetime.datetime(2011, 1, 2),
+ datetime.datetime(2011, 1, 3),
+ datetime.datetime(2011, 1, 4),
+ datetime.datetime(2011, 1, 5)],
+ 'idx2': np.arange(5, dtype='int64'),
+ 'a': np.arange(5, dtype='int64'),
+ 'b': ['A', 'B', 'C', 'D', 'E']},
+ columns=['idx1', 'idx2', 'a', 'b'])
+ expected['idx1'] = expected['idx1'].apply(
+ lambda d: Timestamp(d, tz=tz))
+
+ tm.assert_frame_equal(df.reset_index(), expected)
+
+ idx3 = pd.date_range('1/1/2012', periods=5, freq='MS',
+ tz='Europe/Paris', name='idx3')
+ idx = MultiIndex.from_arrays([idx1, idx2, idx3])
+ df = DataFrame(
+ {'a': np.arange(5, dtype='int64'),
+ 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)
+
+ expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1),
+ datetime.datetime(2011, 1, 2),
+ datetime.datetime(2011, 1, 3),
+ datetime.datetime(2011, 1, 4),
+ datetime.datetime(2011, 1, 5)],
+ 'idx2': np.arange(5, dtype='int64'),
+ 'idx3': [datetime.datetime(2012, 1, 1),
+ datetime.datetime(2012, 2, 1),
+ datetime.datetime(2012, 3, 1),
+ datetime.datetime(2012, 4, 1),
+ datetime.datetime(2012, 5, 1)],
+ 'a': np.arange(5, dtype='int64'),
+ 'b': ['A', 'B', 'C', 'D', 'E']},
+ columns=['idx1', 'idx2', 'idx3', 'a', 'b'])
+ expected['idx1'] = expected['idx1'].apply(
+ lambda d: Timestamp(d, tz=tz))
+ expected['idx3'] = expected['idx3'].apply(
+ lambda d: Timestamp(d, tz='Europe/Paris'))
+ tm.assert_frame_equal(df.reset_index(), expected)
+
+ # GH 7793
+ idx = MultiIndex.from_product([['a', 'b'], pd.date_range(
+ '20130101', periods=3, tz=tz)])
+ df = DataFrame(
+ np.arange(6, dtype='int64').reshape(
+ 6, 1), columns=['a'], index=idx)
+
+ expected = DataFrame({'level_0': 'a a a b b b'.split(),
+ 'level_1': [
+ datetime.datetime(2013, 1, 1),
+ datetime.datetime(2013, 1, 2),
+ datetime.datetime(2013, 1, 3)] * 2,
+ 'a': np.arange(6, dtype='int64')},
+ columns=['level_0', 'level_1', 'a'])
+ expected['level_1'] = expected['level_1'].apply(
+ lambda d: Timestamp(d, freq='D', tz=tz))
+ tm.assert_frame_equal(df.reset_index(), expected)
+
+ def test_reset_index_period(self):
+ # GH 7746
+ idx = MultiIndex.from_product(
+ [pd.period_range('20130101', periods=3, freq='M'), list('abc')],
+ names=['month', 'feature'])
+
+ df = DataFrame(np.arange(9, dtype='int64').reshape(-1, 1),
+ index=idx, columns=['a'])
+ expected = DataFrame({
+ 'month': ([pd.Period('2013-01', freq='M')] * 3 +
+ [pd.Period('2013-02', freq='M')] * 3 +
+ [pd.Period('2013-03', freq='M')] * 3),
+ 'feature': ['a', 'b', 'c'] * 3,
+ 'a': np.arange(9, dtype='int64')
+ }, columns=['month', 'feature', 'a'])
+ tm.assert_frame_equal(df.reset_index(), expected)
+
+ def test_reset_index_multiindex_columns(self):
+ levels = [['A', ''], ['B', 'b']]
+ df = DataFrame([[0, 2], [1, 3]],
+ columns=MultiIndex.from_tuples(levels))
+ result = df[['B']].rename_axis('A').reset_index()
+ tm.assert_frame_equal(result, df)
+
+ # gh-16120: already existing column
+ with pytest.raises(ValueError,
+ match=(r"cannot insert \('A', ''\), "
+ "already exists")):
+ df.rename_axis('A').reset_index()
+
+ # gh-16164: multiindex (tuple) full key
+ result = df.set_index([('A', '')]).reset_index()
+ tm.assert_frame_equal(result, df)
+
+ # with additional (unnamed) index level
+ idx_col = DataFrame([[0], [1]],
+ columns=MultiIndex.from_tuples([('level_0', '')]))
+ expected = pd.concat([idx_col, df[[('B', 'b'), ('A', '')]]], axis=1)
+ result = df.set_index([('B', 'b')], append=True).reset_index()
+ tm.assert_frame_equal(result, expected)
+
+ # with index name which is a too long tuple...
+ with pytest.raises(ValueError,
+ match=("Item must have length equal "
+ "to number of levels.")):
+ df.rename_axis([('C', 'c', 'i')]).reset_index()
+
+ # or too short...
+ levels = [['A', 'a', ''], ['B', 'b', 'i']]
+ df2 = DataFrame([[0, 2], [1, 3]],
+ columns=MultiIndex.from_tuples(levels))
+ idx_col = DataFrame([[0], [1]],
+ columns=MultiIndex.from_tuples([('C', 'c', 'ii')]))
+ expected = pd.concat([idx_col, df2], axis=1)
+ result = df2.rename_axis([('C', 'c')]).reset_index(col_fill='ii')
+ tm.assert_frame_equal(result, expected)
+
+ # ... which is incompatible with col_fill=None
+ with pytest.raises(ValueError,
+ match=("col_fill=None is incompatible with "
+ r"incomplete column name \('C', 'c'\)")):
+ df2.rename_axis([('C', 'c')]).reset_index(col_fill=None)
+
+ # with col_level != 0
+ result = df2.rename_axis([('c', 'ii')]).reset_index(col_level=1,
+ col_fill='C')
+ tm.assert_frame_equal(result, expected)
+
+ def test_set_index_period(self):
+ # GH 6631
+ df = DataFrame(np.random.random(6))
+ idx1 = pd.period_range('2011-01-01', periods=3, freq='M')
+ idx1 = idx1.append(idx1)
+ idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H')
+ idx2 = idx2.append(idx2).append(idx2)
+ idx3 = pd.period_range('2005', periods=6, freq='A')
+
+ df = df.set_index(idx1)
+ df = df.set_index(idx2, append=True)
+ df = df.set_index(idx3, append=True)
+
+ expected1 = pd.period_range('2011-01-01', periods=3, freq='M')
+ expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H')
+
+ tm.assert_index_equal(df.index.levels[0], expected1)
+ tm.assert_index_equal(df.index.levels[1], expected2)
+ tm.assert_index_equal(df.index.levels[2], idx3)
+
+ tm.assert_index_equal(df.index.get_level_values(0), idx1)
+ tm.assert_index_equal(df.index.get_level_values(1), idx2)
+ tm.assert_index_equal(df.index.get_level_values(2), idx3)
+
+ def test_repeat(self):
+ # GH 9361
+ # fixed by # GH 7891
+ m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)])
+ data = ['a', 'b', 'c', 'd']
+ m_df = Series(data, index=m_idx)
+ assert m_df.repeat(3).shape == (3 * len(data), )
+
+
+class TestSorted(Base):
+ """ everything you wanted to test about sorting """
+
+ def test_sort_index_preserve_levels(self):
+ result = self.frame.sort_index()
+ assert result.index.names == self.frame.index.names
+
+ def test_sorting_repr_8017(self):
+
+ np.random.seed(0)
+ data = np.random.randn(3, 4)
+
+ for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4),
+ ([Timestamp('20130101'), Timestamp('20130103'),
+ Timestamp('20130102'), Timestamp('20130105')],
+ Timestamp('20130104')),
+ (['1one', '3one', '2one', '5one'], '4one')]:
+ columns = MultiIndex.from_tuples([('red', i) for i in gen])
+ df = DataFrame(data, index=list('def'), columns=columns)
+ df2 = pd.concat([df,
+ DataFrame('world', index=list('def'),
+ columns=MultiIndex.from_tuples(
+ [('red', extra)]))], axis=1)
+
+ # check that the repr is good
+ # make sure that we have a correct sparsified repr
+ # e.g. only 1 header of read
+ assert str(df2).splitlines()[0].split() == ['red']
+
+ # GH 8017
+ # sorting fails after columns added
+
+ # construct single-dtype then sort
+ result = df.copy().sort_index(axis=1)
+ expected = df.iloc[:, [0, 2, 1, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df2.sort_index(axis=1)
+ expected = df2.iloc[:, [0, 2, 1, 4, 3]]
+ tm.assert_frame_equal(result, expected)
+
+ # setitem then sort
+ result = df.copy()
+ result[('red', extra)] = 'world'
+
+ result = result.sort_index(axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ def test_sort_index_level(self):
+ df = self.frame.copy()
+ df.index = np.arange(len(df))
+
+ # axis=1
+
+ # series
+ a_sorted = self.frame['A'].sort_index(level=0)
+
+ # preserve names
+ assert a_sorted.index.names == self.frame.index.names
+
+ # inplace
+ rs = self.frame.copy()
+ rs.sort_index(level=0, inplace=True)
+ tm.assert_frame_equal(rs, self.frame.sort_index(level=0))
+
+ def test_sort_index_level_large_cardinality(self):
+
+ # #2684 (int64)
+ index = MultiIndex.from_arrays([np.arange(4000)] * 3)
+ df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
+
+ # it works!
+ result = df.sort_index(level=0)
+ assert result.index.lexsort_depth == 3
+
+ # #2684 (int32)
+ index = MultiIndex.from_arrays([np.arange(4000)] * 3)
+ df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
+
+ # it works!
+ result = df.sort_index(level=0)
+ assert (result.dtypes.values == df.dtypes.values).all()
+ assert result.index.lexsort_depth == 3
+
+ def test_sort_index_level_by_name(self):
+ self.frame.index.names = ['first', 'second']
+ result = self.frame.sort_index(level='second')
+ expected = self.frame.sort_index(level=1)
+ tm.assert_frame_equal(result, expected)
+
+ def test_sort_index_level_mixed(self):
+ sorted_before = self.frame.sort_index(level=1)
+
+ df = self.frame.copy()
+ df['foo'] = 'bar'
+ sorted_after = df.sort_index(level=1)
+ tm.assert_frame_equal(sorted_before,
+ sorted_after.drop(['foo'], axis=1))
+
+ dft = self.frame.T
+ sorted_before = dft.sort_index(level=1, axis=1)
+ dft['foo', 'three'] = 'bar'
+
+ sorted_after = dft.sort_index(level=1, axis=1)
+ tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
+ sorted_after.drop([('foo', 'three')], axis=1))
+
+ def test_is_lexsorted(self):
+ levels = [[0, 1], [0, 1, 2]]
+
+ index = MultiIndex(levels=levels,
+ codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
+ assert index.is_lexsorted()
+
+ index = MultiIndex(levels=levels,
+ codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]])
+ assert not index.is_lexsorted()
+
+ index = MultiIndex(levels=levels,
+ codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]])
+ assert not index.is_lexsorted()
+ assert index.lexsort_depth == 0
+
+ def test_sort_index_and_reconstruction(self):
+
+ # 15622
+ # lexsortedness should be identical
+ # across MultiIndex consruction methods
+
+ df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
+ expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
+ index=MultiIndex.from_tuples([(0.5, 'a'),
+ (0.5, 'b'),
+ (0.8, 'a'),
+ (0.8, 'b')]))
+ assert expected.index.is_lexsorted()
+
+ result = DataFrame(
+ [[1, 1], [2, 2], [1, 1], [2, 2]],
+ index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
+ result = result.sort_index()
+ assert result.index.is_lexsorted()
+ assert result.index.is_monotonic
+
+ tm.assert_frame_equal(result, expected)
+
+ result = DataFrame(
+ [[1, 1], [2, 2], [1, 1], [2, 2]],
+ index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+ result = result.sort_index()
+ assert result.index.is_lexsorted()
+
+ tm.assert_frame_equal(result, expected)
+
+ concatted = pd.concat([df, df], keys=[0.8, 0.5])
+ result = concatted.sort_index()
+
+ assert result.index.is_lexsorted()
+ assert result.index.is_monotonic
+
+ tm.assert_frame_equal(result, expected)
+
+ # 14015
+ df = DataFrame([[1, 2], [6, 7]],
+ columns=MultiIndex.from_tuples(
+ [(0, '20160811 12:00:00'),
+ (0, '20160809 12:00:00')],
+ names=['l1', 'Date']))
+
+ df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
+ level=1,
+ inplace=True)
+ assert not df.columns.is_lexsorted()
+ assert not df.columns.is_monotonic
+ result = df.sort_index(axis=1)
+ assert result.columns.is_lexsorted()
+ assert result.columns.is_monotonic
+ result = df.sort_index(axis=1, level=1)
+ assert result.columns.is_lexsorted()
+ assert result.columns.is_monotonic
+
+ def test_sort_index_and_reconstruction_doc_example(self):
+ # doc example
+ df = DataFrame({'value': [1, 2, 3, 4]},
+ index=MultiIndex(
+ levels=[['a', 'b'], ['bb', 'aa']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+ assert df.index.is_lexsorted()
+ assert not df.index.is_monotonic
+
+ # sort it
+ expected = DataFrame({'value': [2, 1, 4, 3]},
+ index=MultiIndex(
+ levels=[['a', 'b'], ['aa', 'bb']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+ result = df.sort_index()
+ assert result.index.is_lexsorted()
+ assert result.index.is_monotonic
+
+ tm.assert_frame_equal(result, expected)
+
+ # reconstruct
+ result = df.sort_index().copy()
+ result.index = result.index._sort_levels_monotonic()
+ assert result.index.is_lexsorted()
+ assert result.index.is_monotonic
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_sort_index_reorder_on_ops(self):
+ # 15687
+ df = DataFrame(
+ np.random.randn(8, 2),
+ index=MultiIndex.from_product(
+ [['a', 'b'], ['big', 'small'], ['red', 'blu']],
+ names=['letter', 'size', 'color']),
+ columns=['near', 'far'])
+ df = df.sort_index()
+
+ def my_func(group):
+ group.index = ['newz', 'newa']
+ return group
+
+ result = df.groupby(level=['letter', 'size']).apply(
+ my_func).sort_index()
+ expected = MultiIndex.from_product(
+ [['a', 'b'], ['big', 'small'], ['newa', 'newz']],
+ names=['letter', 'size', None])
+
+ tm.assert_index_equal(result.index, expected)
+
+ def test_sort_non_lexsorted(self):
+ # degenerate case where we sort but don't
+ # have a satisfying result :<
+ # GH 15797
+ idx = MultiIndex([['A', 'B', 'C'],
+ ['c', 'b', 'a']],
+ [[0, 1, 2, 0, 1, 2],
+ [0, 2, 1, 1, 0, 2]])
+
+ df = DataFrame({'col': range(len(idx))},
+ index=idx,
+ dtype='int64')
+ assert df.index.is_lexsorted() is False
+ assert df.index.is_monotonic is False
+
+ sorted = df.sort_index()
+ assert sorted.index.is_lexsorted() is True
+ assert sorted.index.is_monotonic is True
+
+ expected = DataFrame(
+ {'col': [1, 4, 5, 2]},
+ index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
+ ('C', 'a'), ('C', 'b')]),
+ dtype='int64')
+ result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
+ tm.assert_frame_equal(result, expected)
+
+ def test_sort_index_nan(self):
+ # GH 14784
+ # incorrect sorting w.r.t. nans
+ tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
+ mi = MultiIndex.from_tuples(tuples)
+
+ df = DataFrame(np.arange(16).reshape(4, 4),
+ index=mi, columns=list('ABCD'))
+ s = Series(np.arange(4), index=mi)
+
+ df2 = DataFrame({
+ 'date': pd.to_datetime([
+ '20121002', '20121007', '20130130', '20130202', '20130305',
+ '20121002', '20121207', '20130130', '20130202', '20130305',
+ '20130202', '20130305'
+ ]),
+ 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
+ 'whole_cost': [1790, np.nan, 280, 259, np.nan, 623, 90, 312,
+ np.nan, 301, 359, 801],
+ 'cost': [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12]
+ }).set_index(['date', 'user_id'])
+
+ # sorting frame, default nan position is last
+ result = df.sort_index()
+ expected = df.iloc[[3, 0, 2, 1], :]
+ tm.assert_frame_equal(result, expected)
+
+ # sorting frame, nan position last
+ result = df.sort_index(na_position='last')
+ expected = df.iloc[[3, 0, 2, 1], :]
+ tm.assert_frame_equal(result, expected)
+
+ # sorting frame, nan position first
+ result = df.sort_index(na_position='first')
+ expected = df.iloc[[1, 2, 3, 0], :]
+ tm.assert_frame_equal(result, expected)
+
+ # sorting frame with removed rows
+ result = df2.dropna().sort_index()
+ expected = df2.sort_index().dropna()
+ tm.assert_frame_equal(result, expected)
+
+ # sorting series, default nan position is last
+ result = s.sort_index()
+ expected = s.iloc[[3, 0, 2, 1]]
+ tm.assert_series_equal(result, expected)
+
+ # sorting series, nan position last
+ result = s.sort_index(na_position='last')
+ expected = s.iloc[[3, 0, 2, 1]]
+ tm.assert_series_equal(result, expected)
+
+ # sorting series, nan position first
+ result = s.sort_index(na_position='first')
+ expected = s.iloc[[1, 2, 3, 0]]
+ tm.assert_series_equal(result, expected)
+
+ def test_sort_ascending_list(self):
+ # GH: 16934
+
+ # Set up a Series with a three level MultiIndex
+ arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+ ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
+ [4, 3, 2, 1, 4, 3, 2, 1]]
+ tuples = lzip(*arrays)
+ mi = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
+ s = Series(range(8), index=mi)
+
+ # Sort with boolean ascending
+ result = s.sort_index(level=['third', 'first'], ascending=False)
+ expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]]
+ tm.assert_series_equal(result, expected)
+
+ # Sort with list of boolean ascending
+ result = s.sort_index(level=['third', 'first'],
+ ascending=[False, True])
+ expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_nanops.py b/contrib/python/pandas/py2/pandas/tests/test_nanops.py
new file mode 100644
index 00000000000..cf5ef6cf15e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_nanops.py
@@ -0,0 +1,1059 @@
+# -*- coding: utf-8 -*-
+from __future__ import division, print_function
+
+from functools import partial
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas.compat.numpy import _np_version_under1p13
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+from pandas import Series, isna
+from pandas.core.arrays import DatetimeArray
+import pandas.core.nanops as nanops
+import pandas.util.testing as tm
+
+use_bn = nanops._USE_BOTTLENECK
+
+
+class TestnanopsDataFrame(object):
+
+ def setup_method(self, method):
+ np.random.seed(11235)
+ nanops._USE_BOTTLENECK = False
+
+ arr_shape = (11, 7, 5)
+
+ self.arr_float = np.random.randn(*arr_shape)
+ self.arr_float1 = np.random.randn(*arr_shape)
+ self.arr_complex = self.arr_float + self.arr_float1 * 1j
+ self.arr_int = np.random.randint(-10, 10, arr_shape)
+ self.arr_bool = np.random.randint(0, 2, arr_shape) == 0
+ self.arr_str = np.abs(self.arr_float).astype('S')
+ self.arr_utf = np.abs(self.arr_float).astype('U')
+ self.arr_date = np.random.randint(0, 20000,
+ arr_shape).astype('M8[ns]')
+ self.arr_tdelta = np.random.randint(0, 20000,
+ arr_shape).astype('m8[ns]')
+
+ self.arr_nan = np.tile(np.nan, arr_shape)
+ self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan])
+ self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan])
+ self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1])
+ self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan])
+
+ self.arr_inf = self.arr_float * np.inf
+ self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf])
+
+ self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf])
+ self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan,
+ self.arr_inf])
+ self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan,
+ self.arr_inf])
+ self.arr_obj = np.vstack([
+ self.arr_float.astype('O'),
+ self.arr_int.astype('O'),
+ self.arr_bool.astype('O'),
+ self.arr_complex.astype('O'),
+ self.arr_str.astype('O'),
+ self.arr_utf.astype('O'),
+ self.arr_date.astype('O'),
+ self.arr_tdelta.astype('O')
+ ])
+
+ with np.errstate(invalid='ignore'):
+ self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j
+ self.arr_complex_nan = np.vstack([self.arr_complex,
+ self.arr_nan_nanj])
+
+ self.arr_nan_infj = self.arr_inf * 1j
+ self.arr_complex_nan_infj = np.vstack([self.arr_complex,
+ self.arr_nan_infj])
+
+ self.arr_float_2d = self.arr_float[:, :, 0]
+ self.arr_float1_2d = self.arr_float1[:, :, 0]
+
+ self.arr_nan_2d = self.arr_nan[:, :, 0]
+ self.arr_float_nan_2d = self.arr_float_nan[:, :, 0]
+ self.arr_float1_nan_2d = self.arr_float1_nan[:, :, 0]
+ self.arr_nan_float1_2d = self.arr_nan_float1[:, :, 0]
+
+ self.arr_float_1d = self.arr_float[:, 0, 0]
+ self.arr_float1_1d = self.arr_float1[:, 0, 0]
+
+ self.arr_nan_1d = self.arr_nan[:, 0, 0]
+ self.arr_float_nan_1d = self.arr_float_nan[:, 0, 0]
+ self.arr_float1_nan_1d = self.arr_float1_nan[:, 0, 0]
+ self.arr_nan_float1_1d = self.arr_nan_float1[:, 0, 0]
+
+ def teardown_method(self, method):
+ nanops._USE_BOTTLENECK = use_bn
+
+ def check_results(self, targ, res, axis, check_dtype=True):
+ res = getattr(res, 'asm8', res)
+ res = getattr(res, 'values', res)
+
+ # timedeltas are a beast here
+ def _coerce_tds(targ, res):
+ if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]':
+ if len(targ) == 1:
+ targ = targ[0].item()
+ res = res.item()
+ else:
+ targ = targ.view('i8')
+ return targ, res
+
+ try:
+ if axis != 0 and hasattr(
+ targ, 'shape') and targ.ndim and targ.shape != res.shape:
+ res = np.split(res, [targ.shape[0]], axis=0)[0]
+ except (ValueError, IndexError):
+ targ, res = _coerce_tds(targ, res)
+
+ try:
+ tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
+ except AssertionError:
+
+ # handle timedelta dtypes
+ if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]':
+ targ, res = _coerce_tds(targ, res)
+ tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
+ return
+
+ # There are sometimes rounding errors with
+ # complex and object dtypes.
+ # If it isn't one of those, re-raise the error.
+ if not hasattr(res, 'dtype') or res.dtype.kind not in ['c', 'O']:
+ raise
+ # convert object dtypes to something that can be split into
+ # real and imaginary parts
+ if res.dtype.kind == 'O':
+ if targ.dtype.kind != 'O':
+ res = res.astype(targ.dtype)
+ else:
+ try:
+ res = res.astype('c16')
+ except RuntimeError:
+ res = res.astype('f8')
+ try:
+ targ = targ.astype('c16')
+ except RuntimeError:
+ targ = targ.astype('f8')
+ # there should never be a case where numpy returns an object
+ # but nanops doesn't, so make that an exception
+ elif targ.dtype.kind == 'O':
+ raise
+ tm.assert_almost_equal(targ.real, res.real,
+ check_dtype=check_dtype)
+ tm.assert_almost_equal(targ.imag, res.imag,
+ check_dtype=check_dtype)
+
+ def check_fun_data(self, testfunc, targfunc, testarval, targarval,
+ targarnanval, check_dtype=True, empty_targfunc=None,
+ **kwargs):
+ for axis in list(range(targarval.ndim)) + [None]:
+ for skipna in [False, True]:
+ targartempval = targarval if skipna else targarnanval
+ if skipna and empty_targfunc and isna(targartempval).all():
+ targ = empty_targfunc(targartempval, axis=axis, **kwargs)
+ else:
+ targ = targfunc(targartempval, axis=axis, **kwargs)
+
+ try:
+ res = testfunc(testarval, axis=axis, skipna=skipna,
+ **kwargs)
+ self.check_results(targ, res, axis,
+ check_dtype=check_dtype)
+ if skipna:
+ res = testfunc(testarval, axis=axis, **kwargs)
+ self.check_results(targ, res, axis,
+ check_dtype=check_dtype)
+ if axis is None:
+ res = testfunc(testarval, skipna=skipna, **kwargs)
+ self.check_results(targ, res, axis,
+ check_dtype=check_dtype)
+ if skipna and axis is None:
+ res = testfunc(testarval, **kwargs)
+ self.check_results(targ, res, axis,
+ check_dtype=check_dtype)
+ except BaseException as exc:
+ exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1),
+ 'skipna: %s' % skipna, 'kwargs: %s' % kwargs)
+ raise
+
+ if testarval.ndim <= 1:
+ return
+
+ try:
+ testarval2 = np.take(testarval, 0, axis=-1)
+ targarval2 = np.take(targarval, 0, axis=-1)
+ targarnanval2 = np.take(targarnanval, 0, axis=-1)
+ except ValueError:
+ return
+ self.check_fun_data(testfunc, targfunc, testarval2, targarval2,
+ targarnanval2, check_dtype=check_dtype,
+ empty_targfunc=empty_targfunc, **kwargs)
+
+ def check_fun(self, testfunc, targfunc, testar, targar=None,
+ targarnan=None, empty_targfunc=None, **kwargs):
+ if targar is None:
+ targar = testar
+ if targarnan is None:
+ targarnan = testar
+ testarval = getattr(self, testar)
+ targarval = getattr(self, targar)
+ targarnanval = getattr(self, targarnan)
+ try:
+ self.check_fun_data(testfunc, targfunc, testarval, targarval,
+ targarnanval, empty_targfunc=empty_targfunc,
+ **kwargs)
+ except BaseException as exc:
+ exc.args += ('testar: %s' % testar, 'targar: %s' % targar,
+ 'targarnan: %s' % targarnan)
+ raise
+
+ def check_funs(self, testfunc, targfunc, allow_complex=True,
+ allow_all_nan=True, allow_str=True, allow_date=True,
+ allow_tdelta=True, allow_obj=True, **kwargs):
+ self.check_fun(testfunc, targfunc, 'arr_float', **kwargs)
+ self.check_fun(testfunc, targfunc, 'arr_float_nan', 'arr_float',
+ **kwargs)
+ self.check_fun(testfunc, targfunc, 'arr_int', **kwargs)
+ self.check_fun(testfunc, targfunc, 'arr_bool', **kwargs)
+ objs = [self.arr_float.astype('O'), self.arr_int.astype('O'),
+ self.arr_bool.astype('O')]
+
+ if allow_all_nan:
+ self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs)
+
+ if allow_complex:
+ self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs)
+ self.check_fun(testfunc, targfunc, 'arr_complex_nan',
+ 'arr_complex', **kwargs)
+ if allow_all_nan:
+ self.check_fun(testfunc, targfunc, 'arr_nan_nanj', **kwargs)
+ objs += [self.arr_complex.astype('O')]
+
+ if allow_str:
+ self.check_fun(testfunc, targfunc, 'arr_str', **kwargs)
+ self.check_fun(testfunc, targfunc, 'arr_utf', **kwargs)
+ objs += [self.arr_str.astype('O'), self.arr_utf.astype('O')]
+
+ if allow_date:
+ try:
+ targfunc(self.arr_date)
+ except TypeError:
+ pass
+ else:
+ self.check_fun(testfunc, targfunc, 'arr_date', **kwargs)
+ objs += [self.arr_date.astype('O')]
+
+ if allow_tdelta:
+ try:
+ targfunc(self.arr_tdelta)
+ except TypeError:
+ pass
+ else:
+ self.check_fun(testfunc, targfunc, 'arr_tdelta', **kwargs)
+ objs += [self.arr_tdelta.astype('O')]
+
+ if allow_obj:
+ self.arr_obj = np.vstack(objs)
+ # some nanops handle object dtypes better than their numpy
+ # counterparts, so the numpy functions need to be given something
+ # else
+ if allow_obj == 'convert':
+ targfunc = partial(self._badobj_wrap, func=targfunc,
+ allow_complex=allow_complex)
+ self.check_fun(testfunc, targfunc, 'arr_obj', **kwargs)
+
+ def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
+ if value.dtype.kind == 'O':
+ if allow_complex:
+ value = value.astype('c16')
+ else:
+ value = value.astype('f8')
+ return func(value, **kwargs)
+
+ def test_nanany(self):
+ self.check_funs(nanops.nanany, np.any, allow_all_nan=False,
+ allow_str=False, allow_date=False, allow_tdelta=False)
+
+ def test_nanall(self):
+ self.check_funs(nanops.nanall, np.all, allow_all_nan=False,
+ allow_str=False, allow_date=False, allow_tdelta=False)
+
+ def test_nansum(self):
+ self.check_funs(nanops.nansum, np.sum, allow_str=False,
+ allow_date=False, allow_tdelta=True, check_dtype=False,
+ empty_targfunc=np.nansum)
+
+ def test_nanmean(self):
+ self.check_funs(nanops.nanmean, np.mean, allow_complex=False,
+ allow_obj=False, allow_str=False, allow_date=False,
+ allow_tdelta=True)
+
+ def test_nanmean_overflow(self):
+ # GH 10155
+ # In the previous implementation mean can overflow for int dtypes, it
+ # is now consistent with numpy
+
+ for a in [2 ** 55, -2 ** 55, 20150515061816532]:
+ s = Series(a, index=range(500), dtype=np.int64)
+ result = s.mean()
+ np_result = s.values.mean()
+ assert result == a
+ assert result == np_result
+ assert result.dtype == np.float64
+
+ def test_returned_dtype(self):
+
+ dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64]
+ if hasattr(np, 'float128'):
+ dtypes.append(np.float128)
+
+ for dtype in dtypes:
+ s = Series(range(10), dtype=dtype)
+ group_a = ['mean', 'std', 'var', 'skew', 'kurt']
+ group_b = ['min', 'max']
+ for method in group_a + group_b:
+ result = getattr(s, method)()
+ if is_integer_dtype(dtype) and method in group_a:
+ assert result.dtype == np.float64
+ else:
+ assert result.dtype == dtype
+
+ def test_nanmedian(self):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ self.check_funs(nanops.nanmedian, np.median, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=True, allow_obj='convert')
+
+ @pytest.mark.parametrize('ddof', range(3))
+ def test_nanvar(self, ddof):
+ self.check_funs(nanops.nanvar, np.var, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=True, allow_obj='convert', ddof=ddof)
+
+ @pytest.mark.parametrize('ddof', range(3))
+ def test_nanstd(self, ddof):
+ self.check_funs(nanops.nanstd, np.std, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=True, allow_obj='convert', ddof=ddof)
+
+ @td.skip_if_no('scipy', min_version='0.17.0')
+ @pytest.mark.parametrize('ddof', range(3))
+ def test_nansem(self, ddof):
+ from scipy.stats import sem
+ with np.errstate(invalid='ignore'):
+ self.check_funs(nanops.nansem, sem, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=False, allow_obj='convert', ddof=ddof)
+
+ def _minmax_wrap(self, value, axis=None, func=None):
+
+ # numpy warns if all nan
+ res = func(value, axis)
+ if res.dtype.kind == 'm':
+ res = np.atleast_1d(res)
+ return res
+
+ def test_nanmin(self):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ func = partial(self._minmax_wrap, func=np.min)
+ self.check_funs(nanops.nanmin, func,
+ allow_str=False, allow_obj=False)
+
+ def test_nanmax(self):
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", RuntimeWarning)
+ func = partial(self._minmax_wrap, func=np.max)
+ self.check_funs(nanops.nanmax, func,
+ allow_str=False, allow_obj=False)
+
+ def _argminmax_wrap(self, value, axis=None, func=None):
+ res = func(value, axis)
+ nans = np.min(value, axis)
+ nullnan = isna(nans)
+ if res.ndim:
+ res[nullnan] = -1
+ elif (hasattr(nullnan, 'all') and nullnan.all() or
+ not hasattr(nullnan, 'all') and nullnan):
+ res = -1
+ return res
+
+ def test_nanargmax(self):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ func = partial(self._argminmax_wrap, func=np.argmax)
+ self.check_funs(nanops.nanargmax, func,
+ allow_str=False, allow_obj=False,
+ allow_date=True, allow_tdelta=True)
+
+ def test_nanargmin(self):
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ func = partial(self._argminmax_wrap, func=np.argmin)
+ self.check_funs(nanops.nanargmin, func, allow_str=False,
+ allow_obj=False)
+
+ def _skew_kurt_wrap(self, values, axis=None, func=None):
+ if not isinstance(values.dtype.type, np.floating):
+ values = values.astype('f8')
+ result = func(values, axis=axis, bias=False)
+ # fix for handling cases where all elements in an axis are the same
+ if isinstance(result, np.ndarray):
+ result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0
+ return result
+ elif np.max(values) == np.min(values):
+ return 0.
+ return result
+
+ @td.skip_if_no('scipy', min_version='0.17.0')
+ def test_nanskew(self):
+ from scipy.stats import skew
+ func = partial(self._skew_kurt_wrap, func=skew)
+ with np.errstate(invalid='ignore'):
+ self.check_funs(nanops.nanskew, func, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=False)
+
+ @td.skip_if_no('scipy', min_version='0.17.0')
+ def test_nankurt(self):
+ from scipy.stats import kurtosis
+ func1 = partial(kurtosis, fisher=True)
+ func = partial(self._skew_kurt_wrap, func=func1)
+ with np.errstate(invalid='ignore'):
+ self.check_funs(nanops.nankurt, func, allow_complex=False,
+ allow_str=False, allow_date=False,
+ allow_tdelta=False)
+
+ def test_nanprod(self):
+ self.check_funs(nanops.nanprod, np.prod, allow_str=False,
+ allow_date=False, allow_tdelta=False,
+ empty_targfunc=np.nanprod)
+
+ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs):
+ res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs)
+ res01 = checkfun(self.arr_float_2d, self.arr_float1_2d,
+ min_periods=len(self.arr_float_2d) - 1, **kwargs)
+ tm.assert_almost_equal(targ0, res00)
+ tm.assert_almost_equal(targ0, res01)
+
+ res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d,
+ **kwargs)
+ res11 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d,
+ min_periods=len(self.arr_float_2d) - 1, **kwargs)
+ tm.assert_almost_equal(targ1, res10)
+ tm.assert_almost_equal(targ1, res11)
+
+ targ2 = np.nan
+ res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs)
+ res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs)
+ res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs)
+ res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d,
+ **kwargs)
+ res24 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d,
+ min_periods=len(self.arr_float_2d) - 1, **kwargs)
+ res25 = checkfun(self.arr_float_2d, self.arr_float1_2d,
+ min_periods=len(self.arr_float_2d) + 1, **kwargs)
+ tm.assert_almost_equal(targ2, res20)
+ tm.assert_almost_equal(targ2, res21)
+ tm.assert_almost_equal(targ2, res22)
+ tm.assert_almost_equal(targ2, res23)
+ tm.assert_almost_equal(targ2, res24)
+ tm.assert_almost_equal(targ2, res25)
+
+ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs):
+ res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs)
+ res01 = checkfun(self.arr_float_1d, self.arr_float1_1d,
+ min_periods=len(self.arr_float_1d) - 1, **kwargs)
+ tm.assert_almost_equal(targ0, res00)
+ tm.assert_almost_equal(targ0, res01)
+
+ res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d,
+ **kwargs)
+ res11 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d,
+ min_periods=len(self.arr_float_1d) - 1, **kwargs)
+ tm.assert_almost_equal(targ1, res10)
+ tm.assert_almost_equal(targ1, res11)
+
+ targ2 = np.nan
+ res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs)
+ res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs)
+ res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs)
+ res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d,
+ **kwargs)
+ res24 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d,
+ min_periods=len(self.arr_float_1d) - 1, **kwargs)
+ res25 = checkfun(self.arr_float_1d, self.arr_float1_1d,
+ min_periods=len(self.arr_float_1d) + 1, **kwargs)
+ tm.assert_almost_equal(targ2, res20)
+ tm.assert_almost_equal(targ2, res21)
+ tm.assert_almost_equal(targ2, res22)
+ tm.assert_almost_equal(targ2, res23)
+ tm.assert_almost_equal(targ2, res24)
+ tm.assert_almost_equal(targ2, res25)
+
+ def test_nancorr(self):
+ targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+ targ1 = np.corrcoef(self.arr_float_2d.flat,
+ self.arr_float1_2d.flat)[0, 1]
+ self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1)
+ targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+ targ1 = np.corrcoef(self.arr_float_1d.flat,
+ self.arr_float1_1d.flat)[0, 1]
+ self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
+ method='pearson')
+
+ def test_nancorr_pearson(self):
+ targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+ targ1 = np.corrcoef(self.arr_float_2d.flat,
+ self.arr_float1_2d.flat)[0, 1]
+ self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1,
+ method='pearson')
+ targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+ targ1 = np.corrcoef(self.arr_float_1d.flat,
+ self.arr_float1_1d.flat)[0, 1]
+ self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
+ method='pearson')
+
+ @td.skip_if_no_scipy
+ def test_nancorr_kendall(self):
+ from scipy.stats import kendalltau
+ targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
+ targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
+ self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1,
+ method='kendall')
+ targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
+ targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
+ self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
+ method='kendall')
+
+ @td.skip_if_no_scipy
+ def test_nancorr_spearman(self):
+ from scipy.stats import spearmanr
+ targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0]
+ targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
+ self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1,
+ method='spearman')
+ targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0]
+ targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
+ self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
+ method='spearman')
+
+ def test_nancov(self):
+ targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+ targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
+ self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1)
+ targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+ targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
+ self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1)
+
+ def check_nancomp(self, checkfun, targ0):
+ arr_float = self.arr_float
+ arr_float1 = self.arr_float1
+ arr_nan = self.arr_nan
+ arr_nan_nan = self.arr_nan_nan
+ arr_float_nan = self.arr_float_nan
+ arr_float1_nan = self.arr_float1_nan
+ arr_nan_float1 = self.arr_nan_float1
+
+ while targ0.ndim:
+ try:
+ res0 = checkfun(arr_float, arr_float1)
+ tm.assert_almost_equal(targ0, res0)
+
+ if targ0.ndim > 1:
+ targ1 = np.vstack([targ0, arr_nan])
+ else:
+ targ1 = np.hstack([targ0, arr_nan])
+ res1 = checkfun(arr_float_nan, arr_float1_nan)
+ tm.assert_numpy_array_equal(targ1, res1, check_dtype=False)
+
+ targ2 = arr_nan_nan
+ res2 = checkfun(arr_float_nan, arr_nan_float1)
+ tm.assert_numpy_array_equal(targ2, res2, check_dtype=False)
+ except Exception as exc:
+ exc.args += ('ndim: %s' % arr_float.ndim, )
+ raise
+
+ try:
+ arr_float = np.take(arr_float, 0, axis=-1)
+ arr_float1 = np.take(arr_float1, 0, axis=-1)
+ arr_nan = np.take(arr_nan, 0, axis=-1)
+ arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1)
+ arr_float_nan = np.take(arr_float_nan, 0, axis=-1)
+ arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1)
+ arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1)
+ targ0 = np.take(targ0, 0, axis=-1)
+ except ValueError:
+ break
+
+ def test_nangt(self):
+ targ0 = self.arr_float > self.arr_float1
+ self.check_nancomp(nanops.nangt, targ0)
+
+ def test_nange(self):
+ targ0 = self.arr_float >= self.arr_float1
+ self.check_nancomp(nanops.nange, targ0)
+
+ def test_nanlt(self):
+ targ0 = self.arr_float < self.arr_float1
+ self.check_nancomp(nanops.nanlt, targ0)
+
+ def test_nanle(self):
+ targ0 = self.arr_float <= self.arr_float1
+ self.check_nancomp(nanops.nanle, targ0)
+
+ def test_naneq(self):
+ targ0 = self.arr_float == self.arr_float1
+ self.check_nancomp(nanops.naneq, targ0)
+
+ def test_nanne(self):
+ targ0 = self.arr_float != self.arr_float1
+ self.check_nancomp(nanops.nanne, targ0)
+
+ def check_bool(self, func, value, correct, *args, **kwargs):
+ while getattr(value, 'ndim', True):
+ try:
+ res0 = func(value, *args, **kwargs)
+ if correct:
+ assert res0
+ else:
+ assert not res0
+ except BaseException as exc:
+ exc.args += ('dim: %s' % getattr(value, 'ndim', value), )
+ raise
+ if not hasattr(value, 'ndim'):
+ break
+ try:
+ value = np.take(value, 0, axis=-1)
+ except ValueError:
+ break
+
+ def test__has_infs(self):
+ pairs = [('arr_complex', False), ('arr_int', False),
+ ('arr_bool', False), ('arr_str', False), ('arr_utf', False),
+ ('arr_complex', False), ('arr_complex_nan', False),
+ ('arr_nan_nanj', False), ('arr_nan_infj', True),
+ ('arr_complex_nan_infj', True)]
+ pairs_float = [('arr_float', False), ('arr_nan', False),
+ ('arr_float_nan', False), ('arr_nan_nan', False),
+ ('arr_float_inf', True), ('arr_inf', True),
+ ('arr_nan_inf', True), ('arr_float_nan_inf', True),
+ ('arr_nan_nan_inf', True)]
+
+ for arr, correct in pairs:
+ val = getattr(self, arr)
+ try:
+ self.check_bool(nanops._has_infs, val, correct)
+ except BaseException as exc:
+ exc.args += (arr, )
+ raise
+
+ for arr, correct in pairs_float:
+ val = getattr(self, arr)
+ try:
+ self.check_bool(nanops._has_infs, val, correct)
+ self.check_bool(nanops._has_infs, val.astype('f4'), correct)
+ self.check_bool(nanops._has_infs, val.astype('f2'), correct)
+ except BaseException as exc:
+ exc.args += (arr, )
+ raise
+
+ def test__isfinite(self):
+ pairs = [('arr_complex', False), ('arr_int', False),
+ ('arr_bool', False), ('arr_str', False), ('arr_utf', False),
+ ('arr_complex', False), ('arr_complex_nan', True),
+ ('arr_nan_nanj', True), ('arr_nan_infj', True),
+ ('arr_complex_nan_infj', True)]
+ pairs_float = [('arr_float', False), ('arr_nan', True),
+ ('arr_float_nan', True), ('arr_nan_nan', True),
+ ('arr_float_inf', True), ('arr_inf', True),
+ ('arr_nan_inf', True), ('arr_float_nan_inf', True),
+ ('arr_nan_nan_inf', True)]
+
+ func1 = lambda x: np.any(nanops._isfinite(x).ravel())
+
+ # TODO: unused?
+ # func2 = lambda x: np.any(nanops._isfinite(x).values.ravel())
+
+ for arr, correct in pairs:
+ val = getattr(self, arr)
+ try:
+ self.check_bool(func1, val, correct)
+ except BaseException as exc:
+ exc.args += (arr, )
+ raise
+
+ for arr, correct in pairs_float:
+ val = getattr(self, arr)
+ try:
+ self.check_bool(func1, val, correct)
+ self.check_bool(func1, val.astype('f4'), correct)
+ self.check_bool(func1, val.astype('f2'), correct)
+ except BaseException as exc:
+ exc.args += (arr, )
+ raise
+
+ def test__bn_ok_dtype(self):
+ assert nanops._bn_ok_dtype(self.arr_float.dtype, 'test')
+ assert nanops._bn_ok_dtype(self.arr_complex.dtype, 'test')
+ assert nanops._bn_ok_dtype(self.arr_int.dtype, 'test')
+ assert nanops._bn_ok_dtype(self.arr_bool.dtype, 'test')
+ assert nanops._bn_ok_dtype(self.arr_str.dtype, 'test')
+ assert nanops._bn_ok_dtype(self.arr_utf.dtype, 'test')
+ assert not nanops._bn_ok_dtype(self.arr_date.dtype, 'test')
+ assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, 'test')
+ assert not nanops._bn_ok_dtype(self.arr_obj.dtype, 'test')
+
+
+class TestEnsureNumeric(object):
+
+ def test_numeric_values(self):
+ # Test integer
+ assert nanops._ensure_numeric(1) == 1
+
+ # Test float
+ assert nanops._ensure_numeric(1.1) == 1.1
+
+ # Test complex
+ assert nanops._ensure_numeric(1 + 2j) == 1 + 2j
+
+ def test_ndarray(self):
+ # Test numeric ndarray
+ values = np.array([1, 2, 3])
+ assert np.allclose(nanops._ensure_numeric(values), values)
+
+ # Test object ndarray
+ o_values = values.astype(object)
+ assert np.allclose(nanops._ensure_numeric(o_values), values)
+
+ # Test convertible string ndarray
+ s_values = np.array(['1', '2', '3'], dtype=object)
+ assert np.allclose(nanops._ensure_numeric(s_values), values)
+
+ # Test non-convertible string ndarray
+ s_values = np.array(['foo', 'bar', 'baz'], dtype=object)
+ pytest.raises(ValueError, lambda: nanops._ensure_numeric(s_values))
+
+ def test_convertable_values(self):
+ assert np.allclose(nanops._ensure_numeric('1'), 1.0)
+ assert np.allclose(nanops._ensure_numeric('1.1'), 1.1)
+ assert np.allclose(nanops._ensure_numeric('1+1j'), 1 + 1j)
+
+ def test_non_convertable_values(self):
+ pytest.raises(TypeError, lambda: nanops._ensure_numeric('foo'))
+ pytest.raises(TypeError, lambda: nanops._ensure_numeric({}))
+ pytest.raises(TypeError, lambda: nanops._ensure_numeric([]))
+
+
+class TestNanvarFixedValues(object):
+
+ # xref GH10242
+
+ def setup_method(self, method):
+ # Samples from a normal distribution.
+ self.variance = variance = 3.0
+ self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
+
+ def test_nanvar_all_finite(self):
+ samples = self.samples
+ actual_variance = nanops.nanvar(samples)
+ tm.assert_almost_equal(actual_variance, self.variance,
+ check_less_precise=2)
+
+ def test_nanvar_nans(self):
+ samples = np.nan * np.ones(2 * self.samples.shape[0])
+ samples[::2] = self.samples
+
+ actual_variance = nanops.nanvar(samples, skipna=True)
+ tm.assert_almost_equal(actual_variance, self.variance,
+ check_less_precise=2)
+
+ actual_variance = nanops.nanvar(samples, skipna=False)
+ tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2)
+
+ def test_nanstd_nans(self):
+ samples = np.nan * np.ones(2 * self.samples.shape[0])
+ samples[::2] = self.samples
+
+ actual_std = nanops.nanstd(samples, skipna=True)
+ tm.assert_almost_equal(actual_std, self.variance ** 0.5,
+ check_less_precise=2)
+
+ actual_std = nanops.nanvar(samples, skipna=False)
+ tm.assert_almost_equal(actual_std, np.nan,
+ check_less_precise=2)
+
+ def test_nanvar_axis(self):
+ # Generate some sample data.
+ samples_norm = self.samples
+ samples_unif = self.prng.uniform(size=samples_norm.shape[0])
+ samples = np.vstack([samples_norm, samples_unif])
+
+ actual_variance = nanops.nanvar(samples, axis=1)
+ tm.assert_almost_equal(actual_variance, np.array(
+ [self.variance, 1.0 / 12]), check_less_precise=2)
+
+ def test_nanvar_ddof(self):
+ n = 5
+ samples = self.prng.uniform(size=(10000, n + 1))
+ samples[:, -1] = np.nan # Force use of our own algorithm.
+
+ variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
+ variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
+ variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
+
+ # The unbiased estimate.
+ var = 1.0 / 12
+ tm.assert_almost_equal(variance_1, var,
+ check_less_precise=2)
+
+ # The underestimated variance.
+ tm.assert_almost_equal(variance_0, (n - 1.0) / n * var,
+ check_less_precise=2)
+
+ # The overestimated variance.
+ tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var,
+ check_less_precise=2)
+
+ def test_ground_truth(self):
+ # Test against values that were precomputed with Numpy.
+ samples = np.empty((4, 4))
+ samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287
+ ], [0.72980153, 0.03109364, 0.99155171],
+ [0.09317602, 0.60078248, 0.15871292]])
+ samples[3] = samples[:, 3] = np.nan
+
+ # Actual variances along axis=0, 1 for ddof=0, 1, 2
+ variance = np.array([[[0.13762259, 0.05619224, 0.11568816
+ ], [0.20643388, 0.08428837, 0.17353224],
+ [0.41286776, 0.16857673, 0.34706449]],
+ [[0.09519783, 0.16435395, 0.05082054
+ ], [0.14279674, 0.24653093, 0.07623082],
+ [0.28559348, 0.49306186, 0.15246163]]])
+
+ # Test nanvar.
+ for axis in range(2):
+ for ddof in range(3):
+ var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
+ tm.assert_almost_equal(var[:3], variance[axis, ddof])
+ assert np.isnan(var[3])
+
+ # Test nanstd.
+ for axis in range(2):
+ for ddof in range(3):
+ std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
+ tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5)
+ assert np.isnan(std[3])
+
+ def test_nanstd_roundoff(self):
+ # Regression test for GH 10242 (test data taken from GH 10489). Ensure
+ # that variance is stable.
+ data = Series(766897346 * np.ones(10))
+ for ddof in range(3):
+ result = data.std(ddof=ddof)
+ assert result == 0.0
+
+ @property
+ def prng(self):
+ return np.random.RandomState(1234)
+
+
+class TestNanskewFixedValues(object):
+
+ # xref GH 11974
+
+ def setup_method(self, method):
+ # Test data + skewness value (computed with scipy.stats.skew)
+ self.samples = np.sin(np.linspace(0, 1, 200))
+ self.actual_skew = -0.1875895205961754
+
+ def test_constant_series(self):
+ # xref GH 11974
+ for val in [3075.2, 3075.3, 3075.5]:
+ data = val * np.ones(300)
+ skew = nanops.nanskew(data)
+ assert skew == 0.0
+
+ def test_all_finite(self):
+ alpha, beta = 0.3, 0.1
+ left_tailed = self.prng.beta(alpha, beta, size=100)
+ assert nanops.nanskew(left_tailed) < 0
+
+ alpha, beta = 0.1, 0.3
+ right_tailed = self.prng.beta(alpha, beta, size=100)
+ assert nanops.nanskew(right_tailed) > 0
+
+ def test_ground_truth(self):
+ skew = nanops.nanskew(self.samples)
+ tm.assert_almost_equal(skew, self.actual_skew)
+
+ def test_axis(self):
+ samples = np.vstack([self.samples,
+ np.nan * np.ones(len(self.samples))])
+ skew = nanops.nanskew(samples, axis=1)
+ tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan]))
+
+ def test_nans(self):
+ samples = np.hstack([self.samples, np.nan])
+ skew = nanops.nanskew(samples, skipna=False)
+ assert np.isnan(skew)
+
+ def test_nans_skipna(self):
+ samples = np.hstack([self.samples, np.nan])
+ skew = nanops.nanskew(samples, skipna=True)
+ tm.assert_almost_equal(skew, self.actual_skew)
+
+ @property
+ def prng(self):
+ return np.random.RandomState(1234)
+
+
+class TestNankurtFixedValues(object):
+
+ # xref GH 11974
+
+ def setup_method(self, method):
+ # Test data + kurtosis value (computed with scipy.stats.kurtosis)
+ self.samples = np.sin(np.linspace(0, 1, 200))
+ self.actual_kurt = -1.2058303433799713
+
+ def test_constant_series(self):
+ # xref GH 11974
+ for val in [3075.2, 3075.3, 3075.5]:
+ data = val * np.ones(300)
+ kurt = nanops.nankurt(data)
+ assert kurt == 0.0
+
+ def test_all_finite(self):
+ alpha, beta = 0.3, 0.1
+ left_tailed = self.prng.beta(alpha, beta, size=100)
+ assert nanops.nankurt(left_tailed) < 0
+
+ alpha, beta = 0.1, 0.3
+ right_tailed = self.prng.beta(alpha, beta, size=100)
+ assert nanops.nankurt(right_tailed) > 0
+
+ def test_ground_truth(self):
+ kurt = nanops.nankurt(self.samples)
+ tm.assert_almost_equal(kurt, self.actual_kurt)
+
+ def test_axis(self):
+ samples = np.vstack([self.samples,
+ np.nan * np.ones(len(self.samples))])
+ kurt = nanops.nankurt(samples, axis=1)
+ tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan]))
+
+ def test_nans(self):
+ samples = np.hstack([self.samples, np.nan])
+ kurt = nanops.nankurt(samples, skipna=False)
+ assert np.isnan(kurt)
+
+ def test_nans_skipna(self):
+ samples = np.hstack([self.samples, np.nan])
+ kurt = nanops.nankurt(samples, skipna=True)
+ tm.assert_almost_equal(kurt, self.actual_kurt)
+
+ @property
+ def prng(self):
+ return np.random.RandomState(1234)
+
+
+class TestDatetime64NaNOps(object):
+ @pytest.mark.parametrize('tz', [None, 'UTC'])
+ @pytest.mark.xfail(reason="disabled")
+ # Enabling mean changes the behavior of DataFrame.mean
+ # See https://github.com/pandas-dev/pandas/issues/24752
+ def test_nanmean(self, tz):
+ dti = pd.date_range('2016-01-01', periods=3, tz=tz)
+ expected = dti[1]
+
+ for obj in [dti, DatetimeArray(dti), Series(dti)]:
+ result = nanops.nanmean(obj)
+ assert result == expected
+
+ dti2 = dti.insert(1, pd.NaT)
+
+ for obj in [dti2, DatetimeArray(dti2), Series(dti2)]:
+ result = nanops.nanmean(obj)
+ assert result == expected
+
+
+def test_use_bottleneck():
+
+ if nanops._BOTTLENECK_INSTALLED:
+
+ pd.set_option('use_bottleneck', True)
+ assert pd.get_option('use_bottleneck')
+
+ pd.set_option('use_bottleneck', False)
+ assert not pd.get_option('use_bottleneck')
+
+ pd.set_option('use_bottleneck', use_bn)
+
+
[email protected]("numpy_op, expected", [
+ (np.sum, 10),
+ (np.nansum, 10),
+ (np.mean, 2.5),
+ (np.nanmean, 2.5),
+ (np.median, 2.5),
+ (np.nanmedian, 2.5),
+ (np.min, 1),
+ (np.max, 4),
+])
+def test_numpy_ops(numpy_op, expected):
+ # GH8383
+ result = numpy_op(pd.Series([1, 2, 3, 4]))
+ assert result == expected
+
+
[email protected]("numpy_op, expected", [
+ (np.nanmin, 1),
+ (np.nanmax, 4),
+])
+def test_numpy_ops_np_version_under1p13(numpy_op, expected):
+ # GH8383
+ result = numpy_op(pd.Series([1, 2, 3, 4]))
+ if _np_version_under1p13:
+ # bug for numpy < 1.13, where result is a series, should be a scalar
+ with pytest.raises(ValueError):
+ assert result == expected
+ else:
+ assert result == expected
+
+
[email protected]("operation", [
+ nanops.nanany,
+ nanops.nanall,
+ nanops.nansum,
+ nanops.nanmean,
+ nanops.nanmedian,
+ nanops.nanstd,
+ nanops.nanvar,
+ nanops.nansem,
+ nanops.nanargmax,
+ nanops.nanargmin,
+ nanops.nanmax,
+ nanops.nanmin,
+ nanops.nanskew,
+ nanops.nankurt,
+ nanops.nanprod,
+])
+def test_nanops_independent_of_mask_param(operation):
+ # GH22764
+ s = pd.Series([1, 2, np.nan, 3, np.nan, 4])
+ mask = s.isna()
+ median_expected = operation(s)
+ median_result = operation(s, mask=mask)
+ assert median_expected == median_result
diff --git a/contrib/python/pandas/py2/pandas/tests/test_panel.py b/contrib/python/pandas/py2/pandas/tests/test_panel.py
new file mode 100644
index 00000000000..ba0ad72e624
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_panel.py
@@ -0,0 +1,2621 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=W0612,E1101
+
+from datetime import datetime
+import operator
+from warnings import catch_warnings, simplefilter
+
+import numpy as np
+import pytest
+
+from pandas.compat import OrderedDict, StringIO, lrange, range, signature
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_float_dtype
+
+from pandas import (
+ DataFrame, Index, MultiIndex, Series, compat, date_range, isna, notna)
+from pandas.core.nanops import nanall, nanany
+import pandas.core.panel as panelm
+from pandas.core.panel import Panel
+import pandas.util.testing as tm
+from pandas.util.testing import (
+ assert_almost_equal, assert_frame_equal, assert_panel_equal,
+ assert_series_equal, ensure_clean, makeCustomDataframe as mkdf,
+ makeMixedDataFrame)
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.tseries.offsets import BDay, MonthEnd
+
+
+def make_test_panel():
+ with catch_warnings(record=True):
+ simplefilter("ignore", FutureWarning)
+ _panel = tm.makePanel()
+ tm.add_nans(_panel)
+ _panel = _panel.copy()
+ return _panel
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class PanelTests(object):
+ panel = None
+
+ def test_pickle(self):
+ unpickled = tm.round_trip_pickle(self.panel)
+ assert_frame_equal(unpickled['ItemA'], self.panel['ItemA'])
+
+ def test_rank(self):
+ pytest.raises(NotImplementedError, lambda: self.panel.rank())
+
+ def test_cumsum(self):
+ cumsum = self.panel.cumsum()
+ assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum())
+
+ def not_hashable(self):
+ c_empty = Panel()
+ c = Panel(Panel([[[1]]]))
+ pytest.raises(TypeError, hash, c_empty)
+ pytest.raises(TypeError, hash, c)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class SafeForLongAndSparse(object):
+
+ def test_repr(self):
+ repr(self.panel)
+
+ def test_copy_names(self):
+ for attr in ('major_axis', 'minor_axis'):
+ getattr(self.panel, attr).name = None
+ cp = self.panel.copy()
+ getattr(cp, attr).name = 'foo'
+ assert getattr(self.panel, attr).name is None
+
+ def test_iter(self):
+ tm.equalContents(list(self.panel), self.panel.items)
+
+ def test_count(self):
+ f = lambda s: notna(s).sum()
+ self._check_stat_op('count', f, obj=self.panel, has_skipna=False)
+
+ def test_sum(self):
+ self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum)
+
+ def test_mean(self):
+ self._check_stat_op('mean', np.mean)
+
+ def test_prod(self):
+ self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod)
+
+ @pytest.mark.filterwarnings("ignore:Invalid value:RuntimeWarning")
+ @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
+ def test_median(self):
+ def wrapper(x):
+ if isna(x).any():
+ return np.nan
+ return np.median(x)
+
+ self._check_stat_op('median', wrapper)
+
+ @pytest.mark.filterwarnings("ignore:Invalid value:RuntimeWarning")
+ def test_min(self):
+ self._check_stat_op('min', np.min)
+
+ @pytest.mark.filterwarnings("ignore:Invalid value:RuntimeWarning")
+ def test_max(self):
+ self._check_stat_op('max', np.max)
+
+ @td.skip_if_no_scipy
+ def test_skew(self):
+ from scipy.stats import skew
+
+ def this_skew(x):
+ if len(x) < 3:
+ return np.nan
+ return skew(x, bias=False)
+
+ self._check_stat_op('skew', this_skew)
+
+ def test_var(self):
+ def alt(x):
+ if len(x) < 2:
+ return np.nan
+ return np.var(x, ddof=1)
+
+ self._check_stat_op('var', alt)
+
+ def test_std(self):
+ def alt(x):
+ if len(x) < 2:
+ return np.nan
+ return np.std(x, ddof=1)
+
+ self._check_stat_op('std', alt)
+
+ def test_sem(self):
+ def alt(x):
+ if len(x) < 2:
+ return np.nan
+ return np.std(x, ddof=1) / np.sqrt(len(x))
+
+ self._check_stat_op('sem', alt)
+
+ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True,
+ skipna_alternative=None):
+ if obj is None:
+ obj = self.panel
+
+ # # set some NAs
+ # obj.loc[5:10] = np.nan
+ # obj.loc[15:20, -2:] = np.nan
+
+ f = getattr(obj, name)
+
+ if has_skipna:
+
+ skipna_wrapper = tm._make_skipna_wrapper(alternative,
+ skipna_alternative)
+
+ def wrapper(x):
+ return alternative(np.asarray(x))
+
+ for i in range(obj.ndim):
+ result = f(axis=i, skipna=False)
+ assert_frame_equal(result, obj.apply(wrapper, axis=i))
+ else:
+ skipna_wrapper = alternative
+ wrapper = alternative
+
+ for i in range(obj.ndim):
+ result = f(axis=i)
+ if name in ['sum', 'prod']:
+ assert_frame_equal(result, obj.apply(skipna_wrapper, axis=i))
+
+ pytest.raises(Exception, f, axis=obj.ndim)
+
+ # Unimplemented numeric_only parameter.
+ if 'numeric_only' in signature(f).args:
+ with pytest.raises(NotImplementedError, match=name):
+ f(numeric_only=True)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class SafeForSparse(object):
+
+ def test_get_axis(self):
+ assert (self.panel._get_axis(0) is self.panel.items)
+ assert (self.panel._get_axis(1) is self.panel.major_axis)
+ assert (self.panel._get_axis(2) is self.panel.minor_axis)
+
+ def test_set_axis(self):
+ new_items = Index(np.arange(len(self.panel.items)))
+ new_major = Index(np.arange(len(self.panel.major_axis)))
+ new_minor = Index(np.arange(len(self.panel.minor_axis)))
+
+ # ensure propagate to potentially prior-cached items too
+ item = self.panel['ItemA']
+ self.panel.items = new_items
+
+ if hasattr(self.panel, '_item_cache'):
+ assert 'ItemA' not in self.panel._item_cache
+ assert self.panel.items is new_items
+
+ # TODO: unused?
+ item = self.panel[0] # noqa
+
+ self.panel.major_axis = new_major
+ assert self.panel[0].index is new_major
+ assert self.panel.major_axis is new_major
+
+ # TODO: unused?
+ item = self.panel[0] # noqa
+
+ self.panel.minor_axis = new_minor
+ assert self.panel[0].columns is new_minor
+ assert self.panel.minor_axis is new_minor
+
+ def test_get_axis_number(self):
+ assert self.panel._get_axis_number('items') == 0
+ assert self.panel._get_axis_number('major') == 1
+ assert self.panel._get_axis_number('minor') == 2
+
+ with pytest.raises(ValueError, match="No axis named foo"):
+ self.panel._get_axis_number('foo')
+
+ with pytest.raises(ValueError, match="No axis named foo"):
+ self.panel.__ge__(self.panel, axis='foo')
+
+ def test_get_axis_name(self):
+ assert self.panel._get_axis_name(0) == 'items'
+ assert self.panel._get_axis_name(1) == 'major_axis'
+ assert self.panel._get_axis_name(2) == 'minor_axis'
+
+ def test_get_plane_axes(self):
+ # what to do here?
+
+ index, columns = self.panel._get_plane_axes('items')
+ index, columns = self.panel._get_plane_axes('major_axis')
+ index, columns = self.panel._get_plane_axes('minor_axis')
+ index, columns = self.panel._get_plane_axes(0)
+
+ def test_truncate(self):
+ dates = self.panel.major_axis
+ start, end = dates[1], dates[5]
+
+ trunced = self.panel.truncate(start, end, axis='major')
+ expected = self.panel['ItemA'].truncate(start, end)
+
+ assert_frame_equal(trunced['ItemA'], expected)
+
+ trunced = self.panel.truncate(before=start, axis='major')
+ expected = self.panel['ItemA'].truncate(before=start)
+
+ assert_frame_equal(trunced['ItemA'], expected)
+
+ trunced = self.panel.truncate(after=end, axis='major')
+ expected = self.panel['ItemA'].truncate(after=end)
+
+ assert_frame_equal(trunced['ItemA'], expected)
+
+ def test_arith(self):
+ self._test_op(self.panel, operator.add)
+ self._test_op(self.panel, operator.sub)
+ self._test_op(self.panel, operator.mul)
+ self._test_op(self.panel, operator.truediv)
+ self._test_op(self.panel, operator.floordiv)
+ self._test_op(self.panel, operator.pow)
+
+ self._test_op(self.panel, lambda x, y: y + x)
+ self._test_op(self.panel, lambda x, y: y - x)
+ self._test_op(self.panel, lambda x, y: y * x)
+ self._test_op(self.panel, lambda x, y: y / x)
+ self._test_op(self.panel, lambda x, y: y ** x)
+
+ self._test_op(self.panel, lambda x, y: x + y) # panel + 1
+ self._test_op(self.panel, lambda x, y: x - y) # panel - 1
+ self._test_op(self.panel, lambda x, y: x * y) # panel * 1
+ self._test_op(self.panel, lambda x, y: x / y) # panel / 1
+ self._test_op(self.panel, lambda x, y: x ** y) # panel ** 1
+
+ pytest.raises(Exception, self.panel.__add__,
+ self.panel['ItemA'])
+
+ @staticmethod
+ def _test_op(panel, op):
+ result = op(panel, 1)
+ assert_frame_equal(result['ItemA'], op(panel['ItemA'], 1))
+
+ def test_keys(self):
+ tm.equalContents(list(self.panel.keys()), self.panel.items)
+
+ def test_iteritems(self):
+ # Test panel.iteritems(), aka panel.iteritems()
+ # just test that it works
+ for k, v in self.panel.iteritems():
+ pass
+
+ assert len(list(self.panel.iteritems())) == len(self.panel.items)
+
+ def test_combineFrame(self):
+ def check_op(op, name):
+ # items
+ df = self.panel['ItemA']
+
+ func = getattr(self.panel, name)
+
+ result = func(df, axis='items')
+
+ assert_frame_equal(
+ result['ItemB'], op(self.panel['ItemB'], df))
+
+ # major
+ xs = self.panel.major_xs(self.panel.major_axis[0])
+ result = func(xs, axis='major')
+
+ idx = self.panel.major_axis[1]
+
+ assert_frame_equal(result.major_xs(idx),
+ op(self.panel.major_xs(idx), xs))
+
+ # minor
+ xs = self.panel.minor_xs(self.panel.minor_axis[0])
+ result = func(xs, axis='minor')
+
+ idx = self.panel.minor_axis[1]
+
+ assert_frame_equal(result.minor_xs(idx),
+ op(self.panel.minor_xs(idx), xs))
+
+ ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow', 'mod']
+ if not compat.PY3:
+ ops.append('div')
+
+ for op in ops:
+ try:
+ check_op(getattr(operator, op), op)
+ except AttributeError:
+ pprint_thing("Failing operation: %r" % op)
+ raise
+ if compat.PY3:
+ try:
+ check_op(operator.truediv, 'div')
+ except AttributeError:
+ pprint_thing("Failing operation: %r" % 'div')
+ raise
+
+ def test_combinePanel(self):
+ result = self.panel.add(self.panel)
+ assert_panel_equal(result, self.panel * 2)
+
+ def test_neg(self):
+ assert_panel_equal(-self.panel, self.panel * -1)
+
+ # issue 7692
+ def test_raise_when_not_implemented(self):
+ p = Panel(np.arange(3 * 4 * 5).reshape(3, 4, 5),
+ items=['ItemA', 'ItemB', 'ItemC'],
+ major_axis=date_range('20130101', periods=4),
+ minor_axis=list('ABCDE'))
+ d = p.sum(axis=1).iloc[0]
+ ops = ['add', 'sub', 'mul', 'truediv',
+ 'floordiv', 'div', 'mod', 'pow']
+ for op in ops:
+ with pytest.raises(NotImplementedError):
+ getattr(p, op)(d, axis=0)
+
+ def test_select(self):
+ p = self.panel
+
+ # select items
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items')
+ expected = p.reindex(items=['ItemA', 'ItemC'])
+ assert_panel_equal(result, expected)
+
+ # select major_axis
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = p.select(lambda x: x >= datetime(
+ 2000, 1, 15), axis='major')
+ new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)]
+ expected = p.reindex(major=new_major)
+ assert_panel_equal(result, expected)
+
+ # select minor_axis
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = p.select(lambda x: x in ('D', 'A'), axis=2)
+ expected = p.reindex(minor=['A', 'D'])
+ assert_panel_equal(result, expected)
+
+ # corner case, empty thing
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = p.select(lambda x: x in ('foo', ), axis='items')
+ assert_panel_equal(result, p.reindex(items=[]))
+
+ def test_get_value(self):
+ for item in self.panel.items:
+ for mjr in self.panel.major_axis[::2]:
+ for mnr in self.panel.minor_axis:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = self.panel.get_value(item, mjr, mnr)
+ expected = self.panel[item][mnr][mjr]
+ assert_almost_equal(result, expected)
+
+ def test_abs(self):
+
+ result = self.panel.abs()
+ result2 = abs(self.panel)
+ expected = np.abs(self.panel)
+ assert_panel_equal(result, expected)
+ assert_panel_equal(result2, expected)
+
+ df = self.panel['ItemA']
+ result = df.abs()
+ result2 = abs(df)
+ expected = np.abs(df)
+ assert_frame_equal(result, expected)
+ assert_frame_equal(result2, expected)
+
+ s = df['A']
+ result = s.abs()
+ result2 = abs(s)
+ expected = np.abs(s)
+ assert_series_equal(result, expected)
+ assert_series_equal(result2, expected)
+ assert result.name == 'A'
+ assert result2.name == 'A'
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class CheckIndexing(object):
+
+ def test_getitem(self):
+ pytest.raises(Exception, self.panel.__getitem__, 'ItemQ')
+
+ def test_delitem_and_pop(self):
+ expected = self.panel['ItemA']
+ result = self.panel.pop('ItemA')
+ assert_frame_equal(expected, result)
+ assert 'ItemA' not in self.panel.items
+
+ del self.panel['ItemB']
+ assert 'ItemB' not in self.panel.items
+ pytest.raises(Exception, self.panel.__delitem__, 'ItemB')
+
+ values = np.empty((3, 3, 3))
+ values[0] = 0
+ values[1] = 1
+ values[2] = 2
+
+ panel = Panel(values, lrange(3), lrange(3), lrange(3))
+
+ # did we delete the right row?
+
+ panelc = panel.copy()
+ del panelc[0]
+ tm.assert_frame_equal(panelc[1], panel[1])
+ tm.assert_frame_equal(panelc[2], panel[2])
+
+ panelc = panel.copy()
+ del panelc[1]
+ tm.assert_frame_equal(panelc[0], panel[0])
+ tm.assert_frame_equal(panelc[2], panel[2])
+
+ panelc = panel.copy()
+ del panelc[2]
+ tm.assert_frame_equal(panelc[1], panel[1])
+ tm.assert_frame_equal(panelc[0], panel[0])
+
+ def test_setitem(self):
+ lp = self.panel.filter(['ItemA', 'ItemB']).to_frame()
+
+ with pytest.raises(TypeError):
+ self.panel['ItemE'] = lp
+
+ # DataFrame
+ df = self.panel['ItemA'][2:].filter(items=['A', 'B'])
+ self.panel['ItemF'] = df
+ self.panel['ItemE'] = df
+
+ df2 = self.panel['ItemF']
+
+ assert_frame_equal(df, df2.reindex(
+ index=df.index, columns=df.columns))
+
+ # scalar
+ self.panel['ItemG'] = 1
+ self.panel['ItemE'] = True
+ assert self.panel['ItemG'].values.dtype == np.int64
+ assert self.panel['ItemE'].values.dtype == np.bool_
+
+ # object dtype
+ self.panel['ItemQ'] = 'foo'
+ assert self.panel['ItemQ'].values.dtype == np.object_
+
+ # boolean dtype
+ self.panel['ItemP'] = self.panel['ItemA'] > 0
+ assert self.panel['ItemP'].values.dtype == np.bool_
+
+ pytest.raises(TypeError, self.panel.__setitem__, 'foo',
+ self.panel.loc[['ItemP']])
+
+ # bad shape
+ p = Panel(np.random.randn(4, 3, 2))
+ msg = (r"shape of value must be \(3, 2\), "
+ r"shape of given object was \(4, 2\)")
+ with pytest.raises(ValueError, match=msg):
+ p[0] = np.random.randn(4, 2)
+
+ def test_setitem_ndarray(self):
+ timeidx = date_range(start=datetime(2009, 1, 1),
+ end=datetime(2009, 12, 31),
+ freq=MonthEnd())
+ lons_coarse = np.linspace(-177.5, 177.5, 72)
+ lats_coarse = np.linspace(-87.5, 87.5, 36)
+ P = Panel(items=timeidx, major_axis=lons_coarse,
+ minor_axis=lats_coarse)
+ data = np.random.randn(72 * 36).reshape((72, 36))
+ key = datetime(2009, 2, 28)
+ P[key] = data
+
+ assert_almost_equal(P[key].values, data)
+
+ def test_set_minor_major(self):
+ # GH 11014
+ df1 = DataFrame(['a', 'a', 'a', np.nan, 'a', np.nan])
+ df2 = DataFrame([1.0, np.nan, 1.0, np.nan, 1.0, 1.0])
+ panel = Panel({'Item1': df1, 'Item2': df2})
+
+ newminor = notna(panel.iloc[:, :, 0])
+ panel.loc[:, :, 'NewMinor'] = newminor
+ assert_frame_equal(panel.loc[:, :, 'NewMinor'],
+ newminor.astype(object))
+
+ newmajor = notna(panel.iloc[:, 0, :])
+ panel.loc[:, 'NewMajor', :] = newmajor
+ assert_frame_equal(panel.loc[:, 'NewMajor', :],
+ newmajor.astype(object))
+
+ def test_major_xs(self):
+ ref = self.panel['ItemA']
+
+ idx = self.panel.major_axis[5]
+ xs = self.panel.major_xs(idx)
+
+ result = xs['ItemA']
+ assert_series_equal(result, ref.xs(idx), check_names=False)
+ assert result.name == 'ItemA'
+
+ # not contained
+ idx = self.panel.major_axis[0] - BDay()
+ pytest.raises(Exception, self.panel.major_xs, idx)
+
+ def test_major_xs_mixed(self):
+ self.panel['ItemD'] = 'foo'
+ xs = self.panel.major_xs(self.panel.major_axis[0])
+ assert xs['ItemA'].dtype == np.float64
+ assert xs['ItemD'].dtype == np.object_
+
+ def test_minor_xs(self):
+ ref = self.panel['ItemA']
+
+ idx = self.panel.minor_axis[1]
+ xs = self.panel.minor_xs(idx)
+
+ assert_series_equal(xs['ItemA'], ref[idx], check_names=False)
+
+ # not contained
+ pytest.raises(Exception, self.panel.minor_xs, 'E')
+
+ def test_minor_xs_mixed(self):
+ self.panel['ItemD'] = 'foo'
+
+ xs = self.panel.minor_xs('D')
+ assert xs['ItemA'].dtype == np.float64
+ assert xs['ItemD'].dtype == np.object_
+
+ def test_xs(self):
+ itemA = self.panel.xs('ItemA', axis=0)
+ expected = self.panel['ItemA']
+ tm.assert_frame_equal(itemA, expected)
+
+ # Get a view by default.
+ itemA_view = self.panel.xs('ItemA', axis=0)
+ itemA_view.values[:] = np.nan
+
+ assert np.isnan(self.panel['ItemA'].values).all()
+
+ # Mixed-type yields a copy.
+ self.panel['strings'] = 'foo'
+ result = self.panel.xs('D', axis=2)
+ assert result._is_copy is not None
+
+ def test_getitem_fancy_labels(self):
+ p = self.panel
+
+ items = p.items[[1, 0]]
+ dates = p.major_axis[::2]
+ cols = ['D', 'C', 'F']
+
+ # all 3 specified
+ with catch_warnings():
+ simplefilter("ignore", FutureWarning)
+ # XXX: warning in _validate_read_indexer
+ assert_panel_equal(p.loc[items, dates, cols],
+ p.reindex(items=items, major=dates, minor=cols))
+
+ # 2 specified
+ assert_panel_equal(p.loc[:, dates, cols],
+ p.reindex(major=dates, minor=cols))
+
+ assert_panel_equal(p.loc[items, :, cols],
+ p.reindex(items=items, minor=cols))
+
+ assert_panel_equal(p.loc[items, dates, :],
+ p.reindex(items=items, major=dates))
+
+ # only 1
+ assert_panel_equal(p.loc[items, :, :], p.reindex(items=items))
+
+ assert_panel_equal(p.loc[:, dates, :], p.reindex(major=dates))
+
+ assert_panel_equal(p.loc[:, :, cols], p.reindex(minor=cols))
+
+ def test_getitem_fancy_slice(self):
+ pass
+
+ def test_getitem_fancy_ints(self):
+ p = self.panel
+
+ # #1603
+ result = p.iloc[:, -1, :]
+ expected = p.loc[:, p.major_axis[-1], :]
+ assert_frame_equal(result, expected)
+
+ def test_getitem_fancy_xs(self):
+ p = self.panel
+ item = 'ItemB'
+
+ date = p.major_axis[5]
+ col = 'C'
+
+ # get DataFrame
+ # item
+ assert_frame_equal(p.loc[item], p[item])
+ assert_frame_equal(p.loc[item, :], p[item])
+ assert_frame_equal(p.loc[item, :, :], p[item])
+
+ # major axis, axis=1
+ assert_frame_equal(p.loc[:, date], p.major_xs(date))
+ assert_frame_equal(p.loc[:, date, :], p.major_xs(date))
+
+ # minor axis, axis=2
+ assert_frame_equal(p.loc[:, :, 'C'], p.minor_xs('C'))
+
+ # get Series
+ assert_series_equal(p.loc[item, date], p[item].loc[date])
+ assert_series_equal(p.loc[item, date, :], p[item].loc[date])
+ assert_series_equal(p.loc[item, :, col], p[item][col])
+ assert_series_equal(p.loc[:, date, col], p.major_xs(date).loc[col])
+
+ def test_getitem_fancy_xs_check_view(self):
+ item = 'ItemB'
+ date = self.panel.major_axis[5]
+
+ # make sure it's always a view
+ NS = slice(None, None)
+
+ # DataFrames
+ comp = assert_frame_equal
+ self._check_view(item, comp)
+ self._check_view((item, NS), comp)
+ self._check_view((item, NS, NS), comp)
+ self._check_view((NS, date), comp)
+ self._check_view((NS, date, NS), comp)
+ self._check_view((NS, NS, 'C'), comp)
+
+ # Series
+ comp = assert_series_equal
+ self._check_view((item, date), comp)
+ self._check_view((item, date, NS), comp)
+ self._check_view((item, NS, 'C'), comp)
+ self._check_view((NS, date, 'C'), comp)
+
+ def test_getitem_callable(self):
+ p = self.panel
+ # GH 12533
+
+ assert_frame_equal(p[lambda x: 'ItemB'], p.loc['ItemB'])
+ assert_panel_equal(p[lambda x: ['ItemB', 'ItemC']],
+ p.loc[['ItemB', 'ItemC']])
+
+ def test_ix_setitem_slice_dataframe(self):
+ a = Panel(items=[1, 2, 3], major_axis=[11, 22, 33],
+ minor_axis=[111, 222, 333])
+ b = DataFrame(np.random.randn(2, 3), index=[111, 333],
+ columns=[1, 2, 3])
+
+ a.loc[:, 22, [111, 333]] = b
+
+ assert_frame_equal(a.loc[:, 22, [111, 333]], b)
+
+ def test_ix_align(self):
+ from pandas import Series
+ b = Series(np.random.randn(10), name=0)
+ b.sort_values()
+ df_orig = Panel(np.random.randn(3, 10, 2))
+ df = df_orig.copy()
+
+ df.loc[0, :, 0] = b
+ assert_series_equal(df.loc[0, :, 0].reindex(b.index), b)
+
+ df = df_orig.swapaxes(0, 1)
+ df.loc[:, 0, 0] = b
+ assert_series_equal(df.loc[:, 0, 0].reindex(b.index), b)
+
+ df = df_orig.swapaxes(1, 2)
+ df.loc[0, 0, :] = b
+ assert_series_equal(df.loc[0, 0, :].reindex(b.index), b)
+
+ def test_ix_frame_align(self):
+ p_orig = tm.makePanel()
+ df = p_orig.iloc[0].copy()
+ assert_frame_equal(p_orig['ItemA'], df)
+
+ p = p_orig.copy()
+ p.iloc[0, :, :] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.iloc[0] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.iloc[0, :, :] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.iloc[0] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.loc['ItemA'] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.loc['ItemA', :, :] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p['ItemA'] = df
+ assert_panel_equal(p, p_orig)
+
+ p = p_orig.copy()
+ p.iloc[0, [0, 1, 3, 5], -2:] = df
+ out = p.iloc[0, [0, 1, 3, 5], -2:]
+ assert_frame_equal(out, df.iloc[[0, 1, 3, 5], [2, 3]])
+
+ # GH3830, panel assignent by values/frame
+ for dtype in ['float64', 'int64']:
+
+ panel = Panel(np.arange(40).reshape((2, 4, 5)),
+ items=['a1', 'a2'], dtype=dtype)
+ df1 = panel.iloc[0]
+ df2 = panel.iloc[1]
+
+ tm.assert_frame_equal(panel.loc['a1'], df1)
+ tm.assert_frame_equal(panel.loc['a2'], df2)
+
+ # Assignment by Value Passes for 'a2'
+ panel.loc['a2'] = df1.values
+ tm.assert_frame_equal(panel.loc['a1'], df1)
+ tm.assert_frame_equal(panel.loc['a2'], df1)
+
+ # Assignment by DataFrame Ok w/o loc 'a2'
+ panel['a2'] = df2
+ tm.assert_frame_equal(panel.loc['a1'], df1)
+ tm.assert_frame_equal(panel.loc['a2'], df2)
+
+ # Assignment by DataFrame Fails for 'a2'
+ panel.loc['a2'] = df2
+ tm.assert_frame_equal(panel.loc['a1'], df1)
+ tm.assert_frame_equal(panel.loc['a2'], df2)
+
+ def _check_view(self, indexer, comp):
+ cp = self.panel.copy()
+ obj = cp.loc[indexer]
+ obj.values[:] = 0
+ assert (obj.values == 0).all()
+ comp(cp.loc[indexer].reindex_like(obj), obj)
+
+ def test_logical_with_nas(self):
+ d = Panel({'ItemA': {'a': [np.nan, False]},
+ 'ItemB': {'a': [True, True]}})
+
+ result = d['ItemA'] | d['ItemB']
+ expected = DataFrame({'a': [np.nan, True]})
+ assert_frame_equal(result, expected)
+
+ # this is autodowncasted here
+ result = d['ItemA'].fillna(False) | d['ItemB']
+ expected = DataFrame({'a': [True, True]})
+ assert_frame_equal(result, expected)
+
+ def test_neg(self):
+ assert_panel_equal(-self.panel, -1 * self.panel)
+
+ def test_invert(self):
+ assert_panel_equal(-(self.panel < 0), ~(self.panel < 0))
+
+ def test_comparisons(self):
+ p1 = tm.makePanel()
+ p2 = tm.makePanel()
+
+ tp = p1.reindex(items=p1.items + ['foo'])
+ df = p1[p1.items[0]]
+
+ def test_comp(func):
+
+ # versus same index
+ result = func(p1, p2)
+ tm.assert_numpy_array_equal(result.values,
+ func(p1.values, p2.values))
+
+ # versus non-indexed same objs
+ pytest.raises(Exception, func, p1, tp)
+
+ # versus different objs
+ pytest.raises(Exception, func, p1, df)
+
+ # versus scalar
+ result3 = func(self.panel, 0)
+ tm.assert_numpy_array_equal(result3.values,
+ func(self.panel.values, 0))
+
+ with np.errstate(invalid='ignore'):
+ test_comp(operator.eq)
+ test_comp(operator.ne)
+ test_comp(operator.lt)
+ test_comp(operator.gt)
+ test_comp(operator.ge)
+ test_comp(operator.le)
+
+ def test_get_value(self):
+ for item in self.panel.items:
+ for mjr in self.panel.major_axis[::2]:
+ for mnr in self.panel.minor_axis:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = self.panel.get_value(item, mjr, mnr)
+ expected = self.panel[item][mnr][mjr]
+ assert_almost_equal(result, expected)
+ with catch_warnings():
+ simplefilter("ignore", FutureWarning)
+ msg = "There must be an argument for each axis"
+ with pytest.raises(TypeError, match=msg):
+ self.panel.get_value('a')
+
+ def test_set_value(self):
+ for item in self.panel.items:
+ for mjr in self.panel.major_axis[::2]:
+ for mnr in self.panel.minor_axis:
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ self.panel.set_value(item, mjr, mnr, 1.)
+ tm.assert_almost_equal(self.panel[item][mnr][mjr], 1.)
+
+ # resize
+ with catch_warnings():
+ simplefilter("ignore", FutureWarning)
+ res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5)
+ assert isinstance(res, Panel)
+ assert res is not self.panel
+ assert res.get_value('ItemE', 'foo', 'bar') == 1.5
+
+ res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5)
+ assert is_float_dtype(res3['ItemE'].values)
+
+ msg = ("There must be an argument for each "
+ "axis plus the value provided")
+ with pytest.raises(TypeError, match=msg):
+ self.panel.set_value('a')
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestPanel(PanelTests, CheckIndexing, SafeForLongAndSparse,
+ SafeForSparse):
+
+ def setup_method(self, method):
+ self.panel = make_test_panel()
+ self.panel.major_axis.name = None
+ self.panel.minor_axis.name = None
+ self.panel.items.name = None
+
+ def test_constructor(self):
+ # with BlockManager
+ wp = Panel(self.panel._data)
+ assert wp._data is self.panel._data
+
+ wp = Panel(self.panel._data, copy=True)
+ assert wp._data is not self.panel._data
+ tm.assert_panel_equal(wp, self.panel)
+
+ # strings handled prop
+ wp = Panel([[['foo', 'foo', 'foo', ], ['foo', 'foo', 'foo']]])
+ assert wp.values.dtype == np.object_
+
+ vals = self.panel.values
+
+ # no copy
+ wp = Panel(vals)
+ assert wp.values is vals
+
+ # copy
+ wp = Panel(vals, copy=True)
+ assert wp.values is not vals
+
+ # GH #8285, test when scalar data is used to construct a Panel
+ # if dtype is not passed, it should be inferred
+ value_and_dtype = [(1, 'int64'), (3.14, 'float64'),
+ ('foo', np.object_)]
+ for (val, dtype) in value_and_dtype:
+ wp = Panel(val, items=range(2), major_axis=range(3),
+ minor_axis=range(4))
+ vals = np.empty((2, 3, 4), dtype=dtype)
+ vals.fill(val)
+
+ tm.assert_panel_equal(wp, Panel(vals, dtype=dtype))
+
+ # test the case when dtype is passed
+ wp = Panel(1, items=range(2), major_axis=range(3),
+ minor_axis=range(4),
+ dtype='float32')
+ vals = np.empty((2, 3, 4), dtype='float32')
+ vals.fill(1)
+
+ tm.assert_panel_equal(wp, Panel(vals, dtype='float32'))
+
+ def test_constructor_cast(self):
+ zero_filled = self.panel.fillna(0)
+
+ casted = Panel(zero_filled._data, dtype=int)
+ casted2 = Panel(zero_filled.values, dtype=int)
+
+ exp_values = zero_filled.values.astype(int)
+ assert_almost_equal(casted.values, exp_values)
+ assert_almost_equal(casted2.values, exp_values)
+
+ casted = Panel(zero_filled._data, dtype=np.int32)
+ casted2 = Panel(zero_filled.values, dtype=np.int32)
+
+ exp_values = zero_filled.values.astype(np.int32)
+ assert_almost_equal(casted.values, exp_values)
+ assert_almost_equal(casted2.values, exp_values)
+
+ # can't cast
+ data = [[['foo', 'bar', 'baz']]]
+ pytest.raises(ValueError, Panel, data, dtype=float)
+
+ def test_constructor_empty_panel(self):
+ empty = Panel()
+ assert len(empty.items) == 0
+ assert len(empty.major_axis) == 0
+ assert len(empty.minor_axis) == 0
+
+ def test_constructor_observe_dtype(self):
+ # GH #411
+ panel = Panel(items=lrange(3), major_axis=lrange(3),
+ minor_axis=lrange(3), dtype='O')
+ assert panel.values.dtype == np.object_
+
+ def test_constructor_dtypes(self):
+ # GH #797
+
+ def _check_dtype(panel, dtype):
+ for i in panel.items:
+ assert panel[i].values.dtype.name == dtype
+
+ # only nan holding types allowed here
+ for dtype in ['float64', 'float32', 'object']:
+ panel = Panel(items=lrange(2), major_axis=lrange(10),
+ minor_axis=lrange(5), dtype=dtype)
+ _check_dtype(panel, dtype)
+
+ for dtype in ['float64', 'float32', 'int64', 'int32', 'object']:
+ panel = Panel(np.array(np.random.randn(2, 10, 5), dtype=dtype),
+ items=lrange(2),
+ major_axis=lrange(10),
+ minor_axis=lrange(5), dtype=dtype)
+ _check_dtype(panel, dtype)
+
+ for dtype in ['float64', 'float32', 'int64', 'int32', 'object']:
+ panel = Panel(np.array(np.random.randn(2, 10, 5), dtype='O'),
+ items=lrange(2),
+ major_axis=lrange(10),
+ minor_axis=lrange(5), dtype=dtype)
+ _check_dtype(panel, dtype)
+
+ for dtype in ['float64', 'float32', 'int64', 'int32', 'object']:
+ panel = Panel(
+ np.random.randn(2, 10, 5),
+ items=lrange(2), major_axis=lrange(10),
+ minor_axis=lrange(5),
+ dtype=dtype)
+ _check_dtype(panel, dtype)
+
+ for dtype in ['float64', 'float32', 'int64', 'int32', 'object']:
+ df1 = DataFrame(np.random.randn(2, 5),
+ index=lrange(2), columns=lrange(5))
+ df2 = DataFrame(np.random.randn(2, 5),
+ index=lrange(2), columns=lrange(5))
+ panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype)
+ _check_dtype(panel, dtype)
+
+ def test_constructor_fails_with_not_3d_input(self):
+ msg = "The number of dimensions required is 3"
+ with pytest.raises(ValueError, match=msg):
+ Panel(np.random.randn(10, 2))
+
+ def test_consolidate(self):
+ assert self.panel._data.is_consolidated()
+
+ self.panel['foo'] = 1.
+ assert not self.panel._data.is_consolidated()
+
+ panel = self.panel._consolidate()
+ assert panel._data.is_consolidated()
+
+ def test_ctor_dict(self):
+ itema = self.panel['ItemA']
+ itemb = self.panel['ItemB']
+
+ d = {'A': itema, 'B': itemb[5:]}
+ d2 = {'A': itema._series, 'B': itemb[5:]._series}
+ d3 = {'A': None,
+ 'B': DataFrame(itemb[5:]._series),
+ 'C': DataFrame(itema._series)}
+
+ wp = Panel.from_dict(d)
+ wp2 = Panel.from_dict(d2) # nested Dict
+
+ # TODO: unused?
+ wp3 = Panel.from_dict(d3) # noqa
+
+ tm.assert_index_equal(wp.major_axis, self.panel.major_axis)
+ assert_panel_equal(wp, wp2)
+
+ # intersect
+ wp = Panel.from_dict(d, intersect=True)
+ tm.assert_index_equal(wp.major_axis, itemb.index[5:])
+
+ # use constructor
+ assert_panel_equal(Panel(d), Panel.from_dict(d))
+ assert_panel_equal(Panel(d2), Panel.from_dict(d2))
+ assert_panel_equal(Panel(d3), Panel.from_dict(d3))
+
+ # a pathological case
+ d4 = {'A': None, 'B': None}
+
+ # TODO: unused?
+ wp4 = Panel.from_dict(d4) # noqa
+
+ assert_panel_equal(Panel(d4), Panel(items=['A', 'B']))
+
+ # cast
+ dcasted = {k: v.reindex(wp.major_axis).fillna(0)
+ for k, v in compat.iteritems(d)}
+ result = Panel(dcasted, dtype=int)
+ expected = Panel({k: v.astype(int)
+ for k, v in compat.iteritems(dcasted)})
+ assert_panel_equal(result, expected)
+
+ result = Panel(dcasted, dtype=np.int32)
+ expected = Panel({k: v.astype(np.int32)
+ for k, v in compat.iteritems(dcasted)})
+ assert_panel_equal(result, expected)
+
+ def test_constructor_dict_mixed(self):
+ data = {k: v.values for k, v in self.panel.iteritems()}
+ result = Panel(data)
+ exp_major = Index(np.arange(len(self.panel.major_axis)))
+ tm.assert_index_equal(result.major_axis, exp_major)
+
+ result = Panel(data, items=self.panel.items,
+ major_axis=self.panel.major_axis,
+ minor_axis=self.panel.minor_axis)
+ assert_panel_equal(result, self.panel)
+
+ data['ItemC'] = self.panel['ItemC']
+ result = Panel(data)
+ assert_panel_equal(result, self.panel)
+
+ # corner, blow up
+ data['ItemB'] = data['ItemB'][:-1]
+ pytest.raises(Exception, Panel, data)
+
+ data['ItemB'] = self.panel['ItemB'].values[:, :-1]
+ pytest.raises(Exception, Panel, data)
+
+ def test_ctor_orderedDict(self):
+ keys = list(set(np.random.randint(0, 5000, 100)))[
+ :50] # unique random int keys
+ d = OrderedDict([(k, mkdf(10, 5)) for k in keys])
+ p = Panel(d)
+ assert list(p.items) == keys
+
+ p = Panel.from_dict(d)
+ assert list(p.items) == keys
+
+ def test_constructor_resize(self):
+ data = self.panel._data
+ items = self.panel.items[:-1]
+ major = self.panel.major_axis[:-1]
+ minor = self.panel.minor_axis[:-1]
+
+ result = Panel(data, items=items,
+ major_axis=major, minor_axis=minor)
+ expected = self.panel.reindex(
+ items=items, major=major, minor=minor)
+ assert_panel_equal(result, expected)
+
+ result = Panel(data, items=items, major_axis=major)
+ expected = self.panel.reindex(items=items, major=major)
+ assert_panel_equal(result, expected)
+
+ result = Panel(data, items=items)
+ expected = self.panel.reindex(items=items)
+ assert_panel_equal(result, expected)
+
+ result = Panel(data, minor_axis=minor)
+ expected = self.panel.reindex(minor=minor)
+ assert_panel_equal(result, expected)
+
+ def test_from_dict_mixed_orient(self):
+ df = tm.makeDataFrame()
+ df['foo'] = 'bar'
+
+ data = {'k1': df, 'k2': df}
+
+ panel = Panel.from_dict(data, orient='minor')
+
+ assert panel['foo'].values.dtype == np.object_
+ assert panel['A'].values.dtype == np.float64
+
+ def test_constructor_error_msgs(self):
+ msg = (r"Shape of passed values is \(3, 4, 5\), "
+ r"indices imply \(4, 5, 5\)")
+ with pytest.raises(ValueError, match=msg):
+ Panel(np.random.randn(3, 4, 5),
+ lrange(4), lrange(5), lrange(5))
+
+ msg = (r"Shape of passed values is \(3, 4, 5\), "
+ r"indices imply \(5, 4, 5\)")
+ with pytest.raises(ValueError, match=msg):
+ Panel(np.random.randn(3, 4, 5),
+ lrange(5), lrange(4), lrange(5))
+
+ msg = (r"Shape of passed values is \(3, 4, 5\), "
+ r"indices imply \(5, 5, 4\)")
+ with pytest.raises(ValueError, match=msg):
+ Panel(np.random.randn(3, 4, 5),
+ lrange(5), lrange(5), lrange(4))
+
+ def test_conform(self):
+ df = self.panel['ItemA'][:-5].filter(items=['A', 'B'])
+ conformed = self.panel.conform(df)
+
+ tm.assert_index_equal(conformed.index, self.panel.major_axis)
+ tm.assert_index_equal(conformed.columns, self.panel.minor_axis)
+
+ def test_convert_objects(self):
+ # GH 4937
+ p = Panel(dict(A=dict(a=['1', '1.0'])))
+ expected = Panel(dict(A=dict(a=[1, 1.0])))
+ result = p._convert(numeric=True, coerce=True)
+ assert_panel_equal(result, expected)
+
+ def test_dtypes(self):
+
+ result = self.panel.dtypes
+ expected = Series(np.dtype('float64'), index=self.panel.items)
+ assert_series_equal(result, expected)
+
+ def test_astype(self):
+ # GH7271
+ data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+ panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f'])
+
+ str_data = np.array([[['1', '2'], ['3', '4']],
+ [['5', '6'], ['7', '8']]])
+ expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f'])
+ assert_panel_equal(panel.astype(str), expected)
+
+ pytest.raises(NotImplementedError, panel.astype, {0: str})
+
+ def test_apply(self):
+ # GH1148
+
+ # ufunc
+ applied = self.panel.apply(np.sqrt)
+ with np.errstate(invalid='ignore'):
+ expected = np.sqrt(self.panel.values)
+ assert_almost_equal(applied.values, expected)
+
+ # ufunc same shape
+ result = self.panel.apply(lambda x: x * 2, axis='items')
+ expected = self.panel * 2
+ assert_panel_equal(result, expected)
+ result = self.panel.apply(lambda x: x * 2, axis='major_axis')
+ expected = self.panel * 2
+ assert_panel_equal(result, expected)
+ result = self.panel.apply(lambda x: x * 2, axis='minor_axis')
+ expected = self.panel * 2
+ assert_panel_equal(result, expected)
+
+ # reduction to DataFrame
+ result = self.panel.apply(lambda x: x.dtype, axis='items')
+ expected = DataFrame(np.dtype('float64'),
+ index=self.panel.major_axis,
+ columns=self.panel.minor_axis)
+ assert_frame_equal(result, expected)
+ result = self.panel.apply(lambda x: x.dtype, axis='major_axis')
+ expected = DataFrame(np.dtype('float64'),
+ index=self.panel.minor_axis,
+ columns=self.panel.items)
+ assert_frame_equal(result, expected)
+ result = self.panel.apply(lambda x: x.dtype, axis='minor_axis')
+ expected = DataFrame(np.dtype('float64'),
+ index=self.panel.major_axis,
+ columns=self.panel.items)
+ assert_frame_equal(result, expected)
+
+ # reductions via other dims
+ expected = self.panel.sum(0)
+ result = self.panel.apply(lambda x: x.sum(), axis='items')
+ assert_frame_equal(result, expected)
+ expected = self.panel.sum(1)
+ result = self.panel.apply(lambda x: x.sum(), axis='major_axis')
+ assert_frame_equal(result, expected)
+ expected = self.panel.sum(2)
+ result = self.panel.apply(lambda x: x.sum(), axis='minor_axis')
+ assert_frame_equal(result, expected)
+
+ # pass kwargs
+ result = self.panel.apply(
+ lambda x, y: x.sum() + y, axis='items', y=5)
+ expected = self.panel.sum(0) + 5
+ assert_frame_equal(result, expected)
+
+ def test_apply_slabs(self):
+
+ # same shape as original
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['items', 'major_axis'])
+ expected = (self.panel * 2).transpose('minor_axis', 'major_axis',
+ 'items')
+ assert_panel_equal(result, expected)
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['major_axis', 'items'])
+ assert_panel_equal(result, expected)
+
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['items', 'minor_axis'])
+ expected = (self.panel * 2).transpose('major_axis', 'minor_axis',
+ 'items')
+ assert_panel_equal(result, expected)
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['minor_axis', 'items'])
+ assert_panel_equal(result, expected)
+
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['major_axis', 'minor_axis'])
+ expected = self.panel * 2
+ assert_panel_equal(result, expected)
+ result = self.panel.apply(lambda x: x * 2,
+ axis=['minor_axis', 'major_axis'])
+ assert_panel_equal(result, expected)
+
+ # reductions
+ result = self.panel.apply(lambda x: x.sum(0), axis=[
+ 'items', 'major_axis'
+ ])
+ expected = self.panel.sum(1).T
+ assert_frame_equal(result, expected)
+
+ result = self.panel.apply(lambda x: x.sum(1), axis=[
+ 'items', 'major_axis'
+ ])
+ expected = self.panel.sum(0)
+ assert_frame_equal(result, expected)
+
+ # transforms
+ f = lambda x: ((x.T - x.mean(1)) / x.std(1)).T
+
+ # make sure that we don't trigger any warnings
+ result = self.panel.apply(f, axis=['items', 'major_axis'])
+ expected = Panel({ax: f(self.panel.loc[:, :, ax])
+ for ax in self.panel.minor_axis})
+ assert_panel_equal(result, expected)
+
+ result = self.panel.apply(f, axis=['major_axis', 'minor_axis'])
+ expected = Panel({ax: f(self.panel.loc[ax])
+ for ax in self.panel.items})
+ assert_panel_equal(result, expected)
+
+ result = self.panel.apply(f, axis=['minor_axis', 'items'])
+ expected = Panel({ax: f(self.panel.loc[:, ax])
+ for ax in self.panel.major_axis})
+ assert_panel_equal(result, expected)
+
+ # with multi-indexes
+ # GH7469
+ index = MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), (
+ 'two', 'a'), ('two', 'b')])
+ dfa = DataFrame(np.array(np.arange(12, dtype='int64')).reshape(
+ 4, 3), columns=list("ABC"), index=index)
+ dfb = DataFrame(np.array(np.arange(10, 22, dtype='int64')).reshape(
+ 4, 3), columns=list("ABC"), index=index)
+ p = Panel({'f': dfa, 'g': dfb})
+ result = p.apply(lambda x: x.sum(), axis=0)
+
+ # on windows this will be in32
+ result = result.astype('int64')
+ expected = p.sum(0)
+ assert_frame_equal(result, expected)
+
+ def test_apply_no_or_zero_ndim(self):
+ # GH10332
+ self.panel = Panel(np.random.rand(5, 5, 5))
+
+ result_int = self.panel.apply(lambda df: 0, axis=[1, 2])
+ result_float = self.panel.apply(lambda df: 0.0, axis=[1, 2])
+ result_int64 = self.panel.apply(
+ lambda df: np.int64(0), axis=[1, 2])
+ result_float64 = self.panel.apply(lambda df: np.float64(0.0),
+ axis=[1, 2])
+
+ expected_int = expected_int64 = Series([0] * 5)
+ expected_float = expected_float64 = Series([0.0] * 5)
+
+ assert_series_equal(result_int, expected_int)
+ assert_series_equal(result_int64, expected_int64)
+ assert_series_equal(result_float, expected_float)
+ assert_series_equal(result_float64, expected_float64)
+
+ def test_reindex(self):
+ ref = self.panel['ItemB']
+
+ # items
+ result = self.panel.reindex(items=['ItemA', 'ItemB'])
+ assert_frame_equal(result['ItemB'], ref)
+
+ # major
+ new_major = list(self.panel.major_axis[:10])
+ result = self.panel.reindex(major=new_major)
+ assert_frame_equal(result['ItemB'], ref.reindex(index=new_major))
+
+ # raise exception put both major and major_axis
+ pytest.raises(Exception, self.panel.reindex,
+ major_axis=new_major,
+ major=new_major)
+
+ # minor
+ new_minor = list(self.panel.minor_axis[:2])
+ result = self.panel.reindex(minor=new_minor)
+ assert_frame_equal(result['ItemB'], ref.reindex(columns=new_minor))
+
+ # raise exception put both major and major_axis
+ pytest.raises(Exception, self.panel.reindex,
+ minor_axis=new_minor,
+ minor=new_minor)
+
+ # this ok
+ result = self.panel.reindex()
+ assert_panel_equal(result, self.panel)
+ assert result is not self.panel
+
+ # with filling
+ smaller_major = self.panel.major_axis[::5]
+ smaller = self.panel.reindex(major=smaller_major)
+
+ larger = smaller.reindex(major=self.panel.major_axis, method='pad')
+
+ assert_frame_equal(larger.major_xs(self.panel.major_axis[1]),
+ smaller.major_xs(smaller_major[0]))
+
+ # don't necessarily copy
+ result = self.panel.reindex(
+ major=self.panel.major_axis, copy=False)
+ assert_panel_equal(result, self.panel)
+ assert result is self.panel
+
+ def test_reindex_axis_style(self):
+ panel = Panel(np.random.rand(5, 5, 5))
+ expected0 = Panel(panel.values).iloc[[0, 1]]
+ expected1 = Panel(panel.values).iloc[:, [0, 1]]
+ expected2 = Panel(panel.values).iloc[:, :, [0, 1]]
+
+ result = panel.reindex([0, 1], axis=0)
+ assert_panel_equal(result, expected0)
+
+ result = panel.reindex([0, 1], axis=1)
+ assert_panel_equal(result, expected1)
+
+ result = panel.reindex([0, 1], axis=2)
+ assert_panel_equal(result, expected2)
+
+ result = panel.reindex([0, 1], axis=2)
+ assert_panel_equal(result, expected2)
+
+ def test_reindex_multi(self):
+
+ # with and without copy full reindexing
+ result = self.panel.reindex(
+ items=self.panel.items,
+ major=self.panel.major_axis,
+ minor=self.panel.minor_axis, copy=False)
+
+ assert result.items is self.panel.items
+ assert result.major_axis is self.panel.major_axis
+ assert result.minor_axis is self.panel.minor_axis
+
+ result = self.panel.reindex(
+ items=self.panel.items,
+ major=self.panel.major_axis,
+ minor=self.panel.minor_axis, copy=False)
+ assert_panel_equal(result, self.panel)
+
+ # multi-axis indexing consistency
+ # GH 5900
+ df = DataFrame(np.random.randn(4, 3))
+ p = Panel({'Item1': df})
+ expected = Panel({'Item1': df})
+ expected['Item2'] = np.nan
+
+ items = ['Item1', 'Item2']
+ major_axis = np.arange(4)
+ minor_axis = np.arange(3)
+
+ results = []
+ results.append(p.reindex(items=items, major_axis=major_axis,
+ copy=True))
+ results.append(p.reindex(items=items, major_axis=major_axis,
+ copy=False))
+ results.append(p.reindex(items=items, minor_axis=minor_axis,
+ copy=True))
+ results.append(p.reindex(items=items, minor_axis=minor_axis,
+ copy=False))
+ results.append(p.reindex(items=items, major_axis=major_axis,
+ minor_axis=minor_axis, copy=True))
+ results.append(p.reindex(items=items, major_axis=major_axis,
+ minor_axis=minor_axis, copy=False))
+
+ for i, r in enumerate(results):
+ assert_panel_equal(expected, r)
+
+ def test_reindex_like(self):
+ # reindex_like
+ smaller = self.panel.reindex(items=self.panel.items[:-1],
+ major=self.panel.major_axis[:-1],
+ minor=self.panel.minor_axis[:-1])
+ smaller_like = self.panel.reindex_like(smaller)
+ assert_panel_equal(smaller, smaller_like)
+
+ def test_take(self):
+ # axis == 0
+ result = self.panel.take([2, 0, 1], axis=0)
+ expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB'])
+ assert_panel_equal(result, expected)
+
+ # axis >= 1
+ result = self.panel.take([3, 0, 1, 2], axis=2)
+ expected = self.panel.reindex(minor=['D', 'A', 'B', 'C'])
+ assert_panel_equal(result, expected)
+
+ # neg indices ok
+ expected = self.panel.reindex(minor=['D', 'D', 'B', 'C'])
+ result = self.panel.take([3, -1, 1, 2], axis=2)
+ assert_panel_equal(result, expected)
+
+ pytest.raises(Exception, self.panel.take, [4, 0, 1, 2], axis=2)
+
+ def test_sort_index(self):
+ import random
+
+ ritems = list(self.panel.items)
+ rmajor = list(self.panel.major_axis)
+ rminor = list(self.panel.minor_axis)
+ random.shuffle(ritems)
+ random.shuffle(rmajor)
+ random.shuffle(rminor)
+
+ random_order = self.panel.reindex(items=ritems)
+ sorted_panel = random_order.sort_index(axis=0)
+ assert_panel_equal(sorted_panel, self.panel)
+
+ # descending
+ random_order = self.panel.reindex(items=ritems)
+ sorted_panel = random_order.sort_index(axis=0, ascending=False)
+ assert_panel_equal(
+ sorted_panel,
+ self.panel.reindex(items=self.panel.items[::-1]))
+
+ random_order = self.panel.reindex(major=rmajor)
+ sorted_panel = random_order.sort_index(axis=1)
+ assert_panel_equal(sorted_panel, self.panel)
+
+ random_order = self.panel.reindex(minor=rminor)
+ sorted_panel = random_order.sort_index(axis=2)
+ assert_panel_equal(sorted_panel, self.panel)
+
+ def test_fillna(self):
+ filled = self.panel.fillna(0)
+ assert np.isfinite(filled.values).all()
+
+ filled = self.panel.fillna(method='backfill')
+ assert_frame_equal(filled['ItemA'],
+ self.panel['ItemA'].fillna(method='backfill'))
+
+ panel = self.panel.copy()
+ panel['str'] = 'foo'
+
+ filled = panel.fillna(method='backfill')
+ assert_frame_equal(filled['ItemA'],
+ panel['ItemA'].fillna(method='backfill'))
+
+ empty = self.panel.reindex(items=[])
+ filled = empty.fillna(0)
+ assert_panel_equal(filled, empty)
+
+ pytest.raises(ValueError, self.panel.fillna)
+ pytest.raises(ValueError, self.panel.fillna, 5, method='ffill')
+
+ pytest.raises(TypeError, self.panel.fillna, [1, 2])
+ pytest.raises(TypeError, self.panel.fillna, (1, 2))
+
+ # limit not implemented when only value is specified
+ p = Panel(np.random.randn(3, 4, 5))
+ p.iloc[0:2, 0:2, 0:2] = np.nan
+ pytest.raises(NotImplementedError,
+ lambda: p.fillna(999, limit=1))
+
+ # Test in place fillNA
+ # Expected result
+ expected = Panel([[[0, 1], [2, 1]], [[10, 11], [12, 11]]],
+ items=['a', 'b'], minor_axis=['x', 'y'],
+ dtype=np.float64)
+ # method='ffill'
+ p1 = Panel([[[0, 1], [2, np.nan]], [[10, 11], [12, np.nan]]],
+ items=['a', 'b'], minor_axis=['x', 'y'],
+ dtype=np.float64)
+ p1.fillna(method='ffill', inplace=True)
+ assert_panel_equal(p1, expected)
+
+ # method='bfill'
+ p2 = Panel([[[0, np.nan], [2, 1]], [[10, np.nan], [12, 11]]],
+ items=['a', 'b'], minor_axis=['x', 'y'],
+ dtype=np.float64)
+ p2.fillna(method='bfill', inplace=True)
+ assert_panel_equal(p2, expected)
+
+ def test_ffill_bfill(self):
+ assert_panel_equal(self.panel.ffill(),
+ self.panel.fillna(method='ffill'))
+ assert_panel_equal(self.panel.bfill(),
+ self.panel.fillna(method='bfill'))
+
+ def test_truncate_fillna_bug(self):
+ # #1823
+ result = self.panel.truncate(before=None, after=None, axis='items')
+
+ # it works!
+ result.fillna(value=0.0)
+
+ def test_swapaxes(self):
+ result = self.panel.swapaxes('items', 'minor')
+ assert result.items is self.panel.minor_axis
+
+ result = self.panel.swapaxes('items', 'major')
+ assert result.items is self.panel.major_axis
+
+ result = self.panel.swapaxes('major', 'minor')
+ assert result.major_axis is self.panel.minor_axis
+
+ panel = self.panel.copy()
+ result = panel.swapaxes('major', 'minor')
+ panel.values[0, 0, 1] = np.nan
+ expected = panel.swapaxes('major', 'minor')
+ assert_panel_equal(result, expected)
+
+ # this should also work
+ result = self.panel.swapaxes(0, 1)
+ assert result.items is self.panel.major_axis
+
+ # this works, but return a copy
+ result = self.panel.swapaxes('items', 'items')
+ assert_panel_equal(self.panel, result)
+ assert id(self.panel) != id(result)
+
+ def test_transpose(self):
+ result = self.panel.transpose('minor', 'major', 'items')
+ expected = self.panel.swapaxes('items', 'minor')
+ assert_panel_equal(result, expected)
+
+ # test kwargs
+ result = self.panel.transpose(items='minor', major='major',
+ minor='items')
+ expected = self.panel.swapaxes('items', 'minor')
+ assert_panel_equal(result, expected)
+
+ # text mixture of args
+ result = self.panel.transpose(
+ 'minor', major='major', minor='items')
+ expected = self.panel.swapaxes('items', 'minor')
+ assert_panel_equal(result, expected)
+
+ result = self.panel.transpose('minor',
+ 'major',
+ minor='items')
+ expected = self.panel.swapaxes('items', 'minor')
+ assert_panel_equal(result, expected)
+
+ # duplicate axes
+ with pytest.raises(TypeError,
+ match='not enough/duplicate arguments'):
+ self.panel.transpose('minor', maj='major', minor='items')
+
+ with pytest.raises(ValueError,
+ match='repeated axis in transpose'):
+ self.panel.transpose('minor', 'major', major='minor',
+ minor='items')
+
+ result = self.panel.transpose(2, 1, 0)
+ assert_panel_equal(result, expected)
+
+ result = self.panel.transpose('minor', 'items', 'major')
+ expected = self.panel.swapaxes('items', 'minor')
+ expected = expected.swapaxes('major', 'minor')
+ assert_panel_equal(result, expected)
+
+ result = self.panel.transpose(2, 0, 1)
+ assert_panel_equal(result, expected)
+
+ pytest.raises(ValueError, self.panel.transpose, 0, 0, 1)
+
+ def test_transpose_copy(self):
+ panel = self.panel.copy()
+ result = panel.transpose(2, 0, 1, copy=True)
+ expected = panel.swapaxes('items', 'minor')
+ expected = expected.swapaxes('major', 'minor')
+ assert_panel_equal(result, expected)
+
+ panel.values[0, 1, 1] = np.nan
+ assert notna(result.values[1, 0, 1])
+
+ def test_to_frame(self):
+ # filtered
+ filtered = self.panel.to_frame()
+ expected = self.panel.to_frame().dropna(how='any')
+ assert_frame_equal(filtered, expected)
+
+ # unfiltered
+ unfiltered = self.panel.to_frame(filter_observations=False)
+ assert_panel_equal(unfiltered.to_panel(), self.panel)
+
+ # names
+ assert unfiltered.index.names == ('major', 'minor')
+
+ # unsorted, round trip
+ df = self.panel.to_frame(filter_observations=False)
+ unsorted = df.take(np.random.permutation(len(df)))
+ pan = unsorted.to_panel()
+ assert_panel_equal(pan, self.panel)
+
+ # preserve original index names
+ df = DataFrame(np.random.randn(6, 2),
+ index=[['a', 'a', 'b', 'b', 'c', 'c'],
+ [0, 1, 0, 1, 0, 1]],
+ columns=['one', 'two'])
+ df.index.names = ['foo', 'bar']
+ df.columns.name = 'baz'
+
+ rdf = df.to_panel().to_frame()
+ assert rdf.index.names == df.index.names
+ assert rdf.columns.names == df.columns.names
+
+ def test_to_frame_mixed(self):
+ panel = self.panel.fillna(0)
+ panel['str'] = 'foo'
+ panel['bool'] = panel['ItemA'] > 0
+
+ lp = panel.to_frame()
+ wp = lp.to_panel()
+ assert wp['bool'].values.dtype == np.bool_
+ # Previously, this was mutating the underlying
+ # index and changing its name
+ assert_frame_equal(wp['bool'], panel['bool'], check_names=False)
+
+ # GH 8704
+ # with categorical
+ df = panel.to_frame()
+ df['category'] = df['str'].astype('category')
+
+ # to_panel
+ # TODO: this converts back to object
+ p = df.to_panel()
+ expected = panel.copy()
+ expected['category'] = 'foo'
+ assert_panel_equal(p, expected)
+
+ def test_to_frame_multi_major(self):
+ idx = MultiIndex.from_tuples(
+ [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')])
+ df = DataFrame([[1, 'a', 1], [2, 'b', 1],
+ [3, 'c', 1], [4, 'd', 1]],
+ columns=['A', 'B', 'C'], index=idx)
+ wp = Panel({'i1': df, 'i2': df})
+ expected_idx = MultiIndex.from_tuples(
+ [
+ (1, 'one', 'A'), (1, 'one', 'B'),
+ (1, 'one', 'C'), (1, 'two', 'A'),
+ (1, 'two', 'B'), (1, 'two', 'C'),
+ (2, 'one', 'A'), (2, 'one', 'B'),
+ (2, 'one', 'C'), (2, 'two', 'A'),
+ (2, 'two', 'B'), (2, 'two', 'C')
+ ],
+ names=[None, None, 'minor'])
+ expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3,
+ 'c', 1, 4, 'd', 1],
+ 'i2': [1, 'a', 1, 2, 'b',
+ 1, 3, 'c', 1, 4, 'd', 1]},
+ index=expected_idx)
+ result = wp.to_frame()
+ assert_frame_equal(result, expected)
+
+ wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773
+ result = wp.to_frame()
+ assert_frame_equal(result, expected[1:])
+
+ idx = MultiIndex.from_tuples(
+ [(1, 'two'), (1, 'one'), (2, 'one'), (np.nan, 'two')])
+ df = DataFrame([[1, 'a', 1], [2, 'b', 1],
+ [3, 'c', 1], [4, 'd', 1]],
+ columns=['A', 'B', 'C'], index=idx)
+ wp = Panel({'i1': df, 'i2': df})
+ ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'),
+ (1, 'two', 'C'),
+ (1, 'one', 'A'),
+ (1, 'one', 'B'),
+ (1, 'one', 'C'),
+ (2, 'one', 'A'),
+ (2, 'one', 'B'),
+ (2, 'one', 'C'),
+ (np.nan, 'two', 'A'),
+ (np.nan, 'two', 'B'),
+ (np.nan, 'two', 'C')],
+ names=[None, None, 'minor'])
+ expected.index = ex_idx
+ result = wp.to_frame()
+ assert_frame_equal(result, expected)
+
+ def test_to_frame_multi_major_minor(self):
+ cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+ idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (
+ 2, 'two'), (3, 'three'), (4, 'four')])
+ df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14],
+ ['a', 'b', 'w', 'x'],
+ ['c', 'd', 'y', 'z'], [-1, -2, -3, -4],
+ [-5, -6, -7, -8]], columns=cols, index=idx)
+ wp = Panel({'i1': df, 'i2': df})
+
+ exp_idx = MultiIndex.from_tuples(
+ [(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'),
+ (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'),
+ (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'),
+ (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'),
+ (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'),
+ (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'),
+ (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'),
+ (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'),
+ (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'),
+ (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'),
+ (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'),
+ (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')],
+ names=[None, None, None, None])
+ exp_val = [[1, 1], [2, 2], [11, 11], [12, 12],
+ [3, 3], [4, 4],
+ [13, 13], [14, 14], ['a', 'a'],
+ ['b', 'b'], ['w', 'w'],
+ ['x', 'x'], ['c', 'c'], ['d', 'd'], [
+ 'y', 'y'], ['z', 'z'],
+ [-1, -1], [-2, -2], [-3, -3], [-4, -4],
+ [-5, -5], [-6, -6],
+ [-7, -7], [-8, -8]]
+ result = wp.to_frame()
+ expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx)
+ assert_frame_equal(result, expected)
+
+ def test_to_frame_multi_drop_level(self):
+ idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')])
+ df = DataFrame({'A': [np.nan, 1, 2]}, index=idx)
+ wp = Panel({'i1': df, 'i2': df})
+ result = wp.to_frame()
+ exp_idx = MultiIndex.from_tuples(
+ [(2, 'one', 'A'), (2, 'two', 'A')],
+ names=[None, None, 'minor'])
+ expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx)
+ assert_frame_equal(result, expected)
+
+ def test_to_panel_na_handling(self):
+ df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)),
+ index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+ [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]])
+
+ panel = df.to_panel()
+ assert isna(panel[0].loc[1, [0, 1]]).all()
+
+ def test_to_panel_duplicates(self):
+ # #2441
+ df = DataFrame({'a': [0, 0, 1], 'b': [1, 1, 1], 'c': [1, 2, 3]})
+ idf = df.set_index(['a', 'b'])
+
+ with pytest.raises(ValueError, match='non-uniquely indexed'):
+ idf.to_panel()
+
+ def test_panel_dups(self):
+
+ # GH 4960
+ # duplicates in an index
+
+ # items
+ data = np.random.randn(5, 100, 5)
+ no_dup_panel = Panel(data, items=list("ABCDE"))
+ panel = Panel(data, items=list("AACDE"))
+
+ expected = no_dup_panel['A']
+ result = panel.iloc[0]
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel['E']
+ result = panel.loc['E']
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel.loc[['A', 'B']]
+ expected.items = ['A', 'A']
+ result = panel.loc['A']
+ assert_panel_equal(result, expected)
+
+ # major
+ data = np.random.randn(5, 5, 5)
+ no_dup_panel = Panel(data, major_axis=list("ABCDE"))
+ panel = Panel(data, major_axis=list("AACDE"))
+
+ expected = no_dup_panel.loc[:, 'A']
+ result = panel.iloc[:, 0]
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel.loc[:, 'E']
+ result = panel.loc[:, 'E']
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel.loc[:, ['A', 'B']]
+ expected.major_axis = ['A', 'A']
+ result = panel.loc[:, 'A']
+ assert_panel_equal(result, expected)
+
+ # minor
+ data = np.random.randn(5, 100, 5)
+ no_dup_panel = Panel(data, minor_axis=list("ABCDE"))
+ panel = Panel(data, minor_axis=list("AACDE"))
+
+ expected = no_dup_panel.loc[:, :, 'A']
+ result = panel.iloc[:, :, 0]
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel.loc[:, :, 'E']
+ result = panel.loc[:, :, 'E']
+ assert_frame_equal(result, expected)
+
+ expected = no_dup_panel.loc[:, :, ['A', 'B']]
+ expected.minor_axis = ['A', 'A']
+ result = panel.loc[:, :, 'A']
+ assert_panel_equal(result, expected)
+
+ def test_filter(self):
+ pass
+
+ def test_compound(self):
+ compounded = self.panel.compound()
+
+ assert_series_equal(compounded['ItemA'],
+ (1 + self.panel['ItemA']).product(0) - 1,
+ check_names=False)
+
+ def test_shift(self):
+ # major
+ idx = self.panel.major_axis[0]
+ idx_lag = self.panel.major_axis[1]
+ shifted = self.panel.shift(1)
+ assert_frame_equal(self.panel.major_xs(idx),
+ shifted.major_xs(idx_lag))
+
+ # minor
+ idx = self.panel.minor_axis[0]
+ idx_lag = self.panel.minor_axis[1]
+ shifted = self.panel.shift(1, axis='minor')
+ assert_frame_equal(self.panel.minor_xs(idx),
+ shifted.minor_xs(idx_lag))
+
+ # items
+ idx = self.panel.items[0]
+ idx_lag = self.panel.items[1]
+ shifted = self.panel.shift(1, axis='items')
+ assert_frame_equal(self.panel[idx], shifted[idx_lag])
+
+ # negative numbers, #2164
+ result = self.panel.shift(-1)
+ expected = Panel({i: f.shift(-1)[:-1]
+ for i, f in self.panel.iteritems()})
+ assert_panel_equal(result, expected)
+
+ # mixed dtypes #6959
+ data = [('item ' + ch, makeMixedDataFrame())
+ for ch in list('abcde')]
+ data = dict(data)
+ mixed_panel = Panel.from_dict(data, orient='minor')
+ shifted = mixed_panel.shift(1)
+ assert_series_equal(mixed_panel.dtypes, shifted.dtypes)
+
+ def test_tshift(self):
+ # PeriodIndex
+ ps = tm.makePeriodPanel()
+ shifted = ps.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_panel_equal(unshifted, ps)
+
+ shifted2 = ps.tshift(freq='B')
+ assert_panel_equal(shifted, shifted2)
+
+ shifted3 = ps.tshift(freq=BDay())
+ assert_panel_equal(shifted, shifted3)
+
+ with pytest.raises(ValueError, match='does not match'):
+ ps.tshift(freq='M')
+
+ # DatetimeIndex
+ panel = make_test_panel()
+ shifted = panel.tshift(1)
+ unshifted = shifted.tshift(-1)
+
+ assert_panel_equal(panel, unshifted)
+
+ shifted2 = panel.tshift(freq=panel.major_axis.freq)
+ assert_panel_equal(shifted, shifted2)
+
+ inferred_ts = Panel(panel.values, items=panel.items,
+ major_axis=Index(np.asarray(panel.major_axis)),
+ minor_axis=panel.minor_axis)
+ shifted = inferred_ts.tshift(1)
+ unshifted = shifted.tshift(-1)
+ assert_panel_equal(shifted, panel.tshift(1))
+ assert_panel_equal(unshifted, inferred_ts)
+
+ no_freq = panel.iloc[:, [0, 5, 7], :]
+ pytest.raises(ValueError, no_freq.tshift)
+
+ def test_pct_change(self):
+ df1 = DataFrame({'c1': [1, 2, 5], 'c2': [3, 4, 6]})
+ df2 = df1 + 1
+ df3 = DataFrame({'c1': [3, 4, 7], 'c2': [5, 6, 8]})
+ wp = Panel({'i1': df1, 'i2': df2, 'i3': df3})
+ # major, 1
+ result = wp.pct_change() # axis='major'
+ expected = Panel({'i1': df1.pct_change(),
+ 'i2': df2.pct_change(),
+ 'i3': df3.pct_change()})
+ assert_panel_equal(result, expected)
+ result = wp.pct_change(axis=1)
+ assert_panel_equal(result, expected)
+ # major, 2
+ result = wp.pct_change(periods=2)
+ expected = Panel({'i1': df1.pct_change(2),
+ 'i2': df2.pct_change(2),
+ 'i3': df3.pct_change(2)})
+ assert_panel_equal(result, expected)
+ # minor, 1
+ result = wp.pct_change(axis='minor')
+ expected = Panel({'i1': df1.pct_change(axis=1),
+ 'i2': df2.pct_change(axis=1),
+ 'i3': df3.pct_change(axis=1)})
+ assert_panel_equal(result, expected)
+ result = wp.pct_change(axis=2)
+ assert_panel_equal(result, expected)
+ # minor, 2
+ result = wp.pct_change(periods=2, axis='minor')
+ expected = Panel({'i1': df1.pct_change(periods=2, axis=1),
+ 'i2': df2.pct_change(periods=2, axis=1),
+ 'i3': df3.pct_change(periods=2, axis=1)})
+ assert_panel_equal(result, expected)
+ # items, 1
+ result = wp.pct_change(axis='items')
+ expected = Panel(
+ {'i1': DataFrame({'c1': [np.nan, np.nan, np.nan],
+ 'c2': [np.nan, np.nan, np.nan]}),
+ 'i2': DataFrame({'c1': [1, 0.5, .2],
+ 'c2': [1. / 3, 0.25, 1. / 6]}),
+ 'i3': DataFrame({'c1': [.5, 1. / 3, 1. / 6],
+ 'c2': [.25, .2, 1. / 7]})})
+ assert_panel_equal(result, expected)
+ result = wp.pct_change(axis=0)
+ assert_panel_equal(result, expected)
+ # items, 2
+ result = wp.pct_change(periods=2, axis='items')
+ expected = Panel(
+ {'i1': DataFrame({'c1': [np.nan, np.nan, np.nan],
+ 'c2': [np.nan, np.nan, np.nan]}),
+ 'i2': DataFrame({'c1': [np.nan, np.nan, np.nan],
+ 'c2': [np.nan, np.nan, np.nan]}),
+ 'i3': DataFrame({'c1': [2, 1, .4],
+ 'c2': [2. / 3, .5, 1. / 3]})})
+ assert_panel_equal(result, expected)
+
+ def test_round(self):
+ values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12],
+ [-1566.213, 88.88], [-12, 94.5]],
+ [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12],
+ [272.212, -99.99], [23, -76.5]]]
+ evalues = [[[float(np.around(i)) for i in j] for j in k]
+ for k in values]
+ p = Panel(values, items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B'])
+ expected = Panel(evalues, items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B'])
+ result = p.round()
+ assert_panel_equal(expected, result)
+
+ def test_numpy_round(self):
+ values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12],
+ [-1566.213, 88.88], [-12, 94.5]],
+ [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12],
+ [272.212, -99.99], [23, -76.5]]]
+ evalues = [[[float(np.around(i)) for i in j] for j in k]
+ for k in values]
+ p = Panel(values, items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B'])
+ expected = Panel(evalues, items=['Item1', 'Item2'],
+ major_axis=date_range('1/1/2000', periods=5),
+ minor_axis=['A', 'B'])
+ result = np.round(p)
+ assert_panel_equal(expected, result)
+
+ msg = "the 'out' parameter is not supported"
+ with pytest.raises(ValueError, match=msg):
+ np.round(p, out=p)
+
+ # removing Panel before NumPy enforces, so just ignore
+ @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning")
+ def test_multiindex_get(self):
+ ind = MultiIndex.from_tuples(
+ [('a', 1), ('a', 2), ('b', 1), ('b', 2)],
+ names=['first', 'second'])
+ wp = Panel(np.random.random((4, 5, 5)),
+ items=ind,
+ major_axis=np.arange(5),
+ minor_axis=np.arange(5))
+ f1 = wp['a']
+ f2 = wp.loc['a']
+ assert_panel_equal(f1, f2)
+
+ assert (f1.items == [1, 2]).all()
+ assert (f2.items == [1, 2]).all()
+
+ MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)],
+ names=['first', 'second'])
+
+ @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning")
+ def test_multiindex_blocks(self):
+ ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)],
+ names=['first', 'second'])
+ wp = Panel(self.panel._data)
+ wp.items = ind
+ f1 = wp['a']
+ assert (f1.items == [1, 2]).all()
+
+ f1 = wp[('b', 1)]
+ assert (f1.columns == ['A', 'B', 'C', 'D']).all()
+
+ def test_repr_empty(self):
+ empty = Panel()
+ repr(empty)
+
+ # ignore warning from us, because removing panel
+ @pytest.mark.filterwarnings("ignore:Using:FutureWarning")
+ def test_rename(self):
+ mapper = {'ItemA': 'foo', 'ItemB': 'bar', 'ItemC': 'baz'}
+
+ renamed = self.panel.rename(items=mapper)
+ exp = Index(['foo', 'bar', 'baz'])
+ tm.assert_index_equal(renamed.items, exp)
+
+ renamed = self.panel.rename(minor_axis=str.lower)
+ exp = Index(['a', 'b', 'c', 'd'])
+ tm.assert_index_equal(renamed.minor_axis, exp)
+
+ # don't copy
+ renamed_nocopy = self.panel.rename(items=mapper, copy=False)
+ renamed_nocopy['foo'] = 3.
+ assert (self.panel['ItemA'].values == 3).all()
+
+ def test_get_attr(self):
+ assert_frame_equal(self.panel['ItemA'], self.panel.ItemA)
+
+ # specific cases from #3440
+ self.panel['a'] = self.panel['ItemA']
+ assert_frame_equal(self.panel['a'], self.panel.a)
+ self.panel['i'] = self.panel['ItemA']
+ assert_frame_equal(self.panel['i'], self.panel.i)
+
+ def test_from_frame_level1_unsorted(self):
+ tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1),
+ ('MSFT', 1)]
+ midx = MultiIndex.from_tuples(tuples)
+ df = DataFrame(np.random.rand(5, 4), index=midx)
+ p = df.to_panel()
+ assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
+
+ def test_to_excel(self):
+ try:
+ import xlwt # noqa
+ import xlrd # noqa
+ import openpyxl # noqa
+ from pandas.io.excel import ExcelFile
+ except ImportError:
+ pytest.skip("need xlwt xlrd openpyxl")
+
+ for ext in ['xls', 'xlsx']:
+ with ensure_clean('__tmp__.' + ext) as path:
+ self.panel.to_excel(path)
+ try:
+ reader = ExcelFile(path)
+ except ImportError:
+ pytest.skip("need xlwt xlrd openpyxl")
+
+ for item, df in self.panel.iteritems():
+ recdf = reader.parse(str(item), index_col=0)
+ assert_frame_equal(df, recdf)
+
+ def test_to_excel_xlsxwriter(self):
+ try:
+ import xlrd # noqa
+ import xlsxwriter # noqa
+ from pandas.io.excel import ExcelFile
+ except ImportError:
+ pytest.skip("Requires xlrd and xlsxwriter. Skipping test.")
+
+ with ensure_clean('__tmp__.xlsx') as path:
+ self.panel.to_excel(path, engine='xlsxwriter')
+ try:
+ reader = ExcelFile(path)
+ except ImportError as e:
+ pytest.skip("cannot write excel file: %s" % e)
+
+ for item, df in self.panel.iteritems():
+ recdf = reader.parse(str(item), index_col=0)
+ assert_frame_equal(df, recdf)
+
+ @pytest.mark.filterwarnings("ignore:'.reindex:FutureWarning")
+ def test_dropna(self):
+ p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde'))
+ p.loc[:, ['b', 'd'], 0] = np.nan
+
+ result = p.dropna(axis=1)
+ exp = p.loc[:, ['a', 'c', 'e'], :]
+ assert_panel_equal(result, exp)
+ inp = p.copy()
+ inp.dropna(axis=1, inplace=True)
+ assert_panel_equal(inp, exp)
+
+ result = p.dropna(axis=1, how='all')
+ assert_panel_equal(result, p)
+
+ p.loc[:, ['b', 'd'], :] = np.nan
+ result = p.dropna(axis=1, how='all')
+ exp = p.loc[:, ['a', 'c', 'e'], :]
+ assert_panel_equal(result, exp)
+
+ p = Panel(np.random.randn(4, 5, 6), items=list('abcd'))
+ p.loc[['b'], :, 0] = np.nan
+
+ result = p.dropna()
+ exp = p.loc[['a', 'c', 'd']]
+ assert_panel_equal(result, exp)
+
+ result = p.dropna(how='all')
+ assert_panel_equal(result, p)
+
+ p.loc['b'] = np.nan
+ result = p.dropna(how='all')
+ exp = p.loc[['a', 'c', 'd']]
+ assert_panel_equal(result, exp)
+
+ def test_drop(self):
+ df = DataFrame({"A": [1, 2], "B": [3, 4]})
+ panel = Panel({"One": df, "Two": df})
+
+ def check_drop(drop_val, axis_number, aliases, expected):
+ try:
+ actual = panel.drop(drop_val, axis=axis_number)
+ assert_panel_equal(actual, expected)
+ for alias in aliases:
+ actual = panel.drop(drop_val, axis=alias)
+ assert_panel_equal(actual, expected)
+ except AssertionError:
+ pprint_thing("Failed with axis_number %d and aliases: %s" %
+ (axis_number, aliases))
+ raise
+ # Items
+ expected = Panel({"One": df})
+ check_drop('Two', 0, ['items'], expected)
+
+ pytest.raises(KeyError, panel.drop, 'Three')
+
+ # errors = 'ignore'
+ dropped = panel.drop('Three', errors='ignore')
+ assert_panel_equal(dropped, panel)
+ dropped = panel.drop(['Two', 'Three'], errors='ignore')
+ expected = Panel({"One": df})
+ assert_panel_equal(dropped, expected)
+
+ # Major
+ exp_df = DataFrame({"A": [2], "B": [4]}, index=[1])
+ expected = Panel({"One": exp_df, "Two": exp_df})
+ check_drop(0, 1, ['major_axis', 'major'], expected)
+
+ exp_df = DataFrame({"A": [1], "B": [3]}, index=[0])
+ expected = Panel({"One": exp_df, "Two": exp_df})
+ check_drop([1], 1, ['major_axis', 'major'], expected)
+
+ # Minor
+ exp_df = df[['B']]
+ expected = Panel({"One": exp_df, "Two": exp_df})
+ check_drop(["A"], 2, ['minor_axis', 'minor'], expected)
+
+ exp_df = df[['A']]
+ expected = Panel({"One": exp_df, "Two": exp_df})
+ check_drop("B", 2, ['minor_axis', 'minor'], expected)
+
+ def test_update(self):
+ pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]],
+ [[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ other = Panel(
+ [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1])
+
+ pan.update(other)
+
+ expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.], [1.5, np.nan, 3.]],
+ [[3.6, 2., 3], [1.5, np.nan, 7],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ assert_panel_equal(pan, expected)
+
+ def test_update_from_dict(self):
+ pan = Panel({'one': DataFrame([[1.5, np.nan, 3],
+ [1.5, np.nan, 3],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]),
+ 'two': DataFrame([[1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]])})
+
+ other = {'two': DataFrame(
+ [[3.6, 2., np.nan], [np.nan, np.nan, 7]])}
+
+ pan.update(other)
+
+ expected = Panel(
+ {'one': DataFrame([[1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]),
+ 'two': DataFrame([[3.6, 2., 3],
+ [1.5, np.nan, 7],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]])
+ }
+ )
+
+ assert_panel_equal(pan, expected)
+
+ def test_update_nooverwrite(self):
+ pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]],
+ [[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ other = Panel(
+ [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1])
+
+ pan.update(other, overwrite=False)
+
+ expected = Panel([[[1.5, np.nan, 3], [1.5, np.nan, 3],
+ [1.5, np.nan, 3.], [1.5, np.nan, 3.]],
+ [[1.5, 2., 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ assert_panel_equal(pan, expected)
+
+ def test_update_filtered(self):
+ pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]],
+ [[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ other = Panel(
+ [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1])
+
+ pan.update(other, filter_func=lambda x: x > 2)
+
+ expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.], [1.5, np.nan, 3.]],
+ [[1.5, np.nan, 3], [1.5, np.nan, 7],
+ [1.5, np.nan, 3.], [1.5, np.nan, 3.]]])
+
+ assert_panel_equal(pan, expected)
+
+ @pytest.mark.parametrize('bad_kwarg, exception, msg', [
+ # errors must be 'ignore' or 'raise'
+ ({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
+ ({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
+ ])
+ def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
+ pan = Panel([[[1.5, np.nan, 3.]]])
+ with pytest.raises(exception, match=msg):
+ pan.update(pan, **bad_kwarg)
+
+ def test_update_raise_on_overlap(self):
+ pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]],
+ [[1.5, np.nan, 3.], [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.],
+ [1.5, np.nan, 3.]]])
+
+ with pytest.raises(ValueError, match='Data overlaps'):
+ pan.update(pan, errors='raise')
+
+ @pytest.mark.parametrize('raise_conflict', [True, False])
+ def test_update_deprecation(self, raise_conflict):
+ pan = Panel([[[1.5, np.nan, 3.]]])
+ other = Panel([[[]]])
+ with tm.assert_produces_warning(FutureWarning):
+ pan.update(other, raise_conflict=raise_conflict)
+
+ def test_all_any(self):
+ assert (self.panel.all(axis=0).values == nanall(
+ self.panel, axis=0)).all()
+ assert (self.panel.all(axis=1).values == nanall(
+ self.panel, axis=1).T).all()
+ assert (self.panel.all(axis=2).values == nanall(
+ self.panel, axis=2).T).all()
+ assert (self.panel.any(axis=0).values == nanany(
+ self.panel, axis=0)).all()
+ assert (self.panel.any(axis=1).values == nanany(
+ self.panel, axis=1).T).all()
+ assert (self.panel.any(axis=2).values == nanany(
+ self.panel, axis=2).T).all()
+
+ def test_all_any_unhandled(self):
+ pytest.raises(NotImplementedError, self.panel.all, bool_only=True)
+ pytest.raises(NotImplementedError, self.panel.any, bool_only=True)
+
+ # GH issue 15960
+ def test_sort_values(self):
+ pytest.raises(NotImplementedError, self.panel.sort_values)
+ pytest.raises(NotImplementedError, self.panel.sort_values, 'ItemA')
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+class TestPanelFrame(object):
+ """
+ Check that conversions to and from Panel to DataFrame work.
+ """
+
+ def setup_method(self, method):
+ panel = make_test_panel()
+ self.panel = panel.to_frame()
+ self.unfiltered_panel = panel.to_frame(filter_observations=False)
+
+ def test_ops_differently_indexed(self):
+ # trying to set non-identically indexed panel
+ wp = self.panel.to_panel()
+ wp2 = wp.reindex(major=wp.major_axis[:-1])
+ lp2 = wp2.to_frame()
+
+ result = self.panel + lp2
+ assert_frame_equal(result.reindex(lp2.index), lp2 * 2)
+
+ # careful, mutation
+ self.panel['foo'] = lp2['ItemA']
+ assert_series_equal(self.panel['foo'].reindex(lp2.index),
+ lp2['ItemA'],
+ check_names=False)
+
+ def test_ops_scalar(self):
+ result = self.panel.mul(2)
+ expected = DataFrame.__mul__(self.panel, 2)
+ assert_frame_equal(result, expected)
+
+ def test_combineFrame(self):
+ wp = self.panel.to_panel()
+ result = self.panel.add(wp['ItemA'].stack(), axis=0)
+ assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2)
+
+ def test_combinePanel(self):
+ wp = self.panel.to_panel()
+ result = self.panel.add(self.panel)
+ wide_result = result.to_panel()
+ assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA'])
+
+ # one item
+ result = self.panel.add(self.panel.filter(['ItemA']))
+
+ def test_combine_scalar(self):
+ result = self.panel.mul(2)
+ expected = DataFrame(self.panel._data) * 2
+ assert_frame_equal(result, expected)
+
+ def test_combine_series(self):
+ s = self.panel['ItemA'][:10]
+ result = self.panel.add(s, axis=0)
+ expected = DataFrame.add(self.panel, s, axis=0)
+ assert_frame_equal(result, expected)
+
+ s = self.panel.iloc[5]
+ result = self.panel + s
+ expected = DataFrame.add(self.panel, s, axis=1)
+ assert_frame_equal(result, expected)
+
+ def test_operators(self):
+ wp = self.panel.to_panel()
+ result = (self.panel + 1).to_panel()
+ assert_frame_equal(wp['ItemA'] + 1, result['ItemA'])
+
+ def test_arith_flex_panel(self):
+ ops = ['add', 'sub', 'mul', 'div',
+ 'truediv', 'pow', 'floordiv', 'mod']
+ if not compat.PY3:
+ aliases = {}
+ else:
+ aliases = {'div': 'truediv'}
+ self.panel = self.panel.to_panel()
+
+ for n in [np.random.randint(-50, -1), np.random.randint(1, 50), 0]:
+ for op in ops:
+ alias = aliases.get(op, op)
+ f = getattr(operator, alias)
+ exp = f(self.panel, n)
+ result = getattr(self.panel, op)(n)
+ assert_panel_equal(result, exp, check_panel_type=True)
+
+ # rops
+ r_f = lambda x, y: f(y, x)
+ exp = r_f(self.panel, n)
+ result = getattr(self.panel, 'r' + op)(n)
+ assert_panel_equal(result, exp)
+
+ def test_sort(self):
+ def is_sorted(arr):
+ return (arr[1:] > arr[:-1]).any()
+
+ sorted_minor = self.panel.sort_index(level=1)
+ assert is_sorted(sorted_minor.index.codes[1])
+
+ sorted_major = sorted_minor.sort_index(level=0)
+ assert is_sorted(sorted_major.index.codes[0])
+
+ def test_to_string(self):
+ buf = StringIO()
+ self.panel.to_string(buf)
+
+ def test_to_sparse(self):
+ if isinstance(self.panel, Panel):
+ msg = 'sparsifying is not supported'
+ with pytest.raises(NotImplementedError, match=msg):
+ self.panel.to_sparse
+
+ def test_truncate(self):
+ dates = self.panel.index.levels[0]
+ start, end = dates[1], dates[5]
+
+ trunced = self.panel.truncate(start, end).to_panel()
+ expected = self.panel.to_panel()['ItemA'].truncate(start, end)
+
+ # TODO truncate drops index.names
+ assert_frame_equal(trunced['ItemA'], expected, check_names=False)
+
+ trunced = self.panel.truncate(before=start).to_panel()
+ expected = self.panel.to_panel()['ItemA'].truncate(before=start)
+
+ # TODO truncate drops index.names
+ assert_frame_equal(trunced['ItemA'], expected, check_names=False)
+
+ trunced = self.panel.truncate(after=end).to_panel()
+ expected = self.panel.to_panel()['ItemA'].truncate(after=end)
+
+ # TODO truncate drops index.names
+ assert_frame_equal(trunced['ItemA'], expected, check_names=False)
+
+ # truncate on dates that aren't in there
+ wp = self.panel.to_panel()
+ new_index = wp.major_axis[::5]
+
+ wp2 = wp.reindex(major=new_index)
+
+ lp2 = wp2.to_frame()
+ lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2])
+
+ wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2])
+
+ assert_panel_equal(wp_trunc, lp_trunc.to_panel())
+
+ # throw proper exception
+ pytest.raises(Exception, lp2.truncate, wp.major_axis[-2],
+ wp.major_axis[2])
+
+ def test_axis_dummies(self):
+ from pandas.core.reshape.reshape import make_axis_dummies
+
+ minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
+ assert len(minor_dummies.columns) == len(self.panel.index.levels[1])
+
+ major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8)
+ assert len(major_dummies.columns) == len(self.panel.index.levels[0])
+
+ mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'}
+
+ transformed = make_axis_dummies(self.panel, 'minor',
+ transform=mapping.get).astype(np.uint8)
+ assert len(transformed.columns) == 2
+ tm.assert_index_equal(transformed.columns, Index(['one', 'two']))
+
+ # TODO: test correctness
+
+ def test_get_dummies(self):
+ from pandas.core.reshape.reshape import get_dummies, make_axis_dummies
+
+ self.panel['Label'] = self.panel.index.codes[1]
+ minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
+ dummies = get_dummies(self.panel['Label'])
+ tm.assert_numpy_array_equal(dummies.values, minor_dummies.values)
+
+ def test_mean(self):
+ means = self.panel.mean(level='minor')
+
+ # test versus Panel version
+ wide_means = self.panel.to_panel().mean('major')
+ assert_frame_equal(means, wide_means)
+
+ def test_sum(self):
+ sums = self.panel.sum(level='minor')
+
+ # test versus Panel version
+ wide_sums = self.panel.to_panel().sum('major')
+ assert_frame_equal(sums, wide_sums)
+
+ def test_count(self):
+ index = self.panel.index
+
+ major_count = self.panel.count(level=0)['ItemA']
+ level_codes = index.codes[0]
+ for i, idx in enumerate(index.levels[0]):
+ assert major_count[i] == (level_codes == i).sum()
+
+ minor_count = self.panel.count(level=1)['ItemA']
+ level_codes = index.codes[1]
+ for i, idx in enumerate(index.levels[1]):
+ assert minor_count[i] == (level_codes == i).sum()
+
+ def test_join(self):
+ lp1 = self.panel.filter(['ItemA', 'ItemB'])
+ lp2 = self.panel.filter(['ItemC'])
+
+ joined = lp1.join(lp2)
+
+ assert len(joined.columns) == 3
+
+ pytest.raises(Exception, lp1.join,
+ self.panel.filter(['ItemB', 'ItemC']))
+
+
+def test_panel_index():
+ index = panelm.panel_index([1, 2, 3, 4], [1, 2, 3])
+ expected = MultiIndex.from_arrays([np.tile([1, 2, 3, 4], 3),
+ np.repeat([1, 2, 3], 4)],
+ names=['time', 'panel'])
+ tm.assert_index_equal(index, expected)
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
+def test_panel_np_all():
+ wp = Panel({"A": DataFrame({'b': [1, 2]})})
+ result = np.all(wp)
+ assert result == np.bool_(True)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_register_accessor.py b/contrib/python/pandas/py2/pandas/tests/test_register_accessor.py
new file mode 100644
index 00000000000..acc18ed7ad0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_register_accessor.py
@@ -0,0 +1,89 @@
+import contextlib
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+def ensure_removed(obj, attr):
+ """Ensure that an attribute added to 'obj' during the test is
+ removed when we're done"""
+ try:
+ yield
+ finally:
+ try:
+ delattr(obj, attr)
+ except AttributeError:
+ pass
+ obj._accessors.discard(attr)
+
+
+class MyAccessor(object):
+
+ def __init__(self, obj):
+ self.obj = obj
+ self.item = 'item'
+
+ @property
+ def prop(self):
+ return self.item
+
+ def method(self):
+ return self.item
+
+
[email protected]('obj, registrar', [
+ (pd.Series, pd.api.extensions.register_series_accessor),
+ (pd.DataFrame, pd.api.extensions.register_dataframe_accessor),
+ (pd.Index, pd.api.extensions.register_index_accessor)
+])
+def test_register(obj, registrar):
+ with ensure_removed(obj, 'mine'):
+ before = set(dir(obj))
+ registrar('mine')(MyAccessor)
+ assert obj([]).mine.prop == 'item'
+ after = set(dir(obj))
+ assert (before ^ after) == {'mine'}
+ assert 'mine' in obj._accessors
+
+
+def test_accessor_works():
+ with ensure_removed(pd.Series, 'mine'):
+ pd.api.extensions.register_series_accessor('mine')(MyAccessor)
+
+ s = pd.Series([1, 2])
+ assert s.mine.obj is s
+
+ assert s.mine.prop == 'item'
+ assert s.mine.method() == 'item'
+
+
+def test_overwrite_warns():
+ # Need to restore mean
+ mean = pd.Series.mean
+ try:
+ with tm.assert_produces_warning(UserWarning) as w:
+ pd.api.extensions.register_series_accessor('mean')(MyAccessor)
+ s = pd.Series([1, 2])
+ assert s.mean.prop == 'item'
+ msg = str(w[0].message)
+ assert 'mean' in msg
+ assert 'MyAccessor' in msg
+ assert 'Series' in msg
+ finally:
+ pd.Series.mean = mean
+
+
+def test_raises_attribute_error():
+
+ with ensure_removed(pd.Series, 'bad'):
+
+ @pd.api.extensions.register_series_accessor("bad")
+ class Bad(object):
+ def __init__(self, data):
+ raise AttributeError("whoops")
+
+ with pytest.raises(AttributeError, match="whoops"):
+ pd.Series([]).bad
diff --git a/contrib/python/pandas/py2/pandas/tests/test_sorting.py b/contrib/python/pandas/py2/pandas/tests/test_sorting.py
new file mode 100644
index 00000000000..5391deed719
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_sorting.py
@@ -0,0 +1,435 @@
+from collections import defaultdict
+from datetime import datetime
+from itertools import product
+import warnings
+
+import numpy as np
+from numpy import nan
+import pytest
+
+from pandas import (
+ DataFrame, MultiIndex, Series, compat, concat, merge, to_datetime)
+from pandas.core import common as com
+from pandas.core.sorting import (
+ decons_group_index, get_group_index, is_int64_overflow_possible,
+ lexsort_indexer, nargsort, safe_sort)
+from pandas.util import testing as tm
+from pandas.util.testing import assert_frame_equal, assert_series_equal
+
+
+class TestSorting(object):
+
+ @pytest.mark.slow
+ def test_int64_overflow(self):
+
+ B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
+ A = np.arange(2500)
+ df = DataFrame({'A': A,
+ 'B': B,
+ 'C': A,
+ 'D': B,
+ 'E': A,
+ 'F': B,
+ 'G': A,
+ 'H': B,
+ 'values': np.random.randn(2500)})
+
+ lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
+ rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])
+
+ left = lg.sum()['values']
+ right = rg.sum()['values']
+
+ exp_index, _ = left.index.sortlevel()
+ tm.assert_index_equal(left.index, exp_index)
+
+ exp_index, _ = right.index.sortlevel(0)
+ tm.assert_index_equal(right.index, exp_index)
+
+ tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
+ ]].values))
+ tups = com.asarray_tuplesafe(tups)
+
+ expected = df.groupby(tups).sum()['values']
+
+ for k, v in compat.iteritems(expected):
+ assert left[k] == right[k[::-1]]
+ assert left[k] == v
+ assert len(left) == len(right)
+
+ def test_int64_overflow_moar(self):
+
+ # GH9096
+ values = range(55109)
+ data = DataFrame.from_dict(
+ {'a': values, 'b': values, 'c': values, 'd': values})
+ grouped = data.groupby(['a', 'b', 'c', 'd'])
+ assert len(grouped) == len(values)
+
+ arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
+ i = np.random.choice(len(arr), len(arr) * 4)
+ arr = np.vstack((arr, arr[i])) # add sume duplicate rows
+
+ i = np.random.permutation(len(arr))
+ arr = arr[i] # shuffle rows
+
+ df = DataFrame(arr, columns=list('abcde'))
+ df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10
+ gr = df.groupby(list('abcde'))
+
+ # verify this is testing what it is supposed to test!
+ assert is_int64_overflow_possible(gr.grouper.shape)
+
+ # manually compute groupings
+ jim, joe = defaultdict(list), defaultdict(list)
+ for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']):
+ jim[key].append(a)
+ joe[key].append(b)
+
+ assert len(gr) == len(jim)
+ mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde'))
+
+ def aggr(func):
+ f = lambda a: np.fromiter(map(func, a), dtype='f8')
+ arr = np.vstack((f(jim.values()), f(joe.values()))).T
+ res = DataFrame(arr, columns=['jim', 'joe'], index=mi)
+ return res.sort_index()
+
+ assert_frame_equal(gr.mean(), aggr(np.mean))
+ assert_frame_equal(gr.median(), aggr(np.median))
+
+ def test_lexsort_indexer(self):
+ keys = [[nan] * 5 + list(range(100)) + [nan] * 5]
+ # orders=True, na_position='last'
+ result = lexsort_indexer(keys, orders=True, na_position='last')
+ exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
+
+ # orders=True, na_position='first'
+ result = lexsort_indexer(keys, orders=True, na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
+ tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
+
+ # orders=False, na_position='last'
+ result = lexsort_indexer(keys, orders=False, na_position='last')
+ exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
+
+ # orders=False, na_position='first'
+ result = lexsort_indexer(keys, orders=False, na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
+ tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
+
+ def test_nargsort(self):
+ # np.argsort(items) places NaNs last
+ items = [nan] * 5 + list(range(100)) + [nan] * 5
+ # np.argsort(items2) may not place NaNs first
+ items2 = np.array(items, dtype='O')
+
+ # mergesort is the most difficult to get right because we want it to be
+ # stable.
+
+ # According to numpy/core/tests/test_multiarray, """The number of
+ # sorted items must be greater than ~50 to check the actual algorithm
+ # because quick and merge sort fall over to insertion sort for small
+ # arrays."""
+
+ # mergesort, ascending=True, na_position='last'
+ result = nargsort(items, kind='mergesort', ascending=True,
+ na_position='last')
+ exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=True, na_position='first'
+ result = nargsort(items, kind='mergesort', ascending=True,
+ na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=False, na_position='last'
+ result = nargsort(items, kind='mergesort', ascending=False,
+ na_position='last')
+ exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=False, na_position='first'
+ result = nargsort(items, kind='mergesort', ascending=False,
+ na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=True, na_position='last'
+ result = nargsort(items2, kind='mergesort', ascending=True,
+ na_position='last')
+ exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=True, na_position='first'
+ result = nargsort(items2, kind='mergesort', ascending=True,
+ na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=False, na_position='last'
+ result = nargsort(items2, kind='mergesort', ascending=False,
+ na_position='last')
+ exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ # mergesort, ascending=False, na_position='first'
+ result = nargsort(items2, kind='mergesort', ascending=False,
+ na_position='first')
+ exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1))
+ tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+ def test_nargsort_datetimearray_warning(self):
+ # https://github.com/pandas-dev/pandas/issues/25439
+ # can be removed once the FutureWarning for np.array(DTA) is removed
+ data = to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels')
+ with tm.assert_produces_warning(None):
+ nargsort(data)
+
+
+class TestMerge(object):
+
+ @pytest.mark.slow
+ def test_int64_overflow_issues(self):
+
+ # #2690, combinatorial explosion
+ df1 = DataFrame(np.random.randn(1000, 7),
+ columns=list('ABCDEF') + ['G1'])
+ df2 = DataFrame(np.random.randn(1000, 7),
+ columns=list('ABCDEF') + ['G2'])
+
+ # it works!
+ result = merge(df1, df2, how='outer')
+ assert len(result) == 2000
+
+ low, high, n = -1 << 10, 1 << 10, 1 << 20
+ left = DataFrame(np.random.randint(low, high, (n, 7)),
+ columns=list('ABCDEFG'))
+ left['left'] = left.sum(axis=1)
+
+ # one-2-one match
+ i = np.random.permutation(len(left))
+ right = left.iloc[i].copy()
+ right.columns = right.columns[:-1].tolist() + ['right']
+ right.index = np.arange(len(right))
+ right['right'] *= -1
+
+ out = merge(left, right, how='outer')
+ assert len(out) == len(left)
+ assert_series_equal(out['left'], - out['right'], check_names=False)
+ result = out.iloc[:, :-2].sum(axis=1)
+ assert_series_equal(out['left'], result, check_names=False)
+ assert result.name is None
+
+ out.sort_values(out.columns.tolist(), inplace=True)
+ out.index = np.arange(len(out))
+ for how in ['left', 'right', 'outer', 'inner']:
+ assert_frame_equal(out, merge(left, right, how=how, sort=True))
+
+ # check that left merge w/ sort=False maintains left frame order
+ out = merge(left, right, how='left', sort=False)
+ assert_frame_equal(left, out[left.columns.tolist()])
+
+ out = merge(right, left, how='left', sort=False)
+ assert_frame_equal(right, out[right.columns.tolist()])
+
+ # one-2-many/none match
+ n = 1 << 11
+ left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'),
+ columns=list('ABCDEFG'))
+
+ # confirm that this is checking what it is supposed to check
+ shape = left.apply(Series.nunique).values
+ assert is_int64_overflow_possible(shape)
+
+ # add duplicates to left frame
+ left = concat([left, left], ignore_index=True)
+
+ right = DataFrame(np.random.randint(low, high, (n // 2, 7))
+ .astype('int64'),
+ columns=list('ABCDEFG'))
+
+ # add duplicates & overlap with left to the right frame
+ i = np.random.choice(len(left), n)
+ right = concat([right, right, left.iloc[i]], ignore_index=True)
+
+ left['left'] = np.random.randn(len(left))
+ right['right'] = np.random.randn(len(right))
+
+ # shuffle left & right frames
+ i = np.random.permutation(len(left))
+ left = left.iloc[i].copy()
+ left.index = np.arange(len(left))
+
+ i = np.random.permutation(len(right))
+ right = right.iloc[i].copy()
+ right.index = np.arange(len(right))
+
+ # manually compute outer merge
+ ldict, rdict = defaultdict(list), defaultdict(list)
+
+ for idx, row in left.set_index(list('ABCDEFG')).iterrows():
+ ldict[idx].append(row['left'])
+
+ for idx, row in right.set_index(list('ABCDEFG')).iterrows():
+ rdict[idx].append(row['right'])
+
+ vals = []
+ for k, lval in ldict.items():
+ rval = rdict.get(k, [np.nan])
+ for lv, rv in product(lval, rval):
+ vals.append(k + tuple([lv, rv]))
+
+ for k, rval in rdict.items():
+ if k not in ldict:
+ for rv in rval:
+ vals.append(k + tuple([np.nan, rv]))
+
+ def align(df):
+ df = df.sort_values(df.columns.tolist())
+ df.index = np.arange(len(df))
+ return df
+
+ def verify_order(df):
+ kcols = list('ABCDEFG')
+ assert_frame_equal(df[kcols].copy(),
+ df[kcols].sort_values(kcols, kind='mergesort'))
+
+ out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
+ out = align(out)
+
+ jmask = {'left': out['left'].notna(),
+ 'right': out['right'].notna(),
+ 'inner': out['left'].notna() & out['right'].notna(),
+ 'outer': np.ones(len(out), dtype='bool')}
+
+ for how in 'left', 'right', 'outer', 'inner':
+ mask = jmask[how]
+ frame = align(out[mask].copy())
+ assert mask.all() ^ mask.any() or how == 'outer'
+
+ for sort in [False, True]:
+ res = merge(left, right, how=how, sort=sort)
+ if sort:
+ verify_order(res)
+
+ # as in GH9092 dtypes break with outer/right join
+ assert_frame_equal(frame, align(res),
+ check_dtype=how not in ('right', 'outer'))
+
+
+def test_decons():
+
+ def testit(label_list, shape):
+ group_index = get_group_index(label_list, shape, sort=True, xnull=True)
+ label_list2 = decons_group_index(group_index, shape)
+
+ for a, b in zip(label_list, label_list2):
+ tm.assert_numpy_array_equal(a, b)
+
+ shape = (4, 5, 6)
+ label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64),
+ np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64),
+ np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)]
+ testit(label_list, shape)
+
+ shape = (10000, 10000)
+ label_list = [np.tile(np.arange(10000, dtype=np.int64), 5),
+ np.tile(np.arange(10000, dtype=np.int64), 5)]
+ testit(label_list, shape)
+
+
+class TestSafeSort(object):
+
+ def test_basic_sort(self):
+ values = [3, 1, 2, 0, 4]
+ result = safe_sort(values)
+ expected = np.array([0, 1, 2, 3, 4])
+ tm.assert_numpy_array_equal(result, expected)
+
+ values = list("baaacb")
+ result = safe_sort(values)
+ expected = np.array(list("aaabbc"), dtype='object')
+ tm.assert_numpy_array_equal(result, expected)
+
+ values = []
+ result = safe_sort(values)
+ expected = np.array([])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_labels(self):
+ values = [3, 1, 2, 0, 4]
+ expected = np.array([0, 1, 2, 3, 4])
+
+ labels = [0, 1, 1, 2, 3, 0, -1, 4]
+ result, result_labels = safe_sort(values, labels)
+ expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result_labels, expected_labels)
+
+ # na_sentinel
+ labels = [0, 1, 1, 2, 3, 0, 99, 4]
+ result, result_labels = safe_sort(values, labels,
+ na_sentinel=99)
+ expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result_labels, expected_labels)
+
+ # out of bound indices
+ labels = [0, 101, 102, 2, 3, 0, 99, 4]
+ result, result_labels = safe_sort(values, labels)
+ expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result_labels, expected_labels)
+
+ labels = []
+ result, result_labels = safe_sort(values, labels)
+ expected_labels = np.array([], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result_labels, expected_labels)
+
+ def test_mixed_integer(self):
+ values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
+ result = safe_sort(values)
+ expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ values = np.array(['b', 1, 0, 'a'], dtype=object)
+ labels = [0, 1, 2, 3, 0, -1, 1]
+ result, result_labels = safe_sort(values, labels)
+ expected = np.array([0, 1, 'a', 'b'], dtype=object)
+ expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result_labels, expected_labels)
+
+ def test_mixed_integer_from_list(self):
+ values = ['b', 1, 0, 'a', 0, 'b']
+ result = safe_sort(values)
+ expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_unsortable(self):
+ # GH 13714
+ arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
+ if compat.PY2:
+ # RuntimeWarning: tp_compare didn't return -1 or -2 for exception
+ with warnings.catch_warnings():
+ pytest.raises(TypeError, safe_sort, arr)
+ else:
+ pytest.raises(TypeError, safe_sort, arr)
+
+ def test_exceptions(self):
+ with pytest.raises(TypeError,
+ match="Only list-like objects are allowed"):
+ safe_sort(values=1)
+
+ with pytest.raises(TypeError,
+ match="Only list-like objects or None"):
+ safe_sort(values=[0, 1, 2], labels=1)
+
+ with pytest.raises(ValueError,
+ match="values should be unique"):
+ safe_sort(values=[0, 1, 2, 1], labels=[0, 1])
diff --git a/contrib/python/pandas/py2/pandas/tests/test_strings.py b/contrib/python/pandas/py2/pandas/tests/test_strings.py
new file mode 100644
index 00000000000..7cea3be03d1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_strings.py
@@ -0,0 +1,3426 @@
+# -*- coding: utf-8 -*-
+# pylint: disable-msg=E1101,W0612
+
+from datetime import datetime, timedelta
+import re
+
+import numpy as np
+from numpy import nan as NA
+from numpy.random import randint
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import PY3, range, u
+
+from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
+import pandas.core.strings as strings
+import pandas.util.testing as tm
+from pandas.util.testing import assert_index_equal, assert_series_equal
+
+
+def assert_series_or_index_equal(left, right):
+ if isinstance(left, Series):
+ assert_series_equal(left, right)
+ else: # Index
+ assert_index_equal(left, right)
+
+
+_any_string_method = [
+ ('cat', (), {'sep': ','}), # noqa: E241
+ ('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241
+ 'join': 'left'}),
+ ('center', (10,), {}), # noqa: E241
+ ('contains', ('a',), {}), # noqa: E241
+ ('count', ('a',), {}), # noqa: E241
+ ('decode', ('UTF-8',), {}), # noqa: E241
+ ('encode', ('UTF-8',), {}), # noqa: E241
+ ('endswith', ('a',), {}), # noqa: E241
+ ('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241
+ ('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241
+ ('extractall', ('([a-z]*)',), {}), # noqa: E241
+ ('find', ('a',), {}), # noqa: E241
+ ('findall', ('a',), {}), # noqa: E241
+ ('get', (0,), {}), # noqa: E241
+ # because "index" (and "rindex") fail intentionally
+ # if the string is not found, search only for empty string
+ ('index', ('',), {}), # noqa: E241
+ ('join', (',',), {}), # noqa: E241
+ ('ljust', (10,), {}), # noqa: E241
+ ('match', ('a',), {}), # noqa: E241
+ ('normalize', ('NFC',), {}), # noqa: E241
+ ('pad', (10,), {}), # noqa: E241
+ ('partition', (' ',), {'expand': False}), # noqa: E241
+ ('partition', (' ',), {'expand': True}), # noqa: E241
+ ('repeat', (3,), {}), # noqa: E241
+ ('replace', ('a', 'z',), {}), # noqa: E241
+ ('rfind', ('a',), {}), # noqa: E241
+ ('rindex', ('',), {}), # noqa: E241
+ ('rjust', (10,), {}), # noqa: E241
+ ('rpartition', (' ',), {'expand': False}), # noqa: E241
+ ('rpartition', (' ',), {'expand': True}), # noqa: E241
+ ('slice', (0, 1,), {}), # noqa: E241
+ ('slice_replace', (0, 1, 'z',), {}), # noqa: E241
+ ('split', (' ',), {'expand': False}), # noqa: E241
+ ('split', (' ',), {'expand': True}), # noqa: E241
+ ('startswith', ('a',), {}), # noqa: E241
+ # translating unicode points of "a" to "d"
+ ('translate', ({97: 100},), {}), # noqa: E241
+ ('wrap', (2,), {}), # noqa: E241
+ ('zfill', (10,), {}) # noqa: E241
+] + list(zip([
+ # methods without positional arguments: zip with empty tuple and empty dict
+ 'capitalize', 'cat', 'get_dummies',
+ 'isalnum', 'isalpha', 'isdecimal',
+ 'isdigit', 'islower', 'isnumeric',
+ 'isspace', 'istitle', 'isupper',
+ 'len', 'lower', 'lstrip', 'partition',
+ 'rpartition', 'rsplit', 'rstrip',
+ 'slice', 'slice_replace', 'split',
+ 'strip', 'swapcase', 'title', 'upper'
+], [()] * 100, [{}] * 100))
+ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
+
+
+# test that the above list captures all methods of StringMethods
+missing_methods = {f for f in dir(strings.StringMethods)
+ if not f.startswith('_')} - set(ids)
+assert not missing_methods
+
+
[email protected](params=_any_string_method, ids=ids)
+def any_string_method(request):
+ """
+ Fixture for all public methods of `StringMethods`
+
+ This fixture returns a tuple of the method name and sample arguments
+ necessary to call the method.
+
+ Returns
+ -------
+ method_name : str
+ The name of the method in `StringMethods`
+ args : tuple
+ Sample values for the positional arguments
+ kwargs : dict
+ Sample values for the keyword arguments
+
+ Examples
+ --------
+ >>> def test_something(any_string_method):
+ ... s = pd.Series(['a', 'b', np.nan, 'd'])
+ ...
+ ... method_name, args, kwargs = any_string_method
+ ... method = getattr(s.str, method_name)
+ ... # will not raise
+ ... method(*args, **kwargs)
+ """
+ return request.param
+
+
+# subset of the full set from pandas/conftest.py
+_any_allowed_skipna_inferred_dtype = [
+ ('string', ['a', np.nan, 'c']),
+ ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
+ ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
+ ('empty', [np.nan, np.nan, np.nan]),
+ ('empty', []),
+ ('mixed-integer', ['a', np.nan, 2])
+]
+ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
+
+
[email protected](params=_any_allowed_skipna_inferred_dtype, ids=ids)
+def any_allowed_skipna_inferred_dtype(request):
+ """
+ Fixture for all (inferred) dtypes allowed in StringMethods.__init__
+
+ The covered (inferred) types are:
+ * 'string'
+ * 'unicode' (if PY2)
+ * 'empty'
+ * 'bytes' (if PY3)
+ * 'mixed'
+ * 'mixed-integer'
+
+ Returns
+ -------
+ inferred_dtype : str
+ The string for the inferred dtype from _libs.lib.infer_dtype
+ values : np.ndarray
+ An array of object dtype that will be inferred to have
+ `inferred_dtype`
+
+ Examples
+ --------
+ >>> import pandas._libs.lib as lib
+ >>>
+ >>> def test_something(any_allowed_skipna_inferred_dtype):
+ ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
+ ... # will pass
+ ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
+ """
+ inferred_dtype, values = request.param
+ values = np.array(values, dtype=object) # object dtype to avoid casting
+
+ # correctness of inference tested in tests/dtypes/test_inference.py
+ return inferred_dtype, values
+
+
+class TestStringMethods(object):
+
+ def test_api(self):
+
+ # GH 6106, GH 9322
+ assert Series.str is strings.StringMethods
+ assert isinstance(Series(['']).str, strings.StringMethods)
+
+ @pytest.mark.parametrize('dtype', [object, 'category'])
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype):
+ # one instance of parametrized fixture
+ inferred_dtype, values = any_skipna_inferred_dtype
+
+ t = box(values, dtype=dtype) # explicit dtype to avoid casting
+
+ # TODO: get rid of these xfails
+ if dtype == 'category' and inferred_dtype in ['period', 'interval']:
+ pytest.xfail(reason='Conversion to numpy array fails because '
+ 'the ._values-attribute is not a numpy array for '
+ 'PeriodArray/IntervalArray; see GH 23553')
+ if box == Index and inferred_dtype in ['empty', 'bytes']:
+ pytest.xfail(reason='Raising too restrictively; '
+ 'solved by GH 23167')
+ if (box == Index and dtype == object
+ and inferred_dtype in ['boolean', 'date', 'time']):
+ pytest.xfail(reason='Inferring incorrectly because of NaNs; '
+ 'solved by GH 23167')
+ if (box == Series
+ and (dtype == object and inferred_dtype not in [
+ 'string', 'unicode', 'empty',
+ 'bytes', 'mixed', 'mixed-integer'])
+ or (dtype == 'category'
+ and inferred_dtype in ['decimal', 'boolean', 'time'])):
+ pytest.xfail(reason='Not raising correctly; solved by GH 23167')
+
+ types_passing_constructor = ['string', 'unicode', 'empty',
+ 'bytes', 'mixed', 'mixed-integer']
+ if inferred_dtype in types_passing_constructor:
+ # GH 6106
+ assert isinstance(t.str, strings.StringMethods)
+ else:
+ # GH 9184, GH 23011, GH 23163
+ with pytest.raises(AttributeError, match='Can only use .str '
+ 'accessor with string values.*'):
+ t.str
+ assert not hasattr(t, 'str')
+
+ @pytest.mark.parametrize('dtype', [object, 'category'])
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_api_per_method(self, box, dtype,
+ any_allowed_skipna_inferred_dtype,
+ any_string_method):
+ # this test does not check correctness of the different methods,
+ # just that the methods work on the specified (inferred) dtypes,
+ # and raise on all others
+
+ # one instance of each parametrized fixture
+ inferred_dtype, values = any_allowed_skipna_inferred_dtype
+ method_name, args, kwargs = any_string_method
+
+ # TODO: get rid of these xfails
+ if (method_name not in ['encode', 'decode', 'len']
+ and inferred_dtype == 'bytes'):
+ pytest.xfail(reason='Not raising for "bytes", see GH 23011;'
+ 'Also: malformed method names, see GH 23551; '
+ 'solved by GH 23167')
+ if (method_name == 'cat'
+ and inferred_dtype in ['mixed', 'mixed-integer']):
+ pytest.xfail(reason='Bad error message; should raise better; '
+ 'solved by GH 23167')
+ if box == Index and inferred_dtype in ['empty', 'bytes']:
+ pytest.xfail(reason='Raising too restrictively; '
+ 'solved by GH 23167')
+ if (box == Index and dtype == object
+ and inferred_dtype in ['boolean', 'date', 'time']):
+ pytest.xfail(reason='Inferring incorrectly because of NaNs; '
+ 'solved by GH 23167')
+
+ t = box(values, dtype=dtype) # explicit dtype to avoid casting
+ method = getattr(t.str, method_name)
+
+ bytes_allowed = method_name in ['encode', 'decode', 'len']
+ # as of v0.23.4, all methods except 'cat' are very lenient with the
+ # allowed data types, just returning NaN for entries that error.
+ # This could be changed with an 'errors'-kwarg to the `str`-accessor,
+ # see discussion in GH 13877
+ mixed_allowed = method_name not in ['cat']
+
+ allowed_types = (['string', 'unicode', 'empty']
+ + ['bytes'] * bytes_allowed
+ + ['mixed', 'mixed-integer'] * mixed_allowed)
+
+ if inferred_dtype in allowed_types:
+ # xref GH 23555, GH 23556
+ method(*args, **kwargs) # works!
+ else:
+ # GH 23011, GH 23163
+ msg = ('Cannot use .str.{name} with values of inferred dtype '
+ '{inferred_dtype!r}.'.format(name=method_name,
+ inferred_dtype=inferred_dtype))
+ with pytest.raises(TypeError, match=msg):
+ method(*args, **kwargs)
+
+ def test_api_for_categorical(self, any_string_method):
+ # https://github.com/pandas-dev/pandas/issues/10661
+ s = Series(list('aabb'))
+ s = s + " " + s
+ c = s.astype('category')
+ assert isinstance(c.str, strings.StringMethods)
+
+ method_name, args, kwargs = any_string_method
+
+ result = getattr(c.str, method_name)(*args, **kwargs)
+ expected = getattr(s.str, method_name)(*args, **kwargs)
+
+ if isinstance(result, DataFrame):
+ tm.assert_frame_equal(result, expected)
+ elif isinstance(result, Series):
+ tm.assert_series_equal(result, expected)
+ else:
+ # str.cat(others=None) returns string, for example
+ assert result == expected
+
+ def test_iter(self):
+ # GH3638
+ strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel'
+ ds = Series(strs)
+
+ for s in ds.str:
+ # iter must yield a Series
+ assert isinstance(s, Series)
+
+ # indices of each yielded Series should be equal to the index of
+ # the original Series
+ tm.assert_index_equal(s.index, ds.index)
+
+ for el in s:
+ # each element of the series is either a basestring/str or nan
+ assert isinstance(el, compat.string_types) or isna(el)
+
+ # desired behavior is to iterate until everything would be nan on the
+ # next iter so make sure the last element of the iterator was 'l' in
+ # this case since 'wikitravel' is the longest string
+ assert s.dropna().values.item() == 'l'
+
+ def test_iter_empty(self):
+ ds = Series([], dtype=object)
+
+ i, s = 100, 1
+
+ for i, s in enumerate(ds.str):
+ pass
+
+ # nothing to iterate over so nothing defined values should remain
+ # unchanged
+ assert i == 100
+ assert s == 1
+
+ def test_iter_single_element(self):
+ ds = Series(['a'])
+
+ for i, s in enumerate(ds.str):
+ pass
+
+ assert not i
+ assert_series_equal(ds, s)
+
+ def test_iter_object_try_string(self):
+ ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(
+ 4)])
+
+ i, s = 100, 'h'
+
+ for i, s in enumerate(ds.str):
+ pass
+
+ assert i == 100
+ assert s == 'h'
+
+ @pytest.mark.parametrize('box', [Series, Index])
+ @pytest.mark.parametrize('other', [None, Series, Index])
+ def test_str_cat_name(self, box, other):
+ # GH 21053
+ values = ['a', 'b']
+ if other:
+ other = other(values)
+ else:
+ other = values
+ result = box(values, name='name').str.cat(other, sep=',', join='left')
+ assert result.name == 'name'
+
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_str_cat(self, box):
+ # test_cat above tests "str_cat" from ndarray;
+ # here testing "str.cat" from Series/Indext to ndarray/list
+ s = box(['a', 'a', 'b', 'b', 'c', np.nan])
+
+ # single array
+ result = s.str.cat()
+ expected = 'aabbc'
+ assert result == expected
+
+ result = s.str.cat(na_rep='-')
+ expected = 'aabbc-'
+ assert result == expected
+
+ result = s.str.cat(sep='_', na_rep='NA')
+ expected = 'a_a_b_b_c_NA'
+ assert result == expected
+
+ t = np.array(['a', np.nan, 'b', 'd', 'foo', np.nan], dtype=object)
+ expected = box(['aa', 'a-', 'bb', 'bd', 'cfoo', '--'])
+
+ # Series/Index with array
+ result = s.str.cat(t, na_rep='-')
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with list
+ result = s.str.cat(list(t), na_rep='-')
+ assert_series_or_index_equal(result, expected)
+
+ # errors for incorrect lengths
+ rgx = 'All arrays must be same length, except those having an index.*'
+ z = Series(['1', '2', '3'])
+
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(z)
+
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(z.values)
+
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(list(z))
+
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_str_cat_raises_intuitive_error(self, box):
+ # GH 11334
+ s = box(['a', 'b', 'c', 'd'])
+ message = "Did you mean to supply a `sep` keyword?"
+ with pytest.raises(ValueError, match=message):
+ s.str.cat('|')
+ with pytest.raises(ValueError, match=message):
+ s.str.cat(' ')
+
+ @pytest.mark.parametrize('sep', ['', None])
+ @pytest.mark.parametrize('dtype_target', ['object', 'category'])
+ @pytest.mark.parametrize('dtype_caller', ['object', 'category'])
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep):
+ s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller)
+ s = s if box == Index else Series(s, index=s)
+ t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target)
+
+ expected = Index(['ab', 'aa', 'bb', 'ac'])
+ expected = expected if box == Index else Series(expected, index=s)
+
+ # Series/Index with unaligned Index
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # FutureWarning to switch to alignment by default
+ result = s.str.cat(t, sep=sep)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with Series having matching Index
+ t = Series(t, index=s)
+ result = s.str.cat(t, sep=sep)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with Series.values
+ result = s.str.cat(t.values, sep=sep)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with Series having different Index
+ t = Series(t.values, index=t)
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # FutureWarning to switch to alignment by default
+ result = s.str.cat(t, sep=sep)
+ assert_series_or_index_equal(result, expected)
+
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_str_cat_mixed_inputs(self, box):
+ s = Index(['a', 'b', 'c', 'd'])
+ s = s if box == Index else Series(s, index=s)
+
+ t = Series(['A', 'B', 'C', 'D'], index=s.values)
+ d = concat([t, Series(s, index=s)], axis=1)
+
+ expected = Index(['aAa', 'bBb', 'cCc', 'dDd'])
+ expected = expected if box == Index else Series(expected.values,
+ index=s.values)
+
+ # Series/Index with DataFrame
+ result = s.str.cat(d)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with two-dimensional ndarray
+ result = s.str.cat(d.values)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with list of Series
+ result = s.str.cat([t, s])
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with mixed list of Series/array
+ result = s.str.cat([t, s.values])
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with list of list-likes
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # nested list-likes will be deprecated
+ result = s.str.cat([t.values, list(s)])
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with list of Series; different indexes
+ t.index = ['b', 'c', 'd', 'a']
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # FutureWarning to switch to alignment by default
+ result = s.str.cat([t, s])
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with mixed list; different indexes
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # FutureWarning to switch to alignment by default
+ result = s.str.cat([t, s.values])
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with DataFrame; different indexes
+ d.index = ['b', 'c', 'd', 'a']
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # FutureWarning to switch to alignment by default
+ result = s.str.cat(d)
+ assert_series_or_index_equal(result, expected)
+
+ # Series/Index with iterator of list-likes
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # nested list-likes will be deprecated
+ result = s.str.cat(iter([t.values, list(s)]))
+ assert_series_or_index_equal(result, expected)
+
+ # errors for incorrect lengths
+ rgx = 'All arrays must be same length, except those having an index.*'
+ z = Series(['1', '2', '3'])
+ e = concat([z, z], axis=1)
+
+ # DataFrame
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(e)
+
+ # two-dimensional ndarray
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(e.values)
+
+ # list of Series
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat([z, s])
+
+ # list of list-likes
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat([z.values, s.values])
+
+ # mixed list of Series/list-like
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat([z.values, s])
+
+ # errors for incorrect arguments in list-like
+ rgx = 'others must be Series, Index, DataFrame,.*'
+ # make sure None/NaN do not crash checks in _get_series_list
+ u = Series(['a', np.nan, 'c', None])
+
+ # mix of string and Series
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat([u, 'u'])
+
+ # DataFrame in list
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat([u, d])
+
+ # 2-dim ndarray in list
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat([u, d.values])
+
+ # nested lists
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat([u, [u, d]])
+
+ # forbidden input type: set
+ # GH 23009
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat(set(u))
+
+ # forbidden input type: set in list
+ # GH 23009
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat([u, set(u)])
+
+ # other forbidden input type, e.g. int
+ with pytest.raises(TypeError, match=rgx):
+ s.str.cat(1)
+
+ @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
+ @pytest.mark.parametrize('box', [Series, Index])
+ def test_str_cat_align_indexed(self, box, join):
+ # https://github.com/pandas-dev/pandas/issues/18657
+ s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd'])
+ t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b'])
+ sa, ta = s.align(t, join=join)
+ # result after manual alignment of inputs
+ expected = sa.str.cat(ta, na_rep='-')
+
+ if box == Index:
+ s = Index(s)
+ sa = Index(sa)
+ expected = Index(expected)
+
+ result = s.str.cat(t, join=join, na_rep='-')
+ assert_series_or_index_equal(result, expected)
+
+ @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
+ def test_str_cat_align_mixed_inputs(self, join):
+ s = Series(['a', 'b', 'c', 'd'])
+ t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
+ d = concat([t, t], axis=1)
+
+ expected_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee'])
+ expected = expected_outer.loc[s.index.join(t.index, how=join)]
+
+ # list of Series
+ result = s.str.cat([t, t], join=join, na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ # DataFrame
+ result = s.str.cat(d, join=join, na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ # mixed list of indexed/unindexed
+ u = np.array(['A', 'B', 'C', 'D'])
+ expected_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-'])
+ # joint index of rhs [t, u]; u will be forced have index of s
+ rhs_idx = t.index & s.index if join == 'inner' else t.index | s.index
+
+ expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
+ result = s.str.cat([t, u], join=join, na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ with tm.assert_produces_warning(expected_warning=FutureWarning):
+ # nested list-likes will be deprecated
+ result = s.str.cat([t, list(u)], join=join, na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ # errors for incorrect lengths
+ rgx = r'If `others` contains arrays or lists \(or other list-likes.*'
+ z = Series(['1', '2', '3']).values
+
+ # unindexed object of wrong length
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat(z, join=join)
+
+ # unindexed object of wrong length in list
+ with pytest.raises(ValueError, match=rgx):
+ s.str.cat([t, z], join=join)
+
+ @pytest.mark.parametrize('box', [Series, Index])
+ @pytest.mark.parametrize('other', [Series, Index])
+ def test_str_cat_all_na(self, box, other):
+ # GH 24044
+
+ # check that all NaNs in caller / target work
+ s = Index(['a', 'b', 'c', 'd'])
+ s = s if box == Index else Series(s, index=s)
+ t = other([np.nan] * 4, dtype=object)
+ # add index of s for alignment
+ t = t if other == Index else Series(t, index=s)
+
+ # all-NA target
+ if box == Series:
+ expected = Series([np.nan] * 4, index=s.index, dtype=object)
+ else: # box == Index
+ expected = Index([np.nan] * 4, dtype=object)
+ result = s.str.cat(t, join='left')
+ assert_series_or_index_equal(result, expected)
+
+ # all-NA caller (only for Series)
+ if other == Series:
+ expected = Series([np.nan] * 4, dtype=object, index=t.index)
+ result = t.str.cat(s, join='left')
+ tm.assert_series_equal(result, expected)
+
+ def test_str_cat_special_cases(self):
+ s = Series(['a', 'b', 'c', 'd'])
+ t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
+
+ # iterator of elements with different types
+ expected = Series(['aaa', 'bbb', 'c-c', 'ddd', '-e-'])
+ result = s.str.cat(iter([t, s.values]), join='outer', na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ # right-align with different indexes in others
+ expected = Series(['aa-', 'd-d'], index=[0, 3])
+ result = s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-')
+ tm.assert_series_equal(result, expected)
+
+ def test_cat_on_filtered_index(self):
+ df = DataFrame(index=MultiIndex.from_product(
+ [[2011, 2012], [1, 2, 3]], names=['year', 'month']))
+
+ df = df.reset_index()
+ df = df[df.month > 1]
+
+ str_year = df.year.astype('str')
+ str_month = df.month.astype('str')
+ str_both = str_year.str.cat(str_month, sep=' ')
+
+ assert str_both.loc[1] == '2011 2'
+
+ str_multiple = str_year.str.cat([str_month, str_month], sep=' ')
+
+ assert str_multiple.loc[1] == '2011 2 2'
+
+ def test_count(self):
+ values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'],
+ dtype=np.object_)
+
+ result = strings.str_count(values, 'f[o]+')
+ exp = np.array([1, 2, NA, 4])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = Series(values).str.count('f[o]+')
+ exp = Series([1, 2, NA, 4])
+ assert isinstance(result, Series)
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
+ rs = strings.str_count(mixed, 'a')
+ xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA])
+ tm.assert_numpy_array_equal(rs, xp)
+
+ rs = Series(mixed).str.count('a')
+ xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')]
+
+ result = strings.str_count(values, 'f[o]+')
+ exp = np.array([1, 2, NA, 4])
+ tm.assert_numpy_array_equal(result, exp)
+
+ result = Series(values).str.count('f[o]+')
+ exp = Series([1, 2, NA, 4])
+ assert isinstance(result, Series)
+ tm.assert_series_equal(result, exp)
+
+ def test_contains(self):
+ values = np.array(['foo', NA, 'fooommm__foo',
+ 'mmm_', 'foommm[_]+bar'], dtype=np.object_)
+ pat = 'mmm[_]+'
+
+ result = strings.str_contains(values, pat)
+ expected = np.array([False, NA, True, True, False], dtype=np.object_)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = strings.str_contains(values, pat, regex=False)
+ expected = np.array([False, NA, False, False, True], dtype=np.object_)
+ tm.assert_numpy_array_equal(result, expected)
+
+ values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
+ result = strings.str_contains(values, pat)
+ expected = np.array([False, False, True, True])
+ assert result.dtype == np.bool_
+ tm.assert_numpy_array_equal(result, expected)
+
+ # case insensitive using regex
+ values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_']
+ result = strings.str_contains(values, 'FOO|mmm', case=False)
+ expected = np.array([True, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # case insensitive without regex
+ result = strings.str_contains(values, 'foo', regex=False, case=False)
+ expected = np.array([True, False, True, False])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # mixed
+ mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
+ rs = strings.str_contains(mixed, 'o')
+ xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(rs, xp)
+
+ rs = Series(mixed).str.contains('o')
+ xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'],
+ dtype=np.object_)
+ pat = 'mmm[_]+'
+
+ result = strings.str_contains(values, pat)
+ expected = np.array([False, np.nan, True, True], dtype=np.object_)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = strings.str_contains(values, pat, na=False)
+ expected = np.array([False, False, True, True])
+ tm.assert_numpy_array_equal(result, expected)
+
+ values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'],
+ dtype=np.object_)
+ result = strings.str_contains(values, pat)
+ expected = np.array([False, False, True, True])
+ assert result.dtype == np.bool_
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_contains_for_object_category(self):
+ # gh 22158
+
+ # na for category
+ values = Series(["a", "b", "c", "a", np.nan], dtype="category")
+ result = values.str.contains('a', na=True)
+ expected = Series([True, False, False, True, True])
+ tm.assert_series_equal(result, expected)
+
+ result = values.str.contains('a', na=False)
+ expected = Series([True, False, False, True, False])
+ tm.assert_series_equal(result, expected)
+
+ # na for objects
+ values = Series(["a", "b", "c", "a", np.nan])
+ result = values.str.contains('a', na=True)
+ expected = Series([True, False, False, True, True])
+ tm.assert_series_equal(result, expected)
+
+ result = values.str.contains('a', na=False)
+ expected = Series([True, False, False, True, False])
+ tm.assert_series_equal(result, expected)
+
+ def test_startswith(self):
+ values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
+
+ result = values.str.startswith('foo')
+ exp = Series([False, NA, True, False, False, NA, True])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = np.array(['a', NA, 'b', True, datetime.today(),
+ 'foo', None, 1, 2.], dtype=np.object_)
+ rs = strings.str_startswith(mixed, 'f')
+ xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(rs, xp)
+
+ rs = Series(mixed).str.startswith('f')
+ assert isinstance(rs, Series)
+ xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
+ u('foo')])
+
+ result = values.str.startswith('foo')
+ exp = Series([False, NA, True, False, False, NA, True])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.startswith('foo', na=True)
+ tm.assert_series_equal(result, exp.fillna(True).astype(bool))
+
+ def test_endswith(self):
+ values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
+
+ result = values.str.endswith('foo')
+ exp = Series([False, NA, False, False, True, NA, True])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
+ rs = strings.str_endswith(mixed, 'f')
+ xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(rs, xp)
+
+ rs = Series(mixed).str.endswith('f')
+ xp = Series([False, NA, False, NA, NA, False, NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
+ u('foo')])
+
+ result = values.str.endswith('foo')
+ exp = Series([False, NA, False, False, True, NA, True])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.endswith('foo', na=False)
+ tm.assert_series_equal(result, exp.fillna(False).astype(bool))
+
+ def test_title(self):
+ values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
+
+ result = values.str.title()
+ exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
+ 1, 2.])
+ mixed = mixed.str.title()
+ exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
+ tm.assert_almost_equal(mixed, exp)
+
+ # unicode
+ values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
+
+ results = values.str.title()
+ exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
+
+ tm.assert_series_equal(results, exp)
+
+ def test_lower_upper(self):
+ values = Series(['om', NA, 'nom', 'nom'])
+
+ result = values.str.upper()
+ exp = Series(['OM', NA, 'NOM', 'NOM'])
+ tm.assert_series_equal(result, exp)
+
+ result = result.str.lower()
+ tm.assert_series_equal(result, values)
+
+ # mixed
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
+ 2.])
+ mixed = mixed.str.upper()
+ rs = Series(mixed).str.lower()
+ xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = Series([u('om'), NA, u('nom'), u('nom')])
+
+ result = values.str.upper()
+ exp = Series([u('OM'), NA, u('NOM'), u('NOM')])
+ tm.assert_series_equal(result, exp)
+
+ result = result.str.lower()
+ tm.assert_series_equal(result, values)
+
+ def test_capitalize(self):
+ values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
+ result = values.str.capitalize()
+ exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
+ 1, 2.])
+ mixed = mixed.str.capitalize()
+ exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
+ tm.assert_almost_equal(mixed, exp)
+
+ # unicode
+ values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
+ results = values.str.capitalize()
+ exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
+ tm.assert_series_equal(results, exp)
+
+ def test_swapcase(self):
+ values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
+ result = values.str.swapcase()
+ exp = Series(["foo", "bar", NA, "bLAH", "BLURG"])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None,
+ 1, 2.])
+ mixed = mixed.str.swapcase()
+ exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA])
+ tm.assert_almost_equal(mixed, exp)
+
+ # unicode
+ values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
+ results = values.str.swapcase()
+ exp = Series([u("foo"), NA, u("BAR"), u("bLURG")])
+ tm.assert_series_equal(results, exp)
+
+ def test_casemethods(self):
+ values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE']
+ s = Series(values)
+ assert s.str.lower().tolist() == [v.lower() for v in values]
+ assert s.str.upper().tolist() == [v.upper() for v in values]
+ assert s.str.title().tolist() == [v.title() for v in values]
+ assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
+ assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
+
+ def test_replace(self):
+ values = Series(['fooBAD__barBAD', NA])
+
+ result = values.str.replace('BAD[_]*', '')
+ exp = Series(['foobar', NA])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.replace('BAD[_]*', '', n=1)
+ exp = Series(['foobarBAD', NA])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
+ None, 1, 2.])
+
+ rs = Series(mixed).str.replace('BAD[_]*', '')
+ xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA])
+
+ result = values.str.replace('BAD[_]*', '')
+ exp = Series([u('foobar'), NA])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.replace('BAD[_]*', '', n=1)
+ exp = Series([u('foobarBAD'), NA])
+ tm.assert_series_equal(result, exp)
+
+ # flags + unicode
+ values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
+ exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
+ result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
+ tm.assert_series_equal(result, exp)
+
+ # GH 13438
+ for klass in (Series, Index):
+ for repl in (None, 3, {'a': 'b'}):
+ for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']):
+ values = klass(data)
+ pytest.raises(TypeError, values.str.replace, 'a', repl)
+
+ def test_replace_callable(self):
+ # GH 15055
+ values = Series(['fooBAD__barBAD', NA])
+
+ # test with callable
+ repl = lambda m: m.group(0).swapcase()
+ result = values.str.replace('[a-z][A-Z]{2}', repl, n=2)
+ exp = Series(['foObaD__baRbaD', NA])
+ tm.assert_series_equal(result, exp)
+
+ # test with wrong number of arguments, raising an error
+ if compat.PY2:
+ p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
+ else:
+ p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
+ r'(?(3)required )positional arguments?')
+
+ repl = lambda: None
+ with pytest.raises(TypeError, match=p_err):
+ values.str.replace('a', repl)
+
+ repl = lambda m, x: None
+ with pytest.raises(TypeError, match=p_err):
+ values.str.replace('a', repl)
+
+ repl = lambda m, x, y=None: None
+ with pytest.raises(TypeError, match=p_err):
+ values.str.replace('a', repl)
+
+ # test regex named groups
+ values = Series(['Foo Bar Baz', NA])
+ pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
+ repl = lambda m: m.group('middle').swapcase()
+ result = values.str.replace(pat, repl)
+ exp = Series(['bAR', NA])
+ tm.assert_series_equal(result, exp)
+
+ def test_replace_compiled_regex(self):
+ # GH 15446
+ values = Series(['fooBAD__barBAD', NA])
+
+ # test with compiled regex
+ pat = re.compile(r'BAD[_]*')
+ result = values.str.replace(pat, '')
+ exp = Series(['foobar', NA])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
+ None, 1, 2.])
+
+ rs = Series(mixed).str.replace(pat, '')
+ xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA])
+
+ result = values.str.replace(pat, '')
+ exp = Series([u('foobar'), NA])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.replace(pat, '', n=1)
+ exp = Series([u('foobarBAD'), NA])
+ tm.assert_series_equal(result, exp)
+
+ # flags + unicode
+ values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
+ exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
+ pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
+ result = values.str.replace(pat, ", ")
+ tm.assert_series_equal(result, exp)
+
+ # case and flags provided to str.replace will have no effect
+ # and will produce warnings
+ values = Series(['fooBAD__barBAD__bad', NA])
+ pat = re.compile(r'BAD[_]*')
+
+ with pytest.raises(ValueError,
+ match="case and flags cannot be"):
+ result = values.str.replace(pat, '', flags=re.IGNORECASE)
+
+ with pytest.raises(ValueError,
+ match="case and flags cannot be"):
+ result = values.str.replace(pat, '', case=False)
+
+ with pytest.raises(ValueError,
+ match="case and flags cannot be"):
+ result = values.str.replace(pat, '', case=True)
+
+ # test with callable
+ values = Series(['fooBAD__barBAD', NA])
+ repl = lambda m: m.group(0).swapcase()
+ pat = re.compile('[a-z][A-Z]{2}')
+ result = values.str.replace(pat, repl, n=2)
+ exp = Series(['foObaD__baRbaD', NA])
+ tm.assert_series_equal(result, exp)
+
+ def test_replace_literal(self):
+ # GH16808 literal replace (regex=False vs regex=True)
+ values = Series(['f.o', 'foo', NA])
+ exp = Series(['bao', 'bao', NA])
+ result = values.str.replace('f.', 'ba')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['bao', 'foo', NA])
+ result = values.str.replace('f.', 'ba', regex=False)
+ tm.assert_series_equal(result, exp)
+
+ # Cannot do a literal replace if given a callable repl or compiled
+ # pattern
+ callable_repl = lambda m: m.group(0).swapcase()
+ compiled_pat = re.compile('[a-z][A-Z]{2}')
+
+ pytest.raises(ValueError, values.str.replace, 'abc', callable_repl,
+ regex=False)
+ pytest.raises(ValueError, values.str.replace, compiled_pat, '',
+ regex=False)
+
+ def test_repeat(self):
+ values = Series(['a', 'b', NA, 'c', NA, 'd'])
+
+ result = values.str.repeat(3)
+ exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd'])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.repeat([1, 2, 3, 4, 5, 6])
+ exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd'])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
+ 2.])
+
+ rs = Series(mixed).str.repeat(3)
+ xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = Series([u('a'), u('b'), NA, u('c'), NA, u('d')])
+
+ result = values.str.repeat(3)
+ exp = Series([u('aaa'), u('bbb'), NA, u('ccc'), NA, u('ddd')])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.repeat([1, 2, 3, 4, 5, 6])
+ exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
+ tm.assert_series_equal(result, exp)
+
+ def test_match(self):
+ # New match behavior introduced in 0.13
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ result = values.str.match('.*(BAD[_]+).*(BAD)')
+ exp = Series([True, NA, False])
+ tm.assert_series_equal(result, exp)
+
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ result = values.str.match('.*BAD[_]+.*BAD')
+ exp = Series([True, NA, False])
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+ 'foo', None, 1, 2.])
+ rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
+ xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
+ assert isinstance(rs, Series)
+ tm.assert_series_equal(rs, xp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+ result = values.str.match('.*(BAD[_]+).*(BAD)')
+ exp = Series([True, NA, False])
+ tm.assert_series_equal(result, exp)
+
+ # na GH #6609
+ res = Series(['a', 0, np.nan]).str.match('a', na=False)
+ exp = Series([True, False, False])
+ assert_series_equal(exp, res)
+ res = Series(['a', 0, np.nan]).str.match('a')
+ exp = Series([True, np.nan, np.nan])
+ assert_series_equal(exp, res)
+
+ def test_extract_expand_None(self):
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ with pytest.raises(ValueError,
+ match='expand must be True or False'):
+ values.str.extract('.*(BAD[_]+).*(BAD)', expand=None)
+
+ def test_extract_expand_unspecified(self):
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ result_unspecified = values.str.extract('.*(BAD[_]+).*')
+ assert isinstance(result_unspecified, DataFrame)
+ result_true = values.str.extract('.*(BAD[_]+).*', expand=True)
+ tm.assert_frame_equal(result_unspecified, result_true)
+
+ def test_extract_expand_False(self):
+ # Contains tests like those in test_match and some others.
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ er = [NA, NA] # empty row
+
+ result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
+ exp = DataFrame([['BAD__', 'BAD'], er, er])
+ tm.assert_frame_equal(result, exp)
+
+ # mixed
+ mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+ 'foo', None, 1, 2.])
+
+ rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False)
+ exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er,
+ er, er])
+ tm.assert_frame_equal(rs, exp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+
+ result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
+ exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
+ tm.assert_frame_equal(result, exp)
+
+ # GH9980
+ # Index only works with one regex group since
+ # multi-group would expand to a frame
+ idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
+ with pytest.raises(ValueError, match="supported"):
+ idx.str.extract('([AB])([123])', expand=False)
+
+ # these should work for both Series and Index
+ for klass in [Series, Index]:
+ # no groups
+ s_or_idx = klass(['A1', 'B2', 'C3'])
+ f = lambda: s_or_idx.str.extract('[ABC][123]', expand=False)
+ pytest.raises(ValueError, f)
+
+ # only non-capturing groups
+ f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=False)
+ pytest.raises(ValueError, f)
+
+ # single group renames series/index properly
+ s_or_idx = klass(['A1', 'A2'])
+ result = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=False)
+ assert result.name == 'uno'
+
+ exp = klass(['A', 'A'], name='uno')
+ if klass == Series:
+ tm.assert_series_equal(result, exp)
+ else:
+ tm.assert_index_equal(result, exp)
+
+ s = Series(['A1', 'B2', 'C3'])
+ # one group, no matches
+ result = s.str.extract('(_)', expand=False)
+ exp = Series([NA, NA, NA], dtype=object)
+ tm.assert_series_equal(result, exp)
+
+ # two groups, no matches
+ result = s.str.extract('(_)(_)', expand=False)
+ exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
+ tm.assert_frame_equal(result, exp)
+
+ # one group, some matches
+ result = s.str.extract('([AB])[123]', expand=False)
+ exp = Series(['A', 'B', NA])
+ tm.assert_series_equal(result, exp)
+
+ # two groups, some matches
+ result = s.str.extract('([AB])([123])', expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+ tm.assert_frame_equal(result, exp)
+
+ # one named group
+ result = s.str.extract('(?P<letter>[AB])', expand=False)
+ exp = Series(['A', 'B', NA], name='letter')
+ tm.assert_series_equal(result, exp)
+
+ # two named groups
+ result = s.str.extract('(?P<letter>[AB])(?P<number>[123])',
+ expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
+ columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # mix named and unnamed groups
+ result = s.str.extract('([AB])(?P<number>[123])', expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
+ columns=[0, 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # one normal group, one non-capturing group
+ result = s.str.extract('([AB])(?:[123])', expand=False)
+ exp = Series(['A', 'B', NA])
+ tm.assert_series_equal(result, exp)
+
+ # two normal groups, one non-capturing group
+ result = Series(['A11', 'B22', 'C33']).str.extract(
+ '([AB])([123])(?:[123])', expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+ tm.assert_frame_equal(result, exp)
+
+ # one optional group followed by one normal group
+ result = Series(['A1', 'B2', '3']).str.extract(
+ '(?P<letter>[AB])?(?P<number>[123])', expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']],
+ columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # one normal group followed by one optional group
+ result = Series(['A1', 'B2', 'C']).str.extract(
+ '(?P<letter>[ABC])(?P<number>[123])?', expand=False)
+ exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]],
+ columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # GH6348
+ # not passing index to the extractor
+ def check_index(index):
+ data = ['A1', 'B2', 'C']
+ index = index[:len(data)]
+ s = Series(data, index=index)
+ result = s.str.extract(r'(\d)', expand=False)
+ exp = Series(['1', '2', NA], index=index)
+ tm.assert_series_equal(result, exp)
+
+ result = Series(data, index=index).str.extract(
+ r'(?P<letter>\D)(?P<number>\d)?', expand=False)
+ e_list = [
+ ['A', '1'],
+ ['B', '2'],
+ ['C', NA]
+ ]
+ exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
+ tm.assert_frame_equal(result, exp)
+
+ i_funs = [
+ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
+ tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
+ ]
+ for index in i_funs:
+ check_index(index())
+
+ # single_series_name_is_preserved.
+ s = Series(['a3', 'b3', 'c2'], name='bob')
+ r = s.str.extract(r'(?P<sue>[a-z])', expand=False)
+ e = Series(['a', 'b', 'c'], name='sue')
+ tm.assert_series_equal(r, e)
+ assert r.name == e.name
+
+ def test_extract_expand_True(self):
+ # Contains tests like those in test_match and some others.
+ values = Series(['fooBAD__barBAD', NA, 'foo'])
+ er = [NA, NA] # empty row
+
+ result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
+ exp = DataFrame([['BAD__', 'BAD'], er, er])
+ tm.assert_frame_equal(result, exp)
+
+ # mixed
+ mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+ 'foo', None, 1, 2.])
+
+ rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True)
+ exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er,
+ er, er, er, er])
+ tm.assert_frame_equal(rs, exp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+
+ result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
+ exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
+ tm.assert_frame_equal(result, exp)
+
+ # these should work for both Series and Index
+ for klass in [Series, Index]:
+ # no groups
+ s_or_idx = klass(['A1', 'B2', 'C3'])
+ f = lambda: s_or_idx.str.extract('[ABC][123]', expand=True)
+ pytest.raises(ValueError, f)
+
+ # only non-capturing groups
+ f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=True)
+ pytest.raises(ValueError, f)
+
+ # single group renames series/index properly
+ s_or_idx = klass(['A1', 'A2'])
+ result_df = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=True)
+ assert isinstance(result_df, DataFrame)
+ result_series = result_df['uno']
+ assert_series_equal(result_series, Series(['A', 'A'], name='uno'))
+
+ def test_extract_series(self):
+ # extract should give the same result whether or not the
+ # series has a name.
+ for series_name in None, "series_name":
+ s = Series(['A1', 'B2', 'C3'], name=series_name)
+ # one group, no matches
+ result = s.str.extract('(_)', expand=True)
+ exp = DataFrame([NA, NA, NA], dtype=object)
+ tm.assert_frame_equal(result, exp)
+
+ # two groups, no matches
+ result = s.str.extract('(_)(_)', expand=True)
+ exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
+ tm.assert_frame_equal(result, exp)
+
+ # one group, some matches
+ result = s.str.extract('([AB])[123]', expand=True)
+ exp = DataFrame(['A', 'B', NA])
+ tm.assert_frame_equal(result, exp)
+
+ # two groups, some matches
+ result = s.str.extract('([AB])([123])', expand=True)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+ tm.assert_frame_equal(result, exp)
+
+ # one named group
+ result = s.str.extract('(?P<letter>[AB])', expand=True)
+ exp = DataFrame({"letter": ['A', 'B', NA]})
+ tm.assert_frame_equal(result, exp)
+
+ # two named groups
+ result = s.str.extract(
+ '(?P<letter>[AB])(?P<number>[123])',
+ expand=True)
+ e_list = [
+ ['A', '1'],
+ ['B', '2'],
+ [NA, NA]
+ ]
+ exp = DataFrame(e_list, columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # mix named and unnamed groups
+ result = s.str.extract('([AB])(?P<number>[123])', expand=True)
+ exp = DataFrame(e_list, columns=[0, 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # one normal group, one non-capturing group
+ result = s.str.extract('([AB])(?:[123])', expand=True)
+ exp = DataFrame(['A', 'B', NA])
+ tm.assert_frame_equal(result, exp)
+
+ def test_extract_optional_groups(self):
+
+ # two normal groups, one non-capturing group
+ result = Series(['A11', 'B22', 'C33']).str.extract(
+ '([AB])([123])(?:[123])', expand=True)
+ exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+ tm.assert_frame_equal(result, exp)
+
+ # one optional group followed by one normal group
+ result = Series(['A1', 'B2', '3']).str.extract(
+ '(?P<letter>[AB])?(?P<number>[123])', expand=True)
+ e_list = [
+ ['A', '1'],
+ ['B', '2'],
+ [NA, '3']
+ ]
+ exp = DataFrame(e_list, columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # one normal group followed by one optional group
+ result = Series(['A1', 'B2', 'C']).str.extract(
+ '(?P<letter>[ABC])(?P<number>[123])?', expand=True)
+ e_list = [
+ ['A', '1'],
+ ['B', '2'],
+ ['C', NA]
+ ]
+ exp = DataFrame(e_list, columns=['letter', 'number'])
+ tm.assert_frame_equal(result, exp)
+
+ # GH6348
+ # not passing index to the extractor
+ def check_index(index):
+ data = ['A1', 'B2', 'C']
+ index = index[:len(data)]
+ result = Series(data, index=index).str.extract(
+ r'(\d)', expand=True)
+ exp = DataFrame(['1', '2', NA], index=index)
+ tm.assert_frame_equal(result, exp)
+
+ result = Series(data, index=index).str.extract(
+ r'(?P<letter>\D)(?P<number>\d)?', expand=True)
+ e_list = [
+ ['A', '1'],
+ ['B', '2'],
+ ['C', NA]
+ ]
+ exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
+ tm.assert_frame_equal(result, exp)
+
+ i_funs = [
+ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
+ tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
+ ]
+ for index in i_funs:
+ check_index(index())
+
+ def test_extract_single_group_returns_frame(self):
+ # GH11386 extract should always return DataFrame, even when
+ # there is only one group. Prior to v0.18.0, extract returned
+ # Series when there was only one group in the regex.
+ s = Series(['a3', 'b3', 'c2'], name='series_name')
+ r = s.str.extract(r'(?P<letter>[a-z])', expand=True)
+ e = DataFrame({"letter": ['a', 'b', 'c']})
+ tm.assert_frame_equal(r, e)
+
+ def test_extractall(self):
+ subject_list = [
+ np.nan,
+ "",
+ ]
+ expected_tuples = [
+ ("dave", "google", "com"),
+ ("tdhock5", "gmail", "com"),
+ ("maudelaperriere", "gmail", "com"),
+ ("rob", "gmail", "com"), ("steve", "gmail", "com"),
+ ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"),
+ ]
+ named_pattern = r"""
+ (?P<user>[a-z0-9]+)
+ @
+ (?P<domain>[a-z]+)
+ \.
+ (?P<tld>[a-z]{2,4})
+ """
+ expected_columns = ["user", "domain", "tld"]
+ S = Series(subject_list)
+ # extractall should return a DataFrame with one row for each
+ # match, indexed by the subject from which the match came.
+ expected_index = MultiIndex.from_tuples([
+ (0, 0),
+ (1, 0),
+ (2, 0),
+ (3, 0),
+ (3, 1),
+ (4, 0),
+ (4, 1),
+ (4, 2),
+ ], names=(None, "match"))
+ expected_df = DataFrame(
+ expected_tuples, expected_index, expected_columns)
+ computed_df = S.str.extractall(named_pattern, re.VERBOSE)
+ tm.assert_frame_equal(computed_df, expected_df)
+
+ # The index of the input Series should be used to construct
+ # the index of the output DataFrame:
+ series_index = MultiIndex.from_tuples([
+ ("single", "Dave"),
+ ("single", "Toby"),
+ ("single", "Maude"),
+ ("multiple", "robAndSteve"),
+ ("multiple", "abcdef"),
+ ("none", "missing"),
+ ("none", "empty"),
+ ])
+ Si = Series(subject_list, series_index)
+ expected_index = MultiIndex.from_tuples([
+ ("single", "Dave", 0),
+ ("single", "Toby", 0),
+ ("single", "Maude", 0),
+ ("multiple", "robAndSteve", 0),
+ ("multiple", "robAndSteve", 1),
+ ("multiple", "abcdef", 0),
+ ("multiple", "abcdef", 1),
+ ("multiple", "abcdef", 2),
+ ], names=(None, None, "match"))
+ expected_df = DataFrame(
+ expected_tuples, expected_index, expected_columns)
+ computed_df = Si.str.extractall(named_pattern, re.VERBOSE)
+ tm.assert_frame_equal(computed_df, expected_df)
+
+ # MultiIndexed subject with names.
+ Sn = Series(subject_list, series_index)
+ Sn.index.names = ("matches", "description")
+ expected_index.names = ("matches", "description", "match")
+ expected_df = DataFrame(
+ expected_tuples, expected_index, expected_columns)
+ computed_df = Sn.str.extractall(named_pattern, re.VERBOSE)
+ tm.assert_frame_equal(computed_df, expected_df)
+
+ # optional groups.
+ subject_list = ['', 'A1', '32']
+ named_pattern = '(?P<letter>[AB])?(?P<number>[123])'
+ computed_df = Series(subject_list).str.extractall(named_pattern)
+ expected_index = MultiIndex.from_tuples([
+ (1, 0),
+ (2, 0),
+ (2, 1),
+ ], names=(None, "match"))
+ expected_df = DataFrame([
+ ('A', '1'),
+ (NA, '3'),
+ (NA, '2'),
+ ], expected_index, columns=['letter', 'number'])
+ tm.assert_frame_equal(computed_df, expected_df)
+
+ # only one of two groups has a name.
+ pattern = '([AB])?(?P<number>[123])'
+ computed_df = Series(subject_list).str.extractall(pattern)
+ expected_df = DataFrame([
+ ('A', '1'),
+ (NA, '3'),
+ (NA, '2'),
+ ], expected_index, columns=[0, 'number'])
+ tm.assert_frame_equal(computed_df, expected_df)
+
+ def test_extractall_single_group(self):
+ # extractall(one named group) returns DataFrame with one named
+ # column.
+ s = Series(['a3', 'b3', 'd4c2'], name='series_name')
+ r = s.str.extractall(r'(?P<letter>[a-z])')
+ i = MultiIndex.from_tuples([
+ (0, 0),
+ (1, 0),
+ (2, 0),
+ (2, 1),
+ ], names=(None, "match"))
+ e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i)
+ tm.assert_frame_equal(r, e)
+
+ # extractall(one un-named group) returns DataFrame with one
+ # un-named column.
+ r = s.str.extractall(r'([a-z])')
+ e = DataFrame(['a', 'b', 'd', 'c'], i)
+ tm.assert_frame_equal(r, e)
+
+ def test_extractall_single_group_with_quantifier(self):
+ # extractall(one un-named group with quantifier) returns
+ # DataFrame with one un-named column (GH13382).
+ s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name')
+ r = s.str.extractall(r'([a-z]+)')
+ i = MultiIndex.from_tuples([
+ (0, 0),
+ (1, 0),
+ (2, 0),
+ (2, 1),
+ ], names=(None, "match"))
+ e = DataFrame(['ab', 'abc', 'd', 'cd'], i)
+ tm.assert_frame_equal(r, e)
+
+ @pytest.mark.parametrize('data, names', [
+ ([], (None, )),
+ ([], ('i1', )),
+ ([], (None, 'i2')),
+ ([], ('i1', 'i2')),
+ (['a3', 'b3', 'd4c2'], (None, )),
+ (['a3', 'b3', 'd4c2'], ('i1', 'i2')),
+ (['a3', 'b3', 'd4c2'], (None, 'i2')),
+ (['a3', 'b3', 'd4c2'], ('i1', 'i2')),
+ ])
+ def test_extractall_no_matches(self, data, names):
+ # GH19075 extractall with no matches should return a valid MultiIndex
+ n = len(data)
+ if len(names) == 1:
+ i = Index(range(n), name=names[0])
+ else:
+ a = (tuple([i] * (n - 1)) for i in range(n))
+ i = MultiIndex.from_tuples(a, names=names)
+ s = Series(data, name='series_name', index=i, dtype='object')
+ ei = MultiIndex.from_tuples([], names=(names + ('match',)))
+
+ # one un-named group.
+ r = s.str.extractall('(z)')
+ e = DataFrame(columns=[0], index=ei)
+ tm.assert_frame_equal(r, e)
+
+ # two un-named groups.
+ r = s.str.extractall('(z)(z)')
+ e = DataFrame(columns=[0, 1], index=ei)
+ tm.assert_frame_equal(r, e)
+
+ # one named group.
+ r = s.str.extractall('(?P<first>z)')
+ e = DataFrame(columns=["first"], index=ei)
+ tm.assert_frame_equal(r, e)
+
+ # two named groups.
+ r = s.str.extractall('(?P<first>z)(?P<second>z)')
+ e = DataFrame(columns=["first", "second"], index=ei)
+ tm.assert_frame_equal(r, e)
+
+ # one named, one un-named.
+ r = s.str.extractall('(z)(?P<second>z)')
+ e = DataFrame(columns=[0, "second"], index=ei)
+ tm.assert_frame_equal(r, e)
+
+ def test_extractall_stringindex(self):
+ s = Series(["a1a2", "b1", "c1"], name='xxx')
+ res = s.str.extractall(r"[ab](?P<digit>\d)")
+ exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
+ names=[None, 'match'])
+ exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
+ tm.assert_frame_equal(res, exp)
+
+ # index should return the same result as the default index without name
+ # thus index.name doesn't affect to the result
+ for idx in [Index(["a1a2", "b1", "c1"]),
+ Index(["a1a2", "b1", "c1"], name='xxx')]:
+
+ res = idx.str.extractall(r"[ab](?P<digit>\d)")
+ tm.assert_frame_equal(res, exp)
+
+ s = Series(["a1a2", "b1", "c1"], name='s_name',
+ index=Index(["XX", "yy", "zz"], name='idx_name'))
+ res = s.str.extractall(r"[ab](?P<digit>\d)")
+ exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
+ names=["idx_name", 'match'])
+ exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
+ tm.assert_frame_equal(res, exp)
+
+ def test_extractall_errors(self):
+ # Does not make sense to use extractall with a regex that has
+ # no capture groups. (it returns DataFrame with one column for
+ # each capture group)
+ s = Series(['a3', 'b3', 'd4c2'], name='series_name')
+ with pytest.raises(ValueError, match="no capture groups"):
+ s.str.extractall(r'[a-z]')
+
+ def test_extract_index_one_two_groups(self):
+ s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
+ name='series_name')
+ r = s.index.str.extract(r'([A-Z])', expand=True)
+ e = DataFrame(['A', "B", "D"])
+ tm.assert_frame_equal(r, e)
+
+ # Prior to v0.18.0, index.str.extract(regex with one group)
+ # returned Index. With more than one group, extract raised an
+ # error (GH9980). Now extract always returns DataFrame.
+ r = s.index.str.extract(
+ r'(?P<letter>[A-Z])(?P<digit>[0-9])', expand=True)
+ e_list = [
+ ("A", "3"),
+ ("B", "3"),
+ ("D", "4"),
+ ]
+ e = DataFrame(e_list, columns=["letter", "digit"])
+ tm.assert_frame_equal(r, e)
+
+ def test_extractall_same_as_extract(self):
+ s = Series(['a3', 'b3', 'c2'], name='series_name')
+
+ pattern_two_noname = r'([a-z])([0-9])'
+ extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
+ has_multi_index = s.str.extractall(pattern_two_noname)
+ no_multi_index = has_multi_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_two_noname, no_multi_index)
+
+ pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
+ extract_two_named = s.str.extract(pattern_two_named, expand=True)
+ has_multi_index = s.str.extractall(pattern_two_named)
+ no_multi_index = has_multi_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_two_named, no_multi_index)
+
+ pattern_one_named = r'(?P<group_name>[a-z])'
+ extract_one_named = s.str.extract(pattern_one_named, expand=True)
+ has_multi_index = s.str.extractall(pattern_one_named)
+ no_multi_index = has_multi_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_one_named, no_multi_index)
+
+ pattern_one_noname = r'([a-z])'
+ extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
+ has_multi_index = s.str.extractall(pattern_one_noname)
+ no_multi_index = has_multi_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_one_noname, no_multi_index)
+
+ def test_extractall_same_as_extract_subject_index(self):
+ # same as above tests, but s has an MultiIndex.
+ i = MultiIndex.from_tuples([
+ ("A", "first"),
+ ("B", "second"),
+ ("C", "third"),
+ ], names=("capital", "ordinal"))
+ s = Series(['a3', 'b3', 'c2'], i, name='series_name')
+
+ pattern_two_noname = r'([a-z])([0-9])'
+ extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
+ has_match_index = s.str.extractall(pattern_two_noname)
+ no_match_index = has_match_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_two_noname, no_match_index)
+
+ pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
+ extract_two_named = s.str.extract(pattern_two_named, expand=True)
+ has_match_index = s.str.extractall(pattern_two_named)
+ no_match_index = has_match_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_two_named, no_match_index)
+
+ pattern_one_named = r'(?P<group_name>[a-z])'
+ extract_one_named = s.str.extract(pattern_one_named, expand=True)
+ has_match_index = s.str.extractall(pattern_one_named)
+ no_match_index = has_match_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_one_named, no_match_index)
+
+ pattern_one_noname = r'([a-z])'
+ extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
+ has_match_index = s.str.extractall(pattern_one_noname)
+ no_match_index = has_match_index.xs(0, level="match")
+ tm.assert_frame_equal(extract_one_noname, no_match_index)
+
+ def test_empty_str_methods(self):
+ empty_str = empty = Series(dtype=object)
+ empty_int = Series(dtype=int)
+ empty_bool = Series(dtype=bool)
+ empty_bytes = Series(dtype=object)
+
+ # GH7241
+ # (extract) on empty series
+
+ tm.assert_series_equal(empty_str, empty.str.cat(empty))
+ assert '' == empty.str.cat()
+ tm.assert_series_equal(empty_str, empty.str.title())
+ tm.assert_series_equal(empty_int, empty.str.count('a'))
+ tm.assert_series_equal(empty_bool, empty.str.contains('a'))
+ tm.assert_series_equal(empty_bool, empty.str.startswith('a'))
+ tm.assert_series_equal(empty_bool, empty.str.endswith('a'))
+ tm.assert_series_equal(empty_str, empty.str.lower())
+ tm.assert_series_equal(empty_str, empty.str.upper())
+ tm.assert_series_equal(empty_str, empty.str.replace('a', 'b'))
+ tm.assert_series_equal(empty_str, empty.str.repeat(3))
+ tm.assert_series_equal(empty_bool, empty.str.match('^a'))
+ tm.assert_frame_equal(
+ DataFrame(columns=[0], dtype=str),
+ empty.str.extract('()', expand=True))
+ tm.assert_frame_equal(
+ DataFrame(columns=[0, 1], dtype=str),
+ empty.str.extract('()()', expand=True))
+ tm.assert_series_equal(
+ empty_str,
+ empty.str.extract('()', expand=False))
+ tm.assert_frame_equal(
+ DataFrame(columns=[0, 1], dtype=str),
+ empty.str.extract('()()', expand=False))
+ tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
+ tm.assert_series_equal(empty_str, empty_str.str.join(''))
+ tm.assert_series_equal(empty_int, empty.str.len())
+ tm.assert_series_equal(empty_str, empty_str.str.findall('a'))
+ tm.assert_series_equal(empty_int, empty.str.find('a'))
+ tm.assert_series_equal(empty_int, empty.str.rfind('a'))
+ tm.assert_series_equal(empty_str, empty.str.pad(42))
+ tm.assert_series_equal(empty_str, empty.str.center(42))
+ tm.assert_series_equal(empty_str, empty.str.split('a'))
+ tm.assert_series_equal(empty_str, empty.str.rsplit('a'))
+ tm.assert_series_equal(empty_str,
+ empty.str.partition('a', expand=False))
+ tm.assert_series_equal(empty_str,
+ empty.str.rpartition('a', expand=False))
+ tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
+ tm.assert_series_equal(empty_str, empty.str.slice(step=1))
+ tm.assert_series_equal(empty_str, empty.str.strip())
+ tm.assert_series_equal(empty_str, empty.str.lstrip())
+ tm.assert_series_equal(empty_str, empty.str.rstrip())
+ tm.assert_series_equal(empty_str, empty.str.wrap(42))
+ tm.assert_series_equal(empty_str, empty.str.get(0))
+ tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii'))
+ tm.assert_series_equal(empty_bytes, empty.str.encode('ascii'))
+ tm.assert_series_equal(empty_str, empty.str.isalnum())
+ tm.assert_series_equal(empty_str, empty.str.isalpha())
+ tm.assert_series_equal(empty_str, empty.str.isdigit())
+ tm.assert_series_equal(empty_str, empty.str.isspace())
+ tm.assert_series_equal(empty_str, empty.str.islower())
+ tm.assert_series_equal(empty_str, empty.str.isupper())
+ tm.assert_series_equal(empty_str, empty.str.istitle())
+ tm.assert_series_equal(empty_str, empty.str.isnumeric())
+ tm.assert_series_equal(empty_str, empty.str.isdecimal())
+ tm.assert_series_equal(empty_str, empty.str.capitalize())
+ tm.assert_series_equal(empty_str, empty.str.swapcase())
+ tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
+ if compat.PY3:
+ table = str.maketrans('a', 'b')
+ else:
+ import string
+ table = string.maketrans('a', 'b')
+ tm.assert_series_equal(empty_str, empty.str.translate(table))
+
+ def test_empty_str_methods_to_frame(self):
+ empty = Series(dtype=str)
+ empty_df = DataFrame([])
+ tm.assert_frame_equal(empty_df, empty.str.partition('a'))
+ tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
+
+ def test_ismethods(self):
+ values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
+ str_s = Series(values)
+ alnum_e = [True, True, True, True, True, False, True, True, False,
+ False]
+ alpha_e = [True, True, True, False, False, False, True, False, False,
+ False]
+ digit_e = [False, False, False, True, False, False, False, True, False,
+ False]
+
+ # TODO: unused
+ num_e = [False, False, False, True, False, False, # noqa
+ False, True, False, False]
+
+ space_e = [False, False, False, False, False, False, False, False,
+ False, True]
+ lower_e = [False, True, False, False, False, False, False, False,
+ False, False]
+ upper_e = [True, False, False, False, True, False, True, False, False,
+ False]
+ title_e = [True, False, True, False, True, False, False, False, False,
+ False]
+
+ tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
+ tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
+ tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
+ tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
+ tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
+ tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
+ tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
+
+ assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
+ assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
+ assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values]
+ assert str_s.str.isspace().tolist() == [v.isspace() for v in values]
+ assert str_s.str.islower().tolist() == [v.islower() for v in values]
+ assert str_s.str.isupper().tolist() == [v.isupper() for v in values]
+ assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
+
+ def test_isnumeric(self):
+ # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
+ # 0x2605: ★ not number
+ # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
+ # 0xFF13: 3 Em 3
+ values = ['A', '3', u'¼', u'★', u'፸', u'3', 'four']
+ s = Series(values)
+ numeric_e = [False, True, True, False, True, True, False]
+ decimal_e = [False, True, False, False, False, True, False]
+ tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
+ tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
+
+ unicodes = [u'A', u'3', u'¼', u'★', u'፸', u'3', u'four']
+ assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
+ assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
+
+ values = ['A', np.nan, u'¼', u'★', np.nan, u'3', 'four']
+ s = Series(values)
+ numeric_e = [False, np.nan, True, False, np.nan, True, False]
+ decimal_e = [False, np.nan, False, False, np.nan, True, False]
+ tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
+ tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
+
+ def test_get_dummies(self):
+ s = Series(['a|b', 'a|c', np.nan])
+ result = s.str.get_dummies('|')
+ expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+ columns=list('abc'))
+ tm.assert_frame_equal(result, expected)
+
+ s = Series(['a;b', 'a', 7])
+ result = s.str.get_dummies(';')
+ expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]],
+ columns=list('7ab'))
+ tm.assert_frame_equal(result, expected)
+
+ # GH9980, GH8028
+ idx = Index(['a|b', 'a|c', 'b|c'])
+ result = idx.str.get_dummies('|')
+
+ expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1),
+ (0, 1, 1)], names=('a', 'b', 'c'))
+ tm.assert_index_equal(result, expected)
+
+ def test_get_dummies_with_name_dummy(self):
+ # GH 12180
+ # Dummies named 'name' should work as expected
+ s = Series(['a', 'b,name', 'b'])
+ result = s.str.get_dummies(',')
+ expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]],
+ columns=['a', 'b', 'name'])
+ tm.assert_frame_equal(result, expected)
+
+ idx = Index(['a|b', 'name|c', 'b|name'])
+ result = idx.str.get_dummies('|')
+
+ expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1),
+ (0, 1, 0, 1)],
+ names=('a', 'b', 'c', 'name'))
+ tm.assert_index_equal(result, expected)
+
+ def test_join(self):
+ values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
+ result = values.str.split('_').str.join('_')
+ tm.assert_series_equal(values, result)
+
+ # mixed
+ mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
+ 'foo', None, 1, 2.])
+
+ rs = Series(mixed).str.split('_').str.join('_')
+ xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
+ result = values.str.split('_').str.join('_')
+ tm.assert_series_equal(values, result)
+
+ def test_len(self):
+ values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo'])
+
+ result = values.str.len()
+ exp = values.map(lambda x: len(x) if notna(x) else NA)
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
+ 'foo', None, 1, 2.])
+
+ rs = Series(mixed).str.len()
+ xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, u(
+ 'fooooooo')])
+
+ result = values.str.len()
+ exp = values.map(lambda x: len(x) if notna(x) else NA)
+ tm.assert_series_equal(result, exp)
+
+ def test_findall(self):
+ values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD'])
+
+ result = values.str.findall('BAD[_]*')
+ exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']])
+ tm.assert_almost_equal(result, exp)
+
+ # mixed
+ mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(),
+ 'BAD', None, 1, 2.])
+
+ rs = Series(mixed).str.findall('BAD[_]*')
+ xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('fooBAD__barBAD'), NA, u('foo'), u('BAD')])
+
+ result = values.str.findall('BAD[_]*')
+ exp = Series([[u('BAD__'), u('BAD')], NA, [], [u('BAD')]])
+ tm.assert_almost_equal(result, exp)
+
+ def test_find(self):
+ values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX'])
+ result = values.str.find('EF')
+ tm.assert_series_equal(result, Series([4, 3, 1, 0, -1]))
+ expected = np.array([v.find('EF') for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.rfind('EF')
+ tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
+ expected = np.array([v.rfind('EF') for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.find('EF', 3)
+ tm.assert_series_equal(result, Series([4, 3, 7, 4, -1]))
+ expected = np.array([v.find('EF', 3) for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.rfind('EF', 3)
+ tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
+ expected = np.array([v.rfind('EF', 3) for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.find('EF', 3, 6)
+ tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
+ expected = np.array([v.find('EF', 3, 6) for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.rfind('EF', 3, 6)
+ tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
+ expected = np.array([v.rfind('EF', 3, 6) for v in values.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ with pytest.raises(TypeError,
+ match="expected a string object, not int"):
+ result = values.str.find(0)
+
+ with pytest.raises(TypeError,
+ match="expected a string object, not int"):
+ result = values.str.rfind(0)
+
+ def test_find_nan(self):
+ values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX'])
+ result = values.str.find('EF')
+ tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1]))
+
+ result = values.str.rfind('EF')
+ tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
+
+ result = values.str.find('EF', 3)
+ tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
+
+ result = values.str.rfind('EF', 3)
+ tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
+
+ result = values.str.find('EF', 3, 6)
+ tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
+
+ result = values.str.rfind('EF', 3, 6)
+ tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
+
+ def test_index(self):
+
+ def _check(result, expected):
+ if isinstance(result, Series):
+ tm.assert_series_equal(result, expected)
+ else:
+ tm.assert_index_equal(result, expected)
+
+ for klass in [Series, Index]:
+ s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF'])
+
+ result = s.str.index('EF')
+ _check(result, klass([4, 3, 1, 0]))
+ expected = np.array([v.index('EF') for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = s.str.rindex('EF')
+ _check(result, klass([4, 5, 7, 4]))
+ expected = np.array([v.rindex('EF') for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = s.str.index('EF', 3)
+ _check(result, klass([4, 3, 7, 4]))
+ expected = np.array([v.index('EF', 3) for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = s.str.rindex('EF', 3)
+ _check(result, klass([4, 5, 7, 4]))
+ expected = np.array([v.rindex('EF', 3) for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = s.str.index('E', 4, 8)
+ _check(result, klass([4, 5, 7, 4]))
+ expected = np.array([v.index('E', 4, 8) for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = s.str.rindex('E', 0, 5)
+ _check(result, klass([4, 3, 1, 4]))
+ expected = np.array([v.rindex('E', 0, 5) for v in s.values],
+ dtype=np.int64)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ with pytest.raises(ValueError, match="substring not found"):
+ result = s.str.index('DE')
+
+ msg = "expected a string object, not int"
+ with pytest.raises(TypeError, match=msg):
+ result = s.str.index(0)
+
+ # test with nan
+ s = Series(['abcb', 'ab', 'bcbe', np.nan])
+ result = s.str.index('b')
+ tm.assert_series_equal(result, Series([1, 1, 0, np.nan]))
+ result = s.str.rindex('b')
+ tm.assert_series_equal(result, Series([3, 1, 2, np.nan]))
+
+ def test_pad(self):
+ values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
+
+ result = values.str.pad(5, side='left')
+ exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='right')
+ exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='both')
+ exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
+ ])
+
+ rs = Series(mixed).str.pad(5, side='left')
+ xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
+ ])
+
+ rs = Series(mixed).str.pad(5, side='right')
+ xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
+ ])
+
+ rs = Series(mixed).str.pad(5, side='both')
+ xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
+
+ result = values.str.pad(5, side='left')
+ exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='right')
+ exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='both')
+ exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ def test_pad_fillchar(self):
+
+ values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
+
+ result = values.str.pad(5, side='left', fillchar='X')
+ exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='right', fillchar='X')
+ exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.pad(5, side='both', fillchar='X')
+ exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ msg = "fillchar must be a character, not str"
+ with pytest.raises(TypeError, match=msg):
+ result = values.str.pad(5, fillchar='XY')
+
+ msg = "fillchar must be a character, not int"
+ with pytest.raises(TypeError, match=msg):
+ result = values.str.pad(5, fillchar=5)
+
+ @pytest.mark.parametrize("f", ['center', 'ljust', 'rjust', 'zfill', 'pad'])
+ def test_pad_width(self, f):
+ # see gh-13598
+ s = Series(['1', '22', 'a', 'bb'])
+ msg = "width must be of integer type, not*"
+
+ with pytest.raises(TypeError, match=msg):
+ getattr(s.str, f)('f')
+
+ def test_translate(self):
+
+ def _check(result, expected):
+ if isinstance(result, Series):
+ tm.assert_series_equal(result, expected)
+ else:
+ tm.assert_index_equal(result, expected)
+
+ for klass in [Series, Index]:
+ s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg'])
+ if not compat.PY3:
+ import string
+ table = string.maketrans('abc', 'cde')
+ else:
+ table = str.maketrans('abc', 'cde')
+ result = s.str.translate(table)
+ expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg'])
+ _check(result, expected)
+
+ # use of deletechars is python 2 only
+ if not compat.PY3:
+ result = s.str.translate(table, deletechars='fg')
+ expected = klass(['cdede', 'cdee', 'eddd', 'ede'])
+ _check(result, expected)
+
+ result = s.str.translate(None, deletechars='fg')
+ expected = klass(['abcde', 'abcc', 'cddd', 'cde'])
+ _check(result, expected)
+ else:
+ msg = "deletechars is not a valid argument"
+ with pytest.raises(ValueError, match=msg):
+ result = s.str.translate(table, deletechars='fg')
+
+ # Series with non-string values
+ s = Series(['a', 'b', 'c', 1.2])
+ expected = Series(['c', 'd', 'e', np.nan])
+ result = s.str.translate(table)
+ tm.assert_series_equal(result, expected)
+
+ def test_center_ljust_rjust(self):
+ values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
+
+ result = values.str.center(5)
+ exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.ljust(5)
+ exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.rjust(5)
+ exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
+ tm.assert_almost_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None,
+ 1, 2.])
+
+ rs = Series(mixed).str.center(5)
+ xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA
+ ])
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ rs = Series(mixed).str.ljust(5)
+ xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA
+ ])
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ rs = Series(mixed).str.rjust(5)
+ xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA
+ ])
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
+
+ result = values.str.center(5)
+ exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.ljust(5)
+ exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ result = values.str.rjust(5)
+ exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
+ tm.assert_almost_equal(result, exp)
+
+ def test_center_ljust_rjust_fillchar(self):
+ values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee'])
+
+ result = values.str.center(5, fillchar='X')
+ expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee'])
+ tm.assert_series_equal(result, expected)
+ expected = np.array([v.center(5, 'X') for v in values.values],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.ljust(5, fillchar='X')
+ expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee'])
+ tm.assert_series_equal(result, expected)
+ expected = np.array([v.ljust(5, 'X') for v in values.values],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.rjust(5, fillchar='X')
+ expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee'])
+ tm.assert_series_equal(result, expected)
+ expected = np.array([v.rjust(5, 'X') for v in values.values],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ # If fillchar is not a charatter, normal str raises TypeError
+ # 'aaa'.ljust(5, 'XY')
+ # TypeError: must be char, not str
+ template = "fillchar must be a character, not {dtype}"
+
+ with pytest.raises(TypeError, match=template.format(dtype="str")):
+ values.str.center(5, fillchar='XY')
+
+ with pytest.raises(TypeError, match=template.format(dtype="str")):
+ values.str.ljust(5, fillchar='XY')
+
+ with pytest.raises(TypeError, match=template.format(dtype="str")):
+ values.str.rjust(5, fillchar='XY')
+
+ with pytest.raises(TypeError, match=template.format(dtype="int")):
+ values.str.center(5, fillchar=1)
+
+ with pytest.raises(TypeError, match=template.format(dtype="int")):
+ values.str.ljust(5, fillchar=1)
+
+ with pytest.raises(TypeError, match=template.format(dtype="int")):
+ values.str.rjust(5, fillchar=1)
+
+ def test_zfill(self):
+ values = Series(['1', '22', 'aaa', '333', '45678'])
+
+ result = values.str.zfill(5)
+ expected = Series(['00001', '00022', '00aaa', '00333', '45678'])
+ tm.assert_series_equal(result, expected)
+ expected = np.array([v.zfill(5) for v in values.values],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ result = values.str.zfill(3)
+ expected = Series(['001', '022', 'aaa', '333', '45678'])
+ tm.assert_series_equal(result, expected)
+ expected = np.array([v.zfill(3) for v in values.values],
+ dtype=np.object_)
+ tm.assert_numpy_array_equal(result.values, expected)
+
+ values = Series(['1', np.nan, 'aaa', np.nan, '45678'])
+ result = values.str.zfill(5)
+ expected = Series(['00001', np.nan, '00aaa', np.nan, '45678'])
+ tm.assert_series_equal(result, expected)
+
+ def test_split(self):
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+
+ result = values.str.split('_')
+ exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
+ tm.assert_series_equal(result, exp)
+
+ # more than one char
+ values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+ result = values.str.split('__')
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.split('__', expand=False)
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
+ 2.])
+ result = mixed.str.split('_')
+ exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
+ ])
+ assert isinstance(result, Series)
+ tm.assert_almost_equal(result, exp)
+
+ result = mixed.str.split('_', expand=False)
+ assert isinstance(result, Series)
+ tm.assert_almost_equal(result, exp)
+
+ # unicode
+ values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
+
+ result = values.str.split('_')
+ exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
+ [u('f'), u('g'), u('h')]])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.split('_', expand=False)
+ tm.assert_series_equal(result, exp)
+
+ # regex split
+ values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
+ result = values.str.split('[,_]')
+ exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
+ [u('f'), u('g'), u('h')]])
+ tm.assert_series_equal(result, exp)
+
+ def test_rsplit(self):
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+ result = values.str.rsplit('_')
+ exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
+ tm.assert_series_equal(result, exp)
+
+ # more than one char
+ values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+ result = values.str.rsplit('__')
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rsplit('__', expand=False)
+ tm.assert_series_equal(result, exp)
+
+ # mixed
+ mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
+ 2.])
+ result = mixed.str.rsplit('_')
+ exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
+ ])
+ assert isinstance(result, Series)
+ tm.assert_almost_equal(result, exp)
+
+ result = mixed.str.rsplit('_', expand=False)
+ assert isinstance(result, Series)
+ tm.assert_almost_equal(result, exp)
+
+ # unicode
+ values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
+ result = values.str.rsplit('_')
+ exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
+ [u('f'), u('g'), u('h')]])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rsplit('_', expand=False)
+ tm.assert_series_equal(result, exp)
+
+ # regex split is not supported by rsplit
+ values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
+ result = values.str.rsplit('[,_]')
+ exp = Series([[u('a,b_c')], [u('c_d,e')], NA, [u('f,g,h')]])
+ tm.assert_series_equal(result, exp)
+
+ # setting max number of splits, make sure it's from reverse
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+ result = values.str.rsplit('_', n=1)
+ exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
+ tm.assert_series_equal(result, exp)
+
+ def test_split_blank_string(self):
+ # expand blank split GH 20067
+ values = Series([''], name='test')
+ result = values.str.split(expand=True)
+ exp = DataFrame([[]])
+ tm.assert_frame_equal(result, exp)
+
+ values = Series(['a b c', 'a b', '', ' '], name='test')
+ result = values.str.split(expand=True)
+ exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan],
+ [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]])
+ tm.assert_frame_equal(result, exp)
+
+ def test_split_noargs(self):
+ # #1859
+ s = Series(['Wes McKinney', 'Travis Oliphant'])
+ result = s.str.split()
+ expected = ['Travis', 'Oliphant']
+ assert result[1] == expected
+ result = s.str.rsplit()
+ assert result[1] == expected
+
+ def test_split_maxsplit(self):
+ # re.split 0, str.split -1
+ s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk'])
+
+ result = s.str.split(n=-1)
+ xp = s.str.split()
+ tm.assert_series_equal(result, xp)
+
+ result = s.str.split(n=0)
+ tm.assert_series_equal(result, xp)
+
+ xp = s.str.split('asdf')
+ result = s.str.split('asdf', n=0)
+ tm.assert_series_equal(result, xp)
+
+ result = s.str.split('asdf', n=-1)
+ tm.assert_series_equal(result, xp)
+
+ def test_split_no_pat_with_nonzero_n(self):
+ s = Series(['split once', 'split once too!'])
+ result = s.str.split(n=1)
+ expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
+ tm.assert_series_equal(expected, result, check_index_type=False)
+
+ def test_split_to_dataframe(self):
+ s = Series(['nosplit', 'alsonosplit'])
+ result = s.str.split('_', expand=True)
+ exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
+ tm.assert_frame_equal(result, exp)
+
+ s = Series(['some_equal_splits', 'with_no_nans'])
+ result = s.str.split('_', expand=True)
+ exp = DataFrame({0: ['some', 'with'],
+ 1: ['equal', 'no'],
+ 2: ['splits', 'nans']})
+ tm.assert_frame_equal(result, exp)
+
+ s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
+ result = s.str.split('_', expand=True)
+ exp = DataFrame({0: ['some', 'one'],
+ 1: ['unequal', 'of'],
+ 2: ['splits', 'these'],
+ 3: [NA, 'things'],
+ 4: [NA, 'is'],
+ 5: [NA, 'not']})
+ tm.assert_frame_equal(result, exp)
+
+ s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
+ result = s.str.split('_', expand=True)
+ exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
+ index=['preserve', 'me'])
+ tm.assert_frame_equal(result, exp)
+
+ with pytest.raises(ValueError, match="expand must be"):
+ s.str.split('_', expand="not_a_boolean")
+
+ def test_split_to_multiindex_expand(self):
+ # https://github.com/pandas-dev/pandas/issues/23677
+
+ idx = Index(['nosplit', 'alsonosplit', np.nan])
+ result = idx.str.split('_', expand=True)
+ exp = idx
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 1
+
+ idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
+ result = idx.str.split('_', expand=True)
+ exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
+ ('with', 'no', 'nans'),
+ [np.nan, np.nan, np.nan],
+ [None, None, None]])
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 3
+
+ idx = Index(['some_unequal_splits',
+ 'one_of_these_things_is_not',
+ np.nan, None])
+ result = idx.str.split('_', expand=True)
+ exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
+ NA, NA, NA),
+ ('one', 'of', 'these',
+ 'things', 'is', 'not'),
+ (np.nan, np.nan, np.nan,
+ np.nan, np.nan, np.nan),
+ (None, None, None,
+ None, None, None)])
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 6
+
+ with pytest.raises(ValueError, match="expand must be"):
+ idx.str.split('_', expand="not_a_boolean")
+
+ def test_rsplit_to_dataframe_expand(self):
+ s = Series(['nosplit', 'alsonosplit'])
+ result = s.str.rsplit('_', expand=True)
+ exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
+ tm.assert_frame_equal(result, exp)
+
+ s = Series(['some_equal_splits', 'with_no_nans'])
+ result = s.str.rsplit('_', expand=True)
+ exp = DataFrame({0: ['some', 'with'],
+ 1: ['equal', 'no'],
+ 2: ['splits', 'nans']})
+ tm.assert_frame_equal(result, exp)
+
+ result = s.str.rsplit('_', expand=True, n=2)
+ exp = DataFrame({0: ['some', 'with'],
+ 1: ['equal', 'no'],
+ 2: ['splits', 'nans']})
+ tm.assert_frame_equal(result, exp)
+
+ result = s.str.rsplit('_', expand=True, n=1)
+ exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']})
+ tm.assert_frame_equal(result, exp)
+
+ s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
+ result = s.str.rsplit('_', expand=True)
+ exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
+ index=['preserve', 'me'])
+ tm.assert_frame_equal(result, exp)
+
+ def test_rsplit_to_multiindex_expand(self):
+ idx = Index(['nosplit', 'alsonosplit'])
+ result = idx.str.rsplit('_', expand=True)
+ exp = idx
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 1
+
+ idx = Index(['some_equal_splits', 'with_no_nans'])
+ result = idx.str.rsplit('_', expand=True)
+ exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
+ 'with', 'no', 'nans')])
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 3
+
+ idx = Index(['some_equal_splits', 'with_no_nans'])
+ result = idx.str.rsplit('_', expand=True, n=1)
+ exp = MultiIndex.from_tuples([('some_equal', 'splits'),
+ ('with_no', 'nans')])
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 2
+
+ def test_split_nan_expand(self):
+ # gh-18450
+ s = Series(["foo,bar,baz", NA])
+ result = s.str.split(",", expand=True)
+ exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]])
+ tm.assert_frame_equal(result, exp)
+
+ # check that these are actually np.nan and not None
+ # TODO see GH 18463
+ # tm.assert_frame_equal does not differentiate
+ assert all(np.isnan(x) for x in result.iloc[1])
+
+ def test_split_with_name(self):
+ # GH 12617
+
+ # should preserve name
+ s = Series(['a,b', 'c,d'], name='xxx')
+ res = s.str.split(',')
+ exp = Series([['a', 'b'], ['c', 'd']], name='xxx')
+ tm.assert_series_equal(res, exp)
+
+ res = s.str.split(',', expand=True)
+ exp = DataFrame([['a', 'b'], ['c', 'd']])
+ tm.assert_frame_equal(res, exp)
+
+ idx = Index(['a,b', 'c,d'], name='xxx')
+ res = idx.str.split(',')
+ exp = Index([['a', 'b'], ['c', 'd']], name='xxx')
+ assert res.nlevels == 1
+ tm.assert_index_equal(res, exp)
+
+ res = idx.str.split(',', expand=True)
+ exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')])
+ assert res.nlevels == 2
+ tm.assert_index_equal(res, exp)
+
+ def test_partition_series(self):
+ # https://github.com/pandas-dev/pandas/issues/23558
+
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
+
+ result = values.str.partition('_', expand=False)
+ exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
+ ('f', '_', 'g_h'), None])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rpartition('_', expand=False)
+ exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
+ ('f_g', '_', 'h'), None])
+ tm.assert_series_equal(result, exp)
+
+ # more than one char
+ values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
+ result = values.str.partition('__', expand=False)
+ exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
+ ('f', '__', 'g__h'), None])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rpartition('__', expand=False)
+ exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
+ ('f__g', '__', 'h'), None])
+ tm.assert_series_equal(result, exp)
+
+ # None
+ values = Series(['a b c', 'c d e', NA, 'f g h', None])
+ result = values.str.partition(expand=False)
+ exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
+ ('f', ' ', 'g h'), None])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rpartition(expand=False)
+ exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
+ ('f g', ' ', 'h'), None])
+ tm.assert_series_equal(result, exp)
+
+ # Not split
+ values = Series(['abc', 'cde', NA, 'fgh', None])
+ result = values.str.partition('_', expand=False)
+ exp = Series([('abc', '', ''), ('cde', '', ''), NA,
+ ('fgh', '', ''), None])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rpartition('_', expand=False)
+ exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
+ ('', '', 'fgh'), None])
+ tm.assert_series_equal(result, exp)
+
+ # unicode
+ values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h'])
+
+ result = values.str.partition('_', expand=False)
+ exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'),
+ NA, (u'f', u'_', u'g_h')])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rpartition('_', expand=False)
+ exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'),
+ NA, (u'f_g', u'_', u'h')])
+ tm.assert_series_equal(result, exp)
+
+ # compare to standard lib
+ values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
+ result = values.str.partition('_', expand=False).tolist()
+ assert result == [v.partition('_') for v in values]
+ result = values.str.rpartition('_', expand=False).tolist()
+ assert result == [v.rpartition('_') for v in values]
+
+ def test_partition_index(self):
+ # https://github.com/pandas-dev/pandas/issues/23558
+
+ values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])
+
+ result = values.str.partition('_', expand=False)
+ exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
+ ('f', '_', 'g_h'), np.nan, None]))
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 1
+
+ result = values.str.rpartition('_', expand=False)
+ exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
+ ('f_g', '_', 'h'), np.nan, None]))
+ tm.assert_index_equal(result, exp)
+ assert result.nlevels == 1
+
+ result = values.str.partition('_')
+ exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
+ ('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
+ (None, None, None)])
+ tm.assert_index_equal(result, exp)
+ assert isinstance(result, MultiIndex)
+ assert result.nlevels == 3
+
+ result = values.str.rpartition('_')
+ exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
+ ('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
+ (None, None, None)])
+ tm.assert_index_equal(result, exp)
+ assert isinstance(result, MultiIndex)
+ assert result.nlevels == 3
+
+ def test_partition_to_dataframe(self):
+ # https://github.com/pandas-dev/pandas/issues/23558
+
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
+ result = values.str.partition('_')
+ exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
+ 1: ['_', '_', np.nan, '_', None],
+ 2: ['b_c', 'd_e', np.nan, 'g_h', None]})
+ tm.assert_frame_equal(result, exp)
+
+ result = values.str.rpartition('_')
+ exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
+ 1: ['_', '_', np.nan, '_', None],
+ 2: ['c', 'e', np.nan, 'h', None]})
+ tm.assert_frame_equal(result, exp)
+
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
+ result = values.str.partition('_', expand=True)
+ exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
+ 1: ['_', '_', np.nan, '_', None],
+ 2: ['b_c', 'd_e', np.nan, 'g_h', None]})
+ tm.assert_frame_equal(result, exp)
+
+ result = values.str.rpartition('_', expand=True)
+ exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
+ 1: ['_', '_', np.nan, '_', None],
+ 2: ['c', 'e', np.nan, 'h', None]})
+ tm.assert_frame_equal(result, exp)
+
+ def test_partition_with_name(self):
+ # GH 12617
+
+ s = Series(['a,b', 'c,d'], name='xxx')
+ res = s.str.partition(',')
+ exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']})
+ tm.assert_frame_equal(res, exp)
+
+ # should preserve name
+ res = s.str.partition(',', expand=False)
+ exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx')
+ tm.assert_series_equal(res, exp)
+
+ idx = Index(['a,b', 'c,d'], name='xxx')
+ res = idx.str.partition(',')
+ exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')])
+ assert res.nlevels == 3
+ tm.assert_index_equal(res, exp)
+
+ # should preserve name
+ res = idx.str.partition(',', expand=False)
+ exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx')
+ assert res.nlevels == 1
+ tm.assert_index_equal(res, exp)
+
+ def test_partition_deprecation(self):
+ # GH 22676; depr kwarg "pat" in favor of "sep"
+ values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+
+ # str.partition
+ # using sep -> no warning
+ expected = values.str.partition(sep='_')
+ with tm.assert_produces_warning(FutureWarning):
+ result = values.str.partition(pat='_')
+ tm.assert_frame_equal(result, expected)
+
+ # str.rpartition
+ # using sep -> no warning
+ expected = values.str.rpartition(sep='_')
+ with tm.assert_produces_warning(FutureWarning):
+ result = values.str.rpartition(pat='_')
+ tm.assert_frame_equal(result, expected)
+
+ def test_pipe_failures(self):
+ # #2119
+ s = Series(['A|B|C'])
+
+ result = s.str.split('|')
+ exp = Series([['A', 'B', 'C']])
+
+ tm.assert_series_equal(result, exp)
+
+ result = s.str.replace('|', ' ')
+ exp = Series(['A B C'])
+
+ tm.assert_series_equal(result, exp)
+
+ def test_slice(self):
+ values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux'])
+
+ result = values.str.slice(2, 5)
+ exp = Series(['foo', 'bar', NA, 'baz'])
+ tm.assert_series_equal(result, exp)
+
+ for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2),
+ (3, 0, -1)]:
+ try:
+ result = values.str.slice(start, stop, step)
+ expected = Series([s[start:stop:step] if not isna(s) else NA
+ for s in values])
+ tm.assert_series_equal(result, expected)
+ except IndexError:
+ print('failed on %s:%s:%s' % (start, stop, step))
+ raise
+
+ # mixed
+ mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(),
+ None, 1, 2.])
+
+ rs = Series(mixed).str.slice(2, 5)
+ xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ rs = Series(mixed).str.slice(2, 5, -1)
+ xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA])
+
+ # unicode
+ values = Series([u('aafootwo'), u('aabartwo'), NA, u('aabazqux')])
+
+ result = values.str.slice(2, 5)
+ exp = Series([u('foo'), u('bar'), NA, u('baz')])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.slice(0, -1, 2)
+ exp = Series([u('afow'), u('abrw'), NA, u('abzu')])
+ tm.assert_series_equal(result, exp)
+
+ def test_slice_replace(self):
+ values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA
+ ])
+
+ exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA])
+ result = values.str.slice_replace(2, 3)
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA])
+ result = values.str.slice_replace(2, 3, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
+ ])
+ result = values.str.slice_replace(2, 2, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
+ ])
+ result = values.str.slice_replace(2, 1, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA])
+ result = values.str.slice_replace(-1, None, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['zrt', 'zer', 'zat', 'z', NA])
+ result = values.str.slice_replace(None, -2, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA])
+ result = values.str.slice_replace(6, 8, 'z')
+ tm.assert_series_equal(result, exp)
+
+ exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA])
+ result = values.str.slice_replace(-10, 3, 'z')
+ tm.assert_series_equal(result, exp)
+
+ def test_strip_lstrip_rstrip(self):
+ values = Series([' aa ', ' bb \n', NA, 'cc '])
+
+ result = values.str.strip()
+ exp = Series(['aa', 'bb', NA, 'cc'])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.lstrip()
+ exp = Series(['aa ', 'bb \n', NA, 'cc '])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rstrip()
+ exp = Series([' aa', ' bb', NA, 'cc'])
+ tm.assert_series_equal(result, exp)
+
+ def test_strip_lstrip_rstrip_mixed(self):
+ # mixed
+ mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None,
+ 1, 2.])
+
+ rs = Series(mixed).str.strip()
+ xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ rs = Series(mixed).str.lstrip()
+ xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ rs = Series(mixed).str.rstrip()
+ xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ def test_strip_lstrip_rstrip_unicode(self):
+ # unicode
+ values = Series([u(' aa '), u(' bb \n'), NA, u('cc ')])
+
+ result = values.str.strip()
+ exp = Series([u('aa'), u('bb'), NA, u('cc')])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.lstrip()
+ exp = Series([u('aa '), u('bb \n'), NA, u('cc ')])
+ tm.assert_series_equal(result, exp)
+
+ result = values.str.rstrip()
+ exp = Series([u(' aa'), u(' bb'), NA, u('cc')])
+ tm.assert_series_equal(result, exp)
+
+ def test_strip_lstrip_rstrip_args(self):
+ values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx'])
+
+ rs = values.str.strip('x')
+ xp = Series(['ABC', ' BNSD', 'LDFJH '])
+ assert_series_equal(rs, xp)
+
+ rs = values.str.lstrip('x')
+ xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
+ assert_series_equal(rs, xp)
+
+ rs = values.str.rstrip('x')
+ xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
+ assert_series_equal(rs, xp)
+
+ def test_strip_lstrip_rstrip_args_unicode(self):
+ values = Series([u('xxABCxx'), u('xx BNSD'), u('LDFJH xx')])
+
+ rs = values.str.strip(u('x'))
+ xp = Series(['ABC', ' BNSD', 'LDFJH '])
+ assert_series_equal(rs, xp)
+
+ rs = values.str.lstrip(u('x'))
+ xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
+ assert_series_equal(rs, xp)
+
+ rs = values.str.rstrip(u('x'))
+ xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
+ assert_series_equal(rs, xp)
+
+ def test_wrap(self):
+ # test values are: two words less than width, two words equal to width,
+ # two words greater than width, one word less than width, one word
+ # equal to width, one word greater than width, multiple tokens with
+ # trailing whitespace equal to width
+ values = Series([u('hello world'), u('hello world!'), u(
+ 'hello world!!'), u('abcdefabcde'), u('abcdefabcdef'), u(
+ 'abcdefabcdefa'), u('ab ab ab ab '), u('ab ab ab ab a'), u(
+ '\t')])
+
+ # expected values
+ xp = Series([u('hello world'), u('hello world!'), u('hello\nworld!!'),
+ u('abcdefabcde'), u('abcdefabcdef'), u('abcdefabcdef\na'),
+ u('ab ab ab ab'), u('ab ab ab ab\na'), u('')])
+
+ rs = values.str.wrap(12, break_long_words=True)
+ assert_series_equal(rs, xp)
+
+ # test with pre and post whitespace (non-unicode), NaN, and non-ascii
+ # Unicode
+ values = Series([' pre ', np.nan, u('\xac\u20ac\U00008000 abadcafe')
+ ])
+ xp = Series([' pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')])
+ rs = values.str.wrap(6)
+ assert_series_equal(rs, xp)
+
+ def test_get(self):
+ values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
+
+ result = values.str.split('_').str.get(1)
+ expected = Series(['b', 'd', np.nan, 'g'])
+ tm.assert_series_equal(result, expected)
+
+ # mixed
+ mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1,
+ 2.])
+
+ rs = Series(mixed).str.split('_').str.get(1)
+ xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA])
+
+ assert isinstance(rs, Series)
+ tm.assert_almost_equal(rs, xp)
+
+ # unicode
+ values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
+
+ result = values.str.split('_').str.get(1)
+ expected = Series([u('b'), u('d'), np.nan, u('g')])
+ tm.assert_series_equal(result, expected)
+
+ # bounds testing
+ values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12'])
+
+ # positive index
+ result = values.str.split('_').str.get(2)
+ expected = Series(['3', '8', np.nan])
+ tm.assert_series_equal(result, expected)
+
+ # negative index
+ result = values.str.split('_').str.get(-3)
+ expected = Series(['3', '8', np.nan])
+ tm.assert_series_equal(result, expected)
+
+ def test_get_complex(self):
+ # GH 20671, getting value not in dict raising `KeyError`
+ values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3},
+ {1: 'a', 2: 'b', 3: 'c'}])
+
+ result = values.str.get(1)
+ expected = Series([2, 2, np.nan, 'a'])
+ tm.assert_series_equal(result, expected)
+
+ result = values.str.get(-1)
+ expected = Series([3, 3, np.nan, np.nan])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('to_type', [tuple, list, np.array])
+ def test_get_complex_nested(self, to_type):
+ values = Series([to_type([to_type([1, 2])])])
+
+ result = values.str.get(0)
+ expected = Series([to_type([1, 2])])
+ tm.assert_series_equal(result, expected)
+
+ result = values.str.get(1)
+ expected = Series([np.nan])
+ tm.assert_series_equal(result, expected)
+
+ def test_contains_moar(self):
+ # PR #1179
+ s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA,
+ 'CABA', 'dog', 'cat'])
+
+ result = s.str.contains('a')
+ expected = Series([False, False, False, True, True, False, np.nan,
+ False, False, True])
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('a', case=False)
+ expected = Series([True, False, False, True, True, False, np.nan, True,
+ False, True])
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('Aa')
+ expected = Series([False, False, False, True, False, False, np.nan,
+ False, False, False])
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('ba')
+ expected = Series([False, False, False, True, False, False, np.nan,
+ False, False, False])
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('ba', case=False)
+ expected = Series([False, False, False, True, True, False, np.nan,
+ True, False, False])
+ assert_series_equal(result, expected)
+
+ def test_contains_nan(self):
+ # PR #14171
+ s = Series([np.nan, np.nan, np.nan], dtype=np.object_)
+
+ result = s.str.contains('foo', na=False)
+ expected = Series([False, False, False], dtype=np.bool_)
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('foo', na=True)
+ expected = Series([True, True, True], dtype=np.bool_)
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('foo', na="foo")
+ expected = Series(["foo", "foo", "foo"], dtype=np.object_)
+ assert_series_equal(result, expected)
+
+ result = s.str.contains('foo')
+ expected = Series([np.nan, np.nan, np.nan], dtype=np.object_)
+ assert_series_equal(result, expected)
+
+ def test_replace_moar(self):
+ # PR #1179
+ s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA',
+ 'dog', 'cat'])
+
+ result = s.str.replace('A', 'YYY')
+ expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA,
+ 'CYYYBYYY', 'dog', 'cat'])
+ assert_series_equal(result, expected)
+
+ result = s.str.replace('A', 'YYY', case=False)
+ expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA,
+ 'CYYYBYYY', 'dog', 'cYYYt'])
+ assert_series_equal(result, expected)
+
+ result = s.str.replace('^.a|dog', 'XX-XX ', case=False)
+ expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA,
+ 'XX-XX BA', 'XX-XX ', 'XX-XX t'])
+ assert_series_equal(result, expected)
+
+ def test_string_slice_get_syntax(self):
+ s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY',
+ 'dog', 'cYYYt'])
+
+ result = s.str[0]
+ expected = s.str.get(0)
+ assert_series_equal(result, expected)
+
+ result = s.str[:3]
+ expected = s.str.slice(stop=3)
+ assert_series_equal(result, expected)
+
+ result = s.str[2::-1]
+ expected = s.str.slice(start=2, step=-1)
+ assert_series_equal(result, expected)
+
+ def test_string_slice_out_of_bounds(self):
+ s = Series([(1, 2), (1, ), (3, 4, 5)])
+
+ result = s.str[1]
+ expected = Series([2, np.nan, 4])
+
+ assert_series_equal(result, expected)
+
+ s = Series(['foo', 'b', 'ba'])
+ result = s.str[1]
+ expected = Series(['o', np.nan, 'a'])
+ assert_series_equal(result, expected)
+
+ def test_match_findall_flags(self):
+ data = {'Dave': '[email protected]',
+ 'Steve': '[email protected]',
+ 'Rob': '[email protected]',
+ 'Wes': np.nan}
+ data = Series(data)
+
+ pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
+
+ result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
+ assert result.iloc[0].tolist() == ['dave', 'google', 'com']
+
+ result = data.str.match(pat, flags=re.IGNORECASE)
+ assert result[0]
+
+ result = data.str.findall(pat, flags=re.IGNORECASE)
+ assert result[0][0] == ('dave', 'google', 'com')
+
+ result = data.str.count(pat, flags=re.IGNORECASE)
+ assert result[0] == 1
+
+ with tm.assert_produces_warning(UserWarning):
+ result = data.str.contains(pat, flags=re.IGNORECASE)
+ assert result[0]
+
+ def test_encode_decode(self):
+ base = Series([u('a'), u('b'), u('a\xe4')])
+ series = base.str.encode('utf-8')
+
+ f = lambda x: x.decode('utf-8')
+ result = series.str.decode('utf-8')
+ exp = series.map(f)
+
+ tm.assert_series_equal(result, exp)
+
+ def test_encode_decode_errors(self):
+ encodeBase = Series([u('a'), u('b'), u('a\x9d')])
+
+ pytest.raises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252')
+
+ f = lambda x: x.encode('cp1252', 'ignore')
+ result = encodeBase.str.encode('cp1252', 'ignore')
+ exp = encodeBase.map(f)
+ tm.assert_series_equal(result, exp)
+
+ decodeBase = Series([b'a', b'b', b'a\x9d'])
+
+ pytest.raises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252')
+
+ f = lambda x: x.decode('cp1252', 'ignore')
+ result = decodeBase.str.decode('cp1252', 'ignore')
+ exp = decodeBase.map(f)
+
+ tm.assert_series_equal(result, exp)
+
+ def test_normalize(self):
+ values = ['ABC', u'ABC', u'123', np.nan, u'アイエ']
+ s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
+
+ normed = [u'ABC', u'ABC', u'123', np.nan, u'アイエ']
+ expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
+
+ result = s.str.normalize('NFKC')
+ tm.assert_series_equal(result, expected)
+
+ expected = Series([u'ABC', u'ABC', u'123', np.nan, u'アイエ'],
+ index=['a', 'b', 'c', 'd', 'e'])
+
+ result = s.str.normalize('NFC')
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(ValueError, match="invalid normalization form"):
+ s.str.normalize('xxx')
+
+ s = Index([u'ABC', u'123', u'アイエ'])
+ expected = Index([u'ABC', u'123', u'アイエ'])
+ result = s.str.normalize('NFKC')
+ tm.assert_index_equal(result, expected)
+
+ def test_index_str_accessor_visibility(self):
+ from pandas.core.strings import StringMethods
+
+ if not compat.PY3:
+ cases = [(['a', 'b'], 'string'), (['a', u('b')], 'mixed'),
+ ([u('a'), u('b')], 'unicode'),
+ (['a', 'b', 1], 'mixed-integer'),
+ (['a', 'b', 1.3], 'mixed'),
+ (['a', 'b', 1.3, 1], 'mixed-integer'),
+ (['aa', datetime(2011, 1, 1)], 'mixed')]
+ else:
+ cases = [(['a', 'b'], 'string'), (['a', u('b')], 'string'),
+ ([u('a'), u('b')], 'string'),
+ (['a', 'b', 1], 'mixed-integer'),
+ (['a', 'b', 1.3], 'mixed'),
+ (['a', 'b', 1.3, 1], 'mixed-integer'),
+ (['aa', datetime(2011, 1, 1)], 'mixed')]
+ for values, tp in cases:
+ idx = Index(values)
+ assert isinstance(Series(values).str, StringMethods)
+ assert isinstance(idx.str, StringMethods)
+ assert idx.inferred_type == tp
+
+ for values, tp in cases:
+ idx = Index(values)
+ assert isinstance(Series(values).str, StringMethods)
+ assert isinstance(idx.str, StringMethods)
+ assert idx.inferred_type == tp
+
+ cases = [([1, np.nan], 'floating'),
+ ([datetime(2011, 1, 1)], 'datetime64'),
+ ([timedelta(1)], 'timedelta64')]
+ for values, tp in cases:
+ idx = Index(values)
+ message = 'Can only use .str accessor with string values'
+ with pytest.raises(AttributeError, match=message):
+ Series(values).str
+ with pytest.raises(AttributeError, match=message):
+ idx.str
+ assert idx.inferred_type == tp
+
+ # MultiIndex has mixed dtype, but not allow to use accessor
+ idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')])
+ assert idx.inferred_type == 'mixed'
+ message = 'Can only use .str accessor with Index, not MultiIndex'
+ with pytest.raises(AttributeError, match=message):
+ idx.str
+
+ def test_str_accessor_no_new_attributes(self):
+ # https://github.com/pandas-dev/pandas/issues/10673
+ s = Series(list('aabbcde'))
+ with pytest.raises(AttributeError,
+ match="You cannot add any new attribute"):
+ s.str.xlabel = "a"
+
+ def test_method_on_bytes(self):
+ lhs = Series(np.array(list('abc'), 'S1').astype(object))
+ rhs = Series(np.array(list('def'), 'S1').astype(object))
+ if compat.PY3:
+ pytest.raises(TypeError, lhs.str.cat, rhs)
+ else:
+ result = lhs.str.cat(rhs)
+ expected = Series(np.array(
+ ['ad', 'be', 'cf'], 'S2').astype(object))
+ tm.assert_series_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_take.py b/contrib/python/pandas/py2/pandas/tests/test_take.py
new file mode 100644
index 00000000000..c9e4ed90b1d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_take.py
@@ -0,0 +1,468 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslib import iNaT
+from pandas.compat import long
+
+import pandas.core.algorithms as algos
+import pandas.util.testing as tm
+
+
[email protected](params=[True, False])
+def writeable(request):
+ return request.param
+
+
+# Check that take_nd works both with writeable arrays
+# (in which case fast typed memory-views implementation)
+# and read-only arrays alike.
+ (np.float64, True),
+ (np.float32, True),
+ (np.uint64, False),
+ (np.uint32, False),
+ (np.uint16, False),
+ (np.uint8, False),
+ (np.int64, False),
+ (np.int32, False),
+ (np.int16, False),
+ (np.int8, False),
+ (np.object_, True),
+ (np.bool, False),
+])
+def dtype_can_hold_na(request):
+ return request.param
+
+
+ (np.int8, np.int16(127), np.int8),
+ (np.int8, np.int16(128), np.int16),
+ (np.int32, 1, np.int32),
+ (np.int32, 2.0, np.float64),
+ (np.int32, 3.0 + 4.0j, np.complex128),
+ (np.int32, True, np.object_),
+ (np.int32, "", np.object_),
+ (np.float64, 1, np.float64),
+ (np.float64, 2.0, np.float64),
+ (np.float64, 3.0 + 4.0j, np.complex128),
+ (np.float64, True, np.object_),
+ (np.float64, "", np.object_),
+ (np.complex128, 1, np.complex128),
+ (np.complex128, 2.0, np.complex128),
+ (np.complex128, 3.0 + 4.0j, np.complex128),
+ (np.complex128, True, np.object_),
+ (np.complex128, "", np.object_),
+ (np.bool_, 1, np.object_),
+ (np.bool_, 2.0, np.object_),
+ (np.bool_, 3.0 + 4.0j, np.object_),
+ (np.bool_, True, np.bool_),
+ (np.bool_, '', np.object_),
+])
+def dtype_fill_out_dtype(request):
+ return request.param
+
+
+class TestTake(object):
+ # Standard incompatible fill error.
+ fill_error = re.compile("Incompatible type for fill_value")
+
+ def test_1d_with_out(self, dtype_can_hold_na, writeable):
+ dtype, can_hold_na = dtype_can_hold_na
+
+ data = np.random.randint(0, 2, 4).astype(dtype)
+ data.flags.writeable = writeable
+
+ indexer = [2, 1, 0, 1]
+ out = np.empty(4, dtype=dtype)
+ algos.take_1d(data, indexer, out=out)
+
+ expected = data.take(indexer)
+ tm.assert_almost_equal(out, expected)
+
+ indexer = [2, 1, 0, -1]
+ out = np.empty(4, dtype=dtype)
+
+ if can_hold_na:
+ algos.take_1d(data, indexer, out=out)
+ expected = data.take(indexer)
+ expected[3] = np.nan
+ tm.assert_almost_equal(out, expected)
+ else:
+ with pytest.raises(TypeError, match=self.fill_error):
+ algos.take_1d(data, indexer, out=out)
+
+ # No Exception otherwise.
+ data.take(indexer, out=out)
+
+ def test_1d_fill_nonna(self, dtype_fill_out_dtype):
+ dtype, fill_value, out_dtype = dtype_fill_out_dtype
+ data = np.random.randint(0, 2, 4).astype(dtype)
+ indexer = [2, 1, 0, -1]
+
+ result = algos.take_1d(data, indexer, fill_value=fill_value)
+ assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all())
+ assert (result[3] == fill_value)
+ assert (result.dtype == out_dtype)
+
+ indexer = [2, 1, 0, 1]
+
+ result = algos.take_1d(data, indexer, fill_value=fill_value)
+ assert ((result[[0, 1, 2, 3]] == data[indexer]).all())
+ assert (result.dtype == dtype)
+
+ def test_2d_with_out(self, dtype_can_hold_na, writeable):
+ dtype, can_hold_na = dtype_can_hold_na
+
+ data = np.random.randint(0, 2, (5, 3)).astype(dtype)
+ data.flags.writeable = writeable
+
+ indexer = [2, 1, 0, 1]
+ out0 = np.empty((4, 3), dtype=dtype)
+ out1 = np.empty((5, 4), dtype=dtype)
+ algos.take_nd(data, indexer, out=out0, axis=0)
+ algos.take_nd(data, indexer, out=out1, axis=1)
+
+ expected0 = data.take(indexer, axis=0)
+ expected1 = data.take(indexer, axis=1)
+ tm.assert_almost_equal(out0, expected0)
+ tm.assert_almost_equal(out1, expected1)
+
+ indexer = [2, 1, 0, -1]
+ out0 = np.empty((4, 3), dtype=dtype)
+ out1 = np.empty((5, 4), dtype=dtype)
+
+ if can_hold_na:
+ algos.take_nd(data, indexer, out=out0, axis=0)
+ algos.take_nd(data, indexer, out=out1, axis=1)
+
+ expected0 = data.take(indexer, axis=0)
+ expected1 = data.take(indexer, axis=1)
+ expected0[3, :] = np.nan
+ expected1[:, 3] = np.nan
+
+ tm.assert_almost_equal(out0, expected0)
+ tm.assert_almost_equal(out1, expected1)
+ else:
+ for i, out in enumerate([out0, out1]):
+ with pytest.raises(TypeError, match=self.fill_error):
+ algos.take_nd(data, indexer, out=out, axis=i)
+
+ # No Exception otherwise.
+ data.take(indexer, out=out, axis=i)
+
+ def test_2d_fill_nonna(self, dtype_fill_out_dtype):
+ dtype, fill_value, out_dtype = dtype_fill_out_dtype
+ data = np.random.randint(0, 2, (5, 3)).astype(dtype)
+ indexer = [2, 1, 0, -1]
+
+ result = algos.take_nd(data, indexer, axis=0,
+ fill_value=fill_value)
+ assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all())
+ assert ((result[3, :] == fill_value).all())
+ assert (result.dtype == out_dtype)
+
+ result = algos.take_nd(data, indexer, axis=1,
+ fill_value=fill_value)
+ assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all())
+ assert ((result[:, 3] == fill_value).all())
+ assert (result.dtype == out_dtype)
+
+ indexer = [2, 1, 0, 1]
+ result = algos.take_nd(data, indexer, axis=0,
+ fill_value=fill_value)
+ assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all())
+ assert (result.dtype == dtype)
+
+ result = algos.take_nd(data, indexer, axis=1,
+ fill_value=fill_value)
+ assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all())
+ assert (result.dtype == dtype)
+
+ def test_3d_with_out(self, dtype_can_hold_na):
+ dtype, can_hold_na = dtype_can_hold_na
+
+ data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype)
+ indexer = [2, 1, 0, 1]
+
+ out0 = np.empty((4, 4, 3), dtype=dtype)
+ out1 = np.empty((5, 4, 3), dtype=dtype)
+ out2 = np.empty((5, 4, 4), dtype=dtype)
+
+ algos.take_nd(data, indexer, out=out0, axis=0)
+ algos.take_nd(data, indexer, out=out1, axis=1)
+ algos.take_nd(data, indexer, out=out2, axis=2)
+
+ expected0 = data.take(indexer, axis=0)
+ expected1 = data.take(indexer, axis=1)
+ expected2 = data.take(indexer, axis=2)
+
+ tm.assert_almost_equal(out0, expected0)
+ tm.assert_almost_equal(out1, expected1)
+ tm.assert_almost_equal(out2, expected2)
+
+ indexer = [2, 1, 0, -1]
+ out0 = np.empty((4, 4, 3), dtype=dtype)
+ out1 = np.empty((5, 4, 3), dtype=dtype)
+ out2 = np.empty((5, 4, 4), dtype=dtype)
+
+ if can_hold_na:
+ algos.take_nd(data, indexer, out=out0, axis=0)
+ algos.take_nd(data, indexer, out=out1, axis=1)
+ algos.take_nd(data, indexer, out=out2, axis=2)
+
+ expected0 = data.take(indexer, axis=0)
+ expected1 = data.take(indexer, axis=1)
+ expected2 = data.take(indexer, axis=2)
+
+ expected0[3, :, :] = np.nan
+ expected1[:, 3, :] = np.nan
+ expected2[:, :, 3] = np.nan
+
+ tm.assert_almost_equal(out0, expected0)
+ tm.assert_almost_equal(out1, expected1)
+ tm.assert_almost_equal(out2, expected2)
+ else:
+ for i, out in enumerate([out0, out1, out2]):
+ with pytest.raises(TypeError, match=self.fill_error):
+ algos.take_nd(data, indexer, out=out, axis=i)
+
+ # No Exception otherwise.
+ data.take(indexer, out=out, axis=i)
+
+ def test_3d_fill_nonna(self, dtype_fill_out_dtype):
+ dtype, fill_value, out_dtype = dtype_fill_out_dtype
+
+ data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype)
+ indexer = [2, 1, 0, -1]
+
+ result = algos.take_nd(data, indexer, axis=0,
+ fill_value=fill_value)
+ assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all())
+ assert ((result[3, :, :] == fill_value).all())
+ assert (result.dtype == out_dtype)
+
+ result = algos.take_nd(data, indexer, axis=1,
+ fill_value=fill_value)
+ assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all())
+ assert ((result[:, 3, :] == fill_value).all())
+ assert (result.dtype == out_dtype)
+
+ result = algos.take_nd(data, indexer, axis=2,
+ fill_value=fill_value)
+ assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all())
+ assert ((result[:, :, 3] == fill_value).all())
+ assert (result.dtype == out_dtype)
+
+ indexer = [2, 1, 0, 1]
+ result = algos.take_nd(data, indexer, axis=0,
+ fill_value=fill_value)
+ assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all())
+ assert (result.dtype == dtype)
+
+ result = algos.take_nd(data, indexer, axis=1,
+ fill_value=fill_value)
+ assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all())
+ assert (result.dtype == dtype)
+
+ result = algos.take_nd(data, indexer, axis=2,
+ fill_value=fill_value)
+ assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all())
+ assert (result.dtype == dtype)
+
+ def test_1d_other_dtypes(self):
+ arr = np.random.randn(10).astype(np.float32)
+
+ indexer = [1, 2, 3, -1]
+ result = algos.take_1d(arr, indexer)
+ expected = arr.take(indexer)
+ expected[-1] = np.nan
+ tm.assert_almost_equal(result, expected)
+
+ def test_2d_other_dtypes(self):
+ arr = np.random.randn(10, 5).astype(np.float32)
+
+ indexer = [1, 2, 3, -1]
+
+ # axis=0
+ result = algos.take_nd(arr, indexer, axis=0)
+ expected = arr.take(indexer, axis=0)
+ expected[-1] = np.nan
+ tm.assert_almost_equal(result, expected)
+
+ # axis=1
+ result = algos.take_nd(arr, indexer, axis=1)
+ expected = arr.take(indexer, axis=1)
+ expected[:, -1] = np.nan
+ tm.assert_almost_equal(result, expected)
+
+ def test_1d_bool(self):
+ arr = np.array([0, 1, 0], dtype=bool)
+
+ result = algos.take_1d(arr, [0, 2, 2, 1])
+ expected = arr.take([0, 2, 2, 1])
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.take_1d(arr, [0, 2, -1])
+ assert result.dtype == np.object_
+
+ def test_2d_bool(self):
+ arr = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1]], dtype=bool)
+
+ result = algos.take_nd(arr, [0, 2, 2, 1])
+ expected = arr.take([0, 2, 2, 1], axis=0)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.take_nd(arr, [0, 2, 2, 1], axis=1)
+ expected = arr.take([0, 2, 2, 1], axis=1)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = algos.take_nd(arr, [0, 2, -1])
+ assert result.dtype == np.object_
+
+ def test_2d_float32(self):
+ arr = np.random.randn(4, 3).astype(np.float32)
+ indexer = [0, 2, -1, 1, -1]
+
+ # axis=0
+ result = algos.take_nd(arr, indexer, axis=0)
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, axis=0, out=result2)
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=0)
+ expected[[2, 4], :] = np.nan
+ tm.assert_almost_equal(result, expected)
+
+ # this now accepts a float32! # test with float64 out buffer
+ out = np.empty((len(indexer), arr.shape[1]), dtype='float32')
+ algos.take_nd(arr, indexer, out=out) # it works!
+
+ # axis=1
+ result = algos.take_nd(arr, indexer, axis=1)
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, axis=1, out=result2)
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=1)
+ expected[:, [2, 4]] = np.nan
+ tm.assert_almost_equal(result, expected)
+
+ def test_2d_datetime64(self):
+ # 2005/01/01 - 2006/01/01
+ arr = np.random.randint(
+ long(11045376), long(11360736), (5, 3)) * 100000000000
+ arr = arr.view(dtype='datetime64[ns]')
+ indexer = [0, 2, -1, 1, -1]
+
+ # axis=0
+ result = algos.take_nd(arr, indexer, axis=0)
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, axis=0, out=result2)
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=0)
+ expected.view(np.int64)[[2, 4], :] = iNaT
+ tm.assert_almost_equal(result, expected)
+
+ result = algos.take_nd(arr, indexer, axis=0,
+ fill_value=datetime(2007, 1, 1))
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, out=result2, axis=0,
+ fill_value=datetime(2007, 1, 1))
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=0)
+ expected[[2, 4], :] = datetime(2007, 1, 1)
+ tm.assert_almost_equal(result, expected)
+
+ # axis=1
+ result = algos.take_nd(arr, indexer, axis=1)
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, axis=1, out=result2)
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=1)
+ expected.view(np.int64)[:, [2, 4]] = iNaT
+ tm.assert_almost_equal(result, expected)
+
+ result = algos.take_nd(arr, indexer, axis=1,
+ fill_value=datetime(2007, 1, 1))
+ result2 = np.empty_like(result)
+ algos.take_nd(arr, indexer, out=result2, axis=1,
+ fill_value=datetime(2007, 1, 1))
+ tm.assert_almost_equal(result, result2)
+
+ expected = arr.take(indexer, axis=1)
+ expected[:, [2, 4]] = datetime(2007, 1, 1)
+ tm.assert_almost_equal(result, expected)
+
+ def test_take_axis_0(self):
+ arr = np.arange(12).reshape(4, 3)
+ result = algos.take(arr, [0, -1])
+ expected = np.array([[0, 1, 2], [9, 10, 11]])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # allow_fill=True
+ result = algos.take(arr, [0, -1], allow_fill=True, fill_value=0)
+ expected = np.array([[0, 1, 2], [0, 0, 0]])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_take_axis_1(self):
+ arr = np.arange(12).reshape(4, 3)
+ result = algos.take(arr, [0, -1], axis=1)
+ expected = np.array([[0, 2], [3, 5], [6, 8], [9, 11]])
+ tm.assert_numpy_array_equal(result, expected)
+
+ # allow_fill=True
+ result = algos.take(arr, [0, -1], axis=1, allow_fill=True,
+ fill_value=0)
+ expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]])
+ tm.assert_numpy_array_equal(result, expected)
+
+
+class TestExtensionTake(object):
+ # The take method found in pd.api.extensions
+
+ def test_bounds_check_large(self):
+ arr = np.array([1, 2])
+ with pytest.raises(IndexError):
+ algos.take(arr, [2, 3], allow_fill=True)
+
+ with pytest.raises(IndexError):
+ algos.take(arr, [2, 3], allow_fill=False)
+
+ def test_bounds_check_small(self):
+ arr = np.array([1, 2, 3], dtype=np.int64)
+ indexer = [0, -1, -2]
+ with pytest.raises(ValueError):
+ algos.take(arr, indexer, allow_fill=True)
+
+ result = algos.take(arr, indexer)
+ expected = np.array([1, 3, 2], dtype=np.int64)
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize('allow_fill', [True, False])
+ def test_take_empty(self, allow_fill):
+ arr = np.array([], dtype=np.int64)
+ # empty take is ok
+ result = algos.take(arr, [], allow_fill=allow_fill)
+ tm.assert_numpy_array_equal(arr, result)
+
+ with pytest.raises(IndexError):
+ algos.take(arr, [0], allow_fill=allow_fill)
+
+ def test_take_na_empty(self):
+ result = algos.take(np.array([]), [-1, -1], allow_fill=True,
+ fill_value=0.0)
+ expected = np.array([0., 0.])
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_take_coerces_list(self):
+ arr = [1, 2, 3]
+ result = algos.take(arr, [0, 0])
+ expected = np.array([1, 1])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/test_window.py b/contrib/python/pandas/py2/pandas/tests/test_window.py
new file mode 100644
index 00000000000..e816d4c0434
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/test_window.py
@@ -0,0 +1,4073 @@
+from collections import OrderedDict
+from datetime import datetime, timedelta
+from itertools import product
+import warnings
+from warnings import catch_warnings
+
+import numpy as np
+from numpy.random import randn
+import pytest
+
+from pandas.compat import range, zip
+from pandas.errors import UnsupportedFunctionCall
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+ DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna)
+from pandas.core.base import SpecificationError
+from pandas.core.sorting import safe_sort
+import pandas.core.window as rwindow
+import pandas.util.testing as tm
+
+import pandas.tseries.offsets as offsets
+
+N, K = 100, 10
+
+
+def assert_equal(left, right):
+ if isinstance(left, Series):
+ tm.assert_series_equal(left, right)
+ else:
+ tm.assert_frame_equal(left, right)
+
+
[email protected](params=[True, False])
+def raw(request):
+ return request.param
+
+
[email protected](params=['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
+ 'blackmanharris', 'nuttall', 'barthann'])
+def win_types(request):
+ return request.param
+
+
[email protected](params=['kaiser', 'gaussian', 'general_gaussian'])
+def win_types_special(request):
+ return request.param
+
+
+class Base(object):
+
+ _nan_locs = np.arange(20, 40)
+ _inf_locs = np.array([])
+
+ def _create_data(self):
+ arr = randn(N)
+ arr[self._nan_locs] = np.NaN
+
+ self.arr = arr
+ self.rng = bdate_range(datetime(2009, 1, 1), periods=N)
+ self.series = Series(arr.copy(), index=self.rng)
+ self.frame = DataFrame(randn(N, K), index=self.rng,
+ columns=np.arange(K))
+
+
+class TestApi(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def test_getitem(self):
+
+ r = self.frame.rolling(window=5)
+ tm.assert_index_equal(r._selected_obj.columns, self.frame.columns)
+
+ r = self.frame.rolling(window=5)[1]
+ assert r._selected_obj.name == self.frame.columns[1]
+
+ # technically this is allowed
+ r = self.frame.rolling(window=5)[1, 3]
+ tm.assert_index_equal(r._selected_obj.columns,
+ self.frame.columns[[1, 3]])
+
+ r = self.frame.rolling(window=5)[[1, 3]]
+ tm.assert_index_equal(r._selected_obj.columns,
+ self.frame.columns[[1, 3]])
+
+ def test_select_bad_cols(self):
+ df = DataFrame([[1, 2]], columns=['A', 'B'])
+ g = df.rolling(window=5)
+ pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']]
+
+ pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
+ with pytest.raises(KeyError, match='^[^A]+$'):
+ # A should not be referenced as a bad column...
+ # will have to rethink regex if you change message!
+ g[['A', 'C']]
+
+ def test_attribute_access(self):
+
+ df = DataFrame([[1, 2]], columns=['A', 'B'])
+ r = df.rolling(window=5)
+ tm.assert_series_equal(r.A.sum(), r['A'].sum())
+ pytest.raises(AttributeError, lambda: r.F)
+
+ def tests_skip_nuisance(self):
+
+ df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'})
+ r = df.rolling(window=3)
+ result = r[['A', 'B']].sum()
+ expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9],
+ 'B': [np.nan, np.nan, 18, 21, 24]},
+ columns=list('AB'))
+ tm.assert_frame_equal(result, expected)
+
+ def test_skip_sum_object_raises(self):
+ df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'})
+ r = df.rolling(window=3)
+
+ with pytest.raises(TypeError, match='cannot handle this type'):
+ r.sum()
+
+ def test_agg(self):
+ df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
+
+ r = df.rolling(window=3)
+ a_mean = r['A'].mean()
+ a_std = r['A'].std()
+ a_sum = r['A'].sum()
+ b_mean = r['B'].mean()
+ b_std = r['B'].std()
+ b_sum = r['B'].sum()
+
+ result = r.aggregate([np.mean, np.std])
+ expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
+ expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean',
+ 'std']])
+ tm.assert_frame_equal(result, expected)
+
+ result = r.aggregate({'A': np.mean, 'B': np.std})
+
+ expected = concat([a_mean, b_std], axis=1)
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = r.aggregate({'A': ['mean', 'std']})
+ expected = concat([a_mean, a_std], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A',
+ 'std')])
+ tm.assert_frame_equal(result, expected)
+
+ result = r['A'].aggregate(['mean', 'sum'])
+ expected = concat([a_mean, a_sum], axis=1)
+ expected.columns = ['mean', 'sum']
+ tm.assert_frame_equal(result, expected)
+
+ with catch_warnings(record=True):
+ # using a dict with renaming
+ warnings.simplefilter("ignore", FutureWarning)
+ result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
+ expected = concat([a_mean, a_sum], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
+ ('A', 'sum')])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ with catch_warnings(record=True):
+ warnings.simplefilter("ignore", FutureWarning)
+ result = r.aggregate({'A': {'mean': 'mean',
+ 'sum': 'sum'},
+ 'B': {'mean2': 'mean',
+ 'sum2': 'sum'}})
+ expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1)
+ exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]
+ expected.columns = pd.MultiIndex.from_tuples(exp_cols)
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']})
+ expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
+
+ exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')]
+ expected.columns = pd.MultiIndex.from_tuples(exp_cols)
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ def test_agg_apply(self, raw):
+
+ # passed lambda
+ df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
+
+ r = df.rolling(window=3)
+ a_sum = r['A'].sum()
+
+ result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)})
+ rcustom = r['B'].apply(lambda x: np.std(x, ddof=1), raw=raw)
+ expected = concat([a_sum, rcustom], axis=1)
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ def test_agg_consistency(self):
+
+ df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
+ r = df.rolling(window=3)
+
+ result = r.agg([np.sum, np.mean]).columns
+ expected = pd.MultiIndex.from_product([list('AB'), ['sum', 'mean']])
+ tm.assert_index_equal(result, expected)
+
+ result = r['A'].agg([np.sum, np.mean]).columns
+ expected = Index(['sum', 'mean'])
+ tm.assert_index_equal(result, expected)
+
+ result = r.agg({'A': [np.sum, np.mean]}).columns
+ expected = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'mean')])
+ tm.assert_index_equal(result, expected)
+
+ def test_agg_nested_dicts(self):
+
+ # API change for disallowing these types of nested dicts
+ df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
+ r = df.rolling(window=3)
+
+ def f():
+ r.aggregate({'r1': {'A': ['mean', 'sum']},
+ 'r2': {'B': ['mean', 'sum']}})
+
+ pytest.raises(SpecificationError, f)
+
+ expected = concat([r['A'].mean(), r['A'].std(),
+ r['B'].mean(), r['B'].std()], axis=1)
+ expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
+ 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
+ with catch_warnings(record=True):
+ warnings.simplefilter("ignore", FutureWarning)
+ result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
+ 'B': {'rb': ['mean', 'std']}})
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ with catch_warnings(record=True):
+ warnings.simplefilter("ignore", FutureWarning)
+ result = r.agg({'A': {'ra': ['mean', 'std']},
+ 'B': {'rb': ['mean', 'std']}})
+ expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), (
+ 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')])
+ tm.assert_frame_equal(result, expected, check_like=True)
+
+ def test_count_nonnumeric_types(self):
+ # GH12541
+ cols = ['int', 'float', 'string', 'datetime', 'timedelta', 'periods',
+ 'fl_inf', 'fl_nan', 'str_nan', 'dt_nat', 'periods_nat']
+
+ df = DataFrame(
+ {'int': [1, 2, 3],
+ 'float': [4., 5., 6.],
+ 'string': list('abc'),
+ 'datetime': pd.date_range('20170101', periods=3),
+ 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s'),
+ 'periods': [pd.Period('2012-01'), pd.Period('2012-02'),
+ pd.Period('2012-03')],
+ 'fl_inf': [1., 2., np.Inf],
+ 'fl_nan': [1., 2., np.NaN],
+ 'str_nan': ['aa', 'bb', np.NaN],
+ 'dt_nat': [Timestamp('20170101'), Timestamp('20170203'),
+ Timestamp(None)],
+ 'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'),
+ pd.Period(None)]},
+ columns=cols)
+
+ expected = DataFrame(
+ {'int': [1., 2., 2.],
+ 'float': [1., 2., 2.],
+ 'string': [1., 2., 2.],
+ 'datetime': [1., 2., 2.],
+ 'timedelta': [1., 2., 2.],
+ 'periods': [1., 2., 2.],
+ 'fl_inf': [1., 2., 2.],
+ 'fl_nan': [1., 2., 1.],
+ 'str_nan': [1., 2., 1.],
+ 'dt_nat': [1., 2., 1.],
+ 'periods_nat': [1., 2., 1.]},
+ columns=cols)
+
+ result = df.rolling(window=2).count()
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(1).count()
+ expected = df.notna().astype(float)
+ tm.assert_frame_equal(result, expected)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning")
+ def test_window_with_args(self):
+ # make sure that we are aggregating window functions correctly with arg
+ r = Series(np.random.randn(100)).rolling(window=10, min_periods=1,
+ win_type='gaussian')
+ expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1)
+ expected.columns = ['<lambda>', '<lambda>']
+ result = r.aggregate([lambda x: x.mean(std=10),
+ lambda x: x.mean(std=.01)])
+ tm.assert_frame_equal(result, expected)
+
+ def a(x):
+ return x.mean(std=10)
+
+ def b(x):
+ return x.mean(std=0.01)
+
+ expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1)
+ expected.columns = ['a', 'b']
+ result = r.aggregate([a, b])
+ tm.assert_frame_equal(result, expected)
+
+ def test_preserve_metadata(self):
+ # GH 10565
+ s = Series(np.arange(100), name='foo')
+
+ s2 = s.rolling(30).sum()
+ s3 = s.rolling(20).sum()
+ assert s2.name == 'foo'
+ assert s3.name == 'foo'
+
+ @pytest.mark.parametrize("func,window_size,expected_vals", [
+ ('rolling', 2, [[np.nan, np.nan, np.nan, np.nan],
+ [15., 20., 25., 20.],
+ [25., 30., 35., 30.],
+ [np.nan, np.nan, np.nan, np.nan],
+ [20., 30., 35., 30.],
+ [35., 40., 60., 40.],
+ [60., 80., 85., 80]]),
+ ('expanding', None, [[10., 10., 20., 20.],
+ [15., 20., 25., 20.],
+ [20., 30., 30., 20.],
+ [10., 10., 30., 30.],
+ [20., 30., 35., 30.],
+ [26.666667, 40., 50., 30.],
+ [40., 80., 60., 30.]])])
+ def test_multiple_agg_funcs(self, func, window_size, expected_vals):
+ # GH 15072
+ df = pd.DataFrame([
+ ['A', 10, 20],
+ ['A', 20, 30],
+ ['A', 30, 40],
+ ['B', 10, 30],
+ ['B', 30, 40],
+ ['B', 40, 80],
+ ['B', 80, 90]], columns=['stock', 'low', 'high'])
+
+ f = getattr(df.groupby('stock'), func)
+ if window_size:
+ window = f(window_size)
+ else:
+ window = f()
+
+ index = pd.MultiIndex.from_tuples([
+ ('A', 0), ('A', 1), ('A', 2),
+ ('B', 3), ('B', 4), ('B', 5), ('B', 6)], names=['stock', None])
+ columns = pd.MultiIndex.from_tuples([
+ ('low', 'mean'), ('low', 'max'), ('high', 'mean'),
+ ('high', 'min')])
+ expected = pd.DataFrame(expected_vals, index=index, columns=columns)
+
+ result = window.agg(OrderedDict((
+ ('low', ['mean', 'max']),
+ ('high', ['mean', 'min']),
+ )))
+
+ tm.assert_frame_equal(result, expected)
+
+
[email protected]("ignore:can't resolve package:ImportWarning")
+class TestWindow(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor(self, which):
+ # GH 12669
+
+ o = getattr(self, which)
+ c = o.rolling
+
+ # valid
+ c(win_type='boxcar', window=2, min_periods=1)
+ c(win_type='boxcar', window=2, min_periods=1, center=True)
+ c(win_type='boxcar', window=2, min_periods=1, center=False)
+
+ # not valid
+ for w in [2., 'foo', np.array([2])]:
+ with pytest.raises(ValueError):
+ c(win_type='boxcar', window=2, min_periods=w)
+ with pytest.raises(ValueError):
+ c(win_type='boxcar', window=2, min_periods=1, center=w)
+
+ for wt in ['foobar', 1]:
+ with pytest.raises(ValueError):
+ c(win_type=wt, window=2)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor_with_win_type(self, which, win_types):
+ # GH 12669
+ o = getattr(self, which)
+ c = o.rolling
+ c(win_type=win_types, window=2)
+
+ @pytest.mark.parametrize(
+ 'method', ['sum', 'mean'])
+ def test_numpy_compat(self, method):
+ # see gh-12811
+ w = rwindow.Window(Series([2, 4, 6]), window=[0, 2])
+
+ msg = "numpy operations are not valid with window objects"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(w, method)(1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(w, method)(dtype=np.float64)
+
+
+class TestRolling(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def test_doc_string(self):
+
+ df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ df
+ df.rolling(2).sum()
+ df.rolling(2, min_periods=1).sum()
+
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor(self, which):
+ # GH 12669
+
+ o = getattr(self, which)
+ c = o.rolling
+
+ # valid
+ c(window=2)
+ c(window=2, min_periods=1)
+ c(window=2, min_periods=1, center=True)
+ c(window=2, min_periods=1, center=False)
+
+ # GH 13383
+ with pytest.raises(ValueError):
+ c(0)
+ c(-1)
+
+ # not valid
+ for w in [2., 'foo', np.array([2])]:
+ with pytest.raises(ValueError):
+ c(window=w)
+ with pytest.raises(ValueError):
+ c(window=2, min_periods=w)
+ with pytest.raises(ValueError):
+ c(window=2, min_periods=1, center=w)
+
+ @td.skip_if_no_scipy
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor_with_win_type(self, which):
+ # GH 13383
+ o = getattr(self, which)
+ c = o.rolling
+ with pytest.raises(ValueError):
+ c(-1, win_type='boxcar')
+
+ @pytest.mark.parametrize(
+ 'window', [timedelta(days=3), pd.Timedelta(days=3)])
+ def test_constructor_with_timedelta_window(self, window):
+ # GH 15440
+ n = 10
+ df = DataFrame({'value': np.arange(n)},
+ index=pd.date_range('2015-12-24', periods=n, freq="D"))
+ expected_data = np.append([0., 1.], np.arange(3., 27., 3))
+
+ result = df.rolling(window=window).sum()
+ expected = DataFrame({'value': expected_data},
+ index=pd.date_range('2015-12-24', periods=n,
+ freq="D"))
+ tm.assert_frame_equal(result, expected)
+ expected = df.rolling('3D').sum()
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ 'window', [timedelta(days=3), pd.Timedelta(days=3), '3D'])
+ def test_constructor_timedelta_window_and_minperiods(self, window, raw):
+ # GH 15305
+ n = 10
+ df = DataFrame({'value': np.arange(n)},
+ index=pd.date_range('2017-08-08', periods=n, freq="D"))
+ expected = DataFrame(
+ {'value': np.append([np.NaN, 1.], np.arange(3., 27., 3))},
+ index=pd.date_range('2017-08-08', periods=n, freq="D"))
+ result_roll_sum = df.rolling(window=window, min_periods=2).sum()
+ result_roll_generic = df.rolling(window=window,
+ min_periods=2).apply(sum, raw=raw)
+ tm.assert_frame_equal(result_roll_sum, expected)
+ tm.assert_frame_equal(result_roll_generic, expected)
+
+ @pytest.mark.parametrize(
+ 'method', ['std', 'mean', 'sum', 'max', 'min', 'var'])
+ def test_numpy_compat(self, method):
+ # see gh-12811
+ r = rwindow.Rolling(Series([2, 4, 6]), window=2)
+
+ msg = "numpy operations are not valid with window objects"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(r, method)(1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(r, method)(dtype=np.float64)
+
+ def test_closed(self):
+ df = DataFrame({'A': [0, 1, 2, 3, 4]})
+ # closed only allowed for datetimelike
+ with pytest.raises(ValueError):
+ df.rolling(window=3, closed='neither')
+
+ @pytest.mark.parametrize("func", ['min', 'max'])
+ def test_closed_one_entry(self, func):
+ # GH24718
+ ser = pd.Series(data=[2], index=pd.date_range('2000', periods=1))
+ result = getattr(ser.rolling('10D', closed='left'), func)()
+ tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index))
+
+ @pytest.mark.parametrize("func", ['min', 'max'])
+ def test_closed_one_entry_groupby(self, func):
+ # GH24718
+ ser = pd.DataFrame(data={'A': [1, 1, 2], 'B': [3, 2, 1]},
+ index=pd.date_range('2000', periods=3))
+ result = getattr(
+ ser.groupby('A', sort=False)['B'].rolling('10D', closed='left'),
+ func)()
+ exp_idx = pd.MultiIndex.from_arrays(arrays=[[1, 1, 2], ser.index],
+ names=('A', None))
+ expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("input_dtype", ['int', 'float'])
+ @pytest.mark.parametrize("func,closed,expected", [
+ ('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
+ ('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
+ ('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
+ ('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
+ ('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+ ('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+ ('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]),
+ ('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8])
+ ])
+ def test_closed_min_max_datetime(self, input_dtype,
+ func, closed,
+ expected):
+ # see gh-21704
+ ser = pd.Series(data=np.arange(10).astype(input_dtype),
+ index=pd.date_range('2000', periods=10))
+
+ result = getattr(ser.rolling('3D', closed=closed), func)()
+ expected = pd.Series(expected, index=ser.index)
+ tm.assert_series_equal(result, expected)
+
+ def test_closed_uneven(self):
+ # see gh-21704
+ ser = pd.Series(data=np.arange(10),
+ index=pd.date_range('2000', periods=10))
+
+ # uneven
+ ser = ser.drop(index=ser.index[[1, 5]])
+ result = ser.rolling('3D', closed='left').min()
+ expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6],
+ index=ser.index)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("func,closed,expected", [
+ ('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
+ ('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]),
+ ('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
+ ('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]),
+ ('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]),
+ ('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]),
+ ('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]),
+ ('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan])
+ ])
+ def test_closed_min_max_minp(self, func, closed, expected):
+ # see gh-21704
+ ser = pd.Series(data=np.arange(10),
+ index=pd.date_range('2000', periods=10))
+ ser[ser.index[-3:]] = np.nan
+ result = getattr(ser.rolling('3D', min_periods=2, closed=closed),
+ func)()
+ expected = pd.Series(expected, index=ser.index)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('roller', ['1s', 1])
+ def tests_empty_df_rolling(self, roller):
+ # GH 15819 Verifies that datetime and integer rolling windows can be
+ # applied to empty DataFrames
+ expected = DataFrame()
+ result = DataFrame().rolling(roller).sum()
+ tm.assert_frame_equal(result, expected)
+
+ # Verifies that datetime and integer rolling windows can be applied to
+ # empty DataFrames with datetime index
+ expected = DataFrame(index=pd.DatetimeIndex([]))
+ result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_missing_minp_zero(self):
+ # https://github.com/pandas-dev/pandas/pull/18921
+ # minp=0
+ x = pd.Series([np.nan])
+ result = x.rolling(1, min_periods=0).sum()
+ expected = pd.Series([0.0])
+ tm.assert_series_equal(result, expected)
+
+ # minp=1
+ result = x.rolling(1, min_periods=1).sum()
+ expected = pd.Series([np.nan])
+ tm.assert_series_equal(result, expected)
+
+ def test_missing_minp_zero_variable(self):
+ # https://github.com/pandas-dev/pandas/pull/18921
+ x = pd.Series([np.nan] * 4,
+ index=pd.DatetimeIndex(['2017-01-01', '2017-01-04',
+ '2017-01-06', '2017-01-07']))
+ result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum()
+ expected = pd.Series(0.0, index=x.index)
+ tm.assert_series_equal(result, expected)
+
+ def test_multi_index_names(self):
+
+ # GH 16789, 16825
+ cols = pd.MultiIndex.from_product([['A', 'B'], ['C', 'D', 'E']],
+ names=['1', '2'])
+ df = DataFrame(np.ones((10, 6)), columns=cols)
+ result = df.rolling(3).cov()
+
+ tm.assert_index_equal(result.columns, df.columns)
+ assert result.index.names == [None, '1', '2']
+
+ @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame])
+ def test_iter_raises(self, klass):
+ # https://github.com/pandas-dev/pandas/issues/11704
+ # Iteration over a Window
+ obj = klass([1, 2, 3, 4])
+ with pytest.raises(NotImplementedError):
+ iter(obj.rolling(2))
+
+ def test_rolling_axis(self, axis_frame):
+ # see gh-23372.
+ df = DataFrame(np.ones((10, 20)))
+ axis = df._get_axis_number(axis_frame)
+
+ if axis == 0:
+ expected = DataFrame({
+ i: [np.nan] * 2 + [3.0] * 8
+ for i in range(20)
+ })
+ else:
+ # axis == 1
+ expected = DataFrame([
+ [np.nan] * 2 + [3.0] * 18
+ ] * 10)
+
+ result = df.rolling(3, axis=axis_frame).sum()
+ tm.assert_frame_equal(result, expected)
+
+
+class TestExpanding(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def test_doc_string(self):
+
+ df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ df
+ df.expanding(2).sum()
+
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor(self, which):
+ # GH 12669
+
+ o = getattr(self, which)
+ c = o.expanding
+
+ # valid
+ c(min_periods=1)
+ c(min_periods=1, center=True)
+ c(min_periods=1, center=False)
+
+ # not valid
+ for w in [2., 'foo', np.array([2])]:
+ with pytest.raises(ValueError):
+ c(min_periods=w)
+ with pytest.raises(ValueError):
+ c(min_periods=1, center=w)
+
+ @pytest.mark.parametrize(
+ 'method', ['std', 'mean', 'sum', 'max', 'min', 'var'])
+ def test_numpy_compat(self, method):
+ # see gh-12811
+ e = rwindow.Expanding(Series([2, 4, 6]), window=2)
+
+ msg = "numpy operations are not valid with window objects"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(e, method)(1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(e, method)(dtype=np.float64)
+
+ @pytest.mark.parametrize(
+ 'expander',
+ [1, pytest.param('ls', marks=pytest.mark.xfail(
+ reason='GH#16425 expanding with '
+ 'offset not supported'))])
+ def test_empty_df_expanding(self, expander):
+ # GH 15819 Verifies that datetime and integer expanding windows can be
+ # applied to empty DataFrames
+
+ expected = DataFrame()
+ result = DataFrame().expanding(expander).sum()
+ tm.assert_frame_equal(result, expected)
+
+ # Verifies that datetime and integer expanding windows can be applied
+ # to empty DataFrames with datetime index
+ expected = DataFrame(index=pd.DatetimeIndex([]))
+ result = DataFrame(
+ index=pd.DatetimeIndex([])).expanding(expander).sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_missing_minp_zero(self):
+ # https://github.com/pandas-dev/pandas/pull/18921
+ # minp=0
+ x = pd.Series([np.nan])
+ result = x.expanding(min_periods=0).sum()
+ expected = pd.Series([0.0])
+ tm.assert_series_equal(result, expected)
+
+ # minp=1
+ result = x.expanding(min_periods=1).sum()
+ expected = pd.Series([np.nan])
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame])
+ def test_iter_raises(self, klass):
+ # https://github.com/pandas-dev/pandas/issues/11704
+ # Iteration over a Window
+ obj = klass([1, 2, 3, 4])
+ with pytest.raises(NotImplementedError):
+ iter(obj.expanding(2))
+
+ def test_expanding_axis(self, axis_frame):
+ # see gh-23372.
+ df = DataFrame(np.ones((10, 20)))
+ axis = df._get_axis_number(axis_frame)
+
+ if axis == 0:
+ expected = DataFrame({
+ i: [np.nan] * 2 + [float(j) for j in range(3, 11)]
+ for i in range(20)
+ })
+ else:
+ # axis == 1
+ expected = DataFrame([
+ [np.nan] * 2 + [float(i) for i in range(3, 21)]
+ ] * 10)
+
+ result = df.expanding(3, axis=axis_frame).sum()
+ tm.assert_frame_equal(result, expected)
+
+
+class TestEWM(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def test_doc_string(self):
+
+ df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ df
+ df.ewm(com=0.5).mean()
+
+ @pytest.mark.parametrize(
+ 'which', ['series', 'frame'])
+ def test_constructor(self, which):
+ o = getattr(self, which)
+ c = o.ewm
+
+ # valid
+ c(com=0.5)
+ c(span=1.5)
+ c(alpha=0.5)
+ c(halflife=0.75)
+ c(com=0.5, span=None)
+ c(alpha=0.5, com=None)
+ c(halflife=0.75, alpha=None)
+
+ # not valid: mutually exclusive
+ with pytest.raises(ValueError):
+ c(com=0.5, alpha=0.5)
+ with pytest.raises(ValueError):
+ c(span=1.5, halflife=0.75)
+ with pytest.raises(ValueError):
+ c(alpha=0.5, span=1.5)
+
+ # not valid: com < 0
+ with pytest.raises(ValueError):
+ c(com=-0.5)
+
+ # not valid: span < 1
+ with pytest.raises(ValueError):
+ c(span=0.5)
+
+ # not valid: halflife <= 0
+ with pytest.raises(ValueError):
+ c(halflife=0)
+
+ # not valid: alpha <= 0 or alpha > 1
+ for alpha in (-0.5, 1.5):
+ with pytest.raises(ValueError):
+ c(alpha=alpha)
+
+ @pytest.mark.parametrize(
+ 'method', ['std', 'mean', 'var'])
+ def test_numpy_compat(self, method):
+ # see gh-12811
+ e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5)
+
+ msg = "numpy operations are not valid with window objects"
+
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(e, method)(1, 2, 3)
+ with pytest.raises(UnsupportedFunctionCall, match=msg):
+ getattr(e, method)(dtype=np.float64)
+
+
+# gh-12373 : rolling functions error on float32 data
+# make sure rolling functions works for different dtypes
+#
+# NOTE that these are yielded tests and so _create_data
+# is explicitly called.
+#
+# further note that we are only checking rolling for fully dtype
+# compliance (though both expanding and ewm inherit)
+class Dtype(object):
+ window = 2
+
+ funcs = {
+ 'count': lambda v: v.count(),
+ 'max': lambda v: v.max(),
+ 'min': lambda v: v.min(),
+ 'sum': lambda v: v.sum(),
+ 'mean': lambda v: v.mean(),
+ 'std': lambda v: v.std(),
+ 'var': lambda v: v.var(),
+ 'median': lambda v: v.median()
+ }
+
+ def get_expects(self):
+ expects = {
+ 'sr1': {
+ 'count': Series([1, 2, 2, 2, 2], dtype='float64'),
+ 'max': Series([np.nan, 1, 2, 3, 4], dtype='float64'),
+ 'min': Series([np.nan, 0, 1, 2, 3], dtype='float64'),
+ 'sum': Series([np.nan, 1, 3, 5, 7], dtype='float64'),
+ 'mean': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64'),
+ 'std': Series([np.nan] + [np.sqrt(.5)] * 4, dtype='float64'),
+ 'var': Series([np.nan, .5, .5, .5, .5], dtype='float64'),
+ 'median': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64')
+ },
+ 'sr2': {
+ 'count': Series([1, 2, 2, 2, 2], dtype='float64'),
+ 'max': Series([np.nan, 10, 8, 6, 4], dtype='float64'),
+ 'min': Series([np.nan, 8, 6, 4, 2], dtype='float64'),
+ 'sum': Series([np.nan, 18, 14, 10, 6], dtype='float64'),
+ 'mean': Series([np.nan, 9, 7, 5, 3], dtype='float64'),
+ 'std': Series([np.nan] + [np.sqrt(2)] * 4, dtype='float64'),
+ 'var': Series([np.nan, 2, 2, 2, 2], dtype='float64'),
+ 'median': Series([np.nan, 9, 7, 5, 3], dtype='float64')
+ },
+ 'df': {
+ 'count': DataFrame({0: Series([1, 2, 2, 2, 2]),
+ 1: Series([1, 2, 2, 2, 2])},
+ dtype='float64'),
+ 'max': DataFrame({0: Series([np.nan, 2, 4, 6, 8]),
+ 1: Series([np.nan, 3, 5, 7, 9])},
+ dtype='float64'),
+ 'min': DataFrame({0: Series([np.nan, 0, 2, 4, 6]),
+ 1: Series([np.nan, 1, 3, 5, 7])},
+ dtype='float64'),
+ 'sum': DataFrame({0: Series([np.nan, 2, 6, 10, 14]),
+ 1: Series([np.nan, 4, 8, 12, 16])},
+ dtype='float64'),
+ 'mean': DataFrame({0: Series([np.nan, 1, 3, 5, 7]),
+ 1: Series([np.nan, 2, 4, 6, 8])},
+ dtype='float64'),
+ 'std': DataFrame({0: Series([np.nan] + [np.sqrt(2)] * 4),
+ 1: Series([np.nan] + [np.sqrt(2)] * 4)},
+ dtype='float64'),
+ 'var': DataFrame({0: Series([np.nan, 2, 2, 2, 2]),
+ 1: Series([np.nan, 2, 2, 2, 2])},
+ dtype='float64'),
+ 'median': DataFrame({0: Series([np.nan, 1, 3, 5, 7]),
+ 1: Series([np.nan, 2, 4, 6, 8])},
+ dtype='float64'),
+ }
+ }
+ return expects
+
+ def _create_dtype_data(self, dtype):
+ sr1 = Series(np.arange(5), dtype=dtype)
+ sr2 = Series(np.arange(10, 0, -2), dtype=dtype)
+ df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype)
+
+ data = {
+ 'sr1': sr1,
+ 'sr2': sr2,
+ 'df': df
+ }
+
+ return data
+
+ def _create_data(self):
+ self.data = self._create_dtype_data(self.dtype)
+ self.expects = self.get_expects()
+
+ def test_dtypes(self):
+ self._create_data()
+ for f_name, d_name in product(self.funcs.keys(), self.data.keys()):
+
+ f = self.funcs[f_name]
+ d = self.data[d_name]
+ exp = self.expects[d_name][f_name]
+ self.check_dtypes(f, f_name, d, d_name, exp)
+
+ def check_dtypes(self, f, f_name, d, d_name, exp):
+ roll = d.rolling(window=self.window)
+ result = f(roll)
+ tm.assert_almost_equal(result, exp)
+
+
+class TestDtype_object(Dtype):
+ dtype = object
+
+
+class Dtype_integer(Dtype):
+ pass
+
+
+class TestDtype_int8(Dtype_integer):
+ dtype = np.int8
+
+
+class TestDtype_int16(Dtype_integer):
+ dtype = np.int16
+
+
+class TestDtype_int32(Dtype_integer):
+ dtype = np.int32
+
+
+class TestDtype_int64(Dtype_integer):
+ dtype = np.int64
+
+
+class Dtype_uinteger(Dtype):
+ pass
+
+
+class TestDtype_uint8(Dtype_uinteger):
+ dtype = np.uint8
+
+
+class TestDtype_uint16(Dtype_uinteger):
+ dtype = np.uint16
+
+
+class TestDtype_uint32(Dtype_uinteger):
+ dtype = np.uint32
+
+
+class TestDtype_uint64(Dtype_uinteger):
+ dtype = np.uint64
+
+
+class Dtype_float(Dtype):
+ pass
+
+
+class TestDtype_float16(Dtype_float):
+ dtype = np.float16
+
+
+class TestDtype_float32(Dtype_float):
+ dtype = np.float32
+
+
+class TestDtype_float64(Dtype_float):
+ dtype = np.float64
+
+
+class TestDtype_category(Dtype):
+ dtype = 'category'
+ include_df = False
+
+ def _create_dtype_data(self, dtype):
+ sr1 = Series(range(5), dtype=dtype)
+ sr2 = Series(range(10, 0, -2), dtype=dtype)
+
+ data = {
+ 'sr1': sr1,
+ 'sr2': sr2
+ }
+
+ return data
+
+
+class DatetimeLike(Dtype):
+
+ def check_dtypes(self, f, f_name, d, d_name, exp):
+
+ roll = d.rolling(window=self.window)
+
+ if f_name == 'count':
+ result = f(roll)
+ tm.assert_almost_equal(result, exp)
+
+ else:
+
+ # other methods not Implemented ATM
+ with pytest.raises(NotImplementedError):
+ f(roll)
+
+
+class TestDtype_timedelta(DatetimeLike):
+ dtype = np.dtype('m8[ns]')
+
+
+class TestDtype_datetime(DatetimeLike):
+ dtype = np.dtype('M8[ns]')
+
+
+class TestDtype_datetime64UTC(DatetimeLike):
+ dtype = 'datetime64[ns, UTC]'
+
+ def _create_data(self):
+ pytest.skip("direct creation of extension dtype "
+ "datetime64[ns, UTC] is not supported ATM")
+
+
[email protected]("ignore:can't resolve package:ImportWarning")
+class TestMoments(Base):
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def test_centered_axis_validation(self):
+
+ # ok
+ Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean()
+
+ # bad axis
+ with pytest.raises(ValueError):
+ Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean()
+
+ # ok ok
+ DataFrame(np.ones((10, 10))).rolling(window=3, center=True,
+ axis=0).mean()
+ DataFrame(np.ones((10, 10))).rolling(window=3, center=True,
+ axis=1).mean()
+
+ # bad axis
+ with pytest.raises(ValueError):
+ (DataFrame(np.ones((10, 10)))
+ .rolling(window=3, center=True, axis=2).mean())
+
+ def test_rolling_sum(self):
+ self._check_moment_func(np.nansum, name='sum',
+ zero_min_periods_equal=False)
+
+ def test_rolling_count(self):
+ counter = lambda x: np.isfinite(x).astype(float).sum()
+ self._check_moment_func(counter, name='count', has_min_periods=False,
+ fill_value=0)
+
+ def test_rolling_mean(self):
+ self._check_moment_func(np.mean, name='mean')
+
+ @td.skip_if_no_scipy
+ def test_cmov_mean(self):
+ # GH 8238
+ vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
+ 10.63, 14.48])
+ result = Series(vals).rolling(5, center=True).mean()
+ expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516,
+ 12.818, 12.952, np.nan, np.nan])
+ tm.assert_series_equal(expected, result)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window(self):
+ # GH 8238
+ vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
+ 10.63, 14.48])
+ result = Series(vals).rolling(5, win_type='boxcar', center=True).mean()
+ expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516,
+ 12.818, 12.952, np.nan, np.nan])
+ tm.assert_series_equal(expected, result)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_corner(self):
+ # GH 8238
+ # all nan
+ vals = pd.Series([np.nan] * 10)
+ result = vals.rolling(5, center=True, win_type='boxcar').mean()
+ assert np.isnan(result).all()
+
+ # empty
+ vals = pd.Series([])
+ result = vals.rolling(5, center=True, win_type='boxcar').mean()
+ assert len(result) == 0
+
+ # shorter than window
+ vals = pd.Series(np.random.randn(5))
+ result = vals.rolling(10, win_type='boxcar').mean()
+ assert np.isnan(result).all()
+ assert len(result) == 5
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_frame(self):
+ # Gh 8238
+ vals = np.array([[12.18, 3.64], [10.18, 9.16], [13.24, 14.61],
+ [4.51, 8.11], [6.15, 11.44], [9.14, 6.21],
+ [11.31, 10.67], [2.94, 6.51], [9.42, 8.39], [12.44,
+ 7.34]])
+
+ xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [9.252, 9.392],
+ [8.644, 9.906], [8.87, 10.208], [6.81, 8.588],
+ [7.792, 8.644], [9.05, 7.824], [np.nan, np.nan
+ ], [np.nan, np.nan]])
+
+ # DataFrame
+ rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean()
+ tm.assert_frame_equal(DataFrame(xp), rs)
+
+ # invalid method
+ with pytest.raises(AttributeError):
+ (DataFrame(vals).rolling(5, win_type='boxcar', center=True)
+ .std())
+
+ # sum
+ xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [46.26, 46.96],
+ [43.22, 49.53], [44.35, 51.04], [34.05, 42.94],
+ [38.96, 43.22], [45.25, 39.12], [np.nan, np.nan
+ ], [np.nan, np.nan]])
+
+ rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum()
+ tm.assert_frame_equal(DataFrame(xp), rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_na_min_periods(self):
+ # min_periods
+ vals = Series(np.random.randn(10))
+ vals[4] = np.nan
+ vals[8] = np.nan
+
+ xp = vals.rolling(5, min_periods=4, center=True).mean()
+ rs = vals.rolling(5, win_type='boxcar', min_periods=4,
+ center=True).mean()
+ tm.assert_series_equal(xp, rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_regular(self, win_types):
+ # GH 8238
+ vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
+ 10.63, 14.48])
+ xps = {
+ 'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, 14.03687,
+ 13.8567, 11.81473, np.nan, np.nan],
+ 'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, 13.33889,
+ 13.38, 12.33667, np.nan, np.nan],
+ 'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675,
+ 14.0825, 11.5675, np.nan, np.nan],
+ 'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, 14.17267,
+ 14.65923, 11.10401, np.nan, np.nan],
+ 'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052,
+ 14.02156, 15.10512, 10.74574, np.nan, np.nan],
+ 'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, 14.03559,
+ 15.05657, 10.78514, np.nan, np.nan],
+ 'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, 14.20036,
+ 14.57726, 11.16988, np.nan, np.nan],
+ 'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675,
+ 14.0825, 11.5675, np.nan, np.nan]
+ }
+
+ xp = Series(xps[win_types])
+ rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
+ tm.assert_series_equal(xp, rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_regular_linear_range(self, win_types):
+ # GH 8238
+ vals = np.array(range(10), dtype=np.float)
+ xp = vals.copy()
+ xp[:2] = np.nan
+ xp[-2:] = np.nan
+ xp = Series(xp)
+
+ rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
+ tm.assert_series_equal(xp, rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_regular_missing_data(self, win_types):
+ # GH 8238
+ vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan,
+ 10.63, 14.48])
+ xps = {
+ 'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925,
+ 12.5575, 14.3675, 15.61667, 13.655],
+ 'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, 9.17869,
+ 12.79607, 14.20036, 15.8706, 13.655],
+ 'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925,
+ 12.5575, 14.3675, 15.61667, 13.655],
+ 'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, 9.1764,
+ 12.83559, 14.17267, 15.90976, 13.655],
+ 'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, 9.56348,
+ 12.38009, 14.20565, 15.24694, 13.69758],
+ 'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, 9.16786,
+ 13.02671, 14.03673, 16.08759, 13.65553],
+ 'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, 10.34667,
+ 12.00556, 13.82125, 14.49429, 13.765],
+ 'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691,
+ 9.16438, 13.05052, 14.02175, 16.1098, 13.65509]
+ }
+
+ xp = Series(xps[win_types])
+ rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean()
+ tm.assert_series_equal(xp, rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_special(self, win_types_special):
+ # GH 8238
+ kwds = {
+ 'kaiser': {'beta': 1.},
+ 'gaussian': {'std': 1.},
+ 'general_gaussian': {'power': 2., 'width': 2.}}
+
+ vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
+ 10.63, 14.48])
+
+ xps = {
+ 'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, 13.89053,
+ 13.65671, 12.01002, np.nan, np.nan],
+ 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161,
+ 13.08516, 12.95111, 12.74577, np.nan, np.nan],
+ 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129,
+ 12.90702, 12.83757, np.nan, np.nan]
+ }
+
+ xp = Series(xps[win_types_special])
+ rs = Series(vals).rolling(
+ 5, win_type=win_types_special, center=True).mean(
+ **kwds[win_types_special])
+ tm.assert_series_equal(xp, rs)
+
+ @td.skip_if_no_scipy
+ def test_cmov_window_special_linear_range(self, win_types_special):
+ # GH 8238
+ kwds = {
+ 'kaiser': {'beta': 1.},
+ 'gaussian': {'std': 1.},
+ 'general_gaussian': {'power': 2., 'width': 2.},
+ 'slepian': {'width': 0.5}}
+
+ vals = np.array(range(10), dtype=np.float)
+ xp = vals.copy()
+ xp[:2] = np.nan
+ xp[-2:] = np.nan
+ xp = Series(xp)
+
+ rs = Series(vals).rolling(
+ 5, win_type=win_types_special, center=True).mean(
+ **kwds[win_types_special])
+ tm.assert_series_equal(xp, rs)
+
+ def test_rolling_median(self):
+ self._check_moment_func(np.median, name='median')
+
+ def test_rolling_min(self):
+ self._check_moment_func(np.min, name='min')
+
+ a = pd.Series([1, 2, 3, 4, 5])
+ result = a.rolling(window=100, min_periods=1).min()
+ expected = pd.Series(np.ones(len(a)))
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(ValueError):
+ pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min()
+
+ def test_rolling_max(self):
+ self._check_moment_func(np.max, name='max')
+
+ a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64)
+ b = a.rolling(window=100, min_periods=1).max()
+ tm.assert_almost_equal(a, b)
+
+ with pytest.raises(ValueError):
+ pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max()
+
+ @pytest.mark.parametrize('q', [0.0, .1, .5, .9, 1.0])
+ def test_rolling_quantile(self, q):
+
+ def scoreatpercentile(a, per):
+ values = np.sort(a, axis=0)
+
+ idx = int(per / 1. * (values.shape[0] - 1))
+
+ if idx == values.shape[0] - 1:
+ retval = values[-1]
+
+ else:
+ qlow = float(idx) / float(values.shape[0] - 1)
+ qhig = float(idx + 1) / float(values.shape[0] - 1)
+ vlow = values[idx]
+ vhig = values[idx + 1]
+ retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow)
+
+ return retval
+
+ def quantile_func(x):
+ return scoreatpercentile(x, q)
+
+ self._check_moment_func(quantile_func, name='quantile',
+ quantile=q)
+
+ def test_rolling_quantile_np_percentile(self):
+ # #9413: Tests that rolling window's quantile default behavior
+ # is analogus to Numpy's percentile
+ row = 10
+ col = 5
+ idx = pd.date_range('20100101', periods=row, freq='B')
+ df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx)
+
+ df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
+ np_percentile = np.percentile(df, [25, 50, 75], axis=0)
+
+ tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
+
+ @pytest.mark.parametrize('quantile', [0.0, 0.1, 0.45, 0.5, 1])
+ @pytest.mark.parametrize('interpolation', ['linear', 'lower', 'higher',
+ 'nearest', 'midpoint'])
+ @pytest.mark.parametrize('data', [[1., 2., 3., 4., 5., 6., 7.],
+ [8., 1., 3., 4., 5., 2., 6., 7.],
+ [0., np.nan, 0.2, np.nan, 0.4],
+ [np.nan, np.nan, np.nan, np.nan],
+ [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5],
+ [0.5], [np.nan, 0.7, 0.6]])
+ def test_rolling_quantile_interpolation_options(self, quantile,
+ interpolation, data):
+ # Tests that rolling window's quantile behavior is analogous to
+ # Series' quantile for each interpolation option
+ s = Series(data)
+
+ q1 = s.quantile(quantile, interpolation)
+ q2 = s.expanding(min_periods=1).quantile(
+ quantile, interpolation).iloc[-1]
+
+ if np.isnan(q1):
+ assert np.isnan(q2)
+ else:
+ assert q1 == q2
+
+ def test_invalid_quantile_value(self):
+ data = np.arange(5)
+ s = Series(data)
+
+ with pytest.raises(ValueError, match="Interpolation 'invalid'"
+ " is not supported"):
+ s.rolling(len(data), min_periods=1).quantile(
+ 0.5, interpolation='invalid')
+
+ def test_rolling_quantile_param(self):
+ ser = Series([0.0, .1, .5, .9, 1.0])
+
+ with pytest.raises(ValueError):
+ ser.rolling(3).quantile(-0.1)
+
+ with pytest.raises(ValueError):
+ ser.rolling(3).quantile(10.0)
+
+ with pytest.raises(TypeError):
+ ser.rolling(3).quantile('foo')
+
+ def test_rolling_apply(self, raw):
+ # suppress warnings about empty slices, as we are deliberately testing
+ # with a 0-length Series
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore",
+ message=".*(empty slice|0 for slice).*",
+ category=RuntimeWarning)
+
+ def f(x):
+ return x[np.isfinite(x)].mean()
+
+ self._check_moment_func(np.mean, name='apply', func=f, raw=raw)
+
+ expected = Series([])
+ result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw)
+ tm.assert_series_equal(result, expected)
+
+ # gh-8080
+ s = Series([None, None, None])
+ result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw)
+ expected = Series([1., 2., 2.])
+ tm.assert_series_equal(result, expected)
+
+ result = s.rolling(2, min_periods=0).apply(len, raw=raw)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('klass', [Series, DataFrame])
+ @pytest.mark.parametrize(
+ 'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()])
+ def test_apply_future_warning(self, klass, method):
+
+ # gh-5071
+ s = klass(np.arange(3))
+
+ with tm.assert_produces_warning(FutureWarning):
+ method(s).apply(lambda x: len(x))
+
+ def test_rolling_apply_out_of_bounds(self, raw):
+ # gh-1850
+ vals = pd.Series([1, 2, 3, 4])
+
+ result = vals.rolling(10).apply(np.sum, raw=raw)
+ assert result.isna().all()
+
+ result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw)
+ expected = pd.Series([1, 3, 6, 10], dtype=float)
+ tm.assert_almost_equal(result, expected)
+
+ @pytest.mark.parametrize('window', [2, '2s'])
+ def test_rolling_apply_with_pandas_objects(self, window):
+ # 5071
+ df = pd.DataFrame({'A': np.random.randn(5),
+ 'B': np.random.randint(0, 10, size=5)},
+ index=pd.date_range('20130101', periods=5, freq='s'))
+
+ # we have an equal spaced timeseries index
+ # so simulate removing the first period
+ def f(x):
+ if x.index[0] == df.index[0]:
+ return np.nan
+ return x.iloc[-1]
+
+ result = df.rolling(window).apply(f, raw=False)
+ expected = df.iloc[2:].reindex_like(df)
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(AttributeError):
+ df.rolling(window).apply(f, raw=True)
+
+ def test_rolling_std(self):
+ self._check_moment_func(lambda x: np.std(x, ddof=1),
+ name='std')
+ self._check_moment_func(lambda x: np.std(x, ddof=0),
+ name='std', ddof=0)
+
+ def test_rolling_std_1obs(self):
+ vals = pd.Series([1., 2., 3., 4., 5.])
+
+ result = vals.rolling(1, min_periods=1).std()
+ expected = pd.Series([np.nan] * 5)
+ tm.assert_series_equal(result, expected)
+
+ result = vals.rolling(1, min_periods=1).std(ddof=0)
+ expected = pd.Series([0.] * 5)
+ tm.assert_series_equal(result, expected)
+
+ result = (pd.Series([np.nan, np.nan, 3, 4, 5])
+ .rolling(3, min_periods=2).std())
+ assert np.isnan(result[2])
+
+ def test_rolling_std_neg_sqrt(self):
+ # unit test from Bottleneck
+
+ # Test move_nanstd for neg sqrt.
+
+ a = pd.Series([0.0011448196318903589, 0.00028718669878572767,
+ 0.00028718669878572767, 0.00028718669878572767,
+ 0.00028718669878572767])
+ b = a.rolling(window=3).std()
+ assert np.isfinite(b[2:]).all()
+
+ b = a.ewm(span=3).std()
+ assert np.isfinite(b[2:]).all()
+
+ def test_rolling_var(self):
+ self._check_moment_func(lambda x: np.var(x, ddof=1),
+ name='var')
+ self._check_moment_func(lambda x: np.var(x, ddof=0),
+ name='var', ddof=0)
+
+ @td.skip_if_no_scipy
+ def test_rolling_skew(self):
+ from scipy.stats import skew
+ self._check_moment_func(lambda x: skew(x, bias=False), name='skew')
+
+ @td.skip_if_no_scipy
+ def test_rolling_kurt(self):
+ from scipy.stats import kurtosis
+ self._check_moment_func(lambda x: kurtosis(x, bias=False),
+ name='kurt')
+
+ def _check_moment_func(self, static_comp, name, has_min_periods=True,
+ has_center=True, has_time_rule=True,
+ fill_value=None, zero_min_periods_equal=True,
+ **kwargs):
+
+ def get_result(obj, window, min_periods=None, center=False):
+ r = obj.rolling(window=window, min_periods=min_periods,
+ center=center)
+ return getattr(r, name)(**kwargs)
+
+ series_result = get_result(self.series, window=50)
+ assert isinstance(series_result, Series)
+ tm.assert_almost_equal(series_result.iloc[-1],
+ static_comp(self.series[-50:]))
+
+ frame_result = get_result(self.frame, window=50)
+ assert isinstance(frame_result, DataFrame)
+ tm.assert_series_equal(
+ frame_result.iloc[-1, :],
+ self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw),
+ check_names=False)
+
+ # check time_rule works
+ if has_time_rule:
+ win = 25
+ minp = 10
+ series = self.series[::2].resample('B').mean()
+ frame = self.frame[::2].resample('B').mean()
+
+ if has_min_periods:
+ series_result = get_result(series, window=win,
+ min_periods=minp)
+ frame_result = get_result(frame, window=win,
+ min_periods=minp)
+ else:
+ series_result = get_result(series, window=win)
+ frame_result = get_result(frame, window=win)
+
+ last_date = series_result.index[-1]
+ prev_date = last_date - 24 * offsets.BDay()
+
+ trunc_series = self.series[::2].truncate(prev_date, last_date)
+ trunc_frame = self.frame[::2].truncate(prev_date, last_date)
+
+ tm.assert_almost_equal(series_result[-1],
+ static_comp(trunc_series))
+
+ tm.assert_series_equal(frame_result.xs(last_date),
+ trunc_frame.apply(static_comp, raw=raw),
+ check_names=False)
+
+ # excluding NaNs correctly
+ obj = Series(randn(50))
+ obj[:10] = np.NaN
+ obj[-10:] = np.NaN
+ if has_min_periods:
+ result = get_result(obj, 50, min_periods=30)
+ tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))
+
+ # min_periods is working correctly
+ result = get_result(obj, 20, min_periods=15)
+ assert isna(result.iloc[23])
+ assert not isna(result.iloc[24])
+
+ assert not isna(result.iloc[-6])
+ assert isna(result.iloc[-5])
+
+ obj2 = Series(randn(20))
+ result = get_result(obj2, 10, min_periods=5)
+ assert isna(result.iloc[3])
+ assert notna(result.iloc[4])
+
+ if zero_min_periods_equal:
+ # min_periods=0 may be equivalent to min_periods=1
+ result0 = get_result(obj, 20, min_periods=0)
+ result1 = get_result(obj, 20, min_periods=1)
+ tm.assert_almost_equal(result0, result1)
+ else:
+ result = get_result(obj, 50)
+ tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))
+
+ # window larger than series length (#7297)
+ if has_min_periods:
+ for minp in (0, len(self.series) - 1, len(self.series)):
+ result = get_result(self.series, len(self.series) + 1,
+ min_periods=minp)
+ expected = get_result(self.series, len(self.series),
+ min_periods=minp)
+ nan_mask = isna(result)
+ tm.assert_series_equal(nan_mask, isna(expected))
+
+ nan_mask = ~nan_mask
+ tm.assert_almost_equal(result[nan_mask],
+ expected[nan_mask])
+ else:
+ result = get_result(self.series, len(self.series) + 1)
+ expected = get_result(self.series, len(self.series))
+ nan_mask = isna(result)
+ tm.assert_series_equal(nan_mask, isna(expected))
+
+ nan_mask = ~nan_mask
+ tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+ # check center=True
+ if has_center:
+ if has_min_periods:
+ result = get_result(obj, 20, min_periods=15, center=True)
+ expected = get_result(
+ pd.concat([obj, Series([np.NaN] * 9)]), 20,
+ min_periods=15)[9:].reset_index(drop=True)
+ else:
+ result = get_result(obj, 20, center=True)
+ expected = get_result(
+ pd.concat([obj, Series([np.NaN] * 9)]),
+ 20)[9:].reset_index(drop=True)
+
+ tm.assert_series_equal(result, expected)
+
+ # shifter index
+ s = ['x%d' % x for x in range(12)]
+
+ if has_min_periods:
+ minp = 10
+
+ series_xp = get_result(
+ self.series.reindex(list(self.series.index) + s),
+ window=25,
+ min_periods=minp).shift(-12).reindex(self.series.index)
+ frame_xp = get_result(
+ self.frame.reindex(list(self.frame.index) + s),
+ window=25,
+ min_periods=minp).shift(-12).reindex(self.frame.index)
+
+ series_rs = get_result(self.series, window=25,
+ min_periods=minp, center=True)
+ frame_rs = get_result(self.frame, window=25, min_periods=minp,
+ center=True)
+
+ else:
+ series_xp = get_result(
+ self.series.reindex(list(self.series.index) + s),
+ window=25).shift(-12).reindex(self.series.index)
+ frame_xp = get_result(
+ self.frame.reindex(list(self.frame.index) + s),
+ window=25).shift(-12).reindex(self.frame.index)
+
+ series_rs = get_result(self.series, window=25, center=True)
+ frame_rs = get_result(self.frame, window=25, center=True)
+
+ if fill_value is not None:
+ series_xp = series_xp.fillna(fill_value)
+ frame_xp = frame_xp.fillna(fill_value)
+ tm.assert_series_equal(series_xp, series_rs)
+ tm.assert_frame_equal(frame_xp, frame_rs)
+
+ def test_ewma(self):
+ self._check_ew(name='mean')
+
+ vals = pd.Series(np.zeros(1000))
+ vals[5] = 1
+ result = vals.ewm(span=100, adjust=False).mean().sum()
+ assert np.abs(result - 1) < 1e-2
+
+ @pytest.mark.parametrize('adjust', [True, False])
+ @pytest.mark.parametrize('ignore_na', [True, False])
+ def test_ewma_cases(self, adjust, ignore_na):
+ # try adjust/ignore_na args matrix
+
+ s = Series([1.0, 2.0, 4.0, 8.0])
+
+ if adjust:
+ expected = Series([1.0, 1.6, 2.736842, 4.923077])
+ else:
+ expected = Series([1.0, 1.333333, 2.222222, 4.148148])
+
+ result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean()
+ tm.assert_series_equal(result, expected)
+
+ def test_ewma_nan_handling(self):
+ s = Series([1.] + [np.nan] * 5 + [1.])
+ result = s.ewm(com=5).mean()
+ tm.assert_series_equal(result, Series([1.] * len(s)))
+
+ s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.])
+ result = s.ewm(com=5).mean()
+ tm.assert_series_equal(result, Series([np.nan] * 2 + [1.] * 4))
+
+ # GH 7603
+ s0 = Series([np.nan, 1., 101.])
+ s1 = Series([1., np.nan, 101.])
+ s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan])
+ s3 = Series([1., np.nan, 101., 50.])
+ com = 2.
+ alpha = 1. / (1. + com)
+
+ def simple_wma(s, w):
+ return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill')
+
+ for (s, adjust, ignore_na, w) in [
+ (s0, True, False, [np.nan, (1. - alpha), 1.]),
+ (s0, True, True, [np.nan, (1. - alpha), 1.]),
+ (s0, False, False, [np.nan, (1. - alpha), alpha]),
+ (s0, False, True, [np.nan, (1. - alpha), alpha]),
+ (s1, True, False, [(1. - alpha) ** 2, np.nan, 1.]),
+ (s1, True, True, [(1. - alpha), np.nan, 1.]),
+ (s1, False, False, [(1. - alpha) ** 2, np.nan, alpha]),
+ (s1, False, True, [(1. - alpha), np.nan, alpha]),
+ (s2, True, False, [np.nan, (1. - alpha) **
+ 3, np.nan, np.nan, 1., np.nan]),
+ (s2, True, True, [np.nan, (1. - alpha),
+ np.nan, np.nan, 1., np.nan]),
+ (s2, False, False, [np.nan, (1. - alpha) **
+ 3, np.nan, np.nan, alpha, np.nan]),
+ (s2, False, True, [np.nan, (1. - alpha),
+ np.nan, np.nan, alpha, np.nan]),
+ (s3, True, False, [(1. - alpha) **
+ 3, np.nan, (1. - alpha), 1.]),
+ (s3, True, True, [(1. - alpha) **
+ 2, np.nan, (1. - alpha), 1.]),
+ (s3, False, False, [(1. - alpha) ** 3, np.nan,
+ (1. - alpha) * alpha,
+ alpha * ((1. - alpha) ** 2 + alpha)]),
+ (s3, False, True, [(1. - alpha) ** 2,
+ np.nan, (1. - alpha) * alpha, alpha])]:
+ expected = simple_wma(s, Series(w))
+ result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean()
+
+ tm.assert_series_equal(result, expected)
+ if ignore_na is False:
+ # check that ignore_na defaults to False
+ result = s.ewm(com=com, adjust=adjust).mean()
+ tm.assert_series_equal(result, expected)
+
+ def test_ewmvar(self):
+ self._check_ew(name='var')
+
+ def test_ewmvol(self):
+ self._check_ew(name='vol')
+
+ def test_ewma_span_com_args(self):
+ A = self.series.ewm(com=9.5).mean()
+ B = self.series.ewm(span=20).mean()
+ tm.assert_almost_equal(A, B)
+
+ with pytest.raises(ValueError):
+ self.series.ewm(com=9.5, span=20)
+ with pytest.raises(ValueError):
+ self.series.ewm().mean()
+
+ def test_ewma_halflife_arg(self):
+ A = self.series.ewm(com=13.932726172912965).mean()
+ B = self.series.ewm(halflife=10.0).mean()
+ tm.assert_almost_equal(A, B)
+
+ with pytest.raises(ValueError):
+ self.series.ewm(span=20, halflife=50)
+ with pytest.raises(ValueError):
+ self.series.ewm(com=9.5, halflife=50)
+ with pytest.raises(ValueError):
+ self.series.ewm(com=9.5, span=20, halflife=50)
+ with pytest.raises(ValueError):
+ self.series.ewm()
+
+ def test_ewm_alpha(self):
+ # GH 10789
+ s = Series(self.arr)
+ a = s.ewm(alpha=0.61722699889169674).mean()
+ b = s.ewm(com=0.62014947789973052).mean()
+ c = s.ewm(span=2.240298955799461).mean()
+ d = s.ewm(halflife=0.721792864318).mean()
+ tm.assert_series_equal(a, b)
+ tm.assert_series_equal(a, c)
+ tm.assert_series_equal(a, d)
+
+ def test_ewm_alpha_arg(self):
+ # GH 10789
+ s = self.series
+ with pytest.raises(ValueError):
+ s.ewm()
+ with pytest.raises(ValueError):
+ s.ewm(com=10.0, alpha=0.5)
+ with pytest.raises(ValueError):
+ s.ewm(span=10.0, alpha=0.5)
+ with pytest.raises(ValueError):
+ s.ewm(halflife=10.0, alpha=0.5)
+
+ def test_ewm_domain_checks(self):
+ # GH 12492
+ s = Series(self.arr)
+ # com must satisfy: com >= 0
+ pytest.raises(ValueError, s.ewm, com=-0.1)
+ s.ewm(com=0.0)
+ s.ewm(com=0.1)
+ # span must satisfy: span >= 1
+ pytest.raises(ValueError, s.ewm, span=-0.1)
+ pytest.raises(ValueError, s.ewm, span=0.0)
+ pytest.raises(ValueError, s.ewm, span=0.9)
+ s.ewm(span=1.0)
+ s.ewm(span=1.1)
+ # halflife must satisfy: halflife > 0
+ pytest.raises(ValueError, s.ewm, halflife=-0.1)
+ pytest.raises(ValueError, s.ewm, halflife=0.0)
+ s.ewm(halflife=0.1)
+ # alpha must satisfy: 0 < alpha <= 1
+ pytest.raises(ValueError, s.ewm, alpha=-0.1)
+ pytest.raises(ValueError, s.ewm, alpha=0.0)
+ s.ewm(alpha=0.1)
+ s.ewm(alpha=1.0)
+ pytest.raises(ValueError, s.ewm, alpha=1.1)
+
+ @pytest.mark.parametrize('method', ['mean', 'vol', 'var'])
+ def test_ew_empty_series(self, method):
+ vals = pd.Series([], dtype=np.float64)
+
+ ewm = vals.ewm(3)
+ result = getattr(ewm, method)()
+ tm.assert_almost_equal(result, vals)
+
+ def _check_ew(self, name=None, preserve_nan=False):
+ series_result = getattr(self.series.ewm(com=10), name)()
+ assert isinstance(series_result, Series)
+
+ frame_result = getattr(self.frame.ewm(com=10), name)()
+ assert type(frame_result) == DataFrame
+
+ result = getattr(self.series.ewm(com=10), name)()
+ if preserve_nan:
+ assert result[self._nan_locs].isna().all()
+
+ # excluding NaNs correctly
+ arr = randn(50)
+ arr[:10] = np.NaN
+ arr[-10:] = np.NaN
+ s = Series(arr)
+
+ # check min_periods
+ # GH 7898
+ result = getattr(s.ewm(com=50, min_periods=2), name)()
+ assert result[:11].isna().all()
+ assert not result[11:].isna().any()
+
+ for min_periods in (0, 1):
+ result = getattr(s.ewm(com=50, min_periods=min_periods), name)()
+ if name == 'mean':
+ assert result[:10].isna().all()
+ assert not result[10:].isna().any()
+ else:
+ # ewm.std, ewm.vol, ewm.var (with bias=False) require at least
+ # two values
+ assert result[:11].isna().all()
+ assert not result[11:].isna().any()
+
+ # check series of length 0
+ result = getattr(Series().ewm(com=50, min_periods=min_periods),
+ name)()
+ tm.assert_series_equal(result, Series())
+
+ # check series of length 1
+ result = getattr(Series([1.]).ewm(50, min_periods=min_periods),
+ name)()
+ if name == 'mean':
+ tm.assert_series_equal(result, Series([1.]))
+ else:
+ # ewm.std, ewm.vol, ewm.var with bias=False require at least
+ # two values
+ tm.assert_series_equal(result, Series([np.NaN]))
+
+ # pass in ints
+ result2 = getattr(Series(np.arange(50)).ewm(span=10), name)()
+ assert result2.dtype == np.float_
+
+
+class TestPairwise(object):
+
+ # GH 7738
+ df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]],
+ columns=['C', 'C']),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]),
+ DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]],
+ columns=[1, 0.]),
+ DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]],
+ columns=[0, 1.]),
+ DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]],
+ columns=[1., 'X']), ]
+ df2 = DataFrame([[None, 1, 1], [None, 1, 2],
+ [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X'])
+ s = Series([1, 1, 3, 8])
+
+ def compare(self, result, expected):
+
+ # since we have sorted the results
+ # we can only compare non-nans
+ result = result.dropna().values
+ expected = expected.dropna().values
+
+ tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+ @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()])
+ def test_no_flex(self, f):
+
+ # DataFrame methods (which do not call _flex_binary_moment())
+
+ results = [f(df) for df in self.df1s]
+ for (df, result) in zip(self.df1s, results):
+ tm.assert_index_equal(result.index, df.columns)
+ tm.assert_index_equal(result.columns, df.columns)
+ for i, result in enumerate(results):
+ if i > 0:
+ self.compare(result, results[0])
+
+ @pytest.mark.parametrize(
+ 'f', [lambda x: x.expanding().cov(pairwise=True),
+ lambda x: x.expanding().corr(pairwise=True),
+ lambda x: x.rolling(window=3).cov(pairwise=True),
+ lambda x: x.rolling(window=3).corr(pairwise=True),
+ lambda x: x.ewm(com=3).cov(pairwise=True),
+ lambda x: x.ewm(com=3).corr(pairwise=True)])
+ def test_pairwise_with_self(self, f):
+
+ # DataFrame with itself, pairwise=True
+ # note that we may construct the 1st level of the MI
+ # in a non-motononic way, so compare accordingly
+ results = []
+ for i, df in enumerate(self.df1s):
+ result = f(df)
+ tm.assert_index_equal(result.index.levels[0],
+ df.index,
+ check_names=False)
+ tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
+ safe_sort(df.columns.unique()))
+ tm.assert_index_equal(result.columns, df.columns)
+ results.append(df)
+
+ for i, result in enumerate(results):
+ if i > 0:
+ self.compare(result, results[0])
+
+ @pytest.mark.parametrize(
+ 'f', [lambda x: x.expanding().cov(pairwise=False),
+ lambda x: x.expanding().corr(pairwise=False),
+ lambda x: x.rolling(window=3).cov(pairwise=False),
+ lambda x: x.rolling(window=3).corr(pairwise=False),
+ lambda x: x.ewm(com=3).cov(pairwise=False),
+ lambda x: x.ewm(com=3).corr(pairwise=False), ])
+ def test_no_pairwise_with_self(self, f):
+
+ # DataFrame with itself, pairwise=False
+ results = [f(df) for df in self.df1s]
+ for (df, result) in zip(self.df1s, results):
+ tm.assert_index_equal(result.index, df.index)
+ tm.assert_index_equal(result.columns, df.columns)
+ for i, result in enumerate(results):
+ if i > 0:
+ self.compare(result, results[0])
+
+ @pytest.mark.parametrize(
+ 'f', [lambda x, y: x.expanding().cov(y, pairwise=True),
+ lambda x, y: x.expanding().corr(y, pairwise=True),
+ lambda x, y: x.rolling(window=3).cov(y, pairwise=True),
+ lambda x, y: x.rolling(window=3).corr(y, pairwise=True),
+ lambda x, y: x.ewm(com=3).cov(y, pairwise=True),
+ lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ])
+ def test_pairwise_with_other(self, f):
+
+ # DataFrame with another DataFrame, pairwise=True
+ results = [f(df, self.df2) for df in self.df1s]
+ for (df, result) in zip(self.df1s, results):
+ tm.assert_index_equal(result.index.levels[0],
+ df.index,
+ check_names=False)
+ tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
+ safe_sort(self.df2.columns.unique()))
+ for i, result in enumerate(results):
+ if i > 0:
+ self.compare(result, results[0])
+
+ @pytest.mark.parametrize(
+ 'f', [lambda x, y: x.expanding().cov(y, pairwise=False),
+ lambda x, y: x.expanding().corr(y, pairwise=False),
+ lambda x, y: x.rolling(window=3).cov(y, pairwise=False),
+ lambda x, y: x.rolling(window=3).corr(y, pairwise=False),
+ lambda x, y: x.ewm(com=3).cov(y, pairwise=False),
+ lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ])
+ def test_no_pairwise_with_other(self, f):
+
+ # DataFrame with another DataFrame, pairwise=False
+ results = [f(df, self.df2) if df.columns.is_unique else None
+ for df in self.df1s]
+ for (df, result) in zip(self.df1s, results):
+ if result is not None:
+ with catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ # we can have int and str columns
+ expected_index = df.index.union(self.df2.index)
+ expected_columns = df.columns.union(self.df2.columns)
+ tm.assert_index_equal(result.index, expected_index)
+ tm.assert_index_equal(result.columns, expected_columns)
+ else:
+ with pytest.raises(ValueError,
+ match="'arg1' columns are not unique"):
+ f(df, self.df2)
+ with pytest.raises(ValueError,
+ match="'arg2' columns are not unique"):
+ f(self.df2, df)
+
+ @pytest.mark.parametrize(
+ 'f', [lambda x, y: x.expanding().cov(y),
+ lambda x, y: x.expanding().corr(y),
+ lambda x, y: x.rolling(window=3).cov(y),
+ lambda x, y: x.rolling(window=3).corr(y),
+ lambda x, y: x.ewm(com=3).cov(y),
+ lambda x, y: x.ewm(com=3).corr(y), ])
+ def test_pairwise_with_series(self, f):
+
+ # DataFrame with a Series
+ results = ([f(df, self.s) for df in self.df1s] +
+ [f(self.s, df) for df in self.df1s])
+ for (df, result) in zip(self.df1s, results):
+ tm.assert_index_equal(result.index, df.index)
+ tm.assert_index_equal(result.columns, df.columns)
+ for i, result in enumerate(results):
+ if i > 0:
+ self.compare(result, results[0])
+
+
+# create the data only once as we are not setting it
+def _create_consistency_data():
+ def create_series():
+ return [Series(),
+ Series([np.nan]),
+ Series([np.nan, np.nan]),
+ Series([3.]),
+ Series([np.nan, 3.]),
+ Series([3., np.nan]),
+ Series([1., 3.]),
+ Series([2., 2.]),
+ Series([3., 1.]),
+ Series([5., 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan,
+ np.nan]),
+ Series([np.nan, 5., 5., 5., np.nan, np.nan, np.nan, 5., 5.,
+ np.nan, np.nan]),
+ Series([np.nan, np.nan, 5., 5., np.nan, np.nan, np.nan, 5., 5.,
+ np.nan, np.nan]),
+ Series([np.nan, 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7.,
+ 12., 13., 14., 15.]),
+ Series([np.nan, 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3.,
+ 12., 13., 14., 15.]),
+ Series([2., 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7.,
+ 12., 13., 14., 15.]),
+ Series([2., 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3.,
+ 12., 13., 14., 15.]),
+ Series(range(10)),
+ Series(range(20, 0, -2)), ]
+
+ def create_dataframes():
+ return ([DataFrame(),
+ DataFrame(columns=['a']),
+ DataFrame(columns=['a', 'a']),
+ DataFrame(columns=['a', 'b']),
+ DataFrame(np.arange(10).reshape((5, 2))),
+ DataFrame(np.arange(25).reshape((5, 5))),
+ DataFrame(np.arange(25).reshape((5, 5)),
+ columns=['a', 'b', 99, 'd', 'd'])] +
+ [DataFrame(s) for s in create_series()])
+
+ def is_constant(x):
+ values = x.values.ravel()
+ return len(set(values[notna(values)])) == 1
+
+ def no_nans(x):
+ return x.notna().all().all()
+
+ # data is a tuple(object, is_contant, no_nans)
+ data = create_series() + create_dataframes()
+
+ return [(x, is_constant(x), no_nans(x)) for x in data]
+
+
+_consistency_data = _create_consistency_data()
+
+
+def _rolling_consistency_cases():
+ for window in [1, 2, 3, 10, 20]:
+ for min_periods in {0, 1, 2, 3, 4, window}:
+ if min_periods and (min_periods > window):
+ continue
+ for center in [False, True]:
+ yield window, min_periods, center
+
+
+class TestMomentsConsistency(Base):
+ base_functions = [
+ (lambda v: Series(v).count(), None, 'count'),
+ (lambda v: Series(v).max(), None, 'max'),
+ (lambda v: Series(v).min(), None, 'min'),
+ (lambda v: Series(v).sum(), None, 'sum'),
+ (lambda v: Series(v).mean(), None, 'mean'),
+ (lambda v: Series(v).std(), 1, 'std'),
+ (lambda v: Series(v).cov(Series(v)), None, 'cov'),
+ (lambda v: Series(v).corr(Series(v)), None, 'corr'),
+ (lambda v: Series(v).var(), 1, 'var'),
+
+ # restore once GH 8086 is fixed
+ # lambda v: Series(v).skew(), 3, 'skew'),
+ # (lambda v: Series(v).kurt(), 4, 'kurt'),
+
+ # restore once GH 8084 is fixed
+ # lambda v: Series(v).quantile(0.3), None, 'quantile'),
+
+ (lambda v: Series(v).median(), None, 'median'),
+ (np.nanmax, 1, 'max'),
+ (np.nanmin, 1, 'min'),
+ (np.nansum, 1, 'sum'),
+ (np.nanmean, 1, 'mean'),
+ (lambda v: np.nanstd(v, ddof=1), 1, 'std'),
+ (lambda v: np.nanvar(v, ddof=1), 1, 'var'),
+ (np.nanmedian, 1, 'median'),
+ ]
+ no_nan_functions = [
+ (np.max, None, 'max'),
+ (np.min, None, 'min'),
+ (np.sum, None, 'sum'),
+ (np.mean, None, 'mean'),
+ (lambda v: np.std(v, ddof=1), 1, 'std'),
+ (lambda v: np.var(v, ddof=1), 1, 'var'),
+ (np.median, None, 'median'),
+ ]
+
+ def _create_data(self):
+ super(TestMomentsConsistency, self)._create_data()
+ self.data = _consistency_data
+
+ def setup_method(self, method):
+ self._create_data()
+
+ def _test_moments_consistency(self, min_periods, count, mean, mock_mean,
+ corr, var_unbiased=None, std_unbiased=None,
+ cov_unbiased=None, var_biased=None,
+ std_biased=None, cov_biased=None,
+ var_debiasing_factors=None):
+ def _non_null_values(x):
+ values = x.values.ravel()
+ return set(values[notna(values)].tolist())
+
+ for (x, is_constant, no_nans) in self.data:
+ count_x = count(x)
+ mean_x = mean(x)
+
+ if mock_mean:
+ # check that mean equals mock_mean
+ expected = mock_mean(x)
+ assert_equal(mean_x, expected.astype('float64'))
+
+ # check that correlation of a series with itself is either 1 or NaN
+ corr_x_x = corr(x, x)
+
+ # assert _non_null_values(corr_x_x).issubset(set([1.]))
+ # restore once rolling_cov(x, x) is identically equal to var(x)
+
+ if is_constant:
+ exp = x.max() if isinstance(x, Series) else x.max().max()
+
+ # check mean of constant series
+ expected = x * np.nan
+ expected[count_x >= max(min_periods, 1)] = exp
+ assert_equal(mean_x, expected)
+
+ # check correlation of constant series with itself is NaN
+ expected[:] = np.nan
+ assert_equal(corr_x_x, expected)
+
+ if var_unbiased and var_biased and var_debiasing_factors:
+ # check variance debiasing factors
+ var_unbiased_x = var_unbiased(x)
+ var_biased_x = var_biased(x)
+ var_debiasing_factors_x = var_debiasing_factors(x)
+ assert_equal(var_unbiased_x, var_biased_x *
+ var_debiasing_factors_x)
+
+ for (std, var, cov) in [(std_biased, var_biased, cov_biased),
+ (std_unbiased, var_unbiased, cov_unbiased)
+ ]:
+
+ # check that var(x), std(x), and cov(x) are all >= 0
+ var_x = var(x)
+ std_x = std(x)
+ assert not (var_x < 0).any().any()
+ assert not (std_x < 0).any().any()
+ if cov:
+ cov_x_x = cov(x, x)
+ assert not (cov_x_x < 0).any().any()
+
+ # check that var(x) == cov(x, x)
+ assert_equal(var_x, cov_x_x)
+
+ # check that var(x) == std(x)^2
+ assert_equal(var_x, std_x * std_x)
+
+ if var is var_biased:
+ # check that biased var(x) == mean(x^2) - mean(x)^2
+ mean_x2 = mean(x * x)
+ assert_equal(var_x, mean_x2 - (mean_x * mean_x))
+
+ if is_constant:
+ # check that variance of constant series is identically 0
+ assert not (var_x > 0).any().any()
+ expected = x * np.nan
+ expected[count_x >= max(min_periods, 1)] = 0.
+ if var is var_unbiased:
+ expected[count_x < 2] = np.nan
+ assert_equal(var_x, expected)
+
+ if isinstance(x, Series):
+ for (y, is_constant, no_nans) in self.data:
+ if not x.isna().equals(y.isna()):
+ # can only easily test two Series with similar
+ # structure
+ continue
+
+ # check that cor(x, y) is symmetric
+ corr_x_y = corr(x, y)
+ corr_y_x = corr(y, x)
+ assert_equal(corr_x_y, corr_y_x)
+
+ if cov:
+ # check that cov(x, y) is symmetric
+ cov_x_y = cov(x, y)
+ cov_y_x = cov(y, x)
+ assert_equal(cov_x_y, cov_y_x)
+
+ # check that cov(x, y) == (var(x+y) - var(x) -
+ # var(y)) / 2
+ var_x_plus_y = var(x + y)
+ var_y = var(y)
+ assert_equal(cov_x_y, 0.5 *
+ (var_x_plus_y - var_x - var_y))
+
+ # check that corr(x, y) == cov(x, y) / (std(x) *
+ # std(y))
+ std_y = std(y)
+ assert_equal(corr_x_y, cov_x_y / (std_x * std_y))
+
+ if cov is cov_biased:
+ # check that biased cov(x, y) == mean(x*y) -
+ # mean(x)*mean(y)
+ mean_y = mean(y)
+ mean_x_times_y = mean(x * y)
+ assert_equal(cov_x_y, mean_x_times_y -
+ (mean_x * mean_y))
+
+ @pytest.mark.slow
+ @pytest.mark.parametrize('min_periods', [0, 1, 2, 3, 4])
+ @pytest.mark.parametrize('adjust', [True, False])
+ @pytest.mark.parametrize('ignore_na', [True, False])
+ def test_ewm_consistency(self, min_periods, adjust, ignore_na):
+ def _weights(s, com, adjust, ignore_na):
+ if isinstance(s, DataFrame):
+ if not len(s.columns):
+ return DataFrame(index=s.index, columns=s.columns)
+ w = concat([
+ _weights(s.iloc[:, i], com=com, adjust=adjust,
+ ignore_na=ignore_na)
+ for i, _ in enumerate(s.columns)], axis=1)
+ w.index = s.index
+ w.columns = s.columns
+ return w
+
+ w = Series(np.nan, index=s.index)
+ alpha = 1. / (1. + com)
+ if ignore_na:
+ w[s.notna()] = _weights(s[s.notna()], com=com,
+ adjust=adjust, ignore_na=False)
+ elif adjust:
+ for i in range(len(s)):
+ if s.iat[i] == s.iat[i]:
+ w.iat[i] = pow(1. / (1. - alpha), i)
+ else:
+ sum_wts = 0.
+ prev_i = -1
+ for i in range(len(s)):
+ if s.iat[i] == s.iat[i]:
+ if prev_i == -1:
+ w.iat[i] = 1.
+ else:
+ w.iat[i] = alpha * sum_wts / pow(1. - alpha,
+ i - prev_i)
+ sum_wts += w.iat[i]
+ prev_i = i
+ return w
+
+ def _variance_debiasing_factors(s, com, adjust, ignore_na):
+ weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na)
+ cum_sum = weights.cumsum().fillna(method='ffill')
+ cum_sum_sq = (weights * weights).cumsum().fillna(method='ffill')
+ numerator = cum_sum * cum_sum
+ denominator = numerator - cum_sum_sq
+ denominator[denominator <= 0.] = np.nan
+ return numerator / denominator
+
+ def _ewma(s, com, min_periods, adjust, ignore_na):
+ weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na)
+ result = s.multiply(weights).cumsum().divide(weights.cumsum(
+ )).fillna(method='ffill')
+ result[s.expanding().count() < (max(min_periods, 1) if min_periods
+ else 1)] = np.nan
+ return result
+
+ com = 3.
+ # test consistency between different ewm* moments
+ self._test_moments_consistency(
+ min_periods=min_periods,
+ count=lambda x: x.expanding().count(),
+ mean=lambda x: x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na).mean(),
+ mock_mean=lambda x: _ewma(x, com=com,
+ min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na),
+ corr=lambda x, y: x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na).corr(y),
+ var_unbiased=lambda x: (
+ x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na).var(bias=False)),
+ std_unbiased=lambda x: (
+ x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust, ignore_na=ignore_na)
+ .std(bias=False)),
+ cov_unbiased=lambda x, y: (
+ x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust, ignore_na=ignore_na)
+ .cov(y, bias=False)),
+ var_biased=lambda x: (
+ x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust, ignore_na=ignore_na)
+ .var(bias=True)),
+ std_biased=lambda x: x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na).std(bias=True),
+ cov_biased=lambda x, y: (
+ x.ewm(com=com, min_periods=min_periods,
+ adjust=adjust, ignore_na=ignore_na)
+ .cov(y, bias=True)),
+ var_debiasing_factors=lambda x: (
+ _variance_debiasing_factors(x, com=com, adjust=adjust,
+ ignore_na=ignore_na)))
+
+ @pytest.mark.slow
+ @pytest.mark.parametrize(
+ 'min_periods', [0, 1, 2, 3, 4])
+ def test_expanding_consistency(self, min_periods):
+
+ # suppress warnings about empty slices, as we are deliberately testing
+ # with empty/0-length Series/DataFrames
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore",
+ message=".*(empty slice|0 for slice).*",
+ category=RuntimeWarning)
+
+ # test consistency between different expanding_* moments
+ self._test_moments_consistency(
+ min_periods=min_periods,
+ count=lambda x: x.expanding().count(),
+ mean=lambda x: x.expanding(
+ min_periods=min_periods).mean(),
+ mock_mean=lambda x: x.expanding(
+ min_periods=min_periods).sum() / x.expanding().count(),
+ corr=lambda x, y: x.expanding(
+ min_periods=min_periods).corr(y),
+ var_unbiased=lambda x: x.expanding(
+ min_periods=min_periods).var(),
+ std_unbiased=lambda x: x.expanding(
+ min_periods=min_periods).std(),
+ cov_unbiased=lambda x, y: x.expanding(
+ min_periods=min_periods).cov(y),
+ var_biased=lambda x: x.expanding(
+ min_periods=min_periods).var(ddof=0),
+ std_biased=lambda x: x.expanding(
+ min_periods=min_periods).std(ddof=0),
+ cov_biased=lambda x, y: x.expanding(
+ min_periods=min_periods).cov(y, ddof=0),
+ var_debiasing_factors=lambda x: (
+ x.expanding().count() /
+ (x.expanding().count() - 1.)
+ .replace(0., np.nan)))
+
+ # test consistency between expanding_xyz() and either (a)
+ # expanding_apply of Series.xyz(), or (b) expanding_apply of
+ # np.nanxyz()
+ for (x, is_constant, no_nans) in self.data:
+ functions = self.base_functions
+
+ # GH 8269
+ if no_nans:
+ functions = self.base_functions + self.no_nan_functions
+ for (f, require_min_periods, name) in functions:
+ expanding_f = getattr(
+ x.expanding(min_periods=min_periods), name)
+
+ if (require_min_periods and
+ (min_periods is not None) and
+ (min_periods < require_min_periods)):
+ continue
+
+ if name == 'count':
+ expanding_f_result = expanding_f()
+ expanding_apply_f_result = x.expanding(
+ min_periods=0).apply(func=f, raw=True)
+ else:
+ if name in ['cov', 'corr']:
+ expanding_f_result = expanding_f(
+ pairwise=False)
+ else:
+ expanding_f_result = expanding_f()
+ expanding_apply_f_result = x.expanding(
+ min_periods=min_periods).apply(func=f, raw=True)
+
+ # GH 9422
+ if name in ['sum', 'prod']:
+ assert_equal(expanding_f_result,
+ expanding_apply_f_result)
+
+ @pytest.mark.slow
+ @pytest.mark.parametrize(
+ 'window,min_periods,center', list(_rolling_consistency_cases()))
+ def test_rolling_consistency(self, window, min_periods, center):
+
+ # suppress warnings about empty slices, as we are deliberately testing
+ # with empty/0-length Series/DataFrames
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore",
+ message=".*(empty slice|0 for slice).*",
+ category=RuntimeWarning)
+
+ # test consistency between different rolling_* moments
+ self._test_moments_consistency(
+ min_periods=min_periods,
+ count=lambda x: (
+ x.rolling(window=window, center=center)
+ .count()),
+ mean=lambda x: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).mean()),
+ mock_mean=lambda x: (
+ x.rolling(window=window,
+ min_periods=min_periods,
+ center=center).sum()
+ .divide(x.rolling(window=window,
+ min_periods=min_periods,
+ center=center).count())),
+ corr=lambda x, y: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).corr(y)),
+
+ var_unbiased=lambda x: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).var()),
+
+ std_unbiased=lambda x: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).std()),
+
+ cov_unbiased=lambda x, y: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).cov(y)),
+
+ var_biased=lambda x: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).var(ddof=0)),
+
+ std_biased=lambda x: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).std(ddof=0)),
+
+ cov_biased=lambda x, y: (
+ x.rolling(window=window, min_periods=min_periods,
+ center=center).cov(y, ddof=0)),
+ var_debiasing_factors=lambda x: (
+ x.rolling(window=window, center=center).count()
+ .divide((x.rolling(window=window, center=center)
+ .count() - 1.)
+ .replace(0., np.nan))))
+
+ # test consistency between rolling_xyz() and either (a)
+ # rolling_apply of Series.xyz(), or (b) rolling_apply of
+ # np.nanxyz()
+ for (x, is_constant, no_nans) in self.data:
+ functions = self.base_functions
+
+ # GH 8269
+ if no_nans:
+ functions = self.base_functions + self.no_nan_functions
+ for (f, require_min_periods, name) in functions:
+ rolling_f = getattr(
+ x.rolling(window=window, center=center,
+ min_periods=min_periods), name)
+
+ if require_min_periods and (
+ min_periods is not None) and (
+ min_periods < require_min_periods):
+ continue
+
+ if name == 'count':
+ rolling_f_result = rolling_f()
+ rolling_apply_f_result = x.rolling(
+ window=window, min_periods=0,
+ center=center).apply(func=f, raw=True)
+ else:
+ if name in ['cov', 'corr']:
+ rolling_f_result = rolling_f(
+ pairwise=False)
+ else:
+ rolling_f_result = rolling_f()
+ rolling_apply_f_result = x.rolling(
+ window=window, min_periods=min_periods,
+ center=center).apply(func=f, raw=True)
+
+ # GH 9422
+ if name in ['sum', 'prod']:
+ assert_equal(rolling_f_result,
+ rolling_apply_f_result)
+
+ # binary moments
+ def test_rolling_cov(self):
+ A = self.series
+ B = A + randn(len(A))
+
+ result = A.rolling(window=50, min_periods=25).cov(B)
+ tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])
+
+ def test_rolling_cov_pairwise(self):
+ self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5)
+
+ def test_rolling_corr(self):
+ A = self.series
+ B = A + randn(len(A))
+
+ result = A.rolling(window=50, min_periods=25).corr(B)
+ tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])
+
+ # test for correct bias correction
+ a = tm.makeTimeSeries()
+ b = tm.makeTimeSeries()
+ a[:5] = np.nan
+ b[:10] = np.nan
+
+ result = a.rolling(window=len(a), min_periods=1).corr(b)
+ tm.assert_almost_equal(result[-1], a.corr(b))
+
+ def test_rolling_corr_pairwise(self):
+ self._check_pairwise_moment('rolling', 'corr', window=10,
+ min_periods=5)
+
+ @pytest.mark.parametrize('window', range(7))
+ def test_rolling_corr_with_zero_variance(self, window):
+ # GH 18430
+ s = pd.Series(np.zeros(20))
+ other = pd.Series(np.arange(20))
+
+ assert s.rolling(window=window).corr(other=other).isna().all()
+
+ def _check_pairwise_moment(self, dispatch, name, **kwargs):
+ def get_result(obj, obj2=None):
+ return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2)
+
+ result = get_result(self.frame)
+ result = result.loc[(slice(None), 1), 5]
+ result.index = result.index.droplevel(1)
+ expected = get_result(self.frame[1], self.frame[5])
+ tm.assert_series_equal(result, expected, check_names=False)
+
+ def test_flex_binary_moment(self):
+ # GH3155
+ # don't blow the stack
+ pytest.raises(TypeError, rwindow._flex_binary_moment, 5, 6, None)
+
+ def test_corr_sanity(self):
+ # GH 3155
+ df = DataFrame(np.array(
+ [[0.87024726, 0.18505595], [0.64355431, 0.3091617],
+ [0.92372966, 0.50552513], [0.00203756, 0.04520709],
+ [0.84780328, 0.33394331], [0.78369152, 0.63919667]]))
+
+ res = df[0].rolling(5, center=True).corr(df[1])
+ assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
+
+ # and some fuzzing
+ for _ in range(10):
+ df = DataFrame(np.random.rand(30, 2))
+ res = df[0].rolling(5, center=True).corr(df[1])
+ try:
+ assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
+ except AssertionError:
+ print(res)
+
+ @pytest.mark.parametrize('method', ['corr', 'cov'])
+ def test_flex_binary_frame(self, method):
+ series = self.frame[1]
+
+ res = getattr(series.rolling(window=10), method)(self.frame)
+ res2 = getattr(self.frame.rolling(window=10), method)(series)
+ exp = self.frame.apply(lambda x: getattr(
+ series.rolling(window=10), method)(x))
+
+ tm.assert_frame_equal(res, exp)
+ tm.assert_frame_equal(res2, exp)
+
+ frame2 = self.frame.copy()
+ frame2.values[:] = np.random.randn(*frame2.shape)
+
+ res3 = getattr(self.frame.rolling(window=10), method)(frame2)
+ exp = DataFrame({k: getattr(self.frame[k].rolling(
+ window=10), method)(frame2[k]) for k in self.frame})
+ tm.assert_frame_equal(res3, exp)
+
+ def test_ewmcov(self):
+ self._check_binary_ew('cov')
+
+ def test_ewmcov_pairwise(self):
+ self._check_pairwise_moment('ewm', 'cov', span=10, min_periods=5)
+
+ def test_ewmcorr(self):
+ self._check_binary_ew('corr')
+
+ def test_ewmcorr_pairwise(self):
+ self._check_pairwise_moment('ewm', 'corr', span=10, min_periods=5)
+
+ def _check_binary_ew(self, name):
+ def func(A, B, com, **kwargs):
+ return getattr(A.ewm(com, **kwargs), name)(B)
+
+ A = Series(randn(50), index=np.arange(50))
+ B = A[2:] + randn(48)
+
+ A[:10] = np.NaN
+ B[-10:] = np.NaN
+
+ result = func(A, B, 20, min_periods=5)
+ assert np.isnan(result.values[:14]).all()
+ assert not np.isnan(result.values[14:]).any()
+
+ # GH 7898
+ for min_periods in (0, 1, 2):
+ result = func(A, B, 20, min_periods=min_periods)
+ # binary functions (ewmcov, ewmcorr) with bias=False require at
+ # least two values
+ assert np.isnan(result.values[:11]).all()
+ assert not np.isnan(result.values[11:]).any()
+
+ # check series of length 0
+ result = func(Series([]), Series([]), 50, min_periods=min_periods)
+ tm.assert_series_equal(result, Series([]))
+
+ # check series of length 1
+ result = func(
+ Series([1.]), Series([1.]), 50, min_periods=min_periods)
+ tm.assert_series_equal(result, Series([np.NaN]))
+
+ pytest.raises(Exception, func, A, randn(50), 20, min_periods=5)
+
+ def test_expanding_apply_args_kwargs(self, raw):
+
+ def mean_w_arg(x, const):
+ return np.mean(x) + const
+
+ df = DataFrame(np.random.rand(20, 3))
+
+ expected = df.expanding().apply(np.mean, raw=raw) + 20.
+
+ result = df.expanding().apply(mean_w_arg,
+ raw=raw,
+ args=(20, ))
+ tm.assert_frame_equal(result, expected)
+
+ result = df.expanding().apply(mean_w_arg,
+ raw=raw,
+ kwargs={'const': 20})
+ tm.assert_frame_equal(result, expected)
+
+ def test_expanding_corr(self):
+ A = self.series.dropna()
+ B = (A + randn(len(A)))[:-5]
+
+ result = A.expanding().corr(B)
+
+ rolling_result = A.rolling(window=len(A), min_periods=1).corr(B)
+
+ tm.assert_almost_equal(rolling_result, result)
+
+ def test_expanding_count(self):
+ result = self.series.expanding().count()
+ tm.assert_almost_equal(result, self.series.rolling(
+ window=len(self.series)).count())
+
+ def test_expanding_quantile(self):
+ result = self.series.expanding().quantile(0.5)
+
+ rolling_result = self.series.rolling(window=len(self.series),
+ min_periods=1).quantile(0.5)
+
+ tm.assert_almost_equal(result, rolling_result)
+
+ def test_expanding_cov(self):
+ A = self.series
+ B = (A + randn(len(A)))[:-5]
+
+ result = A.expanding().cov(B)
+
+ rolling_result = A.rolling(window=len(A), min_periods=1).cov(B)
+
+ tm.assert_almost_equal(rolling_result, result)
+
+ def test_expanding_cov_pairwise(self):
+ result = self.frame.expanding().corr()
+
+ rolling_result = self.frame.rolling(window=len(self.frame),
+ min_periods=1).corr()
+
+ tm.assert_frame_equal(result, rolling_result)
+
+ def test_expanding_corr_pairwise(self):
+ result = self.frame.expanding().corr()
+
+ rolling_result = self.frame.rolling(window=len(self.frame),
+ min_periods=1).corr()
+ tm.assert_frame_equal(result, rolling_result)
+
+ def test_expanding_cov_diff_index(self):
+ # GH 7512
+ s1 = Series([1, 2, 3], index=[0, 1, 2])
+ s2 = Series([1, 3], index=[0, 2])
+ result = s1.expanding().cov(s2)
+ expected = Series([None, None, 2.0])
+ tm.assert_series_equal(result, expected)
+
+ s2a = Series([1, None, 3], index=[0, 1, 2])
+ result = s1.expanding().cov(s2a)
+ tm.assert_series_equal(result, expected)
+
+ s1 = Series([7, 8, 10], index=[0, 1, 3])
+ s2 = Series([7, 9, 10], index=[0, 2, 3])
+ result = s1.expanding().cov(s2)
+ expected = Series([None, None, None, 4.5])
+ tm.assert_series_equal(result, expected)
+
+ def test_expanding_corr_diff_index(self):
+ # GH 7512
+ s1 = Series([1, 2, 3], index=[0, 1, 2])
+ s2 = Series([1, 3], index=[0, 2])
+ result = s1.expanding().corr(s2)
+ expected = Series([None, None, 1.0])
+ tm.assert_series_equal(result, expected)
+
+ s2a = Series([1, None, 3], index=[0, 1, 2])
+ result = s1.expanding().corr(s2a)
+ tm.assert_series_equal(result, expected)
+
+ s1 = Series([7, 8, 10], index=[0, 1, 3])
+ s2 = Series([7, 9, 10], index=[0, 2, 3])
+ result = s1.expanding().corr(s2)
+ expected = Series([None, None, None, 1.])
+ tm.assert_series_equal(result, expected)
+
+ def test_rolling_cov_diff_length(self):
+ # GH 7512
+ s1 = Series([1, 2, 3], index=[0, 1, 2])
+ s2 = Series([1, 3], index=[0, 2])
+ result = s1.rolling(window=3, min_periods=2).cov(s2)
+ expected = Series([None, None, 2.0])
+ tm.assert_series_equal(result, expected)
+
+ s2a = Series([1, None, 3], index=[0, 1, 2])
+ result = s1.rolling(window=3, min_periods=2).cov(s2a)
+ tm.assert_series_equal(result, expected)
+
+ def test_rolling_corr_diff_length(self):
+ # GH 7512
+ s1 = Series([1, 2, 3], index=[0, 1, 2])
+ s2 = Series([1, 3], index=[0, 2])
+ result = s1.rolling(window=3, min_periods=2).corr(s2)
+ expected = Series([None, None, 1.0])
+ tm.assert_series_equal(result, expected)
+
+ s2a = Series([1, None, 3], index=[0, 1, 2])
+ result = s1.rolling(window=3, min_periods=2).corr(s2a)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ 'f',
+ [
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .cov(x, pairwise=False)),
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .corr(x, pairwise=False)),
+ lambda x: x.rolling(window=10, min_periods=5).max(),
+ lambda x: x.rolling(window=10, min_periods=5).min(),
+ lambda x: x.rolling(window=10, min_periods=5).sum(),
+ lambda x: x.rolling(window=10, min_periods=5).mean(),
+ lambda x: x.rolling(window=10, min_periods=5).std(),
+ lambda x: x.rolling(window=10, min_periods=5).var(),
+ lambda x: x.rolling(window=10, min_periods=5).skew(),
+ lambda x: x.rolling(window=10, min_periods=5).kurt(),
+ lambda x: x.rolling(
+ window=10, min_periods=5).quantile(quantile=0.5),
+ lambda x: x.rolling(window=10, min_periods=5).median(),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=True),
+ lambda x: x.rolling(win_type='boxcar',
+ window=10, min_periods=5).mean()])
+ def test_rolling_functions_window_non_shrinkage(self, f):
+ # GH 7764
+ s = Series(range(4))
+ s_expected = Series(np.nan, index=s.index)
+ df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B'])
+ df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
+
+ try:
+ s_result = f(s)
+ tm.assert_series_equal(s_result, s_expected)
+
+ df_result = f(df)
+ tm.assert_frame_equal(df_result, df_expected)
+ except (ImportError):
+
+ # scipy needed for rolling_window
+ pytest.skip("scipy not available")
+
+ def test_rolling_functions_window_non_shrinkage_binary(self):
+
+ # corr/cov return a MI DataFrame
+ df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]],
+ columns=Index(['A', 'B'], name='foo'),
+ index=Index(range(4), name='bar'))
+ df_expected = DataFrame(
+ columns=Index(['A', 'B'], name='foo'),
+ index=pd.MultiIndex.from_product([df.index, df.columns],
+ names=['bar', 'foo']),
+ dtype='float64')
+ functions = [lambda x: (x.rolling(window=10, min_periods=5)
+ .cov(x, pairwise=True)),
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .corr(x, pairwise=True))]
+ for f in functions:
+ df_result = f(df)
+ tm.assert_frame_equal(df_result, df_expected)
+
+ def test_moment_functions_zero_length(self):
+ # GH 8056
+ s = Series()
+ s_expected = s
+ df1 = DataFrame()
+ df1_expected = df1
+ df2 = DataFrame(columns=['a'])
+ df2['a'] = df2['a'].astype('float64')
+ df2_expected = df2
+
+ functions = [lambda x: x.expanding().count(),
+ lambda x: x.expanding(min_periods=5).cov(
+ x, pairwise=False),
+ lambda x: x.expanding(min_periods=5).corr(
+ x, pairwise=False),
+ lambda x: x.expanding(min_periods=5).max(),
+ lambda x: x.expanding(min_periods=5).min(),
+ lambda x: x.expanding(min_periods=5).sum(),
+ lambda x: x.expanding(min_periods=5).mean(),
+ lambda x: x.expanding(min_periods=5).std(),
+ lambda x: x.expanding(min_periods=5).var(),
+ lambda x: x.expanding(min_periods=5).skew(),
+ lambda x: x.expanding(min_periods=5).kurt(),
+ lambda x: x.expanding(min_periods=5).quantile(0.5),
+ lambda x: x.expanding(min_periods=5).median(),
+ lambda x: x.expanding(min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.expanding(min_periods=5).apply(
+ sum, raw=True),
+ lambda x: x.rolling(window=10).count(),
+ lambda x: x.rolling(window=10, min_periods=5).cov(
+ x, pairwise=False),
+ lambda x: x.rolling(window=10, min_periods=5).corr(
+ x, pairwise=False),
+ lambda x: x.rolling(window=10, min_periods=5).max(),
+ lambda x: x.rolling(window=10, min_periods=5).min(),
+ lambda x: x.rolling(window=10, min_periods=5).sum(),
+ lambda x: x.rolling(window=10, min_periods=5).mean(),
+ lambda x: x.rolling(window=10, min_periods=5).std(),
+ lambda x: x.rolling(window=10, min_periods=5).var(),
+ lambda x: x.rolling(window=10, min_periods=5).skew(),
+ lambda x: x.rolling(window=10, min_periods=5).kurt(),
+ lambda x: x.rolling(
+ window=10, min_periods=5).quantile(0.5),
+ lambda x: x.rolling(window=10, min_periods=5).median(),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=True),
+ lambda x: x.rolling(win_type='boxcar',
+ window=10, min_periods=5).mean(),
+ ]
+ for f in functions:
+ try:
+ s_result = f(s)
+ tm.assert_series_equal(s_result, s_expected)
+
+ df1_result = f(df1)
+ tm.assert_frame_equal(df1_result, df1_expected)
+
+ df2_result = f(df2)
+ tm.assert_frame_equal(df2_result, df2_expected)
+ except (ImportError):
+
+ # scipy needed for rolling_window
+ continue
+
+ def test_moment_functions_zero_length_pairwise(self):
+
+ df1 = DataFrame()
+ df1_expected = df1
+ df2 = DataFrame(columns=Index(['a'], name='foo'),
+ index=Index([], name='bar'))
+ df2['a'] = df2['a'].astype('float64')
+
+ df1_expected = DataFrame(
+ index=pd.MultiIndex.from_product([df1.index, df1.columns]),
+ columns=Index([]))
+ df2_expected = DataFrame(
+ index=pd.MultiIndex.from_product([df2.index, df2.columns],
+ names=['bar', 'foo']),
+ columns=Index(['a'], name='foo'),
+ dtype='float64')
+
+ functions = [lambda x: (x.expanding(min_periods=5)
+ .cov(x, pairwise=True)),
+ lambda x: (x.expanding(min_periods=5)
+ .corr(x, pairwise=True)),
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .cov(x, pairwise=True)),
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .corr(x, pairwise=True)),
+ ]
+ for f in functions:
+ df1_result = f(df1)
+ tm.assert_frame_equal(df1_result, df1_expected)
+
+ df2_result = f(df2)
+ tm.assert_frame_equal(df2_result, df2_expected)
+
+ def test_expanding_cov_pairwise_diff_length(self):
+ # GH 7512
+ df1 = DataFrame([[1, 5], [3, 2], [3, 9]],
+ columns=Index(['A', 'B'], name='foo'))
+ df1a = DataFrame([[1, 5], [3, 9]],
+ index=[0, 2],
+ columns=Index(['A', 'B'], name='foo'))
+ df2 = DataFrame([[5, 6], [None, None], [2, 1]],
+ columns=Index(['X', 'Y'], name='foo'))
+ df2a = DataFrame([[5, 6], [2, 1]],
+ index=[0, 2],
+ columns=Index(['X', 'Y'], name='foo'))
+ # TODO: xref gh-15826
+ # .loc is not preserving the names
+ result1 = df1.expanding().cov(df2a, pairwise=True).loc[2]
+ result2 = df1.expanding().cov(df2a, pairwise=True).loc[2]
+ result3 = df1a.expanding().cov(df2, pairwise=True).loc[2]
+ result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2]
+ expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]],
+ columns=Index(['A', 'B'], name='foo'),
+ index=Index(['X', 'Y'], name='foo'))
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+ tm.assert_frame_equal(result3, expected)
+ tm.assert_frame_equal(result4, expected)
+
+ def test_expanding_corr_pairwise_diff_length(self):
+ # GH 7512
+ df1 = DataFrame([[1, 2], [3, 2], [3, 4]],
+ columns=['A', 'B'],
+ index=Index(range(3), name='bar'))
+ df1a = DataFrame([[1, 2], [3, 4]],
+ index=Index([0, 2], name='bar'),
+ columns=['A', 'B'])
+ df2 = DataFrame([[5, 6], [None, None], [2, 1]],
+ columns=['X', 'Y'],
+ index=Index(range(3), name='bar'))
+ df2a = DataFrame([[5, 6], [2, 1]],
+ index=Index([0, 2], name='bar'),
+ columns=['X', 'Y'])
+ result1 = df1.expanding().corr(df2, pairwise=True).loc[2]
+ result2 = df1.expanding().corr(df2a, pairwise=True).loc[2]
+ result3 = df1a.expanding().corr(df2, pairwise=True).loc[2]
+ result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2]
+ expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]],
+ columns=['A', 'B'],
+ index=Index(['X', 'Y']))
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+ tm.assert_frame_equal(result3, expected)
+ tm.assert_frame_equal(result4, expected)
+
+ def test_rolling_skew_edge_cases(self):
+
+ all_nan = Series([np.NaN] * 5)
+
+ # yields all NaN (0 variance)
+ d = Series([1] * 5)
+ x = d.rolling(window=5).skew()
+ tm.assert_series_equal(all_nan, x)
+
+ # yields all NaN (window too small)
+ d = Series(np.random.randn(5))
+ x = d.rolling(window=2).skew()
+ tm.assert_series_equal(all_nan, x)
+
+ # yields [NaN, NaN, NaN, 0.177994, 1.548824]
+ d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401
+ ])
+ expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824])
+ x = d.rolling(window=4).skew()
+ tm.assert_series_equal(expected, x)
+
+ def test_rolling_kurt_edge_cases(self):
+
+ all_nan = Series([np.NaN] * 5)
+
+ # yields all NaN (0 variance)
+ d = Series([1] * 5)
+ x = d.rolling(window=5).kurt()
+ tm.assert_series_equal(all_nan, x)
+
+ # yields all NaN (window too small)
+ d = Series(np.random.randn(5))
+ x = d.rolling(window=3).kurt()
+ tm.assert_series_equal(all_nan, x)
+
+ # yields [NaN, NaN, NaN, 1.224307, 2.671499]
+ d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401
+ ])
+ expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499])
+ x = d.rolling(window=4).kurt()
+ tm.assert_series_equal(expected, x)
+
+ def test_rolling_skew_eq_value_fperr(self):
+ # #18804 all rolling skew for all equal values should return Nan
+ a = Series([1.1] * 15).rolling(window=10).skew()
+ assert np.isnan(a).all()
+
+ def test_rolling_kurt_eq_value_fperr(self):
+ # #18804 all rolling kurt for all equal values should return Nan
+ a = Series([1.1] * 15).rolling(window=10).kurt()
+ assert np.isnan(a).all()
+
+ @pytest.mark.parametrize('func,static_comp', [('sum', np.sum),
+ ('mean', np.mean),
+ ('max', np.max),
+ ('min', np.min)],
+ ids=['sum', 'mean', 'max', 'min'])
+ def test_expanding_func(self, func, static_comp):
+ def expanding_func(x, min_periods=1, center=False, axis=0):
+ exp = x.expanding(min_periods=min_periods,
+ center=center, axis=axis)
+ return getattr(exp, func)()
+ self._check_expanding(expanding_func, static_comp, preserve_nan=False)
+
+ def test_expanding_apply(self, raw):
+
+ def expanding_mean(x, min_periods=1):
+
+ exp = x.expanding(min_periods=min_periods)
+ result = exp.apply(lambda x: x.mean(), raw=raw)
+ return result
+
+ # TODO(jreback), needed to add preserve_nan=False
+ # here to make this pass
+ self._check_expanding(expanding_mean, np.mean, preserve_nan=False)
+
+ ser = Series([])
+ tm.assert_series_equal(ser, ser.expanding().apply(
+ lambda x: x.mean(), raw=raw))
+
+ # GH 8080
+ s = Series([None, None, None])
+ result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw)
+ expected = Series([1., 2., 3.])
+ tm.assert_series_equal(result, expected)
+
+ def _check_expanding(self, func, static_comp, has_min_periods=True,
+ has_time_rule=True, preserve_nan=True):
+
+ series_result = func(self.series)
+ assert isinstance(series_result, Series)
+ frame_result = func(self.frame)
+ assert isinstance(frame_result, DataFrame)
+
+ result = func(self.series)
+ tm.assert_almost_equal(result[10], static_comp(self.series[:11]))
+
+ if preserve_nan:
+ assert result.iloc[self._nan_locs].isna().all()
+
+ ser = Series(randn(50))
+
+ if has_min_periods:
+ result = func(ser, min_periods=30)
+ assert result[:29].isna().all()
+ tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
+
+ # min_periods is working correctly
+ result = func(ser, min_periods=15)
+ assert isna(result.iloc[13])
+ assert notna(result.iloc[14])
+
+ ser2 = Series(randn(20))
+ result = func(ser2, min_periods=5)
+ assert isna(result[3])
+ assert notna(result[4])
+
+ # min_periods=0
+ result0 = func(ser, min_periods=0)
+ result1 = func(ser, min_periods=1)
+ tm.assert_almost_equal(result0, result1)
+ else:
+ result = func(ser)
+ tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
+
+ def test_rolling_max_gh6297(self):
+ """Replicate result expected in GH #6297"""
+
+ indices = [datetime(1975, 1, i) for i in range(1, 6)]
+ # So that we can have 2 datapoints on one of the days
+ indices.append(datetime(1975, 1, 3, 6, 0))
+ series = Series(range(1, 7), index=indices)
+ # Use floats instead of ints as values
+ series = series.map(lambda x: float(x))
+ # Sort chronologically
+ series = series.sort_index()
+
+ expected = Series([1.0, 2.0, 6.0, 4.0, 5.0],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ x = series.resample('D').max().rolling(window=1).max()
+ tm.assert_series_equal(expected, x)
+
+ def test_rolling_max_resample(self):
+
+ indices = [datetime(1975, 1, i) for i in range(1, 6)]
+ # So that we can have 3 datapoints on last day (4, 10, and 20)
+ indices.append(datetime(1975, 1, 5, 1))
+ indices.append(datetime(1975, 1, 5, 2))
+ series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ # Use floats instead of ints as values
+ series = series.map(lambda x: float(x))
+ # Sort chronologically
+ series = series.sort_index()
+
+ # Default how should be max
+ expected = Series([0.0, 1.0, 2.0, 3.0, 20.0],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ x = series.resample('D').max().rolling(window=1).max()
+ tm.assert_series_equal(expected, x)
+
+ # Now specify median (10.0)
+ expected = Series([0.0, 1.0, 2.0, 3.0, 10.0],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ x = series.resample('D').median().rolling(window=1).max()
+ tm.assert_series_equal(expected, x)
+
+ # Now specify mean (4+10+20)/3
+ v = (4.0 + 10.0 + 20.0) / 3.0
+ expected = Series([0.0, 1.0, 2.0, 3.0, v],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ x = series.resample('D').mean().rolling(window=1).max()
+ tm.assert_series_equal(expected, x)
+
+ def test_rolling_min_resample(self):
+
+ indices = [datetime(1975, 1, i) for i in range(1, 6)]
+ # So that we can have 3 datapoints on last day (4, 10, and 20)
+ indices.append(datetime(1975, 1, 5, 1))
+ indices.append(datetime(1975, 1, 5, 2))
+ series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ # Use floats instead of ints as values
+ series = series.map(lambda x: float(x))
+ # Sort chronologically
+ series = series.sort_index()
+
+ # Default how should be min
+ expected = Series([0.0, 1.0, 2.0, 3.0, 4.0],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ r = series.resample('D').min().rolling(window=1)
+ tm.assert_series_equal(expected, r.min())
+
+ def test_rolling_median_resample(self):
+
+ indices = [datetime(1975, 1, i) for i in range(1, 6)]
+ # So that we can have 3 datapoints on last day (4, 10, and 20)
+ indices.append(datetime(1975, 1, 5, 1))
+ indices.append(datetime(1975, 1, 5, 2))
+ series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ # Use floats instead of ints as values
+ series = series.map(lambda x: float(x))
+ # Sort chronologically
+ series = series.sort_index()
+
+ # Default how should be median
+ expected = Series([0.0, 1.0, 2.0, 3.0, 10],
+ index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
+ x = series.resample('D').median().rolling(window=1).median()
+ tm.assert_series_equal(expected, x)
+
+ def test_rolling_median_memory_error(self):
+ # GH11722
+ n = 20000
+ Series(np.random.randn(n)).rolling(window=2, center=False).median()
+ Series(np.random.randn(n)).rolling(window=2, center=False).median()
+
+ def test_rolling_min_max_numeric_types(self):
+
+ # GH12373
+ types_test = [np.dtype("f{}".format(width)) for width in [4, 8]]
+ types_test.extend([np.dtype("{}{}".format(sign, width))
+ for width in [1, 2, 4, 8] for sign in "ui"])
+ for data_type in types_test:
+ # Just testing that these don't throw exceptions and that
+ # the return type is float64. Other tests will cover quantitative
+ # correctness
+ result = (DataFrame(np.arange(20, dtype=data_type))
+ .rolling(window=5).max())
+ assert result.dtypes[0] == np.dtype("f8")
+ result = (DataFrame(np.arange(20, dtype=data_type))
+ .rolling(window=5).min())
+ assert result.dtypes[0] == np.dtype("f8")
+
+
+class TestGrouperGrouping(object):
+
+ def setup_method(self, method):
+ self.series = Series(np.arange(10))
+ self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
+ 'B': np.arange(40)})
+
+ def test_mutated(self):
+
+ def f():
+ self.frame.groupby('A', foo=1)
+ pytest.raises(TypeError, f)
+
+ g = self.frame.groupby('A')
+ assert not g.mutated
+ g = self.frame.groupby('A', mutated=True)
+ assert g.mutated
+
+ def test_getitem(self):
+ g = self.frame.groupby('A')
+ g_mutated = self.frame.groupby('A', mutated=True)
+
+ expected = g_mutated.B.apply(lambda x: x.rolling(2).mean())
+
+ result = g.rolling(2).mean().B
+ tm.assert_series_equal(result, expected)
+
+ result = g.rolling(2).B.mean()
+ tm.assert_series_equal(result, expected)
+
+ result = g.B.rolling(2).mean()
+ tm.assert_series_equal(result, expected)
+
+ result = self.frame.B.groupby(self.frame.A).rolling(2).mean()
+ tm.assert_series_equal(result, expected)
+
+ def test_getitem_multiple(self):
+
+ # GH 13174
+ g = self.frame.groupby('A')
+ r = g.rolling(2)
+ g_mutated = self.frame.groupby('A', mutated=True)
+ expected = g_mutated.B.apply(lambda x: x.rolling(2).count())
+
+ result = r.B.count()
+ tm.assert_series_equal(result, expected)
+
+ result = r.B.count()
+ tm.assert_series_equal(result, expected)
+
+ def test_rolling(self):
+ g = self.frame.groupby('A')
+ r = g.rolling(window=4)
+
+ for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']:
+
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.rolling(4), f)())
+ tm.assert_frame_equal(result, expected)
+
+ for f in ['std', 'var']:
+ result = getattr(r, f)(ddof=1)
+ expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
+ tm.assert_frame_equal(result, expected)
+
+ result = r.quantile(0.5)
+ expected = g.apply(lambda x: x.rolling(4).quantile(0.5))
+ tm.assert_frame_equal(result, expected)
+
+ def test_rolling_corr_cov(self):
+ g = self.frame.groupby('A')
+ r = g.rolling(window=4)
+
+ for f in ['corr', 'cov']:
+ result = getattr(r, f)(self.frame)
+
+ def func(x):
+ return getattr(x.rolling(4), f)(self.frame)
+ expected = g.apply(func)
+ tm.assert_frame_equal(result, expected)
+
+ result = getattr(r.B, f)(pairwise=True)
+
+ def func(x):
+ return getattr(x.B.rolling(4), f)(pairwise=True)
+ expected = g.apply(func)
+ tm.assert_series_equal(result, expected)
+
+ def test_rolling_apply(self, raw):
+ g = self.frame.groupby('A')
+ r = g.rolling(window=4)
+
+ # reduction
+ result = r.apply(lambda x: x.sum(), raw=raw)
+ expected = g.apply(
+ lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
+ tm.assert_frame_equal(result, expected)
+
+ def test_rolling_apply_mutability(self):
+ # GH 14013
+ df = pd.DataFrame({'A': ['foo'] * 3 + ['bar'] * 3, 'B': [1] * 6})
+ g = df.groupby('A')
+
+ mi = pd.MultiIndex.from_tuples([('bar', 3), ('bar', 4), ('bar', 5),
+ ('foo', 0), ('foo', 1), ('foo', 2)])
+
+ mi.names = ['A', None]
+ # Grouped column should not be a part of the output
+ expected = pd.DataFrame([np.nan, 2., 2.] * 2, columns=['B'], index=mi)
+
+ result = g.rolling(window=2).sum()
+ tm.assert_frame_equal(result, expected)
+
+ # Call an arbitrary function on the groupby
+ g.sum()
+
+ # Make sure nothing has been mutated
+ result = g.rolling(window=2).sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_expanding(self):
+ g = self.frame.groupby('A')
+ r = g.expanding()
+
+ for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']:
+
+ result = getattr(r, f)()
+ expected = g.apply(lambda x: getattr(x.expanding(), f)())
+ tm.assert_frame_equal(result, expected)
+
+ for f in ['std', 'var']:
+ result = getattr(r, f)(ddof=0)
+ expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
+ tm.assert_frame_equal(result, expected)
+
+ result = r.quantile(0.5)
+ expected = g.apply(lambda x: x.expanding().quantile(0.5))
+ tm.assert_frame_equal(result, expected)
+
+ def test_expanding_corr_cov(self):
+ g = self.frame.groupby('A')
+ r = g.expanding()
+
+ for f in ['corr', 'cov']:
+ result = getattr(r, f)(self.frame)
+
+ def func(x):
+ return getattr(x.expanding(), f)(self.frame)
+ expected = g.apply(func)
+ tm.assert_frame_equal(result, expected)
+
+ result = getattr(r.B, f)(pairwise=True)
+
+ def func(x):
+ return getattr(x.B.expanding(), f)(pairwise=True)
+ expected = g.apply(func)
+ tm.assert_series_equal(result, expected)
+
+ def test_expanding_apply(self, raw):
+ g = self.frame.groupby('A')
+ r = g.expanding()
+
+ # reduction
+ result = r.apply(lambda x: x.sum(), raw=raw)
+ expected = g.apply(
+ lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
+ tm.assert_frame_equal(result, expected)
+
+
+class TestRollingTS(object):
+
+ # rolling time-series friendly
+ # xref GH13327
+
+ def setup_method(self, method):
+
+ self.regular = DataFrame({'A': pd.date_range('20130101',
+ periods=5,
+ freq='s'),
+ 'B': range(5)}).set_index('A')
+
+ self.ragged = DataFrame({'B': range(5)})
+ self.ragged.index = [Timestamp('20130101 09:00:00'),
+ Timestamp('20130101 09:00:02'),
+ Timestamp('20130101 09:00:03'),
+ Timestamp('20130101 09:00:05'),
+ Timestamp('20130101 09:00:06')]
+
+ def test_doc_string(self):
+
+ df = DataFrame({'B': [0, 1, 2, np.nan, 4]},
+ index=[Timestamp('20130101 09:00:00'),
+ Timestamp('20130101 09:00:02'),
+ Timestamp('20130101 09:00:03'),
+ Timestamp('20130101 09:00:05'),
+ Timestamp('20130101 09:00:06')])
+ df
+ df.rolling('2s').sum()
+
+ def test_valid(self):
+
+ df = self.regular
+
+ # not a valid freq
+ with pytest.raises(ValueError):
+ df.rolling(window='foobar')
+
+ # not a datetimelike index
+ with pytest.raises(ValueError):
+ df.reset_index().rolling(window='foobar')
+
+ # non-fixed freqs
+ for freq in ['2MS', pd.offsets.MonthBegin(2)]:
+ with pytest.raises(ValueError):
+ df.rolling(window=freq)
+
+ for freq in ['1D', pd.offsets.Day(2), '2ms']:
+ df.rolling(window=freq)
+
+ # non-integer min_periods
+ for minp in [1.0, 'foo', np.array([1, 2, 3])]:
+ with pytest.raises(ValueError):
+ df.rolling(window='1D', min_periods=minp)
+
+ # center is not implemented
+ with pytest.raises(NotImplementedError):
+ df.rolling(window='1D', center=True)
+
+ def test_on(self):
+
+ df = self.regular
+
+ # not a valid column
+ with pytest.raises(ValueError):
+ df.rolling(window='2s', on='foobar')
+
+ # column is valid
+ df = df.copy()
+ df['C'] = pd.date_range('20130101', periods=len(df))
+ df.rolling(window='2d', on='C').sum()
+
+ # invalid columns
+ with pytest.raises(ValueError):
+ df.rolling(window='2d', on='B')
+
+ # ok even though on non-selected
+ df.rolling(window='2d', on='C').B.sum()
+
+ def test_monotonic_on(self):
+
+ # on/index must be monotonic
+ df = DataFrame({'A': pd.date_range('20130101',
+ periods=5,
+ freq='s'),
+ 'B': range(5)})
+
+ assert df.A.is_monotonic
+ df.rolling('2s', on='A').sum()
+
+ df = df.set_index('A')
+ assert df.index.is_monotonic
+ df.rolling('2s').sum()
+
+ # non-monotonic
+ df.index = reversed(df.index.tolist())
+ assert not df.index.is_monotonic
+
+ with pytest.raises(ValueError):
+ df.rolling('2s').sum()
+
+ df = df.reset_index()
+ with pytest.raises(ValueError):
+ df.rolling('2s', on='A').sum()
+
+ def test_frame_on(self):
+
+ df = DataFrame({'B': range(5),
+ 'C': pd.date_range('20130101 09:00:00',
+ periods=5,
+ freq='3s')})
+
+ df['A'] = [Timestamp('20130101 09:00:00'),
+ Timestamp('20130101 09:00:02'),
+ Timestamp('20130101 09:00:03'),
+ Timestamp('20130101 09:00:05'),
+ Timestamp('20130101 09:00:06')]
+
+ # we are doing simulating using 'on'
+ expected = (df.set_index('A')
+ .rolling('2s')
+ .B
+ .sum()
+ .reset_index(drop=True)
+ )
+
+ result = (df.rolling('2s', on='A')
+ .B
+ .sum()
+ )
+ tm.assert_series_equal(result, expected)
+
+ # test as a frame
+ # we should be ignoring the 'on' as an aggregation column
+ # note that the expected is setting, computing, and resetting
+ # so the columns need to be switched compared
+ # to the actual result where they are ordered as in the
+ # original
+ expected = (df.set_index('A')
+ .rolling('2s')[['B']]
+ .sum()
+ .reset_index()[['B', 'A']]
+ )
+
+ result = (df.rolling('2s', on='A')[['B']]
+ .sum()
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_frame_on2(self):
+
+ # using multiple aggregation columns
+ df = DataFrame({'A': [0, 1, 2, 3, 4],
+ 'B': [0, 1, 2, np.nan, 4],
+ 'C': Index([Timestamp('20130101 09:00:00'),
+ Timestamp('20130101 09:00:02'),
+ Timestamp('20130101 09:00:03'),
+ Timestamp('20130101 09:00:05'),
+ Timestamp('20130101 09:00:06')])},
+ columns=['A', 'C', 'B'])
+
+ expected1 = DataFrame({'A': [0., 1, 3, 3, 7],
+ 'B': [0, 1, 3, np.nan, 4],
+ 'C': df['C']},
+ columns=['A', 'C', 'B'])
+
+ result = df.rolling('2s', on='C').sum()
+ expected = expected1
+ tm.assert_frame_equal(result, expected)
+
+ expected = Series([0, 1, 3, np.nan, 4], name='B')
+ result = df.rolling('2s', on='C').B.sum()
+ tm.assert_series_equal(result, expected)
+
+ expected = expected1[['A', 'B', 'C']]
+ result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_basic_regular(self):
+
+ df = self.regular.copy()
+
+ df.index = pd.date_range('20130101', periods=5, freq='D')
+ expected = df.rolling(window=1, min_periods=1).sum()
+ result = df.rolling(window='1D').sum()
+ tm.assert_frame_equal(result, expected)
+
+ df.index = pd.date_range('20130101', periods=5, freq='2D')
+ expected = df.rolling(window=1, min_periods=1).sum()
+ result = df.rolling(window='2D', min_periods=1).sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.rolling(window=1, min_periods=1).sum()
+ result = df.rolling(window='2D', min_periods=1).sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.rolling(window=1).sum()
+ result = df.rolling(window='2D').sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_min_periods(self):
+
+ # compare for min_periods
+ df = self.regular
+
+ # these slightly different
+ expected = df.rolling(2, min_periods=1).sum()
+ result = df.rolling('2s').sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.rolling(2, min_periods=1).sum()
+ result = df.rolling('2s', min_periods=1).sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_closed(self):
+
+ # xref GH13965
+
+ df = DataFrame({'A': [1] * 5},
+ index=[Timestamp('20130101 09:00:01'),
+ Timestamp('20130101 09:00:02'),
+ Timestamp('20130101 09:00:03'),
+ Timestamp('20130101 09:00:04'),
+ Timestamp('20130101 09:00:06')])
+
+ # closed must be 'right', 'left', 'both', 'neither'
+ with pytest.raises(ValueError):
+ self.regular.rolling(window='2s', closed="blabla")
+
+ expected = df.copy()
+ expected["A"] = [1.0, 2, 2, 2, 1]
+ result = df.rolling('2s', closed='right').sum()
+ tm.assert_frame_equal(result, expected)
+
+ # default should be 'right'
+ result = df.rolling('2s').sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.copy()
+ expected["A"] = [1.0, 2, 3, 3, 2]
+ result = df.rolling('2s', closed='both').sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.copy()
+ expected["A"] = [np.nan, 1.0, 2, 2, 1]
+ result = df.rolling('2s', closed='left').sum()
+ tm.assert_frame_equal(result, expected)
+
+ expected = df.copy()
+ expected["A"] = [np.nan, 1.0, 1, 1, np.nan]
+ result = df.rolling('2s', closed='neither').sum()
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_sum(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 3, 3, 7]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=2).sum()
+ expected = df.copy()
+ expected['B'] = [np.nan, np.nan, 3, np.nan, 7]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='3s', min_periods=1).sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 3, 5, 7]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='3s').sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 3, 5, 7]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='4s', min_periods=1).sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 3, 6, 9]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='4s', min_periods=3).sum()
+ expected = df.copy()
+ expected['B'] = [np.nan, np.nan, 3, 6, 9]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).sum()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 3, 6, 10]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_mean(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).mean()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).mean()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_median(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).median()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).median()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_quantile(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).quantile(0.5)
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).quantile(0.5)
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_std(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).std(ddof=0)
+ expected = df.copy()
+ expected['B'] = [0.0] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='1s', min_periods=1).std(ddof=1)
+ expected = df.copy()
+ expected['B'] = [np.nan] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='3s', min_periods=1).std(ddof=0)
+ expected = df.copy()
+ expected['B'] = [0.0] + [0.5] * 4
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).std(ddof=1)
+ expected = df.copy()
+ expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_var(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).var(ddof=0)
+ expected = df.copy()
+ expected['B'] = [0.0] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='1s', min_periods=1).var(ddof=1)
+ expected = df.copy()
+ expected['B'] = [np.nan] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='3s', min_periods=1).var(ddof=0)
+ expected = df.copy()
+ expected['B'] = [0.0] + [0.25] * 4
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).var(ddof=1)
+ expected = df.copy()
+ expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_skew(self):
+
+ df = self.ragged
+ result = df.rolling(window='3s', min_periods=1).skew()
+ expected = df.copy()
+ expected['B'] = [np.nan] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).skew()
+ expected = df.copy()
+ expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_kurt(self):
+
+ df = self.ragged
+ result = df.rolling(window='3s', min_periods=1).kurt()
+ expected = df.copy()
+ expected['B'] = [np.nan] * 5
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).kurt()
+ expected = df.copy()
+ expected['B'] = [np.nan] * 4 + [-1.2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_count(self):
+
+ df = self.ragged
+ result = df.rolling(window='1s', min_periods=1).count()
+ expected = df.copy()
+ expected['B'] = [1.0, 1, 1, 1, 1]
+ tm.assert_frame_equal(result, expected)
+
+ df = self.ragged
+ result = df.rolling(window='1s').count()
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).count()
+ expected = df.copy()
+ expected['B'] = [1.0, 1, 2, 1, 2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=2).count()
+ expected = df.copy()
+ expected['B'] = [np.nan, np.nan, 2, np.nan, 2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_regular_min(self):
+
+ df = DataFrame({'A': pd.date_range('20130101',
+ periods=5,
+ freq='s'),
+ 'B': [0.0, 1, 2, 3, 4]}).set_index('A')
+ result = df.rolling('1s').min()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ df = DataFrame({'A': pd.date_range('20130101',
+ periods=5,
+ freq='s'),
+ 'B': [5, 4, 3, 4, 5]}).set_index('A')
+
+ tm.assert_frame_equal(result, expected)
+ result = df.rolling('2s').min()
+ expected = df.copy()
+ expected['B'] = [5.0, 4, 3, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling('5s').min()
+ expected = df.copy()
+ expected['B'] = [5.0, 4, 3, 3, 3]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_min(self):
+
+ df = self.ragged
+
+ result = df.rolling(window='1s', min_periods=1).min()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).min()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 1, 3, 3]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).min()
+ expected = df.copy()
+ expected['B'] = [0.0, 0, 0, 1, 1]
+ tm.assert_frame_equal(result, expected)
+
+ def test_perf_min(self):
+
+ N = 10000
+
+ dfp = DataFrame({'B': np.random.randn(N)},
+ index=pd.date_range('20130101',
+ periods=N,
+ freq='s'))
+ expected = dfp.rolling(2, min_periods=1).min()
+ result = dfp.rolling('2s').min()
+ assert ((result - expected) < 0.01).all().bool()
+
+ expected = dfp.rolling(200, min_periods=1).min()
+ result = dfp.rolling('200s').min()
+ assert ((result - expected) < 0.01).all().bool()
+
+ def test_ragged_max(self):
+
+ df = self.ragged
+
+ result = df.rolling(window='1s', min_periods=1).max()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).max()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).max()
+ expected = df.copy()
+ expected['B'] = [0.0, 1, 2, 3, 4]
+ tm.assert_frame_equal(result, expected)
+
+ def test_ragged_apply(self, raw):
+
+ df = self.ragged
+
+ f = lambda x: 1
+ result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw)
+ expected = df.copy()
+ expected['B'] = 1.
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw)
+ expected = df.copy()
+ expected['B'] = 1.
+ tm.assert_frame_equal(result, expected)
+
+ result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw)
+ expected = df.copy()
+ expected['B'] = 1.
+ tm.assert_frame_equal(result, expected)
+
+ def test_all(self):
+
+ # simple comparison of integer vs time-based windowing
+ df = self.regular * 2
+ er = df.rolling(window=1)
+ r = df.rolling(window='1s')
+
+ for f in ['sum', 'mean', 'count', 'median', 'std',
+ 'var', 'kurt', 'skew', 'min', 'max']:
+
+ result = getattr(r, f)()
+ expected = getattr(er, f)()
+ tm.assert_frame_equal(result, expected)
+
+ result = r.quantile(0.5)
+ expected = er.quantile(0.5)
+ tm.assert_frame_equal(result, expected)
+
+ def test_all_apply(self, raw):
+
+ df = self.regular * 2
+ er = df.rolling(window=1)
+ r = df.rolling(window='1s')
+
+ result = r.apply(lambda x: 1, raw=raw)
+ expected = er.apply(lambda x: 1, raw=raw)
+ tm.assert_frame_equal(result, expected)
+
+ def test_all2(self):
+
+ # more sophisticated comparison of integer vs.
+ # time-based windowing
+ df = DataFrame({'B': np.arange(50)},
+ index=pd.date_range('20130101',
+ periods=50, freq='H')
+ )
+ # in-range data
+ dft = df.between_time("09:00", "16:00")
+
+ r = dft.rolling(window='5H')
+
+ for f in ['sum', 'mean', 'count', 'median', 'std',
+ 'var', 'kurt', 'skew', 'min', 'max']:
+
+ result = getattr(r, f)()
+
+ # we need to roll the days separately
+ # to compare with a time-based roll
+ # finally groupby-apply will return a multi-index
+ # so we need to drop the day
+ def agg_by_day(x):
+ x = x.between_time("09:00", "16:00")
+ return getattr(x.rolling(5, min_periods=1), f)()
+ expected = df.groupby(df.index.day).apply(
+ agg_by_day).reset_index(level=0, drop=True)
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_groupby_monotonic(self):
+
+ # GH 15130
+ # we don't need to validate monotonicity when grouping
+
+ data = [
+ ['David', '1/1/2015', 100], ['David', '1/5/2015', 500],
+ ['David', '5/30/2015', 50], ['David', '7/25/2015', 50],
+ ['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500],
+ ['Ryan', '3/31/2016', 50], ['Joe', '7/1/2015', 100],
+ ['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]]
+
+ df = DataFrame(data=data, columns=['name', 'date', 'amount'])
+ df['date'] = pd.to_datetime(df['date'])
+
+ expected = df.set_index('date').groupby('name').apply(
+ lambda x: x.rolling('180D')['amount'].sum())
+ result = df.groupby('name').rolling('180D', on='date')['amount'].sum()
+ tm.assert_series_equal(result, expected)
+
+ def test_non_monotonic(self):
+ # GH 13966 (similar to #15130, closed by #15175)
+
+ dates = pd.date_range(start='2016-01-01 09:30:00',
+ periods=20, freq='s')
+ df = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
+ 'B': np.concatenate((dates, dates)),
+ 'C': np.arange(40)})
+
+ result = df.groupby('A').rolling('4s', on='B').C.mean()
+ expected = df.set_index('B').groupby('A').apply(
+ lambda x: x.rolling('4s')['C'].mean())
+ tm.assert_series_equal(result, expected)
+
+ df2 = df.sort_values('B')
+ result = df2.groupby('A').rolling('4s', on='B').C.mean()
+ tm.assert_series_equal(result, expected)
+
+ def test_rolling_cov_offset(self):
+ # GH16058
+
+ idx = pd.date_range('2017-01-01', periods=24, freq='1h')
+ ss = Series(np.arange(len(idx)), index=idx)
+
+ result = ss.rolling('2h').cov()
+ expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx)
+ tm.assert_series_equal(result, expected)
+
+ expected2 = ss.rolling(2, min_periods=1).cov()
+ tm.assert_series_equal(result, expected2)
+
+ result = ss.rolling('3h').cov()
+ expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx)
+ tm.assert_series_equal(result, expected)
+
+ expected2 = ss.rolling(3, min_periods=1).cov()
+ tm.assert_series_equal(result, expected2)
diff --git a/contrib/python/pandas/py2/pandas/tests/tools/__init__.py b/contrib/python/pandas/py2/pandas/tests/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tools/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/tools/test_numeric.py b/contrib/python/pandas/py2/pandas/tests/tools/test_numeric.py
new file mode 100644
index 00000000000..537881f3a5e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tools/test_numeric.py
@@ -0,0 +1,440 @@
+import decimal
+
+import numpy as np
+from numpy import iinfo
+import pytest
+
+import pandas as pd
+from pandas import to_numeric
+from pandas.util import testing as tm
+
+
+class TestToNumeric(object):
+
+ def test_empty(self):
+ # see gh-16302
+ s = pd.Series([], dtype=object)
+
+ res = to_numeric(s)
+ expected = pd.Series([], dtype=np.int64)
+
+ tm.assert_series_equal(res, expected)
+
+ # Original issue example
+ res = to_numeric(s, errors='coerce', downcast='integer')
+ expected = pd.Series([], dtype=np.int8)
+
+ tm.assert_series_equal(res, expected)
+
+ def test_series(self):
+ s = pd.Series(['1', '-3.14', '7'])
+ res = to_numeric(s)
+ expected = pd.Series([1, -3.14, 7])
+ tm.assert_series_equal(res, expected)
+
+ s = pd.Series(['1', '-3.14', 7])
+ res = to_numeric(s)
+ tm.assert_series_equal(res, expected)
+
+ def test_series_numeric(self):
+ s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
+ res = to_numeric(s)
+ tm.assert_series_equal(res, s)
+
+ s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
+ res = to_numeric(s)
+ tm.assert_series_equal(res, s)
+
+ # bool is regarded as numeric
+ s = pd.Series([True, False, True, True],
+ index=list('ABCD'), name='XXX')
+ res = to_numeric(s)
+ tm.assert_series_equal(res, s)
+
+ def test_error(self):
+ s = pd.Series([1, -3.14, 'apple'])
+ msg = 'Unable to parse string "apple" at position 2'
+ with pytest.raises(ValueError, match=msg):
+ to_numeric(s, errors='raise')
+
+ res = to_numeric(s, errors='ignore')
+ expected = pd.Series([1, -3.14, 'apple'])
+ tm.assert_series_equal(res, expected)
+
+ res = to_numeric(s, errors='coerce')
+ expected = pd.Series([1, -3.14, np.nan])
+ tm.assert_series_equal(res, expected)
+
+ s = pd.Series(['orange', 1, -3.14, 'apple'])
+ msg = 'Unable to parse string "orange" at position 0'
+ with pytest.raises(ValueError, match=msg):
+ to_numeric(s, errors='raise')
+
+ def test_error_seen_bool(self):
+ s = pd.Series([True, False, 'apple'])
+ msg = 'Unable to parse string "apple" at position 2'
+ with pytest.raises(ValueError, match=msg):
+ to_numeric(s, errors='raise')
+
+ res = to_numeric(s, errors='ignore')
+ expected = pd.Series([True, False, 'apple'])
+ tm.assert_series_equal(res, expected)
+
+ # coerces to float
+ res = to_numeric(s, errors='coerce')
+ expected = pd.Series([1., 0., np.nan])
+ tm.assert_series_equal(res, expected)
+
+ def test_list(self):
+ s = ['1', '-3.14', '7']
+ res = to_numeric(s)
+ expected = np.array([1, -3.14, 7])
+ tm.assert_numpy_array_equal(res, expected)
+
+ def test_list_numeric(self):
+ s = [1, 3, 4, 5]
+ res = to_numeric(s)
+ tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64))
+
+ s = [1., 3., 4., 5.]
+ res = to_numeric(s)
+ tm.assert_numpy_array_equal(res, np.array(s))
+
+ # bool is regarded as numeric
+ s = [True, False, True, True]
+ res = to_numeric(s)
+ tm.assert_numpy_array_equal(res, np.array(s))
+
+ def test_numeric(self):
+ s = pd.Series([1, -3.14, 7], dtype='O')
+ res = to_numeric(s)
+ expected = pd.Series([1, -3.14, 7])
+ tm.assert_series_equal(res, expected)
+
+ s = pd.Series([1, -3.14, 7])
+ res = to_numeric(s)
+ tm.assert_series_equal(res, expected)
+
+ # GH 14827
+ df = pd.DataFrame(dict(
+ a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'],
+ b=[1.0, 2.0, 3.0, 4.0],
+ ))
+ expected = pd.DataFrame(dict(
+ a=[1.2, 3.14, np.inf, 0.1],
+ b=[1.0, 2.0, 3.0, 4.0],
+ ))
+
+ # Test to_numeric over one column
+ df_copy = df.copy()
+ df_copy['a'] = df_copy['a'].apply(to_numeric)
+ tm.assert_frame_equal(df_copy, expected)
+
+ # Test to_numeric over multiple columns
+ df_copy = df.copy()
+ df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric)
+ tm.assert_frame_equal(df_copy, expected)
+
+ def test_numeric_lists_and_arrays(self):
+ # Test to_numeric with embedded lists and arrays
+ df = pd.DataFrame(dict(
+ a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1]
+ ))
+ df['a'] = df['a'].apply(to_numeric)
+ expected = pd.DataFrame(dict(
+ a=[[3.14, 1.0], 1.6, 0.1],
+ ))
+ tm.assert_frame_equal(df, expected)
+
+ df = pd.DataFrame(dict(
+ a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1]
+ ))
+ df['a'] = df['a'].apply(to_numeric)
+ expected = pd.DataFrame(dict(
+ a=[[3.14, 1.0], 0.1],
+ ))
+ tm.assert_frame_equal(df, expected)
+
+ def test_all_nan(self):
+ s = pd.Series(['a', 'b', 'c'])
+ res = to_numeric(s, errors='coerce')
+ expected = pd.Series([np.nan, np.nan, np.nan])
+ tm.assert_series_equal(res, expected)
+
+ @pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
+ def test_type_check(self, errors):
+ # see gh-11776
+ df = pd.DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
+ kwargs = dict(errors=errors) if errors is not None else dict()
+ error_ctx = pytest.raises(TypeError, match="1-d array")
+
+ with error_ctx:
+ to_numeric(df, **kwargs)
+
+ def test_scalar(self):
+ assert pd.to_numeric(1) == 1
+ assert pd.to_numeric(1.1) == 1.1
+
+ assert pd.to_numeric('1') == 1
+ assert pd.to_numeric('1.1') == 1.1
+
+ with pytest.raises(ValueError):
+ to_numeric('XX', errors='raise')
+
+ assert to_numeric('XX', errors='ignore') == 'XX'
+ assert np.isnan(to_numeric('XX', errors='coerce'))
+
+ def test_numeric_dtypes(self):
+ idx = pd.Index([1, 2, 3], name='xxx')
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, idx)
+
+ res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, idx.values)
+
+ idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, idx)
+
+ res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, idx.values)
+
+ def test_str(self):
+ idx = pd.Index(['1', '2', '3'], name='xxx')
+ exp = np.array([1, 2, 3], dtype='int64')
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
+
+ res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, exp)
+
+ idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
+ exp = np.array([1.5, 2.7, 3.4])
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
+
+ res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, exp)
+
+ def test_datetime_like(self, tz_naive_fixture):
+ idx = pd.date_range("20130101", periods=3,
+ tz=tz_naive_fixture, name="xxx")
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, pd.Index(idx.asi8, name="xxx"))
+
+ res = pd.to_numeric(pd.Series(idx, name="xxx"))
+ tm.assert_series_equal(res, pd.Series(idx.asi8, name="xxx"))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, idx.asi8)
+
+ def test_timedelta(self):
+ idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
+
+ res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
+
+ res = pd.to_numeric(idx.values)
+ tm.assert_numpy_array_equal(res, idx.asi8)
+
+ def test_period(self):
+ idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
+ res = pd.to_numeric(idx)
+ tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
+
+ # TODO: enable when we can support native PeriodDtype
+ # res = pd.to_numeric(pd.Series(idx, name='xxx'))
+ # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
+
+ def test_non_hashable(self):
+ # Test for Bug #13324
+ s = pd.Series([[10.0, 2], 1.0, 'apple'])
+ res = pd.to_numeric(s, errors='coerce')
+ tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan]))
+
+ res = pd.to_numeric(s, errors='ignore')
+ tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple']))
+
+ with pytest.raises(TypeError, match="Invalid object type"):
+ pd.to_numeric(s)
+
+ @pytest.mark.parametrize("data", [
+ ["1", 2, 3],
+ [1, 2, 3],
+ np.array(["1970-01-02", "1970-01-03",
+ "1970-01-04"], dtype="datetime64[D]")
+ ])
+ def test_downcast_basic(self, data):
+ # see gh-13352
+ invalid_downcast = "unsigned-integer"
+ msg = "invalid downcasting method provided"
+
+ with pytest.raises(ValueError, match=msg):
+ pd.to_numeric(data, downcast=invalid_downcast)
+
+ expected = np.array([1, 2, 3], dtype=np.int64)
+
+ # Basic function tests.
+ res = pd.to_numeric(data)
+ tm.assert_numpy_array_equal(res, expected)
+
+ res = pd.to_numeric(data, downcast=None)
+ tm.assert_numpy_array_equal(res, expected)
+
+ # Basic dtype support.
+ smallest_uint_dtype = np.dtype(np.typecodes["UnsignedInteger"][0])
+
+ # Support below np.float32 is rare and far between.
+ float_32_char = np.dtype(np.float32).char
+ smallest_float_dtype = float_32_char
+
+ expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
+ res = pd.to_numeric(data, downcast="unsigned")
+ tm.assert_numpy_array_equal(res, expected)
+
+ expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
+ res = pd.to_numeric(data, downcast="float")
+ tm.assert_numpy_array_equal(res, expected)
+
+ @pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
+ @pytest.mark.parametrize("data", [
+ ["1", 2, 3],
+ [1, 2, 3],
+ np.array(["1970-01-02", "1970-01-03",
+ "1970-01-04"], dtype="datetime64[D]")
+ ])
+ def test_signed_downcast(self, data, signed_downcast):
+ # see gh-13352
+ smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
+ expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
+
+ res = pd.to_numeric(data, downcast=signed_downcast)
+ tm.assert_numpy_array_equal(res, expected)
+
+ def test_ignore_downcast_invalid_data(self):
+ # If we can't successfully cast the given
+ # data to a numeric dtype, do not bother
+ # with the downcast parameter.
+ data = ["foo", 2, 3]
+ expected = np.array(data, dtype=object)
+
+ res = pd.to_numeric(data, errors="ignore",
+ downcast="unsigned")
+ tm.assert_numpy_array_equal(res, expected)
+
+ def test_ignore_downcast_neg_to_unsigned(self):
+ # Cannot cast to an unsigned integer
+ # because we have a negative number.
+ data = ["-1", 2, 3]
+ expected = np.array([-1, 2, 3], dtype=np.int64)
+
+ res = pd.to_numeric(data, downcast="unsigned")
+ tm.assert_numpy_array_equal(res, expected)
+
+ @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
+ @pytest.mark.parametrize("data,expected", [
+ (["1.1", 2, 3],
+ np.array([1.1, 2, 3], dtype=np.float64)),
+ ([10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
+ np.array([10000.0, 20000, 3000,
+ 40000.36, 50000, 50000.00], dtype=np.float64))
+ ])
+ def test_ignore_downcast_cannot_convert_float(
+ self, data, expected, downcast):
+ # Cannot cast to an integer (signed or unsigned)
+ # because we have a float number.
+ res = pd.to_numeric(data, downcast=downcast)
+ tm.assert_numpy_array_equal(res, expected)
+
+ @pytest.mark.parametrize("downcast,expected_dtype", [
+ ("integer", np.int16),
+ ("signed", np.int16),
+ ("unsigned", np.uint16)
+ ])
+ def test_downcast_not8bit(self, downcast, expected_dtype):
+ # the smallest integer dtype need not be np.(u)int8
+ data = ["256", 257, 258]
+
+ expected = np.array([256, 257, 258], dtype=expected_dtype)
+ res = pd.to_numeric(data, downcast=downcast)
+ tm.assert_numpy_array_equal(res, expected)
+
+ @pytest.mark.parametrize("dtype,downcast,min_max", [
+ ("int8", "integer", [iinfo(np.int8).min,
+ iinfo(np.int8).max]),
+ ("int16", "integer", [iinfo(np.int16).min,
+ iinfo(np.int16).max]),
+ ('int32', "integer", [iinfo(np.int32).min,
+ iinfo(np.int32).max]),
+ ('int64', "integer", [iinfo(np.int64).min,
+ iinfo(np.int64).max]),
+ ('uint8', "unsigned", [iinfo(np.uint8).min,
+ iinfo(np.uint8).max]),
+ ('uint16', "unsigned", [iinfo(np.uint16).min,
+ iinfo(np.uint16).max]),
+ ('uint32', "unsigned", [iinfo(np.uint32).min,
+ iinfo(np.uint32).max]),
+ ('uint64', "unsigned", [iinfo(np.uint64).min,
+ iinfo(np.uint64).max]),
+ ('int16', "integer", [iinfo(np.int8).min,
+ iinfo(np.int8).max + 1]),
+ ('int32', "integer", [iinfo(np.int16).min,
+ iinfo(np.int16).max + 1]),
+ ('int64', "integer", [iinfo(np.int32).min,
+ iinfo(np.int32).max + 1]),
+ ('int16', "integer", [iinfo(np.int8).min - 1,
+ iinfo(np.int16).max]),
+ ('int32', "integer", [iinfo(np.int16).min - 1,
+ iinfo(np.int32).max]),
+ ('int64', "integer", [iinfo(np.int32).min - 1,
+ iinfo(np.int64).max]),
+ ('uint16', "unsigned", [iinfo(np.uint8).min,
+ iinfo(np.uint8).max + 1]),
+ ('uint32', "unsigned", [iinfo(np.uint16).min,
+ iinfo(np.uint16).max + 1]),
+ ('uint64', "unsigned", [iinfo(np.uint32).min,
+ iinfo(np.uint32).max + 1])
+ ])
+ def test_downcast_limits(self, dtype, downcast, min_max):
+ # see gh-14404: test the limits of each downcast.
+ series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
+ assert series.dtype == dtype
+
+ def test_coerce_uint64_conflict(self):
+ # see gh-17007 and gh-17125
+ #
+ # Still returns float despite the uint64-nan conflict,
+ # which would normally force the casting to object.
+ df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
+ expected = pd.Series([200, 300, np.nan, np.nan,
+ 30000000000000000000], dtype=float, name="a")
+ result = to_numeric(df["a"], errors="coerce")
+ tm.assert_series_equal(result, expected)
+
+ s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
+ expected = pd.Series([12345678901234567890,
+ 1234567890, np.nan], dtype=float)
+ result = to_numeric(s, errors="coerce")
+ tm.assert_series_equal(result, expected)
+
+ # For completeness, check against "ignore" and "raise"
+ result = to_numeric(s, errors="ignore")
+ tm.assert_series_equal(result, s)
+
+ msg = "Unable to parse string"
+ with pytest.raises(ValueError, match=msg):
+ to_numeric(s, errors="raise")
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/__init__.py b/contrib/python/pandas/py2/pandas/tests/tseries/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/__init__.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/__init__.py
new file mode 100644
index 00000000000..40a96afc6ff
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/common.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/common.py
new file mode 100644
index 00000000000..2e8eb224bca
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/common.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Assertion helpers for offsets tests
+"""
+
+
+def assert_offset_equal(offset, base, expected):
+ actual = offset + base
+ actual_swapped = base + offset
+ actual_apply = offset.apply(base)
+ try:
+ assert actual == expected
+ assert actual_swapped == expected
+ assert actual_apply == expected
+ except AssertionError:
+ raise AssertionError("\nExpected: %s\nActual: %s\nFor Offset: %s)"
+ "\nAt Date: %s" %
+ (expected, actual, offset, base))
+
+
+def assert_onOffset(offset, date, expected):
+ actual = offset.onOffset(date)
+ assert actual == expected, ("\nExpected: %s\nActual: %s\nFor Offset: %s)"
+ "\nAt Date: %s" %
+ (expected, actual, offset, date))
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/conftest.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/conftest.py
new file mode 100644
index 00000000000..c192a56b205
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+
+import pandas.tseries.offsets as offsets
+
+
[email protected](params=[getattr(offsets, o) for o in offsets.__all__])
+def offset_types(request):
+ """
+ Fixture for all the datetime offsets available for a time series.
+ """
+ return request.param
+
+
[email protected](params=[getattr(offsets, o) for o in offsets.__all__ if
+ issubclass(getattr(offsets, o), offsets.MonthOffset)
+ and o != 'MonthOffset'])
+def month_classes(request):
+ """
+ Fixture for month based datetime offsets available for a time series.
+ """
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_fiscal.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_fiscal.py
new file mode 100644
index 00000000000..a5d7460921f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_fiscal.py
@@ -0,0 +1,657 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for Fiscal Year and Fiscal Quarter offset classes
+"""
+from datetime import datetime
+
+from dateutil.relativedelta import relativedelta
+import pytest
+
+from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG
+
+from pandas import Timestamp
+
+from pandas.tseries.frequencies import get_offset
+from pandas.tseries.offsets import FY5253, FY5253Quarter
+
+from .common import assert_offset_equal, assert_onOffset
+from .test_offsets import Base, WeekDay
+
+
+def makeFY5253LastOfMonthQuarter(*args, **kwds):
+ return FY5253Quarter(*args, variation="last", **kwds)
+
+
+def makeFY5253NearestEndMonthQuarter(*args, **kwds):
+ return FY5253Quarter(*args, variation="nearest", **kwds)
+
+
+def makeFY5253NearestEndMonth(*args, **kwds):
+ return FY5253(*args, variation="nearest", **kwds)
+
+
+def makeFY5253LastOfMonth(*args, **kwds):
+ return FY5253(*args, variation="last", **kwds)
+
+
+def test_get_offset_name():
+ assert (makeFY5253LastOfMonthQuarter(
+ weekday=1, startingMonth=3,
+ qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4")
+ assert (makeFY5253NearestEndMonthQuarter(
+ weekday=1, startingMonth=3,
+ qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3")
+
+
+def test_get_offset():
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ get_offset('gibberish')
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ get_offset('QS-JAN-B')
+
+ pairs = [
+ ("RE-N-DEC-MON",
+ makeFY5253NearestEndMonth(weekday=0, startingMonth=12)),
+ ("RE-L-DEC-TUE",
+ makeFY5253LastOfMonth(weekday=1, startingMonth=12)),
+ ("REQ-L-MAR-TUE-4",
+ makeFY5253LastOfMonthQuarter(weekday=1,
+ startingMonth=3,
+ qtr_with_extra_week=4)),
+ ("REQ-L-DEC-MON-3",
+ makeFY5253LastOfMonthQuarter(weekday=0,
+ startingMonth=12,
+ qtr_with_extra_week=3)),
+ ("REQ-N-DEC-MON-3",
+ makeFY5253NearestEndMonthQuarter(weekday=0,
+ startingMonth=12,
+ qtr_with_extra_week=3))]
+
+ for name, expected in pairs:
+ offset = get_offset(name)
+ assert offset == expected, ("Expected %r to yield %r (actual: %r)" %
+ (name, expected, offset))
+
+
+class TestFY5253LastOfMonth(Base):
+ offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8,
+ weekday=WeekDay.SAT)
+ offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9,
+ weekday=WeekDay.SAT)
+
+ on_offset_cases = [
+ # From Wikipedia (see:
+ # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end)
+ (offset_lom_sat_aug, datetime(2006, 8, 26), True),
+ (offset_lom_sat_aug, datetime(2007, 8, 25), True),
+ (offset_lom_sat_aug, datetime(2008, 8, 30), True),
+ (offset_lom_sat_aug, datetime(2009, 8, 29), True),
+ (offset_lom_sat_aug, datetime(2010, 8, 28), True),
+ (offset_lom_sat_aug, datetime(2011, 8, 27), True),
+ (offset_lom_sat_aug, datetime(2012, 8, 25), True),
+ (offset_lom_sat_aug, datetime(2013, 8, 31), True),
+ (offset_lom_sat_aug, datetime(2014, 8, 30), True),
+ (offset_lom_sat_aug, datetime(2015, 8, 29), True),
+ (offset_lom_sat_aug, datetime(2016, 8, 27), True),
+ (offset_lom_sat_aug, datetime(2017, 8, 26), True),
+ (offset_lom_sat_aug, datetime(2018, 8, 25), True),
+ (offset_lom_sat_aug, datetime(2019, 8, 31), True),
+
+ (offset_lom_sat_aug, datetime(2006, 8, 27), False),
+ (offset_lom_sat_aug, datetime(2007, 8, 28), False),
+ (offset_lom_sat_aug, datetime(2008, 8, 31), False),
+ (offset_lom_sat_aug, datetime(2009, 8, 30), False),
+ (offset_lom_sat_aug, datetime(2010, 8, 29), False),
+ (offset_lom_sat_aug, datetime(2011, 8, 28), False),
+
+ (offset_lom_sat_aug, datetime(2006, 8, 25), False),
+ (offset_lom_sat_aug, datetime(2007, 8, 24), False),
+ (offset_lom_sat_aug, datetime(2008, 8, 29), False),
+ (offset_lom_sat_aug, datetime(2009, 8, 28), False),
+ (offset_lom_sat_aug, datetime(2010, 8, 27), False),
+ (offset_lom_sat_aug, datetime(2011, 8, 26), False),
+ (offset_lom_sat_aug, datetime(2019, 8, 30), False),
+
+ # From GMCR (see for example:
+ # http://yahoo.brand.edgar-online.com/Default.aspx?
+ # companyid=3184&formtypeID=7)
+ (offset_lom_sat_sep, datetime(2010, 9, 25), True),
+ (offset_lom_sat_sep, datetime(2011, 9, 24), True),
+ (offset_lom_sat_sep, datetime(2012, 9, 29), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+ def test_apply(self):
+ offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8,
+ weekday=WeekDay.SAT)
+ offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8,
+ weekday=WeekDay.SAT)
+
+ date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25),
+ datetime(2008, 8, 30), datetime(2009, 8, 29),
+ datetime(2010, 8, 28), datetime(2011, 8, 27),
+ datetime(2012, 8, 25), datetime(2013, 8, 31),
+ datetime(2014, 8, 30), datetime(2015, 8, 29),
+ datetime(2016, 8, 27)]
+
+ tests = [
+ (offset_lom_aug_sat, date_seq_lom_aug_sat),
+ (offset_lom_aug_sat_1, date_seq_lom_aug_sat),
+ (offset_lom_aug_sat, [
+ datetime(2006, 8, 25)] + date_seq_lom_aug_sat),
+ (offset_lom_aug_sat_1, [
+ datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]),
+ (makeFY5253LastOfMonth(n=-1, startingMonth=8,
+ weekday=WeekDay.SAT),
+ list(reversed(date_seq_lom_aug_sat))),
+ ]
+ for test in tests:
+ offset, data = test
+ current = data[0]
+ for datum in data[1:]:
+ current = current + offset
+ assert current == datum
+
+
+class TestFY5253NearestEndMonth(Base):
+
+ def test_get_year_end(self):
+ assert (makeFY5253NearestEndMonth(
+ startingMonth=8, weekday=WeekDay.SAT).get_year_end(
+ datetime(2013, 1, 1)) == datetime(2013, 8, 31))
+ assert (makeFY5253NearestEndMonth(
+ startingMonth=8, weekday=WeekDay.SUN).get_year_end(
+ datetime(2013, 1, 1)) == datetime(2013, 9, 1))
+ assert (makeFY5253NearestEndMonth(
+ startingMonth=8, weekday=WeekDay.FRI).get_year_end(
+ datetime(2013, 1, 1)) == datetime(2013, 8, 30))
+
+ offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12,
+ variation="nearest")
+ assert (offset_n.get_year_end(datetime(2012, 1, 1)) ==
+ datetime(2013, 1, 1))
+ assert (offset_n.get_year_end(datetime(2012, 1, 10)) ==
+ datetime(2013, 1, 1))
+
+ assert (offset_n.get_year_end(datetime(2013, 1, 1)) ==
+ datetime(2013, 12, 31))
+ assert (offset_n.get_year_end(datetime(2013, 1, 2)) ==
+ datetime(2013, 12, 31))
+ assert (offset_n.get_year_end(datetime(2013, 1, 3)) ==
+ datetime(2013, 12, 31))
+ assert (offset_n.get_year_end(datetime(2013, 1, 10)) ==
+ datetime(2013, 12, 31))
+
+ JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest")
+ assert (JNJ.get_year_end(datetime(2006, 1, 1)) ==
+ datetime(2006, 12, 31))
+
+ offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8,
+ weekday=WeekDay.SAT)
+ offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8,
+ weekday=WeekDay.THU)
+ offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12,
+ variation="nearest")
+
+ on_offset_cases = [
+ # From Wikipedia (see:
+ # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar
+ # #Saturday_nearest_the_end_of_month)
+ # 2006-09-02 2006 September 2
+ # 2007-09-01 2007 September 1
+ # 2008-08-30 2008 August 30 (leap year)
+ # 2009-08-29 2009 August 29
+ # 2010-08-28 2010 August 28
+ # 2011-09-03 2011 September 3
+ # 2012-09-01 2012 September 1 (leap year)
+ # 2013-08-31 2013 August 31
+ # 2014-08-30 2014 August 30
+ # 2015-08-29 2015 August 29
+ # 2016-09-03 2016 September 3 (leap year)
+ # 2017-09-02 2017 September 2
+ # 2018-09-01 2018 September 1
+ # 2019-08-31 2019 August 31
+ (offset_lom_aug_sat, datetime(2006, 9, 2), True),
+ (offset_lom_aug_sat, datetime(2007, 9, 1), True),
+ (offset_lom_aug_sat, datetime(2008, 8, 30), True),
+ (offset_lom_aug_sat, datetime(2009, 8, 29), True),
+ (offset_lom_aug_sat, datetime(2010, 8, 28), True),
+ (offset_lom_aug_sat, datetime(2011, 9, 3), True),
+
+ (offset_lom_aug_sat, datetime(2016, 9, 3), True),
+ (offset_lom_aug_sat, datetime(2017, 9, 2), True),
+ (offset_lom_aug_sat, datetime(2018, 9, 1), True),
+ (offset_lom_aug_sat, datetime(2019, 8, 31), True),
+
+ (offset_lom_aug_sat, datetime(2006, 8, 27), False),
+ (offset_lom_aug_sat, datetime(2007, 8, 28), False),
+ (offset_lom_aug_sat, datetime(2008, 8, 31), False),
+ (offset_lom_aug_sat, datetime(2009, 8, 30), False),
+ (offset_lom_aug_sat, datetime(2010, 8, 29), False),
+ (offset_lom_aug_sat, datetime(2011, 8, 28), False),
+
+ (offset_lom_aug_sat, datetime(2006, 8, 25), False),
+ (offset_lom_aug_sat, datetime(2007, 8, 24), False),
+ (offset_lom_aug_sat, datetime(2008, 8, 29), False),
+ (offset_lom_aug_sat, datetime(2009, 8, 28), False),
+ (offset_lom_aug_sat, datetime(2010, 8, 27), False),
+ (offset_lom_aug_sat, datetime(2011, 8, 26), False),
+ (offset_lom_aug_sat, datetime(2019, 8, 30), False),
+
+ # From Micron, see:
+ # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7
+ (offset_lom_aug_thu, datetime(2012, 8, 30), True),
+ (offset_lom_aug_thu, datetime(2011, 9, 1), True),
+
+ (offset_n, datetime(2012, 12, 31), False),
+ (offset_n, datetime(2013, 1, 1), True),
+ (offset_n, datetime(2013, 1, 2), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+ def test_apply(self):
+ date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1),
+ datetime(2008, 8, 30), datetime(2009, 8, 29),
+ datetime(2010, 8, 28), datetime(2011, 9, 3)]
+
+ JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1),
+ datetime(2006, 12, 31), datetime(2007, 12, 30),
+ datetime(2008, 12, 28), datetime(2010, 1, 3),
+ datetime(2011, 1, 2), datetime(2012, 1, 1),
+ datetime(2012, 12, 30)]
+
+ DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5,
+ variation="nearest")
+
+ tests = [
+ (makeFY5253NearestEndMonth(startingMonth=8,
+ weekday=WeekDay.SAT),
+ date_seq_nem_8_sat),
+ (makeFY5253NearestEndMonth(n=1, startingMonth=8,
+ weekday=WeekDay.SAT),
+ date_seq_nem_8_sat),
+ (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT),
+ [datetime(2006, 9, 1)] + date_seq_nem_8_sat),
+ (makeFY5253NearestEndMonth(n=1, startingMonth=8,
+ weekday=WeekDay.SAT),
+ [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]),
+ (makeFY5253NearestEndMonth(n=-1, startingMonth=8,
+ weekday=WeekDay.SAT),
+ list(reversed(date_seq_nem_8_sat))),
+ (makeFY5253NearestEndMonth(n=1, startingMonth=12,
+ weekday=WeekDay.SUN), JNJ),
+ (makeFY5253NearestEndMonth(n=-1, startingMonth=12,
+ weekday=WeekDay.SUN),
+ list(reversed(JNJ))),
+ (makeFY5253NearestEndMonth(n=1, startingMonth=12,
+ weekday=WeekDay.SUN),
+ [datetime(2005, 1, 2), datetime(2006, 1, 1)]),
+ (makeFY5253NearestEndMonth(n=1, startingMonth=12,
+ weekday=WeekDay.SUN),
+ [datetime(2006, 1, 2), datetime(2006, 12, 31)]),
+ (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)])
+ ]
+ for test in tests:
+ offset, data = test
+ current = data[0]
+ for datum in data[1:]:
+ current = current + offset
+ assert current == datum
+
+
+class TestFY5253LastOfMonthQuarter(Base):
+
+ def test_isAnchored(self):
+ assert makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4).isAnchored()
+ assert makeFY5253LastOfMonthQuarter(
+ weekday=WeekDay.SAT, startingMonth=3,
+ qtr_with_extra_week=4).isAnchored()
+ assert not makeFY5253LastOfMonthQuarter(
+ 2, startingMonth=1, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4).isAnchored()
+
+ def test_equality(self):
+ assert (makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4))
+ assert (makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4))
+ assert (makeFY5253LastOfMonthQuarter(
+ startingMonth=1, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter(
+ startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4))
+
+ def test_offset(self):
+ offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+ offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+ offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+
+ offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+ offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+
+ GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26),
+ datetime(2010, 9, 25), datetime(2010, 12, 25),
+ datetime(2011, 3, 26), datetime(2011, 6, 25),
+ datetime(2011, 9, 24), datetime(2011, 12, 24),
+ datetime(2012, 3, 24), datetime(2012, 6, 23),
+ datetime(2012, 9, 29), datetime(2012, 12, 29),
+ datetime(2013, 3, 30), datetime(2013, 6, 29)]
+
+ assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1])
+ assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1),
+ expected=GMCR[0])
+ assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2])
+
+ assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2])
+ assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4])
+
+ assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2])
+ assert_offset_equal(offset_neg1,
+ base=GMCR[-1] + relativedelta(days=+1),
+ expected=GMCR[-1])
+ assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3])
+
+ date = GMCR[0] + relativedelta(days=-1)
+ for expected in GMCR:
+ assert_offset_equal(offset, date, expected)
+ date = date + offset
+
+ date = GMCR[-1] + relativedelta(days=+1)
+ for expected in reversed(GMCR):
+ assert_offset_equal(offset_neg1, date, expected)
+ date = date + offset_neg1
+
+ lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+ lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+
+ on_offset_cases = [
+ # From Wikipedia
+ (lomq_aug_sat_4, datetime(2006, 8, 26), True),
+ (lomq_aug_sat_4, datetime(2007, 8, 25), True),
+ (lomq_aug_sat_4, datetime(2008, 8, 30), True),
+ (lomq_aug_sat_4, datetime(2009, 8, 29), True),
+ (lomq_aug_sat_4, datetime(2010, 8, 28), True),
+ (lomq_aug_sat_4, datetime(2011, 8, 27), True),
+ (lomq_aug_sat_4, datetime(2019, 8, 31), True),
+
+ (lomq_aug_sat_4, datetime(2006, 8, 27), False),
+ (lomq_aug_sat_4, datetime(2007, 8, 28), False),
+ (lomq_aug_sat_4, datetime(2008, 8, 31), False),
+ (lomq_aug_sat_4, datetime(2009, 8, 30), False),
+ (lomq_aug_sat_4, datetime(2010, 8, 29), False),
+ (lomq_aug_sat_4, datetime(2011, 8, 28), False),
+
+ (lomq_aug_sat_4, datetime(2006, 8, 25), False),
+ (lomq_aug_sat_4, datetime(2007, 8, 24), False),
+ (lomq_aug_sat_4, datetime(2008, 8, 29), False),
+ (lomq_aug_sat_4, datetime(2009, 8, 28), False),
+ (lomq_aug_sat_4, datetime(2010, 8, 27), False),
+ (lomq_aug_sat_4, datetime(2011, 8, 26), False),
+ (lomq_aug_sat_4, datetime(2019, 8, 30), False),
+
+ # From GMCR
+ (lomq_sep_sat_4, datetime(2010, 9, 25), True),
+ (lomq_sep_sat_4, datetime(2011, 9, 24), True),
+ (lomq_sep_sat_4, datetime(2012, 9, 29), True),
+
+ (lomq_sep_sat_4, datetime(2013, 6, 29), True),
+ (lomq_sep_sat_4, datetime(2012, 6, 23), True),
+ (lomq_sep_sat_4, datetime(2012, 6, 30), False),
+
+ (lomq_sep_sat_4, datetime(2013, 3, 30), True),
+ (lomq_sep_sat_4, datetime(2012, 3, 24), True),
+
+ (lomq_sep_sat_4, datetime(2012, 12, 29), True),
+ (lomq_sep_sat_4, datetime(2011, 12, 24), True),
+
+ # INTC (extra week in Q1)
+ # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844
+ (makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=1),
+ datetime(2011, 4, 2), True),
+
+ # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7
+ (makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=1),
+ datetime(2012, 12, 29), True),
+ (makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=1),
+ datetime(2011, 12, 31), True),
+ (makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=1),
+ datetime(2010, 12, 25), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+ def test_year_has_extra_week(self):
+ # End of long Q1
+ assert makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2))
+
+ # Start of long Q1
+ assert makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26))
+
+ # End of year before year with long Q1
+ assert not makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25))
+
+ for year in [x
+ for x in range(1994, 2011 + 1)
+ if x not in [2011, 2005, 2000, 1994]]:
+ assert not makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(
+ datetime(year, 4, 2))
+
+ # Other long years
+ assert makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2))
+
+ assert makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2))
+
+ assert makeFY5253LastOfMonthQuarter(
+ 1, startingMonth=12, weekday=WeekDay.SAT,
+ qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2))
+
+ def test_get_weeks(self):
+ sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=1)
+ sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12,
+ weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+
+ assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13]
+ assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14]
+ assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13]
+
+
+class TestFY5253NearestEndMonthQuarter(Base):
+
+ offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter(
+ 1, startingMonth=8, weekday=WeekDay.SAT,
+ qtr_with_extra_week=4)
+ offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter(
+ 1, startingMonth=8, weekday=WeekDay.THU,
+ qtr_with_extra_week=4)
+ offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12,
+ variation="nearest")
+
+ on_offset_cases = [
+ # From Wikipedia
+ (offset_nem_sat_aug_4, datetime(2006, 9, 2), True),
+ (offset_nem_sat_aug_4, datetime(2007, 9, 1), True),
+ (offset_nem_sat_aug_4, datetime(2008, 8, 30), True),
+ (offset_nem_sat_aug_4, datetime(2009, 8, 29), True),
+ (offset_nem_sat_aug_4, datetime(2010, 8, 28), True),
+ (offset_nem_sat_aug_4, datetime(2011, 9, 3), True),
+
+ (offset_nem_sat_aug_4, datetime(2016, 9, 3), True),
+ (offset_nem_sat_aug_4, datetime(2017, 9, 2), True),
+ (offset_nem_sat_aug_4, datetime(2018, 9, 1), True),
+ (offset_nem_sat_aug_4, datetime(2019, 8, 31), True),
+
+ (offset_nem_sat_aug_4, datetime(2006, 8, 27), False),
+ (offset_nem_sat_aug_4, datetime(2007, 8, 28), False),
+ (offset_nem_sat_aug_4, datetime(2008, 8, 31), False),
+ (offset_nem_sat_aug_4, datetime(2009, 8, 30), False),
+ (offset_nem_sat_aug_4, datetime(2010, 8, 29), False),
+ (offset_nem_sat_aug_4, datetime(2011, 8, 28), False),
+
+ (offset_nem_sat_aug_4, datetime(2006, 8, 25), False),
+ (offset_nem_sat_aug_4, datetime(2007, 8, 24), False),
+ (offset_nem_sat_aug_4, datetime(2008, 8, 29), False),
+ (offset_nem_sat_aug_4, datetime(2009, 8, 28), False),
+ (offset_nem_sat_aug_4, datetime(2010, 8, 27), False),
+ (offset_nem_sat_aug_4, datetime(2011, 8, 26), False),
+ (offset_nem_sat_aug_4, datetime(2019, 8, 30), False),
+
+ # From Micron, see:
+ # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7
+ (offset_nem_thu_aug_4, datetime(2012, 8, 30), True),
+ (offset_nem_thu_aug_4, datetime(2011, 9, 1), True),
+
+ # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13
+ (offset_nem_thu_aug_4, datetime(2013, 5, 30), True),
+ (offset_nem_thu_aug_4, datetime(2013, 2, 28), True),
+ (offset_nem_thu_aug_4, datetime(2012, 11, 29), True),
+ (offset_nem_thu_aug_4, datetime(2012, 5, 31), True),
+ (offset_nem_thu_aug_4, datetime(2007, 3, 1), True),
+ (offset_nem_thu_aug_4, datetime(1994, 3, 3), True),
+
+ (offset_n, datetime(2012, 12, 31), False),
+ (offset_n, datetime(2013, 1, 1), True),
+ (offset_n, datetime(2013, 1, 2), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+ def test_offset(self):
+ offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8,
+ weekday=WeekDay.THU,
+ qtr_with_extra_week=4)
+
+ MU = [datetime(2012, 5, 31),
+ datetime(2012, 8, 30), datetime(2012, 11, 29),
+ datetime(2013, 2, 28), datetime(2013, 5, 30)]
+
+ date = MU[0] + relativedelta(days=-1)
+ for expected in MU:
+ assert_offset_equal(offset, date, expected)
+ date = date + offset
+
+ assert_offset_equal(offset,
+ datetime(2012, 5, 31),
+ datetime(2012, 8, 30))
+ assert_offset_equal(offset,
+ datetime(2012, 5, 30),
+ datetime(2012, 5, 31))
+
+ offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last",
+ qtr_with_extra_week=4)
+
+ assert_offset_equal(offset2,
+ datetime(2013, 1, 15),
+ datetime(2013, 3, 30))
+
+
+def test_bunched_yearends():
+ # GH#14774 cases with two fiscal year-ends in the same calendar-year
+ fy = FY5253(n=1, weekday=5, startingMonth=12, variation='nearest')
+ dt = Timestamp('2004-01-01')
+ assert fy.rollback(dt) == Timestamp('2002-12-28')
+ assert (-fy).apply(dt) == Timestamp('2002-12-28')
+ assert dt - fy == Timestamp('2002-12-28')
+
+ assert fy.rollforward(dt) == Timestamp('2004-01-03')
+ assert fy.apply(dt) == Timestamp('2004-01-03')
+ assert fy + dt == Timestamp('2004-01-03')
+ assert dt + fy == Timestamp('2004-01-03')
+
+ # Same thing, but starting from a Timestamp in the previous year.
+ dt = Timestamp('2003-12-31')
+ assert fy.rollback(dt) == Timestamp('2002-12-28')
+ assert (-fy).apply(dt) == Timestamp('2002-12-28')
+ assert dt - fy == Timestamp('2002-12-28')
+
+
+def test_fy5253_last_onoffset():
+ # GH#18877 dates on the year-end but not normalized to midnight
+ offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0)
+ ts = Timestamp('1984-05-28 06:29:43.955911354+0200',
+ tz='Europe/San_Marino')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+
+def test_fy5253_nearest_onoffset():
+ # GH#18877 dates on the year-end but not normalized to midnight
+ offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2)
+ ts = Timestamp('2032-07-28 00:12:59.035729419+0000', tz='Africa/Dakar')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+
+def test_fy5253qtr_onoffset_nearest():
+ # GH#19036
+ ts = Timestamp('1985-09-02 23:57:46.232550356-0300',
+ tz='Atlantic/Bermuda')
+ offset = FY5253Quarter(n=3, qtr_with_extra_week=1, startingMonth=2,
+ variation="nearest", weekday=0)
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+
+def test_fy5253qtr_onoffset_last():
+ # GH#19036
+ offset = FY5253Quarter(n=-2, qtr_with_extra_week=1,
+ startingMonth=7, variation="last", weekday=2)
+ ts = Timestamp('2011-01-26 19:03:40.331096129+0200',
+ tz='Africa/Windhoek')
+ slow = (ts + offset) - offset == ts
+ fast = offset.onOffset(ts)
+ assert fast == slow
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets.py
new file mode 100644
index 00000000000..ac395597058
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets.py
@@ -0,0 +1,3143 @@
+from datetime import date, datetime, timedelta
+from distutils.version import LooseVersion
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+ NaT, OutOfBoundsDatetime, Timestamp, conversion, timezones)
+from pandas._libs.tslibs.frequencies import (
+ INVALID_FREQ_ERR_MSG, get_freq_code, get_freq_str)
+import pandas._libs.tslibs.offsets as liboffsets
+import pandas.compat as compat
+from pandas.compat import range
+from pandas.compat.numpy import np_datetime64_compat
+
+from pandas.core.indexes.datetimes import DatetimeIndex, _to_M8, date_range
+from pandas.core.series import Series
+import pandas.util.testing as tm
+
+from pandas.io.pickle import read_pickle
+from pandas.tseries.frequencies import _offset_map, get_offset
+from pandas.tseries.holiday import USFederalHolidayCalendar
+import pandas.tseries.offsets as offsets
+from pandas.tseries.offsets import (
+ FY5253, BDay, BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd,
+ BusinessHour, BYearBegin, BYearEnd, CBMonthBegin, CBMonthEnd, CDay,
+ CustomBusinessHour, DateOffset, Day, Easter, FY5253Quarter,
+ LastWeekOfMonth, MonthBegin, MonthEnd, Nano, QuarterBegin, QuarterEnd,
+ SemiMonthBegin, SemiMonthEnd, Tick, Week, WeekOfMonth, YearBegin, YearEnd)
+
+from .common import assert_offset_equal, assert_onOffset
+
+
+class WeekDay(object):
+ # TODO: Remove: This is not used outside of tests
+ MON = 0
+ TUE = 1
+ WED = 2
+ THU = 3
+ FRI = 4
+ SAT = 5
+ SUN = 6
+
+
+####
+# Misc function tests
+####
+
+
+def test_to_M8():
+ valb = datetime(2007, 10, 1)
+ valu = _to_M8(valb)
+ assert isinstance(valu, np.datetime64)
+
+
+#####
+# DateOffset Tests
+#####
+
+
+class Base(object):
+ _offset = None
+ d = Timestamp(datetime(2008, 1, 2))
+
+ timezones = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern',
+ 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific']
+
+ def _get_offset(self, klass, value=1, normalize=False):
+ # create instance from offset class
+ if klass is FY5253:
+ klass = klass(n=value, startingMonth=1, weekday=1,
+ variation='last', normalize=normalize)
+ elif klass is FY5253Quarter:
+ klass = klass(n=value, startingMonth=1, weekday=1,
+ qtr_with_extra_week=1, variation='last',
+ normalize=normalize)
+ elif klass is LastWeekOfMonth:
+ klass = klass(n=value, weekday=5, normalize=normalize)
+ elif klass is WeekOfMonth:
+ klass = klass(n=value, week=1, weekday=5, normalize=normalize)
+ elif klass is Week:
+ klass = klass(n=value, weekday=5, normalize=normalize)
+ elif klass is DateOffset:
+ klass = klass(days=value, normalize=normalize)
+ else:
+ try:
+ klass = klass(value, normalize=normalize)
+ except Exception:
+ klass = klass(normalize=normalize)
+ return klass
+
+ def test_apply_out_of_range(self, tz_naive_fixture):
+ tz = tz_naive_fixture
+ if self._offset is None:
+ return
+
+ # try to create an out-of-bounds result timestamp; if we can't create
+ # the offset skip
+ try:
+ if self._offset in (BusinessHour, CustomBusinessHour):
+ # Using 10000 in BusinessHour fails in tz check because of DST
+ # difference
+ offset = self._get_offset(self._offset, value=100000)
+ else:
+ offset = self._get_offset(self._offset, value=10000)
+
+ result = Timestamp('20080101') + offset
+ assert isinstance(result, datetime)
+ assert result.tzinfo is None
+
+ # Check tz is preserved
+ t = Timestamp('20080101', tz=tz)
+ result = t + offset
+ assert isinstance(result, datetime)
+ assert t.tzinfo == result.tzinfo
+
+ except OutOfBoundsDatetime:
+ raise
+ except (ValueError, KeyError):
+ # we are creating an invalid offset
+ # so ignore
+ pass
+
+ def test_offsets_compare_equal(self):
+ # root cause of GH#456: __ne__ was not implemented
+ if self._offset is None:
+ return
+ offset1 = self._offset()
+ offset2 = self._offset()
+ assert not offset1 != offset2
+ assert offset1 == offset2
+
+ def test_rsub(self):
+ if self._offset is None or not hasattr(self, "offset2"):
+ # i.e. skip for TestCommon and YQM subclasses that do not have
+ # offset2 attr
+ return
+ assert self.d - self.offset2 == (-self.offset2).apply(self.d)
+
+ def test_radd(self):
+ if self._offset is None or not hasattr(self, "offset2"):
+ # i.e. skip for TestCommon and YQM subclasses that do not have
+ # offset2 attr
+ return
+ assert self.d + self.offset2 == self.offset2 + self.d
+
+ def test_sub(self):
+ if self._offset is None or not hasattr(self, "offset2"):
+ # i.e. skip for TestCommon and YQM subclasses that do not have
+ # offset2 attr
+ return
+ off = self.offset2
+ with pytest.raises(Exception):
+ off - self.d
+
+ assert 2 * off - off == off
+ assert self.d - self.offset2 == self.d + self._offset(-2)
+ assert self.d - self.offset2 == self.d - (2 * off - off)
+
+ def testMult1(self):
+ if self._offset is None or not hasattr(self, "offset1"):
+ # i.e. skip for TestCommon and YQM subclasses that do not have
+ # offset1 attr
+ return
+ assert self.d + 10 * self.offset1 == self.d + self._offset(10)
+ assert self.d + 5 * self.offset1 == self.d + self._offset(5)
+
+ def testMult2(self):
+ if self._offset is None:
+ return
+ assert self.d + (-5 * self._offset(-10)) == self.d + self._offset(50)
+ assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6)
+
+ def test_compare_str(self):
+ # GH#23524
+ # comparing to strings that cannot be cast to DateOffsets should
+ # not raise for __eq__ or __ne__
+ if self._offset is None:
+ return
+ off = self._get_offset(self._offset)
+
+ assert not off == "infer"
+ assert off != "foo"
+ # Note: inequalities are only implemented for Tick subclasses;
+ # tests for this are in test_ticks
+
+
+class TestCommon(Base):
+ # exected value created by Base._get_offset
+ # are applied to 2011/01/01 09:00 (Saturday)
+ # used for .apply and .rollforward
+ expecteds = {'Day': Timestamp('2011-01-02 09:00:00'),
+ 'DateOffset': Timestamp('2011-01-02 09:00:00'),
+ 'BusinessDay': Timestamp('2011-01-03 09:00:00'),
+ 'CustomBusinessDay': Timestamp('2011-01-03 09:00:00'),
+ 'CustomBusinessMonthEnd': Timestamp('2011-01-31 09:00:00'),
+ 'CustomBusinessMonthBegin': Timestamp('2011-01-03 09:00:00'),
+ 'MonthBegin': Timestamp('2011-02-01 09:00:00'),
+ 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'),
+ 'MonthEnd': Timestamp('2011-01-31 09:00:00'),
+ 'SemiMonthEnd': Timestamp('2011-01-15 09:00:00'),
+ 'SemiMonthBegin': Timestamp('2011-01-15 09:00:00'),
+ 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'),
+ 'YearBegin': Timestamp('2012-01-01 09:00:00'),
+ 'BYearBegin': Timestamp('2011-01-03 09:00:00'),
+ 'YearEnd': Timestamp('2011-12-31 09:00:00'),
+ 'BYearEnd': Timestamp('2011-12-30 09:00:00'),
+ 'QuarterBegin': Timestamp('2011-03-01 09:00:00'),
+ 'BQuarterBegin': Timestamp('2011-03-01 09:00:00'),
+ 'QuarterEnd': Timestamp('2011-03-31 09:00:00'),
+ 'BQuarterEnd': Timestamp('2011-03-31 09:00:00'),
+ 'BusinessHour': Timestamp('2011-01-03 10:00:00'),
+ 'CustomBusinessHour': Timestamp('2011-01-03 10:00:00'),
+ 'WeekOfMonth': Timestamp('2011-01-08 09:00:00'),
+ 'LastWeekOfMonth': Timestamp('2011-01-29 09:00:00'),
+ 'FY5253Quarter': Timestamp('2011-01-25 09:00:00'),
+ 'FY5253': Timestamp('2011-01-25 09:00:00'),
+ 'Week': Timestamp('2011-01-08 09:00:00'),
+ 'Easter': Timestamp('2011-04-24 09:00:00'),
+ 'Hour': Timestamp('2011-01-01 10:00:00'),
+ 'Minute': Timestamp('2011-01-01 09:01:00'),
+ 'Second': Timestamp('2011-01-01 09:00:01'),
+ 'Milli': Timestamp('2011-01-01 09:00:00.001000'),
+ 'Micro': Timestamp('2011-01-01 09:00:00.000001'),
+ 'Nano': Timestamp(np_datetime64_compat(
+ '2011-01-01T09:00:00.000000001Z'))}
+
+ def test_immutable(self, offset_types):
+ # GH#21341 check that __setattr__ raises
+ offset = self._get_offset(offset_types)
+ with pytest.raises(AttributeError):
+ offset.normalize = True
+ with pytest.raises(AttributeError):
+ offset.n = 91
+
+ def test_return_type(self, offset_types):
+ offset = self._get_offset(offset_types)
+
+ # make sure that we are returning a Timestamp
+ result = Timestamp('20080101') + offset
+ assert isinstance(result, Timestamp)
+
+ # make sure that we are returning NaT
+ assert NaT + offset is NaT
+ assert offset + NaT is NaT
+
+ assert NaT - offset is NaT
+ assert (-offset).apply(NaT) is NaT
+
+ def test_offset_n(self, offset_types):
+ offset = self._get_offset(offset_types)
+ assert offset.n == 1
+
+ neg_offset = offset * -1
+ assert neg_offset.n == -1
+
+ mul_offset = offset * 3
+ assert mul_offset.n == 3
+
+ def test_offset_freqstr(self, offset_types):
+ offset = self._get_offset(offset_types)
+
+ freqstr = offset.freqstr
+ if freqstr not in ('<Easter>',
+ "<DateOffset: days=1>",
+ 'LWOM-SAT', ):
+ code = get_offset(freqstr)
+ assert offset.rule_code == code
+
+ def _check_offsetfunc_works(self, offset, funcname, dt, expected,
+ normalize=False):
+
+ if normalize and issubclass(offset, Tick):
+ # normalize=True disallowed for Tick subclasses GH#21427
+ return
+
+ offset_s = self._get_offset(offset, normalize=normalize)
+ func = getattr(offset_s, funcname)
+
+ result = func(dt)
+ assert isinstance(result, Timestamp)
+ assert result == expected
+
+ result = func(Timestamp(dt))
+ assert isinstance(result, Timestamp)
+ assert result == expected
+
+ # see gh-14101
+ exp_warning = None
+ ts = Timestamp(dt) + Nano(5)
+
+ if (offset_s.__class__.__name__ == 'DateOffset' and
+ (funcname == 'apply' or normalize) and
+ ts.nanosecond > 0):
+ exp_warning = UserWarning
+
+ # test nanosecond is preserved
+ with tm.assert_produces_warning(exp_warning,
+ check_stacklevel=False):
+ result = func(ts)
+ assert isinstance(result, Timestamp)
+ if normalize is False:
+ assert result == expected + Nano(5)
+ else:
+ assert result == expected
+
+ if isinstance(dt, np.datetime64):
+ # test tz when input is datetime or Timestamp
+ return
+
+ for tz in self.timezones:
+ expected_localize = expected.tz_localize(tz)
+ tz_obj = timezones.maybe_get_tz(tz)
+ dt_tz = conversion.localize_pydatetime(dt, tz_obj)
+
+ result = func(dt_tz)
+ assert isinstance(result, Timestamp)
+ assert result == expected_localize
+
+ result = func(Timestamp(dt, tz=tz))
+ assert isinstance(result, Timestamp)
+ assert result == expected_localize
+
+ # see gh-14101
+ exp_warning = None
+ ts = Timestamp(dt, tz=tz) + Nano(5)
+
+ if (offset_s.__class__.__name__ == 'DateOffset' and
+ (funcname == 'apply' or normalize) and
+ ts.nanosecond > 0):
+ exp_warning = UserWarning
+
+ # test nanosecond is preserved
+ with tm.assert_produces_warning(exp_warning,
+ check_stacklevel=False):
+ result = func(ts)
+ assert isinstance(result, Timestamp)
+ if normalize is False:
+ assert result == expected_localize + Nano(5)
+ else:
+ assert result == expected_localize
+
+ def test_apply(self, offset_types):
+ sdt = datetime(2011, 1, 1, 9, 0)
+ ndt = np_datetime64_compat('2011-01-01 09:00Z')
+
+ for dt in [sdt, ndt]:
+ expected = self.expecteds[offset_types.__name__]
+ self._check_offsetfunc_works(offset_types, 'apply', dt, expected)
+
+ expected = Timestamp(expected.date())
+ self._check_offsetfunc_works(offset_types, 'apply', dt, expected,
+ normalize=True)
+
+ def test_rollforward(self, offset_types):
+ expecteds = self.expecteds.copy()
+
+ # result will not be changed if the target is on the offset
+ no_changes = ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin',
+ 'Week', 'Hour', 'Minute', 'Second', 'Milli', 'Micro',
+ 'Nano', 'DateOffset']
+ for n in no_changes:
+ expecteds[n] = Timestamp('2011/01/01 09:00')
+
+ expecteds['BusinessHour'] = Timestamp('2011-01-03 09:00:00')
+ expecteds['CustomBusinessHour'] = Timestamp('2011-01-03 09:00:00')
+
+ # but be changed when normalize=True
+ norm_expected = expecteds.copy()
+ for k in norm_expected:
+ norm_expected[k] = Timestamp(norm_expected[k].date())
+
+ normalized = {'Day': Timestamp('2011-01-02 00:00:00'),
+ 'DateOffset': Timestamp('2011-01-02 00:00:00'),
+ 'MonthBegin': Timestamp('2011-02-01 00:00:00'),
+ 'SemiMonthBegin': Timestamp('2011-01-15 00:00:00'),
+ 'YearBegin': Timestamp('2012-01-01 00:00:00'),
+ 'Week': Timestamp('2011-01-08 00:00:00'),
+ 'Hour': Timestamp('2011-01-01 00:00:00'),
+ 'Minute': Timestamp('2011-01-01 00:00:00'),
+ 'Second': Timestamp('2011-01-01 00:00:00'),
+ 'Milli': Timestamp('2011-01-01 00:00:00'),
+ 'Micro': Timestamp('2011-01-01 00:00:00')}
+ norm_expected.update(normalized)
+
+ sdt = datetime(2011, 1, 1, 9, 0)
+ ndt = np_datetime64_compat('2011-01-01 09:00Z')
+
+ for dt in [sdt, ndt]:
+ expected = expecteds[offset_types.__name__]
+ self._check_offsetfunc_works(offset_types, 'rollforward', dt,
+ expected)
+ expected = norm_expected[offset_types.__name__]
+ self._check_offsetfunc_works(offset_types, 'rollforward', dt,
+ expected, normalize=True)
+
+ def test_rollback(self, offset_types):
+ expecteds = {'BusinessDay': Timestamp('2010-12-31 09:00:00'),
+ 'CustomBusinessDay': Timestamp('2010-12-31 09:00:00'),
+ 'CustomBusinessMonthEnd':
+ Timestamp('2010-12-31 09:00:00'),
+ 'CustomBusinessMonthBegin':
+ Timestamp('2010-12-01 09:00:00'),
+ 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'),
+ 'MonthEnd': Timestamp('2010-12-31 09:00:00'),
+ 'SemiMonthEnd': Timestamp('2010-12-31 09:00:00'),
+ 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'),
+ 'BYearBegin': Timestamp('2010-01-01 09:00:00'),
+ 'YearEnd': Timestamp('2010-12-31 09:00:00'),
+ 'BYearEnd': Timestamp('2010-12-31 09:00:00'),
+ 'QuarterBegin': Timestamp('2010-12-01 09:00:00'),
+ 'BQuarterBegin': Timestamp('2010-12-01 09:00:00'),
+ 'QuarterEnd': Timestamp('2010-12-31 09:00:00'),
+ 'BQuarterEnd': Timestamp('2010-12-31 09:00:00'),
+ 'BusinessHour': Timestamp('2010-12-31 17:00:00'),
+ 'CustomBusinessHour': Timestamp('2010-12-31 17:00:00'),
+ 'WeekOfMonth': Timestamp('2010-12-11 09:00:00'),
+ 'LastWeekOfMonth': Timestamp('2010-12-25 09:00:00'),
+ 'FY5253Quarter': Timestamp('2010-10-26 09:00:00'),
+ 'FY5253': Timestamp('2010-01-26 09:00:00'),
+ 'Easter': Timestamp('2010-04-04 09:00:00')}
+
+ # result will not be changed if the target is on the offset
+ for n in ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', 'Week',
+ 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano',
+ 'DateOffset']:
+ expecteds[n] = Timestamp('2011/01/01 09:00')
+
+ # but be changed when normalize=True
+ norm_expected = expecteds.copy()
+ for k in norm_expected:
+ norm_expected[k] = Timestamp(norm_expected[k].date())
+
+ normalized = {'Day': Timestamp('2010-12-31 00:00:00'),
+ 'DateOffset': Timestamp('2010-12-31 00:00:00'),
+ 'MonthBegin': Timestamp('2010-12-01 00:00:00'),
+ 'SemiMonthBegin': Timestamp('2010-12-15 00:00:00'),
+ 'YearBegin': Timestamp('2010-01-01 00:00:00'),
+ 'Week': Timestamp('2010-12-25 00:00:00'),
+ 'Hour': Timestamp('2011-01-01 00:00:00'),
+ 'Minute': Timestamp('2011-01-01 00:00:00'),
+ 'Second': Timestamp('2011-01-01 00:00:00'),
+ 'Milli': Timestamp('2011-01-01 00:00:00'),
+ 'Micro': Timestamp('2011-01-01 00:00:00')}
+ norm_expected.update(normalized)
+
+ sdt = datetime(2011, 1, 1, 9, 0)
+ ndt = np_datetime64_compat('2011-01-01 09:00Z')
+
+ for dt in [sdt, ndt]:
+ expected = expecteds[offset_types.__name__]
+ self._check_offsetfunc_works(offset_types, 'rollback', dt,
+ expected)
+
+ expected = norm_expected[offset_types.__name__]
+ self._check_offsetfunc_works(offset_types, 'rollback', dt,
+ expected, normalize=True)
+
+ def test_onOffset(self, offset_types):
+ dt = self.expecteds[offset_types.__name__]
+ offset_s = self._get_offset(offset_types)
+ assert offset_s.onOffset(dt)
+
+ # when normalize=True, onOffset checks time is 00:00:00
+ if issubclass(offset_types, Tick):
+ # normalize=True disallowed for Tick subclasses GH#21427
+ return
+ offset_n = self._get_offset(offset_types, normalize=True)
+ assert not offset_n.onOffset(dt)
+
+ if offset_types in (BusinessHour, CustomBusinessHour):
+ # In default BusinessHour (9:00-17:00), normalized time
+ # cannot be in business hour range
+ return
+ date = datetime(dt.year, dt.month, dt.day)
+ assert offset_n.onOffset(date)
+
+ def test_add(self, offset_types, tz_naive_fixture):
+ tz = tz_naive_fixture
+ dt = datetime(2011, 1, 1, 9, 0)
+
+ offset_s = self._get_offset(offset_types)
+ expected = self.expecteds[offset_types.__name__]
+
+ result_dt = dt + offset_s
+ result_ts = Timestamp(dt) + offset_s
+ for result in [result_dt, result_ts]:
+ assert isinstance(result, Timestamp)
+ assert result == expected
+
+ expected_localize = expected.tz_localize(tz)
+ result = Timestamp(dt, tz=tz) + offset_s
+ assert isinstance(result, Timestamp)
+ assert result == expected_localize
+
+ # normalize=True, disallowed for Tick subclasses GH#21427
+ if issubclass(offset_types, Tick):
+ return
+ offset_s = self._get_offset(offset_types, normalize=True)
+ expected = Timestamp(expected.date())
+
+ result_dt = dt + offset_s
+ result_ts = Timestamp(dt) + offset_s
+ for result in [result_dt, result_ts]:
+ assert isinstance(result, Timestamp)
+ assert result == expected
+
+ expected_localize = expected.tz_localize(tz)
+ result = Timestamp(dt, tz=tz) + offset_s
+ assert isinstance(result, Timestamp)
+ assert result == expected_localize
+
+ def test_pickle_v0_15_2(self, datapath):
+ offsets = {'DateOffset': DateOffset(years=1),
+ 'MonthBegin': MonthBegin(1),
+ 'Day': Day(1),
+ 'YearBegin': YearBegin(1),
+ 'Week': Week(1)}
+
+ pickle_path = datapath('tseries', 'offsets', 'data',
+ 'dateoffset_0_15_2.pickle')
+ # This code was executed once on v0.15.2 to generate the pickle:
+ # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f)
+ #
+ tm.assert_dict_equal(offsets, read_pickle(pickle_path))
+
+
+class TestDateOffset(Base):
+
+ def setup_method(self, method):
+ self.d = Timestamp(datetime(2008, 1, 2))
+ _offset_map.clear()
+
+ def test_repr(self):
+ repr(DateOffset())
+ repr(DateOffset(2))
+ repr(2 * DateOffset())
+ repr(2 * DateOffset(months=2))
+
+ def test_mul(self):
+ assert DateOffset(2) == 2 * DateOffset(1)
+ assert DateOffset(2) == DateOffset(1) * 2
+
+ def test_constructor(self):
+
+ assert ((self.d + DateOffset(months=2)) == datetime(2008, 3, 2))
+ assert ((self.d - DateOffset(months=2)) == datetime(2007, 11, 2))
+
+ assert ((self.d + DateOffset(2)) == datetime(2008, 1, 4))
+
+ assert not DateOffset(2).isAnchored()
+ assert DateOffset(1).isAnchored()
+
+ d = datetime(2008, 1, 31)
+ assert ((d + DateOffset(months=1)) == datetime(2008, 2, 29))
+
+ def test_copy(self):
+ assert (DateOffset(months=2).copy() == DateOffset(months=2))
+
+ def test_eq(self):
+ offset1 = DateOffset(days=1)
+ offset2 = DateOffset(days=365)
+
+ assert offset1 != offset2
+
+
+class TestBusinessDay(Base):
+ _offset = BDay
+
+ def setup_method(self, method):
+ self.d = datetime(2008, 1, 1)
+
+ self.offset = BDay()
+ self.offset1 = self.offset
+ self.offset2 = BDay(2)
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset) == '<BusinessDay>'
+ assert repr(self.offset2) == '<2 * BusinessDays>'
+
+ if compat.PY37:
+ expected = '<BusinessDay: offset=datetime.timedelta(days=1)>'
+ else:
+ expected = '<BusinessDay: offset=datetime.timedelta(1)>'
+ assert repr(self.offset + timedelta(1)) == expected
+
+ def test_with_offset(self):
+ offset = self.offset + timedelta(hours=2)
+
+ assert (self.d + offset) == datetime(2008, 1, 2, 2)
+
+ def test_eq(self):
+ assert self.offset2 == self.offset2
+
+ def test_mul(self):
+ pass
+
+ def test_hash(self):
+ assert hash(self.offset2) == hash(self.offset2)
+
+ def test_call(self):
+ assert self.offset2(self.d) == datetime(2008, 1, 3)
+
+ def testRollback1(self):
+ assert BDay(10).rollback(self.d) == self.d
+
+ def testRollback2(self):
+ assert (BDay(10).rollback(datetime(2008, 1, 5)) ==
+ datetime(2008, 1, 4))
+
+ def testRollforward1(self):
+ assert BDay(10).rollforward(self.d) == self.d
+
+ def testRollforward2(self):
+ assert (BDay(10).rollforward(datetime(2008, 1, 5)) ==
+ datetime(2008, 1, 7))
+
+ def test_roll_date_object(self):
+ offset = BDay()
+
+ dt = date(2012, 9, 15)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 14)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 17)
+
+ offset = offsets.Day()
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 15)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 15)
+
+ def test_onOffset(self):
+ tests = [(BDay(), datetime(2008, 1, 1), True),
+ (BDay(), datetime(2008, 1, 5), False)]
+
+ for offset, d, expected in tests:
+ assert_onOffset(offset, d, expected)
+
+ apply_cases = []
+ apply_cases.append((BDay(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 2),
+ datetime(2008, 1, 4): datetime(2008, 1, 7),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 8)}))
+
+ apply_cases.append((2 * BDay(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 3),
+ datetime(2008, 1, 4): datetime(2008, 1, 8),
+ datetime(2008, 1, 5): datetime(2008, 1, 8),
+ datetime(2008, 1, 6): datetime(2008, 1, 8),
+ datetime(2008, 1, 7): datetime(2008, 1, 9)}))
+
+ apply_cases.append((-BDay(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 31),
+ datetime(2008, 1, 4): datetime(2008, 1, 3),
+ datetime(2008, 1, 5): datetime(2008, 1, 4),
+ datetime(2008, 1, 6): datetime(2008, 1, 4),
+ datetime(2008, 1, 7): datetime(2008, 1, 4),
+ datetime(2008, 1, 8): datetime(2008, 1, 7)}))
+
+ apply_cases.append((-2 * BDay(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 28),
+ datetime(2008, 1, 4): datetime(2008, 1, 2),
+ datetime(2008, 1, 5): datetime(2008, 1, 3),
+ datetime(2008, 1, 6): datetime(2008, 1, 3),
+ datetime(2008, 1, 7): datetime(2008, 1, 3),
+ datetime(2008, 1, 8): datetime(2008, 1, 4),
+ datetime(2008, 1, 9): datetime(2008, 1, 7)}))
+
+ apply_cases.append((BDay(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 4): datetime(2008, 1, 4),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 7)}))
+
+ @pytest.mark.parametrize('case', apply_cases)
+ def test_apply(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_apply_large_n(self):
+ dt = datetime(2012, 10, 23)
+
+ result = dt + BDay(10)
+ assert result == datetime(2012, 11, 6)
+
+ result = dt + BDay(100) - BDay(100)
+ assert result == dt
+
+ off = BDay() * 6
+ rs = datetime(2012, 1, 1) - off
+ xp = datetime(2011, 12, 23)
+ assert rs == xp
+
+ st = datetime(2011, 12, 18)
+ rs = st + off
+ xp = datetime(2011, 12, 26)
+ assert rs == xp
+
+ off = BDay() * 10
+ rs = datetime(2014, 1, 5) + off # see #5890
+ xp = datetime(2014, 1, 17)
+ assert rs == xp
+
+ def test_apply_corner(self):
+ pytest.raises(TypeError, BDay().apply, BMonthEnd())
+
+
+class TestBusinessHour(Base):
+ _offset = BusinessHour
+
+ def setup_method(self, method):
+ self.d = datetime(2014, 7, 1, 10, 00)
+
+ self.offset1 = BusinessHour()
+ self.offset2 = BusinessHour(n=3)
+
+ self.offset3 = BusinessHour(n=-1)
+ self.offset4 = BusinessHour(n=-4)
+
+ from datetime import time as dt_time
+ self.offset5 = BusinessHour(start=dt_time(11, 0), end=dt_time(14, 30))
+ self.offset6 = BusinessHour(start='20:00', end='05:00')
+ self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30),
+ end=dt_time(6, 30))
+
+ def test_constructor_errors(self):
+ from datetime import time as dt_time
+ with pytest.raises(ValueError):
+ BusinessHour(start=dt_time(11, 0, 5))
+ with pytest.raises(ValueError):
+ BusinessHour(start='AAA')
+ with pytest.raises(ValueError):
+ BusinessHour(start='14:00:05')
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset1) == '<BusinessHour: BH=09:00-17:00>'
+ assert repr(self.offset2) == '<3 * BusinessHours: BH=09:00-17:00>'
+ assert repr(self.offset3) == '<-1 * BusinessHour: BH=09:00-17:00>'
+ assert repr(self.offset4) == '<-4 * BusinessHours: BH=09:00-17:00>'
+
+ assert repr(self.offset5) == '<BusinessHour: BH=11:00-14:30>'
+ assert repr(self.offset6) == '<BusinessHour: BH=20:00-05:00>'
+ assert repr(self.offset7) == '<-2 * BusinessHours: BH=21:30-06:30>'
+
+ def test_with_offset(self):
+ expected = Timestamp('2014-07-01 13:00')
+
+ assert self.d + BusinessHour() * 3 == expected
+ assert self.d + BusinessHour(n=3) == expected
+
+ def test_eq(self):
+ for offset in [self.offset1, self.offset2, self.offset3, self.offset4]:
+ assert offset == offset
+
+ assert BusinessHour() != BusinessHour(-1)
+ assert BusinessHour(start='09:00') == BusinessHour()
+ assert BusinessHour(start='09:00') != BusinessHour(start='09:01')
+ assert (BusinessHour(start='09:00', end='17:00') !=
+ BusinessHour(start='17:00', end='09:01'))
+
+ def test_hash(self):
+ for offset in [self.offset1, self.offset2, self.offset3, self.offset4]:
+ assert hash(offset) == hash(offset)
+
+ def test_call(self):
+ assert self.offset1(self.d) == datetime(2014, 7, 1, 11)
+ assert self.offset2(self.d) == datetime(2014, 7, 1, 13)
+ assert self.offset3(self.d) == datetime(2014, 6, 30, 17)
+ assert self.offset4(self.d) == datetime(2014, 6, 30, 14)
+
+ def test_sub(self):
+ # we have to override test_sub here becasue self.offset2 is not
+ # defined as self._offset(2)
+ off = self.offset2
+ with pytest.raises(Exception):
+ off - self.d
+ assert 2 * off - off == off
+
+ assert self.d - self.offset2 == self.d + self._offset(-3)
+
+ def testRollback1(self):
+ assert self.offset1.rollback(self.d) == self.d
+ assert self.offset2.rollback(self.d) == self.d
+ assert self.offset3.rollback(self.d) == self.d
+ assert self.offset4.rollback(self.d) == self.d
+ assert self.offset5.rollback(self.d) == datetime(2014, 6, 30, 14, 30)
+ assert self.offset6.rollback(self.d) == datetime(2014, 7, 1, 5, 0)
+ assert self.offset7.rollback(self.d) == datetime(2014, 7, 1, 6, 30)
+
+ d = datetime(2014, 7, 1, 0)
+ assert self.offset1.rollback(d) == datetime(2014, 6, 30, 17)
+ assert self.offset2.rollback(d) == datetime(2014, 6, 30, 17)
+ assert self.offset3.rollback(d) == datetime(2014, 6, 30, 17)
+ assert self.offset4.rollback(d) == datetime(2014, 6, 30, 17)
+ assert self.offset5.rollback(d) == datetime(2014, 6, 30, 14, 30)
+ assert self.offset6.rollback(d) == d
+ assert self.offset7.rollback(d) == d
+
+ assert self._offset(5).rollback(self.d) == self.d
+
+ def testRollback2(self):
+ assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) ==
+ datetime(2014, 7, 4, 17, 0))
+
+ def testRollforward1(self):
+ assert self.offset1.rollforward(self.d) == self.d
+ assert self.offset2.rollforward(self.d) == self.d
+ assert self.offset3.rollforward(self.d) == self.d
+ assert self.offset4.rollforward(self.d) == self.d
+ assert (self.offset5.rollforward(self.d) ==
+ datetime(2014, 7, 1, 11, 0))
+ assert (self.offset6.rollforward(self.d) ==
+ datetime(2014, 7, 1, 20, 0))
+ assert (self.offset7.rollforward(self.d) ==
+ datetime(2014, 7, 1, 21, 30))
+
+ d = datetime(2014, 7, 1, 0)
+ assert self.offset1.rollforward(d) == datetime(2014, 7, 1, 9)
+ assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9)
+ assert self.offset3.rollforward(d) == datetime(2014, 7, 1, 9)
+ assert self.offset4.rollforward(d) == datetime(2014, 7, 1, 9)
+ assert self.offset5.rollforward(d) == datetime(2014, 7, 1, 11)
+ assert self.offset6.rollforward(d) == d
+ assert self.offset7.rollforward(d) == d
+
+ assert self._offset(5).rollforward(self.d) == self.d
+
+ def testRollforward2(self):
+ assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) ==
+ datetime(2014, 7, 7, 9))
+
+ def test_roll_date_object(self):
+ offset = BusinessHour()
+
+ dt = datetime(2014, 7, 6, 15, 0)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2014, 7, 4, 17)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2014, 7, 7, 9)
+
+ normalize_cases = []
+ normalize_cases.append((BusinessHour(normalize=True), {
+ datetime(2014, 7, 1, 8): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 2),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 2),
+ datetime(2014, 7, 1, 23): datetime(2014, 7, 2),
+ datetime(2014, 7, 1, 0): datetime(2014, 7, 1),
+ datetime(2014, 7, 4, 15): datetime(2014, 7, 4),
+ datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7),
+ datetime(2014, 7, 5, 23): datetime(2014, 7, 7),
+ datetime(2014, 7, 6, 10): datetime(2014, 7, 7)}))
+
+ normalize_cases.append((BusinessHour(-1, normalize=True), {
+ datetime(2014, 7, 1, 8): datetime(2014, 6, 30),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 10): datetime(2014, 6, 30),
+ datetime(2014, 7, 1, 0): datetime(2014, 6, 30),
+ datetime(2014, 7, 7, 10): datetime(2014, 7, 4),
+ datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7),
+ datetime(2014, 7, 5, 23): datetime(2014, 7, 4),
+ datetime(2014, 7, 6, 10): datetime(2014, 7, 4)}))
+
+ normalize_cases.append((BusinessHour(1, normalize=True, start='17:00',
+ end='04:00'), {
+ datetime(2014, 7, 1, 8): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 23): datetime(2014, 7, 2),
+ datetime(2014, 7, 2, 2): datetime(2014, 7, 2),
+ datetime(2014, 7, 2, 3): datetime(2014, 7, 2),
+ datetime(2014, 7, 4, 23): datetime(2014, 7, 5),
+ datetime(2014, 7, 5, 2): datetime(2014, 7, 5),
+ datetime(2014, 7, 7, 2): datetime(2014, 7, 7),
+ datetime(2014, 7, 7, 17): datetime(2014, 7, 7)}))
+
+ @pytest.mark.parametrize('case', normalize_cases)
+ def test_normalize(self, case):
+ offset, cases = case
+ for dt, expected in compat.iteritems(cases):
+ assert offset.apply(dt) == expected
+
+ on_offset_cases = []
+ on_offset_cases.append((BusinessHour(), {
+ datetime(2014, 7, 1, 9): True,
+ datetime(2014, 7, 1, 8, 59): False,
+ datetime(2014, 7, 1, 8): False,
+ datetime(2014, 7, 1, 17): True,
+ datetime(2014, 7, 1, 17, 1): False,
+ datetime(2014, 7, 1, 18): False,
+ datetime(2014, 7, 5, 9): False,
+ datetime(2014, 7, 6, 12): False}))
+
+ on_offset_cases.append((BusinessHour(start='10:00', end='15:00'), {
+ datetime(2014, 7, 1, 9): False,
+ datetime(2014, 7, 1, 10): True,
+ datetime(2014, 7, 1, 15): True,
+ datetime(2014, 7, 1, 15, 1): False,
+ datetime(2014, 7, 5, 12): False,
+ datetime(2014, 7, 6, 12): False}))
+
+ on_offset_cases.append((BusinessHour(start='19:00', end='05:00'), {
+ datetime(2014, 7, 1, 9, 0): False,
+ datetime(2014, 7, 1, 10, 0): False,
+ datetime(2014, 7, 1, 15): False,
+ datetime(2014, 7, 1, 15, 1): False,
+ datetime(2014, 7, 5, 12, 0): False,
+ datetime(2014, 7, 6, 12, 0): False,
+ datetime(2014, 7, 1, 19, 0): True,
+ datetime(2014, 7, 2, 0, 0): True,
+ datetime(2014, 7, 4, 23): True,
+ datetime(2014, 7, 5, 1): True,
+ datetime(2014, 7, 5, 5, 0): True,
+ datetime(2014, 7, 6, 23, 0): False,
+ datetime(2014, 7, 7, 3, 0): False}))
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, cases = case
+ for dt, expected in compat.iteritems(cases):
+ assert offset.onOffset(dt) == expected
+
+ opening_time_cases = []
+ # opening time should be affected by sign of n, not by n's value and
+ # end
+ opening_time_cases.append(([BusinessHour(), BusinessHour(n=2),
+ BusinessHour(n=4), BusinessHour(end='10:00'),
+ BusinessHour(n=2, end='4:00'),
+ BusinessHour(n=4, end='15:00')], {
+ datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 9)),
+ datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 9)),
+ datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 9)),
+ datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 9)),
+ # if timestamp is on opening time, next opening time is
+ # as it is
+ datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9),
+ datetime(2014, 7, 2, 9)),
+ # 2014-07-05 is saturday
+ datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 4, 9)),
+ datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 4, 9)),
+ datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 4, 9)),
+ datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 4, 9)),
+ datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 4, 9)),
+ datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9),
+ datetime(2014, 7, 7, 9))}))
+
+ opening_time_cases.append(([BusinessHour(start='11:15'),
+ BusinessHour(n=2, start='11:15'),
+ BusinessHour(n=3, start='11:15'),
+ BusinessHour(start='11:15', end='10:00'),
+ BusinessHour(n=2, start='11:15', end='4:00'),
+ BusinessHour(n=3, start='11:15',
+ end='15:00')], {
+ datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15),
+ datetime(2014, 6, 30, 11, 15)),
+ datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 1, 11, 15)),
+ datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 1, 11, 15)),
+ datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 1, 11, 15)),
+ datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 1, 11, 15)),
+ datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 1, 11, 15)),
+ datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15),
+ datetime(2014, 7, 2, 11, 15)),
+ datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15),
+ datetime(2014, 7, 2, 11, 15)),
+ datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15),
+ datetime(2014, 7, 4, 11, 15)),
+ datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15),
+ datetime(2014, 7, 3, 11, 15)),
+ datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15),
+ datetime(2014, 7, 4, 11, 15)),
+ datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15),
+ datetime(2014, 7, 4, 11, 15)),
+ datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15),
+ datetime(2014, 7, 4, 11, 15)),
+ datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15),
+ datetime(2014, 7, 4, 11, 15))}))
+
+ opening_time_cases.append(([BusinessHour(-1), BusinessHour(n=-2),
+ BusinessHour(n=-4),
+ BusinessHour(n=-1, end='10:00'),
+ BusinessHour(n=-2, end='4:00'),
+ BusinessHour(n=-4, end='15:00')], {
+ datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 2, 9)),
+ datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 3, 9)),
+ datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 7, 9)),
+ datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9),
+ datetime(2014, 7, 8, 9))}))
+
+ opening_time_cases.append(([BusinessHour(start='17:00', end='05:00'),
+ BusinessHour(n=3, start='17:00',
+ end='03:00')], {
+ datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17),
+ datetime(2014, 6, 30, 17)),
+ datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17),
+ datetime(2014, 7, 1, 17)),
+ datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17),
+ datetime(2014, 7, 1, 17)),
+ datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17),
+ datetime(2014, 7, 1, 17)),
+ datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17),
+ datetime(2014, 7, 1, 17)),
+ datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 3, 17)),
+ datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17),
+ datetime(2014, 7, 7, 17)), }))
+
+ opening_time_cases.append(([BusinessHour(-1, start='17:00', end='05:00'),
+ BusinessHour(n=-2, start='17:00',
+ end='03:00')], {
+ datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17),
+ datetime(2014, 7, 1, 17)),
+ datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17),
+ datetime(2014, 7, 2, 17)),
+ datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17),
+ datetime(2014, 7, 2, 17)),
+ datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17),
+ datetime(2014, 7, 2, 17)),
+ datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17),
+ datetime(2014, 7, 2, 17)),
+ datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17),
+ datetime(2014, 7, 2, 17)),
+ datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 7, 17)),
+ datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17),
+ datetime(2014, 7, 4, 17)),
+ datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 7, 17)),
+ datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 7, 17)),
+ datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17),
+ datetime(2014, 7, 7, 17)),
+ datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17),
+ datetime(2014, 7, 8, 17))}))
+
+ @pytest.mark.parametrize('case', opening_time_cases)
+ def test_opening_time(self, case):
+ _offsets, cases = case
+ for offset in _offsets:
+ for dt, (exp_next, exp_prev) in compat.iteritems(cases):
+ assert offset._next_opening_time(dt) == exp_next
+ assert offset._prev_opening_time(dt) == exp_prev
+
+ apply_cases = []
+ apply_cases.append((BusinessHour(), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12),
+ # out of business hours
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10),
+ # saturday
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30),
+ datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)}))
+
+ apply_cases.append((BusinessHour(4), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30),
+ datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)}))
+
+ apply_cases.append((BusinessHour(-1), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17),
+ datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15),
+ datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10),
+ # out of business hours
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16),
+ # saturday
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16),
+ datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16),
+ datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30),
+ datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)}))
+
+ apply_cases.append((BusinessHour(-4), {
+ datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15),
+ datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13),
+ datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13),
+ datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30),
+ datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)}))
+
+ apply_cases.append((BusinessHour(start='13:00', end='16:00'), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14),
+ datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)}))
+
+ apply_cases.append((BusinessHour(n=2, start='13:00', end='16:00'), {
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15),
+ datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15),
+ datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15),
+ datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30),
+ datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)}))
+
+ apply_cases.append((BusinessHour(n=-1, start='13:00', end='16:00'), {
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15),
+ datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15),
+ datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15),
+ datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)}))
+
+ apply_cases.append((BusinessHour(n=-3, start='10:00', end='16:00'), {
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13),
+ datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13),
+ datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13),
+ datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13),
+ datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13),
+ datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30),
+ datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)}))
+
+ apply_cases.append((BusinessHour(start='19:00', end='05:00'), {
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20),
+ datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20),
+ datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20),
+ datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1),
+ datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20),
+ datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0),
+ datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1),
+ datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19),
+ datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30),
+ datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)}))
+
+ apply_cases.append((BusinessHour(n=-1, start='19:00', end='05:00'), {
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4),
+ datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4),
+ datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4),
+ datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4),
+ datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23),
+ datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4),
+ datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22),
+ datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23),
+ datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3),
+ datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30),
+ datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)}))
+
+ @pytest.mark.parametrize('case', apply_cases)
+ def test_apply(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ apply_large_n_cases = []
+ # A week later
+ apply_large_n_cases.append((BusinessHour(40), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9),
+ datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9),
+ datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30),
+ datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)}))
+
+ # 3 days and 1 hour before
+ apply_large_n_cases.append((BusinessHour(-25), {
+ datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10),
+ datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12),
+ datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16),
+ datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17),
+ datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10),
+ datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16),
+ datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16),
+ datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16),
+ datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30),
+ datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)}))
+
+ # 5 days and 3 hours later
+ apply_large_n_cases.append((BusinessHour(28, start='21:00', end='02:00'), {
+ datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0),
+ datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1),
+ datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21),
+ datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0),
+ datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0),
+ datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23),
+ datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0),
+ datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0),
+ datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0),
+ datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0),
+ datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0),
+ datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)}))
+
+ @pytest.mark.parametrize('case', apply_large_n_cases)
+ def test_apply_large_n(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_apply_nanoseconds(self):
+ tests = []
+
+ tests.append((BusinessHour(),
+ {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp(
+ '2014-07-04 16:00') + Nano(5),
+ Timestamp('2014-07-04 16:00') + Nano(5): Timestamp(
+ '2014-07-07 09:00') + Nano(5),
+ Timestamp('2014-07-04 16:00') - Nano(5): Timestamp(
+ '2014-07-04 17:00') - Nano(5)}))
+
+ tests.append((BusinessHour(-1),
+ {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp(
+ '2014-07-04 14:00') + Nano(5),
+ Timestamp('2014-07-04 10:00') + Nano(5): Timestamp(
+ '2014-07-04 09:00') + Nano(5),
+ Timestamp('2014-07-04 10:00') - Nano(5): Timestamp(
+ '2014-07-03 17:00') - Nano(5), }))
+
+ for offset, cases in tests:
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_datetimeindex(self):
+ idx1 = date_range(start='2014-07-04 15:00', end='2014-07-08 10:00',
+ freq='BH')
+ idx2 = date_range(start='2014-07-04 15:00', periods=12, freq='BH')
+ idx3 = date_range(end='2014-07-08 10:00', periods=12, freq='BH')
+ expected = DatetimeIndex(['2014-07-04 15:00', '2014-07-04 16:00',
+ '2014-07-07 09:00',
+ '2014-07-07 10:00', '2014-07-07 11:00',
+ '2014-07-07 12:00',
+ '2014-07-07 13:00', '2014-07-07 14:00',
+ '2014-07-07 15:00',
+ '2014-07-07 16:00', '2014-07-08 09:00',
+ '2014-07-08 10:00'],
+ freq='BH')
+ for idx in [idx1, idx2, idx3]:
+ tm.assert_index_equal(idx, expected)
+
+ idx1 = date_range(start='2014-07-04 15:45', end='2014-07-08 10:45',
+ freq='BH')
+ idx2 = date_range(start='2014-07-04 15:45', periods=12, freq='BH')
+ idx3 = date_range(end='2014-07-08 10:45', periods=12, freq='BH')
+
+ expected = DatetimeIndex(['2014-07-04 15:45', '2014-07-04 16:45',
+ '2014-07-07 09:45',
+ '2014-07-07 10:45', '2014-07-07 11:45',
+ '2014-07-07 12:45',
+ '2014-07-07 13:45', '2014-07-07 14:45',
+ '2014-07-07 15:45',
+ '2014-07-07 16:45', '2014-07-08 09:45',
+ '2014-07-08 10:45'],
+ freq='BH')
+ expected = idx1
+ for idx in [idx1, idx2, idx3]:
+ tm.assert_index_equal(idx, expected)
+
+
+class TestCustomBusinessHour(Base):
+ _offset = CustomBusinessHour
+ holidays = ['2014-06-27', datetime(2014, 6, 30),
+ np.datetime64('2014-07-02')]
+
+ def setup_method(self, method):
+ # 2014 Calendar to check custom holidays
+ # Sun Mon Tue Wed Thu Fri Sat
+ # 6/22 23 24 25 26 27 28
+ # 29 30 7/1 2 3 4 5
+ # 6 7 8 9 10 11 12
+ self.d = datetime(2014, 7, 1, 10, 00)
+ self.offset1 = CustomBusinessHour(weekmask='Tue Wed Thu Fri')
+
+ self.offset2 = CustomBusinessHour(holidays=self.holidays)
+
+ def test_constructor_errors(self):
+ from datetime import time as dt_time
+ with pytest.raises(ValueError):
+ CustomBusinessHour(start=dt_time(11, 0, 5))
+ with pytest.raises(ValueError):
+ CustomBusinessHour(start='AAA')
+ with pytest.raises(ValueError):
+ CustomBusinessHour(start='14:00:05')
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset1) == '<CustomBusinessHour: CBH=09:00-17:00>'
+ assert repr(self.offset2) == '<CustomBusinessHour: CBH=09:00-17:00>'
+
+ def test_with_offset(self):
+ expected = Timestamp('2014-07-01 13:00')
+
+ assert self.d + CustomBusinessHour() * 3 == expected
+ assert self.d + CustomBusinessHour(n=3) == expected
+
+ def test_eq(self):
+ for offset in [self.offset1, self.offset2]:
+ assert offset == offset
+
+ assert CustomBusinessHour() != CustomBusinessHour(-1)
+ assert (CustomBusinessHour(start='09:00') ==
+ CustomBusinessHour())
+ assert (CustomBusinessHour(start='09:00') !=
+ CustomBusinessHour(start='09:01'))
+ assert (CustomBusinessHour(start='09:00', end='17:00') !=
+ CustomBusinessHour(start='17:00', end='09:01'))
+
+ assert (CustomBusinessHour(weekmask='Tue Wed Thu Fri') !=
+ CustomBusinessHour(weekmask='Mon Tue Wed Thu Fri'))
+ assert (CustomBusinessHour(holidays=['2014-06-27']) !=
+ CustomBusinessHour(holidays=['2014-06-28']))
+
+ def test_sub(self):
+ # override the Base.test_sub implementation because self.offset2 is
+ # defined differently in this class than the test expects
+ pass
+
+ def test_hash(self):
+ assert hash(self.offset1) == hash(self.offset1)
+ assert hash(self.offset2) == hash(self.offset2)
+
+ def test_call(self):
+ assert self.offset1(self.d) == datetime(2014, 7, 1, 11)
+ assert self.offset2(self.d) == datetime(2014, 7, 1, 11)
+
+ def testRollback1(self):
+ assert self.offset1.rollback(self.d) == self.d
+ assert self.offset2.rollback(self.d) == self.d
+
+ d = datetime(2014, 7, 1, 0)
+
+ # 2014/07/01 is Tuesday, 06/30 is Monday(holiday)
+ assert self.offset1.rollback(d) == datetime(2014, 6, 27, 17)
+
+ # 2014/6/30 and 2014/6/27 are holidays
+ assert self.offset2.rollback(d) == datetime(2014, 6, 26, 17)
+
+ def testRollback2(self):
+ assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) ==
+ datetime(2014, 7, 4, 17, 0))
+
+ def testRollforward1(self):
+ assert self.offset1.rollforward(self.d) == self.d
+ assert self.offset2.rollforward(self.d) == self.d
+
+ d = datetime(2014, 7, 1, 0)
+ assert self.offset1.rollforward(d) == datetime(2014, 7, 1, 9)
+ assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9)
+
+ def testRollforward2(self):
+ assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) ==
+ datetime(2014, 7, 7, 9))
+
+ def test_roll_date_object(self):
+ offset = BusinessHour()
+
+ dt = datetime(2014, 7, 6, 15, 0)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2014, 7, 4, 17)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2014, 7, 7, 9)
+
+ normalize_cases = []
+ normalize_cases.append((
+ CustomBusinessHour(normalize=True, holidays=holidays),
+ {datetime(2014, 7, 1, 8): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 3),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 3),
+ datetime(2014, 7, 1, 23): datetime(2014, 7, 3),
+ datetime(2014, 7, 1, 0): datetime(2014, 7, 1),
+ datetime(2014, 7, 4, 15): datetime(2014, 7, 4),
+ datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7),
+ datetime(2014, 7, 5, 23): datetime(2014, 7, 7),
+ datetime(2014, 7, 6, 10): datetime(2014, 7, 7)}))
+
+ normalize_cases.append((
+ CustomBusinessHour(-1, normalize=True, holidays=holidays),
+ {datetime(2014, 7, 1, 8): datetime(2014, 6, 26),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 10): datetime(2014, 6, 26),
+ datetime(2014, 7, 1, 0): datetime(2014, 6, 26),
+ datetime(2014, 7, 7, 10): datetime(2014, 7, 4),
+ datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7),
+ datetime(2014, 7, 5, 23): datetime(2014, 7, 4),
+ datetime(2014, 7, 6, 10): datetime(2014, 7, 4)}))
+
+ normalize_cases.append((
+ CustomBusinessHour(1, normalize=True,
+ start='17:00', end='04:00',
+ holidays=holidays),
+ {datetime(2014, 7, 1, 8): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 1),
+ datetime(2014, 7, 1, 23): datetime(2014, 7, 2),
+ datetime(2014, 7, 2, 2): datetime(2014, 7, 2),
+ datetime(2014, 7, 2, 3): datetime(2014, 7, 3),
+ datetime(2014, 7, 4, 23): datetime(2014, 7, 5),
+ datetime(2014, 7, 5, 2): datetime(2014, 7, 5),
+ datetime(2014, 7, 7, 2): datetime(2014, 7, 7),
+ datetime(2014, 7, 7, 17): datetime(2014, 7, 7)}))
+
+ @pytest.mark.parametrize('norm_cases', normalize_cases)
+ def test_normalize(self, norm_cases):
+ offset, cases = norm_cases
+ for dt, expected in compat.iteritems(cases):
+ assert offset.apply(dt) == expected
+
+ def test_onOffset(self):
+ tests = []
+
+ tests.append((CustomBusinessHour(start='10:00', end='15:00',
+ holidays=self.holidays),
+ {datetime(2014, 7, 1, 9): False,
+ datetime(2014, 7, 1, 10): True,
+ datetime(2014, 7, 1, 15): True,
+ datetime(2014, 7, 1, 15, 1): False,
+ datetime(2014, 7, 5, 12): False,
+ datetime(2014, 7, 6, 12): False}))
+
+ for offset, cases in tests:
+ for dt, expected in compat.iteritems(cases):
+ assert offset.onOffset(dt) == expected
+
+ apply_cases = []
+ apply_cases.append((
+ CustomBusinessHour(holidays=holidays),
+ {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16),
+ datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9),
+ datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10),
+ # out of business hours
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10),
+ # saturday
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30),
+ datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)}))
+
+ apply_cases.append((
+ CustomBusinessHour(4, holidays=holidays),
+ {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15),
+ datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9),
+ datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11),
+ datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12),
+ datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13),
+ datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13),
+ datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13),
+ datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30),
+ datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)}))
+
+ @pytest.mark.parametrize('apply_case', apply_cases)
+ def test_apply(self, apply_case):
+ offset, cases = apply_case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ nano_cases = []
+ nano_cases.append(
+ (CustomBusinessHour(holidays=holidays),
+ {Timestamp('2014-07-01 15:00') + Nano(5):
+ Timestamp('2014-07-01 16:00') + Nano(5),
+ Timestamp('2014-07-01 16:00') + Nano(5):
+ Timestamp('2014-07-03 09:00') + Nano(5),
+ Timestamp('2014-07-01 16:00') - Nano(5):
+ Timestamp('2014-07-01 17:00') - Nano(5)}))
+
+ nano_cases.append(
+ (CustomBusinessHour(-1, holidays=holidays),
+ {Timestamp('2014-07-01 15:00') + Nano(5):
+ Timestamp('2014-07-01 14:00') + Nano(5),
+ Timestamp('2014-07-01 10:00') + Nano(5):
+ Timestamp('2014-07-01 09:00') + Nano(5),
+ Timestamp('2014-07-01 10:00') - Nano(5):
+ Timestamp('2014-06-26 17:00') - Nano(5)}))
+
+ @pytest.mark.parametrize('nano_case', nano_cases)
+ def test_apply_nanoseconds(self, nano_case):
+ offset, cases = nano_case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+
+class TestCustomBusinessDay(Base):
+ _offset = CDay
+
+ def setup_method(self, method):
+ self.d = datetime(2008, 1, 1)
+ self.nd = np_datetime64_compat('2008-01-01 00:00:00Z')
+
+ self.offset = CDay()
+ self.offset1 = self.offset
+ self.offset2 = CDay(2)
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset) == '<CustomBusinessDay>'
+ assert repr(self.offset2) == '<2 * CustomBusinessDays>'
+
+ if compat.PY37:
+ expected = '<BusinessDay: offset=datetime.timedelta(days=1)>'
+ else:
+ expected = '<BusinessDay: offset=datetime.timedelta(1)>'
+ assert repr(self.offset + timedelta(1)) == expected
+
+ def test_with_offset(self):
+ offset = self.offset + timedelta(hours=2)
+
+ assert (self.d + offset) == datetime(2008, 1, 2, 2)
+
+ def test_eq(self):
+ assert self.offset2 == self.offset2
+
+ def test_mul(self):
+ pass
+
+ def test_hash(self):
+ assert hash(self.offset2) == hash(self.offset2)
+
+ def test_call(self):
+ assert self.offset2(self.d) == datetime(2008, 1, 3)
+ assert self.offset2(self.nd) == datetime(2008, 1, 3)
+
+ def testRollback1(self):
+ assert CDay(10).rollback(self.d) == self.d
+
+ def testRollback2(self):
+ assert (CDay(10).rollback(datetime(2008, 1, 5)) ==
+ datetime(2008, 1, 4))
+
+ def testRollforward1(self):
+ assert CDay(10).rollforward(self.d) == self.d
+
+ def testRollforward2(self):
+ assert (CDay(10).rollforward(datetime(2008, 1, 5)) ==
+ datetime(2008, 1, 7))
+
+ def test_roll_date_object(self):
+ offset = CDay()
+
+ dt = date(2012, 9, 15)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 14)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 17)
+
+ offset = offsets.Day()
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 15)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 15)
+
+ on_offset_cases = [(CDay(), datetime(2008, 1, 1), True),
+ (CDay(), datetime(2008, 1, 5), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, d, expected = case
+ assert_onOffset(offset, d, expected)
+
+ apply_cases = []
+ apply_cases.append((CDay(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 2),
+ datetime(2008, 1, 4): datetime(2008, 1, 7),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 8)}))
+
+ apply_cases.append((2 * CDay(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 3),
+ datetime(2008, 1, 4): datetime(2008, 1, 8),
+ datetime(2008, 1, 5): datetime(2008, 1, 8),
+ datetime(2008, 1, 6): datetime(2008, 1, 8),
+ datetime(2008, 1, 7): datetime(2008, 1, 9)}))
+
+ apply_cases.append((-CDay(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 31),
+ datetime(2008, 1, 4): datetime(2008, 1, 3),
+ datetime(2008, 1, 5): datetime(2008, 1, 4),
+ datetime(2008, 1, 6): datetime(2008, 1, 4),
+ datetime(2008, 1, 7): datetime(2008, 1, 4),
+ datetime(2008, 1, 8): datetime(2008, 1, 7)}))
+
+ apply_cases.append((-2 * CDay(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 28),
+ datetime(2008, 1, 4): datetime(2008, 1, 2),
+ datetime(2008, 1, 5): datetime(2008, 1, 3),
+ datetime(2008, 1, 6): datetime(2008, 1, 3),
+ datetime(2008, 1, 7): datetime(2008, 1, 3),
+ datetime(2008, 1, 8): datetime(2008, 1, 4),
+ datetime(2008, 1, 9): datetime(2008, 1, 7)}))
+
+ apply_cases.append((CDay(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 4): datetime(2008, 1, 4),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 7)}))
+
+ @pytest.mark.parametrize('case', apply_cases)
+ def test_apply(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_apply_large_n(self):
+ dt = datetime(2012, 10, 23)
+
+ result = dt + CDay(10)
+ assert result == datetime(2012, 11, 6)
+
+ result = dt + CDay(100) - CDay(100)
+ assert result == dt
+
+ off = CDay() * 6
+ rs = datetime(2012, 1, 1) - off
+ xp = datetime(2011, 12, 23)
+ assert rs == xp
+
+ st = datetime(2011, 12, 18)
+ rs = st + off
+ xp = datetime(2011, 12, 26)
+ assert rs == xp
+
+ def test_apply_corner(self):
+ pytest.raises(Exception, CDay().apply, BMonthEnd())
+
+ def test_holidays(self):
+ # Define a TradingDay offset
+ holidays = ['2012-05-01', datetime(2013, 5, 1),
+ np.datetime64('2014-05-01')]
+ tday = CDay(holidays=holidays)
+ for year in range(2012, 2015):
+ dt = datetime(year, 4, 30)
+ xp = datetime(year, 5, 2)
+ rs = dt + tday
+ assert rs == xp
+
+ def test_weekmask(self):
+ weekmask_saudi = 'Sat Sun Mon Tue Wed' # Thu-Fri Weekend
+ weekmask_uae = '1111001' # Fri-Sat Weekend
+ weekmask_egypt = [1, 1, 1, 1, 0, 0, 1] # Fri-Sat Weekend
+ bday_saudi = CDay(weekmask=weekmask_saudi)
+ bday_uae = CDay(weekmask=weekmask_uae)
+ bday_egypt = CDay(weekmask=weekmask_egypt)
+ dt = datetime(2013, 5, 1)
+ xp_saudi = datetime(2013, 5, 4)
+ xp_uae = datetime(2013, 5, 2)
+ xp_egypt = datetime(2013, 5, 2)
+ assert xp_saudi == dt + bday_saudi
+ assert xp_uae == dt + bday_uae
+ assert xp_egypt == dt + bday_egypt
+ xp2 = datetime(2013, 5, 5)
+ assert xp2 == dt + 2 * bday_saudi
+ assert xp2 == dt + 2 * bday_uae
+ assert xp2 == dt + 2 * bday_egypt
+
+ def test_weekmask_and_holidays(self):
+ weekmask_egypt = 'Sun Mon Tue Wed Thu' # Fri-Sat Weekend
+ holidays = ['2012-05-01', datetime(2013, 5, 1),
+ np.datetime64('2014-05-01')]
+ bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt)
+ dt = datetime(2013, 4, 30)
+ xp_egypt = datetime(2013, 5, 5)
+ assert xp_egypt == dt + 2 * bday_egypt
+
+ @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning")
+ def test_calendar(self):
+ calendar = USFederalHolidayCalendar()
+ dt = datetime(2014, 1, 17)
+ assert_offset_equal(CDay(calendar=calendar), dt, datetime(2014, 1, 21))
+
+ def test_roundtrip_pickle(self):
+ def _check_roundtrip(obj):
+ unpickled = tm.round_trip_pickle(obj)
+ assert unpickled == obj
+
+ _check_roundtrip(self.offset)
+ _check_roundtrip(self.offset2)
+ _check_roundtrip(self.offset * 2)
+
+ def test_pickle_compat_0_14_1(self, datapath):
+ hdays = [datetime(2013, 1, 1) for ele in range(4)]
+ pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle')
+ cday0_14_1 = read_pickle(pth)
+ cday = CDay(holidays=hdays)
+ assert cday == cday0_14_1
+
+
+class CustomBusinessMonthBase(object):
+
+ def setup_method(self, method):
+ self.d = datetime(2008, 1, 1)
+
+ self.offset = self._offset()
+ self.offset1 = self.offset
+ self.offset2 = self._offset(2)
+
+ def test_eq(self):
+ assert self.offset2 == self.offset2
+
+ def test_mul(self):
+ pass
+
+ def test_hash(self):
+ assert hash(self.offset2) == hash(self.offset2)
+
+ def test_roundtrip_pickle(self):
+ def _check_roundtrip(obj):
+ unpickled = tm.round_trip_pickle(obj)
+ assert unpickled == obj
+
+ _check_roundtrip(self._offset())
+ _check_roundtrip(self._offset(2))
+ _check_roundtrip(self._offset() * 2)
+
+ def test_copy(self):
+ # GH 17452
+ off = self._offset(weekmask='Mon Wed Fri')
+ assert off == off.copy()
+
+
+class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base):
+ _offset = CBMonthEnd
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset) == '<CustomBusinessMonthEnd>'
+ assert repr(self.offset2) == '<2 * CustomBusinessMonthEnds>'
+
+ def testCall(self):
+ assert self.offset2(self.d) == datetime(2008, 2, 29)
+
+ def testRollback1(self):
+ assert (CDay(10).rollback(datetime(2007, 12, 31)) ==
+ datetime(2007, 12, 31))
+
+ def testRollback2(self):
+ assert CBMonthEnd(10).rollback(self.d) == datetime(2007, 12, 31)
+
+ def testRollforward1(self):
+ assert CBMonthEnd(10).rollforward(self.d) == datetime(2008, 1, 31)
+
+ def test_roll_date_object(self):
+ offset = CBMonthEnd()
+
+ dt = date(2012, 9, 15)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 8, 31)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 28)
+
+ offset = offsets.Day()
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 15)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 15)
+
+ on_offset_cases = [(CBMonthEnd(), datetime(2008, 1, 31), True),
+ (CBMonthEnd(), datetime(2008, 1, 1), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, d, expected = case
+ assert_onOffset(offset, d, expected)
+
+ apply_cases = []
+ apply_cases.append((CBMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 2, 7): datetime(2008, 2, 29)}))
+
+ apply_cases.append((2 * CBMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 2, 29),
+ datetime(2008, 2, 7): datetime(2008, 3, 31)}))
+
+ apply_cases.append((-CBMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 31),
+ datetime(2008, 2, 8): datetime(2008, 1, 31)}))
+
+ apply_cases.append((-2 * CBMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2007, 11, 30),
+ datetime(2008, 2, 9): datetime(2007, 12, 31)}))
+
+ apply_cases.append((CBMonthEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 2, 7): datetime(2008, 2, 29)}))
+
+ @pytest.mark.parametrize('case', apply_cases)
+ def test_apply(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_apply_large_n(self):
+ dt = datetime(2012, 10, 23)
+
+ result = dt + CBMonthEnd(10)
+ assert result == datetime(2013, 7, 31)
+
+ result = dt + CDay(100) - CDay(100)
+ assert result == dt
+
+ off = CBMonthEnd() * 6
+ rs = datetime(2012, 1, 1) - off
+ xp = datetime(2011, 7, 29)
+ assert rs == xp
+
+ st = datetime(2011, 12, 18)
+ rs = st + off
+ xp = datetime(2012, 5, 31)
+ assert rs == xp
+
+ def test_holidays(self):
+ # Define a TradingDay offset
+ holidays = ['2012-01-31', datetime(2012, 2, 28),
+ np.datetime64('2012-02-29')]
+ bm_offset = CBMonthEnd(holidays=holidays)
+ dt = datetime(2012, 1, 1)
+ assert dt + bm_offset == datetime(2012, 1, 30)
+ assert dt + 2 * bm_offset == datetime(2012, 2, 27)
+
+ @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning")
+ def test_datetimeindex(self):
+ from pandas.tseries.holiday import USFederalHolidayCalendar
+ hcal = USFederalHolidayCalendar()
+ freq = CBMonthEnd(calendar=hcal)
+
+ assert (date_range(start='20120101', end='20130101',
+ freq=freq).tolist()[0] == datetime(2012, 1, 31))
+
+
+class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base):
+ _offset = CBMonthBegin
+
+ def test_different_normalize_equals(self):
+ # GH#21404 changed __eq__ to return False when `normalize` doesnt match
+ offset = self._offset()
+ offset2 = self._offset(normalize=True)
+ assert offset != offset2
+
+ def test_repr(self):
+ assert repr(self.offset) == '<CustomBusinessMonthBegin>'
+ assert repr(self.offset2) == '<2 * CustomBusinessMonthBegins>'
+
+ def testCall(self):
+ assert self.offset2(self.d) == datetime(2008, 3, 3)
+
+ def testRollback1(self):
+ assert (CDay(10).rollback(datetime(2007, 12, 31)) ==
+ datetime(2007, 12, 31))
+
+ def testRollback2(self):
+ assert CBMonthBegin(10).rollback(self.d) == datetime(2008, 1, 1)
+
+ def testRollforward1(self):
+ assert CBMonthBegin(10).rollforward(self.d) == datetime(2008, 1, 1)
+
+ def test_roll_date_object(self):
+ offset = CBMonthBegin()
+
+ dt = date(2012, 9, 15)
+
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 3)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 10, 1)
+
+ offset = offsets.Day()
+ result = offset.rollback(dt)
+ assert result == datetime(2012, 9, 15)
+
+ result = offset.rollforward(dt)
+ assert result == datetime(2012, 9, 15)
+
+ on_offset_cases = [(CBMonthBegin(), datetime(2008, 1, 1), True),
+ (CBMonthBegin(), datetime(2008, 1, 31), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+ apply_cases = []
+ apply_cases.append((CBMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2008, 2, 1),
+ datetime(2008, 2, 7): datetime(2008, 3, 3)}))
+
+ apply_cases.append((2 * CBMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2008, 3, 3),
+ datetime(2008, 2, 7): datetime(2008, 4, 1)}))
+
+ apply_cases.append((-CBMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2007, 12, 3),
+ datetime(2008, 2, 8): datetime(2008, 2, 1)}))
+
+ apply_cases.append((-2 * CBMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2007, 11, 1),
+ datetime(2008, 2, 9): datetime(2008, 1, 1)}))
+
+ apply_cases.append((CBMonthBegin(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 7): datetime(2008, 2, 1)}))
+
+ @pytest.mark.parametrize('case', apply_cases)
+ def test_apply(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_apply_large_n(self):
+ dt = datetime(2012, 10, 23)
+
+ result = dt + CBMonthBegin(10)
+ assert result == datetime(2013, 8, 1)
+
+ result = dt + CDay(100) - CDay(100)
+ assert result == dt
+
+ off = CBMonthBegin() * 6
+ rs = datetime(2012, 1, 1) - off
+ xp = datetime(2011, 7, 1)
+ assert rs == xp
+
+ st = datetime(2011, 12, 18)
+ rs = st + off
+
+ xp = datetime(2012, 6, 1)
+ assert rs == xp
+
+ def test_holidays(self):
+ # Define a TradingDay offset
+ holidays = ['2012-02-01', datetime(2012, 2, 2),
+ np.datetime64('2012-03-01')]
+ bm_offset = CBMonthBegin(holidays=holidays)
+ dt = datetime(2012, 1, 1)
+
+ assert dt + bm_offset == datetime(2012, 1, 2)
+ assert dt + 2 * bm_offset == datetime(2012, 2, 3)
+
+ @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning")
+ def test_datetimeindex(self):
+ hcal = USFederalHolidayCalendar()
+ cbmb = CBMonthBegin(calendar=hcal)
+ assert (date_range(start='20120101', end='20130101',
+ freq=cbmb).tolist()[0] == datetime(2012, 1, 3))
+
+
+class TestWeek(Base):
+ _offset = Week
+ d = Timestamp(datetime(2008, 1, 2))
+ offset1 = _offset()
+ offset2 = _offset(2)
+
+ def test_repr(self):
+ assert repr(Week(weekday=0)) == "<Week: weekday=0>"
+ assert repr(Week(n=-1, weekday=0)) == "<-1 * Week: weekday=0>"
+ assert repr(Week(n=-2, weekday=0)) == "<-2 * Weeks: weekday=0>"
+
+ def test_corner(self):
+ with pytest.raises(ValueError):
+ Week(weekday=7)
+
+ with pytest.raises(ValueError, match="Day must be"):
+ Week(weekday=-1)
+
+ def test_isAnchored(self):
+ assert Week(weekday=0).isAnchored()
+ assert not Week().isAnchored()
+ assert not Week(2, weekday=2).isAnchored()
+ assert not Week(2).isAnchored()
+
+ offset_cases = []
+ # not business week
+ offset_cases.append((Week(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 8),
+ datetime(2008, 1, 4): datetime(2008, 1, 11),
+ datetime(2008, 1, 5): datetime(2008, 1, 12),
+ datetime(2008, 1, 6): datetime(2008, 1, 13),
+ datetime(2008, 1, 7): datetime(2008, 1, 14)}))
+
+ # Mon
+ offset_cases.append((Week(weekday=0), {
+ datetime(2007, 12, 31): datetime(2008, 1, 7),
+ datetime(2008, 1, 4): datetime(2008, 1, 7),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 14)}))
+
+ # n=0 -> roll forward. Mon
+ offset_cases.append((Week(0, weekday=0), {
+ datetime(2007, 12, 31): datetime(2007, 12, 31),
+ datetime(2008, 1, 4): datetime(2008, 1, 7),
+ datetime(2008, 1, 5): datetime(2008, 1, 7),
+ datetime(2008, 1, 6): datetime(2008, 1, 7),
+ datetime(2008, 1, 7): datetime(2008, 1, 7)}))
+
+ # n=0 -> roll forward. Mon
+ offset_cases.append((Week(-2, weekday=1), {
+ datetime(2010, 4, 6): datetime(2010, 3, 23),
+ datetime(2010, 4, 8): datetime(2010, 3, 30),
+ datetime(2010, 4, 5): datetime(2010, 3, 23)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ @pytest.mark.parametrize('weekday', range(7))
+ def test_onOffset(self, weekday):
+ offset = Week(weekday=weekday)
+
+ for day in range(1, 8):
+ date = datetime(2008, 1, day)
+
+ if day % 7 == weekday:
+ expected = True
+ else:
+ expected = False
+ assert_onOffset(offset, date, expected)
+
+
+class TestWeekOfMonth(Base):
+ _offset = WeekOfMonth
+ offset1 = _offset()
+ offset2 = _offset(2)
+
+ def test_constructor(self):
+ with pytest.raises(ValueError, match="^Week"):
+ WeekOfMonth(n=1, week=4, weekday=0)
+
+ with pytest.raises(ValueError, match="^Week"):
+ WeekOfMonth(n=1, week=-1, weekday=0)
+
+ with pytest.raises(ValueError, match="^Day"):
+ WeekOfMonth(n=1, week=0, weekday=-1)
+
+ with pytest.raises(ValueError, match="^Day"):
+ WeekOfMonth(n=1, week=0, weekday=-7)
+
+ def test_repr(self):
+ assert (repr(WeekOfMonth(weekday=1, week=2)) ==
+ "<WeekOfMonth: week=2, weekday=1>")
+
+ def test_offset(self):
+ date1 = datetime(2011, 1, 4) # 1st Tuesday of Month
+ date2 = datetime(2011, 1, 11) # 2nd Tuesday of Month
+ date3 = datetime(2011, 1, 18) # 3rd Tuesday of Month
+ date4 = datetime(2011, 1, 25) # 4th Tuesday of Month
+
+ # see for loop for structure
+ test_cases = [
+ (-2, 2, 1, date1, datetime(2010, 11, 16)),
+ (-2, 2, 1, date2, datetime(2010, 11, 16)),
+ (-2, 2, 1, date3, datetime(2010, 11, 16)),
+ (-2, 2, 1, date4, datetime(2010, 12, 21)),
+
+ (-1, 2, 1, date1, datetime(2010, 12, 21)),
+ (-1, 2, 1, date2, datetime(2010, 12, 21)),
+ (-1, 2, 1, date3, datetime(2010, 12, 21)),
+ (-1, 2, 1, date4, datetime(2011, 1, 18)),
+
+ (0, 0, 1, date1, datetime(2011, 1, 4)),
+ (0, 0, 1, date2, datetime(2011, 2, 1)),
+ (0, 0, 1, date3, datetime(2011, 2, 1)),
+ (0, 0, 1, date4, datetime(2011, 2, 1)),
+ (0, 1, 1, date1, datetime(2011, 1, 11)),
+ (0, 1, 1, date2, datetime(2011, 1, 11)),
+ (0, 1, 1, date3, datetime(2011, 2, 8)),
+ (0, 1, 1, date4, datetime(2011, 2, 8)),
+ (0, 0, 1, date1, datetime(2011, 1, 4)),
+ (0, 1, 1, date2, datetime(2011, 1, 11)),
+ (0, 2, 1, date3, datetime(2011, 1, 18)),
+ (0, 3, 1, date4, datetime(2011, 1, 25)),
+
+ (1, 0, 0, date1, datetime(2011, 2, 7)),
+ (1, 0, 0, date2, datetime(2011, 2, 7)),
+ (1, 0, 0, date3, datetime(2011, 2, 7)),
+ (1, 0, 0, date4, datetime(2011, 2, 7)),
+ (1, 0, 1, date1, datetime(2011, 2, 1)),
+ (1, 0, 1, date2, datetime(2011, 2, 1)),
+ (1, 0, 1, date3, datetime(2011, 2, 1)),
+ (1, 0, 1, date4, datetime(2011, 2, 1)),
+ (1, 0, 2, date1, datetime(2011, 1, 5)),
+ (1, 0, 2, date2, datetime(2011, 2, 2)),
+ (1, 0, 2, date3, datetime(2011, 2, 2)),
+ (1, 0, 2, date4, datetime(2011, 2, 2)),
+
+ (1, 2, 1, date1, datetime(2011, 1, 18)),
+ (1, 2, 1, date2, datetime(2011, 1, 18)),
+ (1, 2, 1, date3, datetime(2011, 2, 15)),
+ (1, 2, 1, date4, datetime(2011, 2, 15)),
+
+ (2, 2, 1, date1, datetime(2011, 2, 15)),
+ (2, 2, 1, date2, datetime(2011, 2, 15)),
+ (2, 2, 1, date3, datetime(2011, 3, 15)),
+ (2, 2, 1, date4, datetime(2011, 3, 15))]
+
+ for n, week, weekday, dt, expected in test_cases:
+ offset = WeekOfMonth(n, week=week, weekday=weekday)
+ assert_offset_equal(offset, dt, expected)
+
+ # try subtracting
+ result = datetime(2011, 2, 1) - WeekOfMonth(week=1, weekday=2)
+ assert result == datetime(2011, 1, 12)
+
+ result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2)
+ assert result == datetime(2011, 2, 2)
+
+ on_offset_cases = [(0, 0, datetime(2011, 2, 7), True),
+ (0, 0, datetime(2011, 2, 6), False),
+ (0, 0, datetime(2011, 2, 14), False),
+ (1, 0, datetime(2011, 2, 14), True),
+ (0, 1, datetime(2011, 2, 1), True),
+ (0, 1, datetime(2011, 2, 8), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ week, weekday, dt, expected = case
+ offset = WeekOfMonth(week=week, weekday=weekday)
+ assert offset.onOffset(dt) == expected
+
+
+class TestLastWeekOfMonth(Base):
+ _offset = LastWeekOfMonth
+ offset1 = _offset()
+ offset2 = _offset(2)
+
+ def test_constructor(self):
+ with pytest.raises(ValueError, match="^N cannot be 0"):
+ LastWeekOfMonth(n=0, weekday=1)
+
+ with pytest.raises(ValueError, match="^Day"):
+ LastWeekOfMonth(n=1, weekday=-1)
+
+ with pytest.raises(ValueError, match="^Day"):
+ LastWeekOfMonth(n=1, weekday=7)
+
+ def test_offset(self):
+ # Saturday
+ last_sat = datetime(2013, 8, 31)
+ next_sat = datetime(2013, 9, 28)
+ offset_sat = LastWeekOfMonth(n=1, weekday=5)
+
+ one_day_before = (last_sat + timedelta(days=-1))
+ assert one_day_before + offset_sat == last_sat
+
+ one_day_after = (last_sat + timedelta(days=+1))
+ assert one_day_after + offset_sat == next_sat
+
+ # Test On that day
+ assert last_sat + offset_sat == next_sat
+
+ # Thursday
+
+ offset_thur = LastWeekOfMonth(n=1, weekday=3)
+ last_thurs = datetime(2013, 1, 31)
+ next_thurs = datetime(2013, 2, 28)
+
+ one_day_before = last_thurs + timedelta(days=-1)
+ assert one_day_before + offset_thur == last_thurs
+
+ one_day_after = last_thurs + timedelta(days=+1)
+ assert one_day_after + offset_thur == next_thurs
+
+ # Test on that day
+ assert last_thurs + offset_thur == next_thurs
+
+ three_before = last_thurs + timedelta(days=-3)
+ assert three_before + offset_thur == last_thurs
+
+ two_after = last_thurs + timedelta(days=+2)
+ assert two_after + offset_thur == next_thurs
+
+ offset_sunday = LastWeekOfMonth(n=1, weekday=WeekDay.SUN)
+ assert datetime(2013, 7, 31) + offset_sunday == datetime(2013, 8, 25)
+
+ on_offset_cases = [
+ (WeekDay.SUN, datetime(2013, 1, 27), True),
+ (WeekDay.SAT, datetime(2013, 3, 30), True),
+ (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon
+ (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN
+ (WeekDay.MON, datetime(2013, 2, 25), True),
+ (WeekDay.SAT, datetime(2013, 11, 30), True),
+
+ (WeekDay.SAT, datetime(2006, 8, 26), True),
+ (WeekDay.SAT, datetime(2007, 8, 25), True),
+ (WeekDay.SAT, datetime(2008, 8, 30), True),
+ (WeekDay.SAT, datetime(2009, 8, 29), True),
+ (WeekDay.SAT, datetime(2010, 8, 28), True),
+ (WeekDay.SAT, datetime(2011, 8, 27), True),
+ (WeekDay.SAT, datetime(2019, 8, 31), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ weekday, dt, expected = case
+ offset = LastWeekOfMonth(weekday=weekday)
+ assert offset.onOffset(dt) == expected
+
+
+class TestSemiMonthEnd(Base):
+ _offset = SemiMonthEnd
+ offset1 = _offset()
+ offset2 = _offset(2)
+
+ def test_offset_whole_year(self):
+ dates = (datetime(2007, 12, 31),
+ datetime(2008, 1, 15),
+ datetime(2008, 1, 31),
+ datetime(2008, 2, 15),
+ datetime(2008, 2, 29),
+ datetime(2008, 3, 15),
+ datetime(2008, 3, 31),
+ datetime(2008, 4, 15),
+ datetime(2008, 4, 30),
+ datetime(2008, 5, 15),
+ datetime(2008, 5, 31),
+ datetime(2008, 6, 15),
+ datetime(2008, 6, 30),
+ datetime(2008, 7, 15),
+ datetime(2008, 7, 31),
+ datetime(2008, 8, 15),
+ datetime(2008, 8, 31),
+ datetime(2008, 9, 15),
+ datetime(2008, 9, 30),
+ datetime(2008, 10, 15),
+ datetime(2008, 10, 31),
+ datetime(2008, 11, 15),
+ datetime(2008, 11, 30),
+ datetime(2008, 12, 15),
+ datetime(2008, 12, 31))
+
+ for base, exp_date in zip(dates[:-1], dates[1:]):
+ assert_offset_equal(SemiMonthEnd(), base, exp_date)
+
+ # ensure .apply_index works as expected
+ s = DatetimeIndex(dates[:-1])
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = SemiMonthEnd().apply_index(s)
+
+ exp = DatetimeIndex(dates[1:])
+ tm.assert_index_equal(result, exp)
+
+ # ensure generating a range with DatetimeIndex gives same result
+ result = date_range(start=dates[0], end=dates[-1], freq='SM')
+ exp = DatetimeIndex(dates)
+ tm.assert_index_equal(result, exp)
+
+ offset_cases = []
+ offset_cases.append((SemiMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 15),
+ datetime(2008, 1, 15): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 2, 15),
+ datetime(2006, 12, 14): datetime(2006, 12, 15),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2007, 1, 15),
+ datetime(2007, 1, 1): datetime(2007, 1, 15),
+ datetime(2006, 12, 1): datetime(2006, 12, 15),
+ datetime(2006, 12, 15): datetime(2006, 12, 31)}))
+
+ offset_cases.append((SemiMonthEnd(day_of_month=20), {
+ datetime(2008, 1, 1): datetime(2008, 1, 20),
+ datetime(2008, 1, 15): datetime(2008, 1, 20),
+ datetime(2008, 1, 21): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 2, 20),
+ datetime(2006, 12, 14): datetime(2006, 12, 20),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2007, 1, 20),
+ datetime(2007, 1, 1): datetime(2007, 1, 20),
+ datetime(2006, 12, 1): datetime(2006, 12, 20),
+ datetime(2006, 12, 15): datetime(2006, 12, 20)}))
+
+ offset_cases.append((SemiMonthEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 15),
+ datetime(2008, 1, 16): datetime(2008, 1, 31),
+ datetime(2008, 1, 15): datetime(2008, 1, 15),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2006, 12, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 15)}))
+
+ offset_cases.append((SemiMonthEnd(0, day_of_month=16), {
+ datetime(2008, 1, 1): datetime(2008, 1, 16),
+ datetime(2008, 1, 16): datetime(2008, 1, 16),
+ datetime(2008, 1, 15): datetime(2008, 1, 16),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2006, 12, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 16)}))
+
+ offset_cases.append((SemiMonthEnd(2), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 2, 29),
+ datetime(2006, 12, 29): datetime(2007, 1, 15),
+ datetime(2006, 12, 31): datetime(2007, 1, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 31),
+ datetime(2007, 1, 16): datetime(2007, 2, 15),
+ datetime(2006, 11, 1): datetime(2006, 11, 30)}))
+
+ offset_cases.append((SemiMonthEnd(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 6, 15),
+ datetime(2008, 12, 31): datetime(2008, 12, 15),
+ datetime(2006, 12, 29): datetime(2006, 12, 15),
+ datetime(2006, 12, 30): datetime(2006, 12, 15),
+ datetime(2007, 1, 1): datetime(2006, 12, 31)}))
+
+ offset_cases.append((SemiMonthEnd(-1, day_of_month=4), {
+ datetime(2007, 1, 1): datetime(2006, 12, 31),
+ datetime(2007, 1, 4): datetime(2006, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 6, 4),
+ datetime(2008, 12, 31): datetime(2008, 12, 4),
+ datetime(2006, 12, 5): datetime(2006, 12, 4),
+ datetime(2006, 12, 30): datetime(2006, 12, 4),
+ datetime(2007, 1, 1): datetime(2006, 12, 31)}))
+
+ offset_cases.append((SemiMonthEnd(-2), {
+ datetime(2007, 1, 1): datetime(2006, 12, 15),
+ datetime(2008, 6, 30): datetime(2008, 5, 31),
+ datetime(2008, 3, 15): datetime(2008, 2, 15),
+ datetime(2008, 12, 31): datetime(2008, 11, 30),
+ datetime(2006, 12, 29): datetime(2006, 11, 30),
+ datetime(2006, 12, 14): datetime(2006, 11, 15),
+ datetime(2007, 1, 1): datetime(2006, 12, 15)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_apply_index(self, case):
+ offset, cases = case
+ s = DatetimeIndex(cases.keys())
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = offset.apply_index(s)
+
+ exp = DatetimeIndex(cases.values())
+ tm.assert_index_equal(result, exp)
+
+ on_offset_cases = [(datetime(2007, 12, 31), True),
+ (datetime(2007, 12, 15), True),
+ (datetime(2007, 12, 14), False),
+ (datetime(2007, 12, 1), False),
+ (datetime(2008, 2, 29), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ dt, expected = case
+ assert_onOffset(SemiMonthEnd(), dt, expected)
+
+ @pytest.mark.parametrize('klass', [Series, DatetimeIndex])
+ def test_vectorized_offset_addition(self, klass):
+ s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = s + SemiMonthEnd()
+ result2 = SemiMonthEnd() + s
+
+ exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-29', tz='US/Central')], name='a')
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+ s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-01', tz='US/Central')], name='a')
+
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = s + SemiMonthEnd()
+ result2 = SemiMonthEnd() + s
+
+ exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+
+class TestSemiMonthBegin(Base):
+ _offset = SemiMonthBegin
+ offset1 = _offset()
+ offset2 = _offset(2)
+
+ def test_offset_whole_year(self):
+ dates = (datetime(2007, 12, 15),
+ datetime(2008, 1, 1),
+ datetime(2008, 1, 15),
+ datetime(2008, 2, 1),
+ datetime(2008, 2, 15),
+ datetime(2008, 3, 1),
+ datetime(2008, 3, 15),
+ datetime(2008, 4, 1),
+ datetime(2008, 4, 15),
+ datetime(2008, 5, 1),
+ datetime(2008, 5, 15),
+ datetime(2008, 6, 1),
+ datetime(2008, 6, 15),
+ datetime(2008, 7, 1),
+ datetime(2008, 7, 15),
+ datetime(2008, 8, 1),
+ datetime(2008, 8, 15),
+ datetime(2008, 9, 1),
+ datetime(2008, 9, 15),
+ datetime(2008, 10, 1),
+ datetime(2008, 10, 15),
+ datetime(2008, 11, 1),
+ datetime(2008, 11, 15),
+ datetime(2008, 12, 1),
+ datetime(2008, 12, 15))
+
+ for base, exp_date in zip(dates[:-1], dates[1:]):
+ assert_offset_equal(SemiMonthBegin(), base, exp_date)
+
+ # ensure .apply_index works as expected
+ s = DatetimeIndex(dates[:-1])
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = SemiMonthBegin().apply_index(s)
+
+ exp = DatetimeIndex(dates[1:])
+ tm.assert_index_equal(result, exp)
+
+ # ensure generating a range with DatetimeIndex gives same result
+ result = date_range(start=dates[0], end=dates[-1], freq='SMS')
+ exp = DatetimeIndex(dates)
+ tm.assert_index_equal(result, exp)
+
+ offset_cases = []
+ offset_cases.append((SemiMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 15),
+ datetime(2008, 1, 15): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 14): datetime(2006, 12, 15),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2007, 1, 1): datetime(2007, 1, 15),
+ datetime(2006, 12, 1): datetime(2006, 12, 15),
+ datetime(2006, 12, 15): datetime(2007, 1, 1)}))
+
+ offset_cases.append((SemiMonthBegin(day_of_month=20), {
+ datetime(2008, 1, 1): datetime(2008, 1, 20),
+ datetime(2008, 1, 15): datetime(2008, 1, 20),
+ datetime(2008, 1, 21): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 14): datetime(2006, 12, 20),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2007, 1, 1): datetime(2007, 1, 20),
+ datetime(2006, 12, 1): datetime(2006, 12, 20),
+ datetime(2006, 12, 15): datetime(2006, 12, 20)}))
+
+ offset_cases.append((SemiMonthBegin(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 16): datetime(2008, 2, 1),
+ datetime(2008, 1, 15): datetime(2008, 1, 15),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 2): datetime(2006, 12, 15),
+ datetime(2007, 1, 1): datetime(2007, 1, 1)}))
+
+ offset_cases.append((SemiMonthBegin(0, day_of_month=16), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 16): datetime(2008, 1, 16),
+ datetime(2008, 1, 15): datetime(2008, 1, 16),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2007, 1, 5): datetime(2007, 1, 16),
+ datetime(2007, 1, 1): datetime(2007, 1, 1)}))
+
+ offset_cases.append((SemiMonthBegin(2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 15),
+ datetime(2006, 12, 1): datetime(2007, 1, 1),
+ datetime(2006, 12, 29): datetime(2007, 1, 15),
+ datetime(2006, 12, 15): datetime(2007, 1, 15),
+ datetime(2007, 1, 1): datetime(2007, 2, 1),
+ datetime(2007, 1, 16): datetime(2007, 2, 15),
+ datetime(2006, 11, 1): datetime(2006, 12, 1)}))
+
+ offset_cases.append((SemiMonthBegin(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 15),
+ datetime(2008, 6, 30): datetime(2008, 6, 15),
+ datetime(2008, 6, 14): datetime(2008, 6, 1),
+ datetime(2008, 12, 31): datetime(2008, 12, 15),
+ datetime(2006, 12, 29): datetime(2006, 12, 15),
+ datetime(2006, 12, 15): datetime(2006, 12, 1),
+ datetime(2007, 1, 1): datetime(2006, 12, 15)}))
+
+ offset_cases.append((SemiMonthBegin(-1, day_of_month=4), {
+ datetime(2007, 1, 1): datetime(2006, 12, 4),
+ datetime(2007, 1, 4): datetime(2007, 1, 1),
+ datetime(2008, 6, 30): datetime(2008, 6, 4),
+ datetime(2008, 12, 31): datetime(2008, 12, 4),
+ datetime(2006, 12, 5): datetime(2006, 12, 4),
+ datetime(2006, 12, 30): datetime(2006, 12, 4),
+ datetime(2006, 12, 2): datetime(2006, 12, 1),
+ datetime(2007, 1, 1): datetime(2006, 12, 4)}))
+
+ offset_cases.append((SemiMonthBegin(-2), {
+ datetime(2007, 1, 1): datetime(2006, 12, 1),
+ datetime(2008, 6, 30): datetime(2008, 6, 1),
+ datetime(2008, 6, 14): datetime(2008, 5, 15),
+ datetime(2008, 12, 31): datetime(2008, 12, 1),
+ datetime(2006, 12, 29): datetime(2006, 12, 1),
+ datetime(2006, 12, 15): datetime(2006, 11, 15),
+ datetime(2007, 1, 1): datetime(2006, 12, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_apply_index(self, case):
+ offset, cases = case
+ s = DatetimeIndex(cases.keys())
+
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = offset.apply_index(s)
+
+ exp = DatetimeIndex(cases.values())
+ tm.assert_index_equal(result, exp)
+
+ on_offset_cases = [(datetime(2007, 12, 1), True),
+ (datetime(2007, 12, 15), True),
+ (datetime(2007, 12, 14), False),
+ (datetime(2007, 12, 31), False),
+ (datetime(2008, 2, 15), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ dt, expected = case
+ assert_onOffset(SemiMonthBegin(), dt, expected)
+
+ @pytest.mark.parametrize('klass', [Series, DatetimeIndex])
+ def test_vectorized_offset_addition(self, klass):
+ s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = s + SemiMonthBegin()
+ result2 = SemiMonthBegin() + s
+
+ exp = klass([Timestamp('2000-02-01 00:15:00', tz='US/Central'),
+ Timestamp('2000-03-01', tz='US/Central')], name='a')
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+ s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-01', tz='US/Central')], name='a')
+ with tm.assert_produces_warning(None):
+ # GH#22535 check that we don't get a FutureWarning from adding
+ # an integer array to PeriodIndex
+ result = s + SemiMonthBegin()
+ result2 = SemiMonthBegin() + s
+
+ exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
+ Timestamp('2000-02-15', tz='US/Central')], name='a')
+ tm.assert_equal(result, exp)
+ tm.assert_equal(result2, exp)
+
+
+def test_Easter():
+ assert_offset_equal(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4))
+ assert_offset_equal(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24))
+ assert_offset_equal(Easter(2), datetime(2010, 1, 1), datetime(2011, 4, 24))
+
+ assert_offset_equal(Easter(), datetime(2010, 4, 4), datetime(2011, 4, 24))
+ assert_offset_equal(Easter(2), datetime(2010, 4, 4), datetime(2012, 4, 8))
+
+ assert_offset_equal(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4))
+ assert_offset_equal(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4))
+ assert_offset_equal(-Easter(2),
+ datetime(2011, 1, 1),
+ datetime(2009, 4, 12))
+
+ assert_offset_equal(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12))
+ assert_offset_equal(-Easter(2),
+ datetime(2010, 4, 4),
+ datetime(2008, 3, 23))
+
+
+class TestOffsetNames(object):
+
+ def test_get_offset_name(self):
+ assert BDay().freqstr == 'B'
+ assert BDay(2).freqstr == '2B'
+ assert BMonthEnd().freqstr == 'BM'
+ assert Week(weekday=0).freqstr == 'W-MON'
+ assert Week(weekday=1).freqstr == 'W-TUE'
+ assert Week(weekday=2).freqstr == 'W-WED'
+ assert Week(weekday=3).freqstr == 'W-THU'
+ assert Week(weekday=4).freqstr == 'W-FRI'
+
+ assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN"
+
+
+def test_get_offset():
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ get_offset('gibberish')
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ get_offset('QS-JAN-B')
+
+ pairs = [
+ ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()),
+ ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)),
+ ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)),
+ ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4))]
+
+ for name, expected in pairs:
+ offset = get_offset(name)
+ assert offset == expected, ("Expected %r to yield %r (actual: %r)" %
+ (name, expected, offset))
+
+
+def test_get_offset_legacy():
+ pairs = [('w@Sat', Week(weekday=5))]
+ for name, expected in pairs:
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ get_offset(name)
+
+
+class TestOffsetAliases(object):
+
+ def setup_method(self, method):
+ _offset_map.clear()
+
+ def test_alias_equality(self):
+ for k, v in compat.iteritems(_offset_map):
+ if v is None:
+ continue
+ assert k == v.copy()
+
+ def test_rule_code(self):
+ lst = ['M', 'MS', 'BM', 'BMS', 'D', 'B', 'H', 'T', 'S', 'L', 'U']
+ for k in lst:
+ assert k == get_offset(k).rule_code
+ # should be cached - this is kind of an internals test...
+ assert k in _offset_map
+ assert k == (get_offset(k) * 3).rule_code
+
+ suffix_lst = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
+ base = 'W'
+ for v in suffix_lst:
+ alias = '-'.join([base, v])
+ assert alias == get_offset(alias).rule_code
+ assert alias == (get_offset(alias) * 5).rule_code
+
+ suffix_lst = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG',
+ 'SEP', 'OCT', 'NOV', 'DEC']
+ base_lst = ['A', 'AS', 'BA', 'BAS', 'Q', 'QS', 'BQ', 'BQS']
+ for base in base_lst:
+ for v in suffix_lst:
+ alias = '-'.join([base, v])
+ assert alias == get_offset(alias).rule_code
+ assert alias == (get_offset(alias) * 5).rule_code
+
+ lst = ['M', 'D', 'B', 'H', 'T', 'S', 'L', 'U']
+ for k in lst:
+ code, stride = get_freq_code('3' + k)
+ assert isinstance(code, int)
+ assert stride == 3
+ assert k == get_freq_str(code)
+
+
+def test_dateoffset_misc():
+ oset = offsets.DateOffset(months=2, days=4)
+ # it works
+ oset.freqstr
+
+ assert (not offsets.DateOffset(months=2) == 2)
+
+
+def test_freq_offsets():
+ off = BDay(1, offset=timedelta(0, 1800))
+ assert (off.freqstr == 'B+30Min')
+
+ off = BDay(1, offset=timedelta(0, -1800))
+ assert (off.freqstr == 'B-30Min')
+
+
+class TestReprNames(object):
+
+ def test_str_for_named_is_name(self):
+ # look at all the amazing combinations!
+ month_prefixes = ['A', 'AS', 'BA', 'BAS', 'Q', 'BQ', 'BQS', 'QS']
+ names = [prefix + '-' + month
+ for prefix in month_prefixes
+ for month in ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
+ 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']]
+ days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
+ names += ['W-' + day for day in days]
+ names += ['WOM-' + week + day
+ for week in ('1', '2', '3', '4') for day in days]
+ _offset_map.clear()
+ for name in names:
+ offset = get_offset(name)
+ assert offset.freqstr == name
+
+
+def get_utc_offset_hours(ts):
+ # take a Timestamp and compute total hours of utc offset
+ o = ts.utcoffset()
+ return (o.days * 24 * 3600 + o.seconds) / 3600.0
+
+
+class TestDST(object):
+ """
+ test DateOffset additions over Daylight Savings Time
+ """
+ # one microsecond before the DST transition
+ ts_pre_fallback = "2013-11-03 01:59:59.999999"
+ ts_pre_springfwd = "2013-03-10 01:59:59.999999"
+
+ # test both basic names and dateutil timezones
+ timezone_utc_offsets = {
+ 'US/Eastern': dict(utc_offset_daylight=-4,
+ utc_offset_standard=-5, ),
+ 'dateutil/US/Pacific': dict(utc_offset_daylight=-7,
+ utc_offset_standard=-8, )
+ }
+ valid_date_offsets_singular = [
+ 'weekday', 'day', 'hour', 'minute', 'second', 'microsecond'
+ ]
+ valid_date_offsets_plural = [
+ 'weeks', 'days',
+ 'hours', 'minutes', 'seconds',
+ 'milliseconds', 'microseconds'
+ ]
+
+ def _test_all_offsets(self, n, **kwds):
+ valid_offsets = self.valid_date_offsets_plural if n > 1 \
+ else self.valid_date_offsets_singular
+
+ for name in valid_offsets:
+ self._test_offset(offset_name=name, offset_n=n, **kwds)
+
+ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset):
+ offset = DateOffset(**{offset_name: offset_n})
+
+ t = tstart + offset
+ if expected_utc_offset is not None:
+ assert get_utc_offset_hours(t) == expected_utc_offset
+
+ if offset_name == 'weeks':
+ # dates should match
+ assert t.date() == timedelta(days=7 * offset.kwds[
+ 'weeks']) + tstart.date()
+ # expect the same day of week, hour of day, minute, second, ...
+ assert (t.dayofweek == tstart.dayofweek and
+ t.hour == tstart.hour and
+ t.minute == tstart.minute and
+ t.second == tstart.second)
+ elif offset_name == 'days':
+ # dates should match
+ assert timedelta(offset.kwds['days']) + tstart.date() == t.date()
+ # expect the same hour of day, minute, second, ...
+ assert (t.hour == tstart.hour and
+ t.minute == tstart.minute and
+ t.second == tstart.second)
+ elif offset_name in self.valid_date_offsets_singular:
+ # expect the singular offset value to match between tstart and t
+ datepart_offset = getattr(t, offset_name
+ if offset_name != 'weekday' else
+ 'dayofweek')
+ assert datepart_offset == offset.kwds[offset_name]
+ else:
+ # the offset should be the same as if it was done in UTC
+ assert (t == (tstart.tz_convert('UTC') + offset)
+ .tz_convert('US/Pacific'))
+
+ def _make_timestamp(self, string, hrs_offset, tz):
+ if hrs_offset >= 0:
+ offset_string = '{hrs:02d}00'.format(hrs=hrs_offset)
+ else:
+ offset_string = '-{hrs:02d}00'.format(hrs=-1 * hrs_offset)
+ return Timestamp(string + offset_string).tz_convert(tz)
+
+ def test_fallback_plural(self):
+ # test moving from daylight savings to standard time
+ import dateutil
+ for tz, utc_offsets in self.timezone_utc_offsets.items():
+ hrs_pre = utc_offsets['utc_offset_daylight']
+ hrs_post = utc_offsets['utc_offset_standard']
+
+ if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'):
+ # buggy ambiguous behavior in 2.6.0
+ # GH 14621
+ # https://github.com/dateutil/dateutil/issues/321
+ self._test_all_offsets(
+ n=3, tstart=self._make_timestamp(self.ts_pre_fallback,
+ hrs_pre, tz),
+ expected_utc_offset=hrs_post)
+ elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'):
+ # fixed, but skip the test
+ continue
+
+ def test_springforward_plural(self):
+ # test moving from standard to daylight savings
+ for tz, utc_offsets in self.timezone_utc_offsets.items():
+ hrs_pre = utc_offsets['utc_offset_standard']
+ hrs_post = utc_offsets['utc_offset_daylight']
+ self._test_all_offsets(
+ n=3, tstart=self._make_timestamp(self.ts_pre_springfwd,
+ hrs_pre, tz),
+ expected_utc_offset=hrs_post)
+
+ def test_fallback_singular(self):
+ # in the case of singular offsets, we don't necessarily know which utc
+ # offset the new Timestamp will wind up in (the tz for 1 month may be
+ # different from 1 second) so we don't specify an expected_utc_offset
+ for tz, utc_offsets in self.timezone_utc_offsets.items():
+ hrs_pre = utc_offsets['utc_offset_standard']
+ self._test_all_offsets(n=1, tstart=self._make_timestamp(
+ self.ts_pre_fallback, hrs_pre, tz), expected_utc_offset=None)
+
+ def test_springforward_singular(self):
+ for tz, utc_offsets in self.timezone_utc_offsets.items():
+ hrs_pre = utc_offsets['utc_offset_standard']
+ self._test_all_offsets(n=1, tstart=self._make_timestamp(
+ self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=None)
+
+ offset_classes = {MonthBegin: ['11/2/2012', '12/1/2012'],
+ MonthEnd: ['11/2/2012', '11/30/2012'],
+ BMonthBegin: ['11/2/2012', '12/3/2012'],
+ BMonthEnd: ['11/2/2012', '11/30/2012'],
+ CBMonthBegin: ['11/2/2012', '12/3/2012'],
+ CBMonthEnd: ['11/2/2012', '11/30/2012'],
+ SemiMonthBegin: ['11/2/2012', '11/15/2012'],
+ SemiMonthEnd: ['11/2/2012', '11/15/2012'],
+ Week: ['11/2/2012', '11/9/2012'],
+ YearBegin: ['11/2/2012', '1/1/2013'],
+ YearEnd: ['11/2/2012', '12/31/2012'],
+ BYearBegin: ['11/2/2012', '1/1/2013'],
+ BYearEnd: ['11/2/2012', '12/31/2012'],
+ QuarterBegin: ['11/2/2012', '12/1/2012'],
+ QuarterEnd: ['11/2/2012', '12/31/2012'],
+ BQuarterBegin: ['11/2/2012', '12/3/2012'],
+ BQuarterEnd: ['11/2/2012', '12/31/2012'],
+ Day: ['11/4/2012', '11/4/2012 23:00']}.items()
+
+ @pytest.mark.parametrize('tup', offset_classes)
+ def test_all_offset_classes(self, tup):
+ offset, test_values = tup
+
+ first = Timestamp(test_values[0], tz='US/Eastern') + offset()
+ second = Timestamp(test_values[1], tz='US/Eastern')
+ assert first == second
+
+
+# ---------------------------------------------------------------------
+def test_get_offset_day_error():
+ # subclass of _BaseOffset must override _day_opt attribute, or we should
+ # get a NotImplementedError
+
+ with pytest.raises(NotImplementedError):
+ DateOffset()._get_offset_day(datetime.now())
+
+
+def test_valid_default_arguments(offset_types):
+ # GH#19142 check that the calling the constructors without passing
+ # any keyword arguments produce valid offsets
+ cls = offset_types
+ cls()
+
+
[email protected]('kwd', sorted(list(liboffsets.relativedelta_kwds)))
+def test_valid_month_attributes(kwd, month_classes):
+ # GH#18226
+ cls = month_classes
+ # check that we cannot create e.g. MonthEnd(weeks=3)
+ with pytest.raises(TypeError):
+ cls(**{kwd: 3})
+
+
[email protected]('kwd', sorted(list(liboffsets.relativedelta_kwds)))
+def test_valid_relativedelta_kwargs(kwd):
+ # Check that all the arguments specified in liboffsets.relativedelta_kwds
+ # are in fact valid relativedelta keyword args
+ DateOffset(**{kwd: 1})
+
+
[email protected]('kwd', sorted(list(liboffsets.relativedelta_kwds)))
+def test_valid_tick_attributes(kwd, tick_classes):
+ # GH#18226
+ cls = tick_classes
+ # check that we cannot create e.g. Hour(weeks=3)
+ with pytest.raises(TypeError):
+ cls(**{kwd: 3})
+
+
+def test_validate_n_error():
+ with pytest.raises(TypeError):
+ DateOffset(n='Doh!')
+
+ with pytest.raises(TypeError):
+ MonthBegin(n=timedelta(1))
+
+ with pytest.raises(TypeError):
+ BDay(n=np.array([1, 2], dtype=np.int64))
+
+
+def test_require_integers(offset_types):
+ cls = offset_types
+ with pytest.raises(ValueError):
+ cls(n=1.5)
+
+
+def test_tick_normalize_raises(tick_classes):
+ # check that trying to create a Tick object with normalize=True raises
+ # GH#21427
+ cls = tick_classes
+ with pytest.raises(ValueError):
+ cls(n=3, normalize=True)
+
+
+def test_weeks_onoffset():
+ # GH#18510 Week with weekday = None, normalize = False should always
+ # be onOffset
+ offset = Week(n=2, weekday=None)
+ ts = Timestamp('1862-01-13 09:03:34.873477378+0210', tz='Africa/Lusaka')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+ # negative n
+ offset = Week(n=2, weekday=None)
+ ts = Timestamp('1856-10-24 16:18:36.556360110-0717', tz='Pacific/Easter')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+
+def test_weekofmonth_onoffset():
+ # GH#18864
+ # Make sure that nanoseconds don't trip up onOffset (and with it apply)
+ offset = WeekOfMonth(n=2, week=2, weekday=0)
+ ts = Timestamp('1916-05-15 01:14:49.583410462+0422', tz='Asia/Qyzylorda')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+ # negative n
+ offset = WeekOfMonth(n=-3, week=1, weekday=0)
+ ts = Timestamp('1980-12-08 03:38:52.878321185+0500', tz='Asia/Oral')
+ fast = offset.onOffset(ts)
+ slow = (ts + offset) - offset == ts
+ assert fast == slow
+
+
+def test_last_week_of_month_on_offset():
+ # GH#19036, GH#18977 _adjust_dst was incorrect for LastWeekOfMonth
+ offset = LastWeekOfMonth(n=4, weekday=6)
+ ts = Timestamp('1917-05-27 20:55:27.084284178+0200',
+ tz='Europe/Warsaw')
+ slow = (ts + offset) - offset == ts
+ fast = offset.onOffset(ts)
+ assert fast == slow
+
+ # negative n
+ offset = LastWeekOfMonth(n=-4, weekday=5)
+ ts = Timestamp('2005-08-27 05:01:42.799392561-0500',
+ tz='America/Rainy_River')
+ slow = (ts + offset) - offset == ts
+ fast = offset.onOffset(ts)
+ assert fast == slow
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets_properties.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets_properties.py
new file mode 100644
index 00000000000..cd5f2a2a25e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_offsets_properties.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+"""
+Behavioral based tests for offsets and date_range.
+
+This file is adapted from https://github.com/pandas-dev/pandas/pull/18761 -
+which was more ambitious but less idiomatic in its use of Hypothesis.
+
+You may wish to consult the previous version for inspiration on further
+tests, or when trying to pin down the bugs exposed by the tests below.
+"""
+import warnings
+
+from hypothesis import assume, given, strategies as st
+from hypothesis.extra.dateutil import timezones as dateutil_timezones
+from hypothesis.extra.pytz import timezones as pytz_timezones
+import pytest
+
+import pandas as pd
+
+from pandas.tseries.offsets import (
+ BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd,
+ MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd)
+
+# ----------------------------------------------------------------
+# Helpers for generating random data
+
+with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ min_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(),
+ max_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(),
+
+gen_date_range = st.builds(
+ pd.date_range,
+ start=st.datetimes(
+ # TODO: Choose the min/max values more systematically
+ min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),
+ max_value=pd.Timestamp(2100, 1, 1).to_pydatetime()
+ ),
+ periods=st.integers(min_value=2, max_value=100),
+ freq=st.sampled_from('Y Q M D H T s ms us ns'.split()),
+ tz=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()),
+)
+
+gen_random_datetime = st.datetimes(
+ min_value=min_dt,
+ max_value=max_dt,
+ timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones())
+)
+
+# The strategy for each type is registered in conftest.py, as they don't carry
+# enough runtime information (e.g. type hints) to infer how to build them.
+gen_yqm_offset = st.one_of(*map(st.from_type, [
+ MonthBegin, MonthEnd, BMonthBegin, BMonthEnd,
+ QuarterBegin, QuarterEnd, BQuarterBegin, BQuarterEnd,
+ YearBegin, YearEnd, BYearBegin, BYearEnd
+]))
+
+
+# ----------------------------------------------------------------
+# Offset-specific behaviour tests
+
+
+# Based on CI runs: Always passes on OSX, fails on Linux, sometimes on Windows
[email protected](strict=False, reason='inconsistent between OSs, Pythons')
+@given(gen_random_datetime, gen_yqm_offset)
+def test_on_offset_implementations(dt, offset):
+ assume(not offset.normalize)
+ # check that the class-specific implementations of onOffset match
+ # the general case definition:
+ # (dt + offset) - offset == dt
+ compare = (dt + offset) - offset
+ assert offset.onOffset(dt) == (compare == dt)
+
+
+@given(gen_yqm_offset, gen_date_range)
+def test_apply_index_implementations(offset, rng):
+ # offset.apply_index(dti)[i] should match dti[i] + offset
+ assume(offset.n != 0) # TODO: test for that case separately
+
+ # rng = pd.date_range(start='1/1/2000', periods=100000, freq='T')
+ ser = pd.Series(rng)
+
+ res = rng + offset
+ res_v2 = offset.apply_index(rng)
+ assert (res == res_v2).all()
+
+ assert res[0] == rng[0] + offset
+ assert res[-1] == rng[-1] + offset
+ res2 = ser + offset
+ # apply_index is only for indexes, not series, so no res2_v2
+ assert res2.iloc[0] == ser.iloc[0] + offset
+ assert res2.iloc[-1] == ser.iloc[-1] + offset
+ # TODO: Check randomly assorted entries, not just first/last
+
+
+@given(gen_yqm_offset)
+def test_shift_across_dst(offset):
+ # GH#18319 check that 1) timezone is correctly normalized and
+ # 2) that hour is not incorrectly changed by this normalization
+ # Note that dti includes a transition across DST boundary
+ dti = pd.date_range(start='2017-10-30 12:00:00', end='2017-11-06',
+ freq='D', tz='US/Eastern')
+ assert (dti.hour == 12).all() # we haven't screwed up yet
+
+ res = dti + offset
+ assert (res.hour == 12).all()
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_ticks.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_ticks.py
new file mode 100644
index 00000000000..f4b012ec189
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_ticks.py
@@ -0,0 +1,320 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for offsets.Tick and subclasses
+"""
+from __future__ import division
+
+from datetime import datetime, timedelta
+
+from hypothesis import assume, example, given, settings, strategies as st
+import numpy as np
+import pytest
+
+from pandas import Timedelta, Timestamp
+
+from pandas.tseries import offsets
+from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second
+
+from .common import assert_offset_equal
+
+# ---------------------------------------------------------------------
+# Test Helpers
+
+tick_classes = [Hour, Minute, Second, Milli, Micro, Nano]
+
+
+# ---------------------------------------------------------------------
+
+
+def test_apply_ticks():
+ result = offsets.Hour(3).apply(offsets.Hour(4))
+ exp = offsets.Hour(7)
+ assert (result == exp)
+
+
+def test_delta_to_tick():
+ delta = timedelta(3)
+
+ tick = offsets._delta_to_tick(delta)
+ assert (tick == offsets.Day(3))
+
+ td = Timedelta(nanoseconds=5)
+ tick = offsets._delta_to_tick(td)
+ assert tick == Nano(5)
+
+
[email protected]('cls', tick_classes)
+@settings(deadline=None) # GH 24641
+@example(n=2, m=3)
+@example(n=800, m=300)
+@example(n=1000, m=5)
+@given(n=st.integers(-999, 999), m=st.integers(-999, 999))
+def test_tick_add_sub(cls, n, m):
+ # For all Tick subclasses and all integers n, m, we should have
+ # tick(n) + tick(m) == tick(n+m)
+ # tick(n) - tick(m) == tick(n-m)
+ left = cls(n)
+ right = cls(m)
+ expected = cls(n + m)
+
+ assert left + right == expected
+ assert left.apply(right) == expected
+
+ expected = cls(n - m)
+ assert left - right == expected
+
+
[email protected]('cls', tick_classes)
+@settings(deadline=None)
+@example(n=2, m=3)
+@given(n=st.integers(-999, 999), m=st.integers(-999, 999))
+def test_tick_equality(cls, n, m):
+ assume(m != n)
+ # tick == tock iff tick.n == tock.n
+ left = cls(n)
+ right = cls(m)
+ assert left != right
+ assert not (left == right)
+
+ right = cls(n)
+ assert left == right
+ assert not (left != right)
+
+ if n != 0:
+ assert cls(n) != cls(-n)
+
+
+# ---------------------------------------------------------------------
+
+
+def test_Hour():
+ assert_offset_equal(Hour(),
+ datetime(2010, 1, 1), datetime(2010, 1, 1, 1))
+ assert_offset_equal(Hour(-1),
+ datetime(2010, 1, 1, 1), datetime(2010, 1, 1))
+ assert_offset_equal(2 * Hour(),
+ datetime(2010, 1, 1), datetime(2010, 1, 1, 2))
+ assert_offset_equal(-1 * Hour(),
+ datetime(2010, 1, 1, 1), datetime(2010, 1, 1))
+
+ assert Hour(3) + Hour(2) == Hour(5)
+ assert Hour(3) - Hour(2) == Hour()
+
+ assert Hour(4) != Hour(1)
+
+
+def test_Minute():
+ assert_offset_equal(Minute(),
+ datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1))
+ assert_offset_equal(Minute(-1),
+ datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1))
+ assert_offset_equal(2 * Minute(),
+ datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2))
+ assert_offset_equal(-1 * Minute(),
+ datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1))
+
+ assert Minute(3) + Minute(2) == Minute(5)
+ assert Minute(3) - Minute(2) == Minute()
+ assert Minute(5) != Minute()
+
+
+def test_Second():
+ assert_offset_equal(Second(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 1))
+ assert_offset_equal(Second(-1),
+ datetime(2010, 1, 1, 0, 0, 1),
+ datetime(2010, 1, 1))
+ assert_offset_equal(2 * Second(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 2))
+ assert_offset_equal(-1 * Second(),
+ datetime(2010, 1, 1, 0, 0, 1),
+ datetime(2010, 1, 1))
+
+ assert Second(3) + Second(2) == Second(5)
+ assert Second(3) - Second(2) == Second()
+
+
+def test_Millisecond():
+ assert_offset_equal(Milli(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 0, 1000))
+ assert_offset_equal(Milli(-1),
+ datetime(2010, 1, 1, 0, 0, 0, 1000),
+ datetime(2010, 1, 1))
+ assert_offset_equal(Milli(2),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 0, 2000))
+ assert_offset_equal(2 * Milli(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 0, 2000))
+ assert_offset_equal(-1 * Milli(),
+ datetime(2010, 1, 1, 0, 0, 0, 1000),
+ datetime(2010, 1, 1))
+
+ assert Milli(3) + Milli(2) == Milli(5)
+ assert Milli(3) - Milli(2) == Milli()
+
+
+def test_MillisecondTimestampArithmetic():
+ assert_offset_equal(Milli(),
+ Timestamp('2010-01-01'),
+ Timestamp('2010-01-01 00:00:00.001'))
+ assert_offset_equal(Milli(-1),
+ Timestamp('2010-01-01 00:00:00.001'),
+ Timestamp('2010-01-01'))
+
+
+def test_Microsecond():
+ assert_offset_equal(Micro(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 0, 1))
+ assert_offset_equal(Micro(-1),
+ datetime(2010, 1, 1, 0, 0, 0, 1),
+ datetime(2010, 1, 1))
+
+ assert_offset_equal(2 * Micro(),
+ datetime(2010, 1, 1),
+ datetime(2010, 1, 1, 0, 0, 0, 2))
+ assert_offset_equal(-1 * Micro(),
+ datetime(2010, 1, 1, 0, 0, 0, 1),
+ datetime(2010, 1, 1))
+
+ assert Micro(3) + Micro(2) == Micro(5)
+ assert Micro(3) - Micro(2) == Micro()
+
+
+def test_NanosecondGeneric():
+ timestamp = Timestamp(datetime(2010, 1, 1))
+ assert timestamp.nanosecond == 0
+
+ result = timestamp + Nano(10)
+ assert result.nanosecond == 10
+
+ reverse_result = Nano(10) + timestamp
+ assert reverse_result.nanosecond == 10
+
+
+def test_Nanosecond():
+ timestamp = Timestamp(datetime(2010, 1, 1))
+ assert_offset_equal(Nano(),
+ timestamp,
+ timestamp + np.timedelta64(1, 'ns'))
+ assert_offset_equal(Nano(-1),
+ timestamp + np.timedelta64(1, 'ns'),
+ timestamp)
+ assert_offset_equal(2 * Nano(),
+ timestamp,
+ timestamp + np.timedelta64(2, 'ns'))
+ assert_offset_equal(-1 * Nano(),
+ timestamp + np.timedelta64(1, 'ns'),
+ timestamp)
+
+ assert Nano(3) + Nano(2) == Nano(5)
+ assert Nano(3) - Nano(2) == Nano()
+
+ # GH9284
+ assert Nano(1) + Nano(10) == Nano(11)
+ assert Nano(5) + Micro(1) == Nano(1005)
+ assert Micro(5) + Nano(1) == Nano(5001)
+
+
[email protected]('kls, expected',
+ [(Hour, Timedelta(hours=5)),
+ (Minute, Timedelta(hours=2, minutes=3)),
+ (Second, Timedelta(hours=2, seconds=3)),
+ (Milli, Timedelta(hours=2, milliseconds=3)),
+ (Micro, Timedelta(hours=2, microseconds=3)),
+ (Nano, Timedelta(hours=2, nanoseconds=3))])
+def test_tick_addition(kls, expected):
+ offset = kls(3)
+ result = offset + Timedelta(hours=2)
+ assert isinstance(result, Timedelta)
+ assert result == expected
+
+
[email protected]('cls', tick_classes)
+def test_tick_division(cls):
+ off = cls(10)
+
+ assert off / cls(5) == 2
+ assert off / 2 == cls(5)
+ assert off / 2.0 == cls(5)
+
+ assert off / off.delta == 1
+ assert off / off.delta.to_timedelta64() == 1
+
+ assert off / Nano(1) == off.delta / Nano(1).delta
+
+ if cls is not Nano:
+ # A case where we end up with a smaller class
+ result = off / 1000
+ assert isinstance(result, offsets.Tick)
+ assert not isinstance(result, cls)
+ assert result.delta == off.delta / 1000
+
+ if cls._inc < Timedelta(seconds=1):
+ # Case where we end up with a bigger class
+ result = off / .001
+ assert isinstance(result, offsets.Tick)
+ assert not isinstance(result, cls)
+ assert result.delta == off.delta / .001
+
+
[email protected]('cls1', tick_classes)
[email protected]('cls2', tick_classes)
+def test_tick_zero(cls1, cls2):
+ assert cls1(0) == cls2(0)
+ assert cls1(0) + cls2(0) == cls1(0)
+
+ if cls1 is not Nano:
+ assert cls1(2) + cls2(0) == cls1(2)
+
+ if cls1 is Nano:
+ assert cls1(2) + Nano(0) == cls1(2)
+
+
[email protected]('cls', tick_classes)
+def test_tick_equalities(cls):
+ assert cls() == cls(1)
+
+
[email protected]('cls', tick_classes)
+def test_tick_offset(cls):
+ assert not cls().isAnchored()
+
+
[email protected]('cls', tick_classes)
+def test_compare_ticks(cls):
+ three = cls(3)
+ four = cls(4)
+
+ assert three < cls(4)
+ assert cls(3) < four
+ assert four > cls(3)
+ assert cls(4) > three
+ assert cls(3) == cls(3)
+ assert cls(3) != cls(4)
+
+
[email protected]('cls', tick_classes)
+def test_compare_ticks_to_strs(cls):
+ # GH#23524
+ off = cls(19)
+
+ # These tests should work with any strings, but we particularly are
+ # interested in "infer" as that comparison is convenient to make in
+ # Datetime/Timedelta Array/Index constructors
+ assert not off == "infer"
+ assert not "foo" == off
+
+ for left, right in [("infer", off), (off, "infer")]:
+ with pytest.raises(TypeError):
+ left < right
+ with pytest.raises(TypeError):
+ left <= right
+ with pytest.raises(TypeError):
+ left > right
+ with pytest.raises(TypeError):
+ left >= right
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_yqm_offsets.py b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_yqm_offsets.py
new file mode 100644
index 00000000000..8023ee3139d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/offsets/test_yqm_offsets.py
@@ -0,0 +1,1027 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for Year, Quarter, and Month-based DateOffset subclasses
+"""
+from datetime import datetime
+
+import pytest
+
+import pandas as pd
+from pandas import Timestamp, compat
+
+from pandas.tseries.offsets import (
+ BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd,
+ MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd)
+
+from .common import assert_offset_equal, assert_onOffset
+from .test_offsets import Base
+
+# --------------------------------------------------------------------
+# Misc
+
+
+def test_quarterly_dont_normalize():
+ date = datetime(2012, 3, 31, 5, 30)
+
+ offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin)
+
+ for klass in offsets:
+ result = date + klass()
+ assert (result.time() == date.time())
+
+
[email protected]('n', [-2, 1])
[email protected]('cls', [MonthBegin, MonthEnd,
+ BMonthBegin, BMonthEnd,
+ QuarterBegin, QuarterEnd,
+ BQuarterBegin, BQuarterEnd,
+ YearBegin, YearEnd,
+ BYearBegin, BYearEnd])
+def test_apply_index(cls, n):
+ offset = cls(n=n)
+ rng = pd.date_range(start='1/1/2000', periods=100000, freq='T')
+ ser = pd.Series(rng)
+
+ res = rng + offset
+ res_v2 = offset.apply_index(rng)
+ assert (res == res_v2).all()
+ assert res[0] == rng[0] + offset
+ assert res[-1] == rng[-1] + offset
+ res2 = ser + offset
+ # apply_index is only for indexes, not series, so no res2_v2
+ assert res2.iloc[0] == ser.iloc[0] + offset
+ assert res2.iloc[-1] == ser.iloc[-1] + offset
+
+
[email protected]('offset', [QuarterBegin(), QuarterEnd(),
+ BQuarterBegin(), BQuarterEnd()])
+def test_on_offset(offset):
+ dates = [datetime(2016, m, d)
+ for m in [10, 11, 12]
+ for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)]
+ for date in dates:
+ res = offset.onOffset(date)
+ slow_version = date == (date + offset) - offset
+ assert res == slow_version
+
+
+# --------------------------------------------------------------------
+# Months
+
+class TestMonthBegin(Base):
+ _offset = MonthBegin
+
+ offset_cases = []
+ # NOTE: I'm not entirely happy with the logic here for Begin -ss
+ # see thread 'offset conventions' on the ML
+ offset_cases.append((MonthBegin(), {
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2008, 2, 1): datetime(2008, 3, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2006, 12, 1): datetime(2007, 1, 1),
+ datetime(2007, 1, 31): datetime(2007, 2, 1)}))
+
+ offset_cases.append((MonthBegin(0), {
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2006, 12, 3): datetime(2007, 1, 1),
+ datetime(2007, 1, 31): datetime(2007, 2, 1)}))
+
+ offset_cases.append((MonthBegin(2), {
+ datetime(2008, 2, 29): datetime(2008, 4, 1),
+ datetime(2008, 1, 31): datetime(2008, 3, 1),
+ datetime(2006, 12, 31): datetime(2007, 2, 1),
+ datetime(2007, 12, 28): datetime(2008, 2, 1),
+ datetime(2007, 1, 1): datetime(2007, 3, 1),
+ datetime(2006, 11, 1): datetime(2007, 1, 1)}))
+
+ offset_cases.append((MonthBegin(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 1),
+ datetime(2008, 5, 31): datetime(2008, 5, 1),
+ datetime(2008, 12, 31): datetime(2008, 12, 1),
+ datetime(2006, 12, 29): datetime(2006, 12, 1),
+ datetime(2006, 1, 2): datetime(2006, 1, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+
+class TestMonthEnd(Base):
+ _offset = MonthEnd
+
+ def test_day_of_month(self):
+ dt = datetime(2007, 1, 1)
+ offset = MonthEnd()
+
+ result = dt + offset
+ assert result == Timestamp(2007, 1, 31)
+
+ result = result + offset
+ assert result == Timestamp(2007, 2, 28)
+
+ def test_normalize(self):
+ dt = datetime(2007, 1, 1, 3)
+
+ result = dt + MonthEnd(normalize=True)
+ expected = dt.replace(hour=0) + MonthEnd()
+ assert result == expected
+
+ offset_cases = []
+ offset_cases.append((MonthEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 2, 29),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2007, 1, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 31),
+ datetime(2006, 12, 1): datetime(2006, 12, 31)}))
+
+ offset_cases.append((MonthEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2006, 12, 29): datetime(2006, 12, 31),
+ datetime(2006, 12, 31): datetime(2006, 12, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 31)}))
+
+ offset_cases.append((MonthEnd(2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 29),
+ datetime(2008, 1, 31): datetime(2008, 3, 31),
+ datetime(2006, 12, 29): datetime(2007, 1, 31),
+ datetime(2006, 12, 31): datetime(2007, 2, 28),
+ datetime(2007, 1, 1): datetime(2007, 2, 28),
+ datetime(2006, 11, 1): datetime(2006, 12, 31)}))
+
+ offset_cases.append((MonthEnd(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 5, 31),
+ datetime(2008, 12, 31): datetime(2008, 11, 30),
+ datetime(2006, 12, 29): datetime(2006, 11, 30),
+ datetime(2006, 12, 30): datetime(2006, 11, 30),
+ datetime(2007, 1, 1): datetime(2006, 12, 31)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(MonthEnd(), datetime(2007, 12, 31), True),
+ (MonthEnd(), datetime(2008, 1, 1), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestBMonthBegin(Base):
+ _offset = BMonthBegin
+
+ def test_offsets_compare_equal(self):
+ # root cause of #456
+ offset1 = BMonthBegin()
+ offset2 = BMonthBegin()
+ assert not offset1 != offset2
+
+ offset_cases = []
+ offset_cases.append((BMonthBegin(), {
+ datetime(2008, 1, 1): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2006, 9, 1): datetime(2006, 10, 2),
+ datetime(2007, 1, 1): datetime(2007, 2, 1),
+ datetime(2006, 12, 1): datetime(2007, 1, 1)}))
+
+ offset_cases.append((BMonthBegin(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2006, 10, 2): datetime(2006, 10, 2),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2006, 12, 29): datetime(2007, 1, 1),
+ datetime(2006, 12, 31): datetime(2007, 1, 1),
+ datetime(2006, 9, 15): datetime(2006, 10, 2)}))
+
+ offset_cases.append((BMonthBegin(2), {
+ datetime(2008, 1, 1): datetime(2008, 3, 3),
+ datetime(2008, 1, 15): datetime(2008, 3, 3),
+ datetime(2006, 12, 29): datetime(2007, 2, 1),
+ datetime(2006, 12, 31): datetime(2007, 2, 1),
+ datetime(2007, 1, 1): datetime(2007, 3, 1),
+ datetime(2006, 11, 1): datetime(2007, 1, 1)}))
+
+ offset_cases.append((BMonthBegin(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 1),
+ datetime(2008, 6, 30): datetime(2008, 6, 2),
+ datetime(2008, 6, 1): datetime(2008, 5, 1),
+ datetime(2008, 3, 10): datetime(2008, 3, 3),
+ datetime(2008, 12, 31): datetime(2008, 12, 1),
+ datetime(2006, 12, 29): datetime(2006, 12, 1),
+ datetime(2006, 12, 30): datetime(2006, 12, 1),
+ datetime(2007, 1, 1): datetime(2006, 12, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(BMonthBegin(), datetime(2007, 12, 31), False),
+ (BMonthBegin(), datetime(2008, 1, 1), True),
+ (BMonthBegin(), datetime(2001, 4, 2), True),
+ (BMonthBegin(), datetime(2008, 3, 3), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestBMonthEnd(Base):
+ _offset = BMonthEnd
+
+ def test_normalize(self):
+ dt = datetime(2007, 1, 1, 3)
+
+ result = dt + BMonthEnd(normalize=True)
+ expected = dt.replace(hour=0) + BMonthEnd()
+ assert result == expected
+
+ def test_offsets_compare_equal(self):
+ # root cause of #456
+ offset1 = BMonthEnd()
+ offset2 = BMonthEnd()
+ assert not offset1 != offset2
+
+ offset_cases = []
+ offset_cases.append((BMonthEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 2, 29),
+ datetime(2006, 12, 29): datetime(2007, 1, 31),
+ datetime(2006, 12, 31): datetime(2007, 1, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 31),
+ datetime(2006, 12, 1): datetime(2006, 12, 29)}))
+
+ offset_cases.append((BMonthEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2006, 12, 29): datetime(2006, 12, 29),
+ datetime(2006, 12, 31): datetime(2007, 1, 31),
+ datetime(2007, 1, 1): datetime(2007, 1, 31)}))
+
+ offset_cases.append((BMonthEnd(2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 29),
+ datetime(2008, 1, 31): datetime(2008, 3, 31),
+ datetime(2006, 12, 29): datetime(2007, 2, 28),
+ datetime(2006, 12, 31): datetime(2007, 2, 28),
+ datetime(2007, 1, 1): datetime(2007, 2, 28),
+ datetime(2006, 11, 1): datetime(2006, 12, 29)}))
+
+ offset_cases.append((BMonthEnd(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 29),
+ datetime(2008, 6, 30): datetime(2008, 5, 30),
+ datetime(2008, 12, 31): datetime(2008, 11, 28),
+ datetime(2006, 12, 29): datetime(2006, 11, 30),
+ datetime(2006, 12, 30): datetime(2006, 12, 29),
+ datetime(2007, 1, 1): datetime(2006, 12, 29)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(BMonthEnd(), datetime(2007, 12, 31), True),
+ (BMonthEnd(), datetime(2008, 1, 1), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+# --------------------------------------------------------------------
+# Quarters
+
+
+class TestQuarterBegin(Base):
+
+ def test_repr(self):
+ expected = "<QuarterBegin: startingMonth=3>"
+ assert repr(QuarterBegin()) == expected
+ expected = "<QuarterBegin: startingMonth=3>"
+ assert repr(QuarterBegin(startingMonth=3)) == expected
+ expected = "<QuarterBegin: startingMonth=1>"
+ assert repr(QuarterBegin(startingMonth=1)) == expected
+
+ def test_isAnchored(self):
+ assert QuarterBegin(startingMonth=1).isAnchored()
+ assert QuarterBegin().isAnchored()
+ assert not QuarterBegin(2, startingMonth=1).isAnchored()
+
+ def test_offset_corner_case(self):
+ # corner
+ offset = QuarterBegin(n=-1, startingMonth=1)
+ assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1)
+
+ offset_cases = []
+ offset_cases.append((QuarterBegin(startingMonth=1), {
+ datetime(2007, 12, 1): datetime(2008, 1, 1),
+ datetime(2008, 1, 1): datetime(2008, 4, 1),
+ datetime(2008, 2, 15): datetime(2008, 4, 1),
+ datetime(2008, 2, 29): datetime(2008, 4, 1),
+ datetime(2008, 3, 15): datetime(2008, 4, 1),
+ datetime(2008, 3, 31): datetime(2008, 4, 1),
+ datetime(2008, 4, 15): datetime(2008, 7, 1),
+ datetime(2008, 4, 1): datetime(2008, 7, 1)}))
+
+ offset_cases.append((QuarterBegin(startingMonth=2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2008, 1, 15): datetime(2008, 2, 1),
+ datetime(2008, 2, 29): datetime(2008, 5, 1),
+ datetime(2008, 3, 15): datetime(2008, 5, 1),
+ datetime(2008, 3, 31): datetime(2008, 5, 1),
+ datetime(2008, 4, 15): datetime(2008, 5, 1),
+ datetime(2008, 4, 30): datetime(2008, 5, 1)}))
+
+ offset_cases.append((QuarterBegin(startingMonth=1, n=0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 12, 1): datetime(2009, 1, 1),
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 2, 15): datetime(2008, 4, 1),
+ datetime(2008, 2, 29): datetime(2008, 4, 1),
+ datetime(2008, 3, 15): datetime(2008, 4, 1),
+ datetime(2008, 3, 31): datetime(2008, 4, 1),
+ datetime(2008, 4, 15): datetime(2008, 7, 1),
+ datetime(2008, 4, 30): datetime(2008, 7, 1)}))
+
+ offset_cases.append((QuarterBegin(startingMonth=1, n=-1), {
+ datetime(2008, 1, 1): datetime(2007, 10, 1),
+ datetime(2008, 1, 31): datetime(2008, 1, 1),
+ datetime(2008, 2, 15): datetime(2008, 1, 1),
+ datetime(2008, 2, 29): datetime(2008, 1, 1),
+ datetime(2008, 3, 15): datetime(2008, 1, 1),
+ datetime(2008, 3, 31): datetime(2008, 1, 1),
+ datetime(2008, 4, 15): datetime(2008, 4, 1),
+ datetime(2008, 4, 30): datetime(2008, 4, 1),
+ datetime(2008, 7, 1): datetime(2008, 4, 1)}))
+
+ offset_cases.append((QuarterBegin(startingMonth=1, n=2), {
+ datetime(2008, 1, 1): datetime(2008, 7, 1),
+ datetime(2008, 2, 15): datetime(2008, 7, 1),
+ datetime(2008, 2, 29): datetime(2008, 7, 1),
+ datetime(2008, 3, 15): datetime(2008, 7, 1),
+ datetime(2008, 3, 31): datetime(2008, 7, 1),
+ datetime(2008, 4, 15): datetime(2008, 10, 1),
+ datetime(2008, 4, 1): datetime(2008, 10, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+
+class TestQuarterEnd(Base):
+ _offset = QuarterEnd
+
+ def test_repr(self):
+ expected = "<QuarterEnd: startingMonth=3>"
+ assert repr(QuarterEnd()) == expected
+ expected = "<QuarterEnd: startingMonth=3>"
+ assert repr(QuarterEnd(startingMonth=3)) == expected
+ expected = "<QuarterEnd: startingMonth=1>"
+ assert repr(QuarterEnd(startingMonth=1)) == expected
+
+ def test_isAnchored(self):
+ assert QuarterEnd(startingMonth=1).isAnchored()
+ assert QuarterEnd().isAnchored()
+ assert not QuarterEnd(2, startingMonth=1).isAnchored()
+
+ def test_offset_corner_case(self):
+ # corner
+ offset = QuarterEnd(n=-1, startingMonth=1)
+ assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31)
+
+ offset_cases = []
+ offset_cases.append((QuarterEnd(startingMonth=1), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 4, 30),
+ datetime(2008, 2, 15): datetime(2008, 4, 30),
+ datetime(2008, 2, 29): datetime(2008, 4, 30),
+ datetime(2008, 3, 15): datetime(2008, 4, 30),
+ datetime(2008, 3, 31): datetime(2008, 4, 30),
+ datetime(2008, 4, 15): datetime(2008, 4, 30),
+ datetime(2008, 4, 30): datetime(2008, 7, 31)}))
+
+ offset_cases.append((QuarterEnd(startingMonth=2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 29),
+ datetime(2008, 1, 31): datetime(2008, 2, 29),
+ datetime(2008, 2, 15): datetime(2008, 2, 29),
+ datetime(2008, 2, 29): datetime(2008, 5, 31),
+ datetime(2008, 3, 15): datetime(2008, 5, 31),
+ datetime(2008, 3, 31): datetime(2008, 5, 31),
+ datetime(2008, 4, 15): datetime(2008, 5, 31),
+ datetime(2008, 4, 30): datetime(2008, 5, 31)}))
+
+ offset_cases.append((QuarterEnd(startingMonth=1, n=0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2008, 2, 15): datetime(2008, 4, 30),
+ datetime(2008, 2, 29): datetime(2008, 4, 30),
+ datetime(2008, 3, 15): datetime(2008, 4, 30),
+ datetime(2008, 3, 31): datetime(2008, 4, 30),
+ datetime(2008, 4, 15): datetime(2008, 4, 30),
+ datetime(2008, 4, 30): datetime(2008, 4, 30)}))
+
+ offset_cases.append((QuarterEnd(startingMonth=1, n=-1), {
+ datetime(2008, 1, 1): datetime(2007, 10, 31),
+ datetime(2008, 1, 31): datetime(2007, 10, 31),
+ datetime(2008, 2, 15): datetime(2008, 1, 31),
+ datetime(2008, 2, 29): datetime(2008, 1, 31),
+ datetime(2008, 3, 15): datetime(2008, 1, 31),
+ datetime(2008, 3, 31): datetime(2008, 1, 31),
+ datetime(2008, 4, 15): datetime(2008, 1, 31),
+ datetime(2008, 4, 30): datetime(2008, 1, 31),
+ datetime(2008, 7, 1): datetime(2008, 4, 30)}))
+
+ offset_cases.append((QuarterEnd(startingMonth=1, n=2), {
+ datetime(2008, 1, 31): datetime(2008, 7, 31),
+ datetime(2008, 2, 15): datetime(2008, 7, 31),
+ datetime(2008, 2, 29): datetime(2008, 7, 31),
+ datetime(2008, 3, 15): datetime(2008, 7, 31),
+ datetime(2008, 3, 31): datetime(2008, 7, 31),
+ datetime(2008, 4, 15): datetime(2008, 7, 31),
+ datetime(2008, 4, 30): datetime(2008, 10, 31)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [
+ (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True),
+ (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True),
+ (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False),
+ (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True),
+ (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True),
+ (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False),
+ (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True),
+ (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True),
+ (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False),
+ (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestBQuarterBegin(Base):
+ _offset = BQuarterBegin
+
+ def test_repr(self):
+ expected = "<BusinessQuarterBegin: startingMonth=3>"
+ assert repr(BQuarterBegin()) == expected
+ expected = "<BusinessQuarterBegin: startingMonth=3>"
+ assert repr(BQuarterBegin(startingMonth=3)) == expected
+ expected = "<BusinessQuarterBegin: startingMonth=1>"
+ assert repr(BQuarterBegin(startingMonth=1)) == expected
+
+ def test_isAnchored(self):
+ assert BQuarterBegin(startingMonth=1).isAnchored()
+ assert BQuarterBegin().isAnchored()
+ assert not BQuarterBegin(2, startingMonth=1).isAnchored()
+
+ def test_offset_corner_case(self):
+ # corner
+ offset = BQuarterBegin(n=-1, startingMonth=1)
+ assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2)
+
+ offset_cases = []
+ offset_cases.append((BQuarterBegin(startingMonth=1), {
+ datetime(2008, 1, 1): datetime(2008, 4, 1),
+ datetime(2008, 1, 31): datetime(2008, 4, 1),
+ datetime(2008, 2, 15): datetime(2008, 4, 1),
+ datetime(2008, 2, 29): datetime(2008, 4, 1),
+ datetime(2008, 3, 15): datetime(2008, 4, 1),
+ datetime(2008, 3, 31): datetime(2008, 4, 1),
+ datetime(2008, 4, 15): datetime(2008, 7, 1),
+ datetime(2007, 3, 15): datetime(2007, 4, 2),
+ datetime(2007, 2, 28): datetime(2007, 4, 2),
+ datetime(2007, 1, 1): datetime(2007, 4, 2),
+ datetime(2007, 4, 15): datetime(2007, 7, 2),
+ datetime(2007, 7, 1): datetime(2007, 7, 2),
+ datetime(2007, 4, 1): datetime(2007, 4, 2),
+ datetime(2007, 4, 2): datetime(2007, 7, 2),
+ datetime(2008, 4, 30): datetime(2008, 7, 1)}))
+
+ offset_cases.append((BQuarterBegin(startingMonth=2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 1),
+ datetime(2008, 1, 31): datetime(2008, 2, 1),
+ datetime(2008, 1, 15): datetime(2008, 2, 1),
+ datetime(2008, 2, 29): datetime(2008, 5, 1),
+ datetime(2008, 3, 15): datetime(2008, 5, 1),
+ datetime(2008, 3, 31): datetime(2008, 5, 1),
+ datetime(2008, 4, 15): datetime(2008, 5, 1),
+ datetime(2008, 8, 15): datetime(2008, 11, 3),
+ datetime(2008, 9, 15): datetime(2008, 11, 3),
+ datetime(2008, 11, 1): datetime(2008, 11, 3),
+ datetime(2008, 4, 30): datetime(2008, 5, 1)}))
+
+ offset_cases.append((BQuarterBegin(startingMonth=1, n=0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2007, 12, 31): datetime(2008, 1, 1),
+ datetime(2008, 2, 15): datetime(2008, 4, 1),
+ datetime(2008, 2, 29): datetime(2008, 4, 1),
+ datetime(2008, 1, 15): datetime(2008, 4, 1),
+ datetime(2008, 2, 27): datetime(2008, 4, 1),
+ datetime(2008, 3, 15): datetime(2008, 4, 1),
+ datetime(2007, 4, 1): datetime(2007, 4, 2),
+ datetime(2007, 4, 2): datetime(2007, 4, 2),
+ datetime(2007, 7, 1): datetime(2007, 7, 2),
+ datetime(2007, 4, 15): datetime(2007, 7, 2),
+ datetime(2007, 7, 2): datetime(2007, 7, 2)}))
+
+ offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), {
+ datetime(2008, 1, 1): datetime(2007, 10, 1),
+ datetime(2008, 1, 31): datetime(2008, 1, 1),
+ datetime(2008, 2, 15): datetime(2008, 1, 1),
+ datetime(2008, 2, 29): datetime(2008, 1, 1),
+ datetime(2008, 3, 15): datetime(2008, 1, 1),
+ datetime(2008, 3, 31): datetime(2008, 1, 1),
+ datetime(2008, 4, 15): datetime(2008, 4, 1),
+ datetime(2007, 7, 3): datetime(2007, 7, 2),
+ datetime(2007, 4, 3): datetime(2007, 4, 2),
+ datetime(2007, 7, 2): datetime(2007, 4, 2),
+ datetime(2008, 4, 1): datetime(2008, 1, 1)}))
+
+ offset_cases.append((BQuarterBegin(startingMonth=1, n=2), {
+ datetime(2008, 1, 1): datetime(2008, 7, 1),
+ datetime(2008, 1, 15): datetime(2008, 7, 1),
+ datetime(2008, 2, 29): datetime(2008, 7, 1),
+ datetime(2008, 3, 15): datetime(2008, 7, 1),
+ datetime(2007, 3, 31): datetime(2007, 7, 2),
+ datetime(2007, 4, 15): datetime(2007, 10, 1),
+ datetime(2008, 4, 30): datetime(2008, 10, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+
+class TestBQuarterEnd(Base):
+ _offset = BQuarterEnd
+
+ def test_repr(self):
+ expected = "<BusinessQuarterEnd: startingMonth=3>"
+ assert repr(BQuarterEnd()) == expected
+ expected = "<BusinessQuarterEnd: startingMonth=3>"
+ assert repr(BQuarterEnd(startingMonth=3)) == expected
+ expected = "<BusinessQuarterEnd: startingMonth=1>"
+ assert repr(BQuarterEnd(startingMonth=1)) == expected
+
+ def test_isAnchored(self):
+ assert BQuarterEnd(startingMonth=1).isAnchored()
+ assert BQuarterEnd().isAnchored()
+ assert not BQuarterEnd(2, startingMonth=1).isAnchored()
+
+ def test_offset_corner_case(self):
+ # corner
+ offset = BQuarterEnd(n=-1, startingMonth=1)
+ assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29)
+
+ offset_cases = []
+ offset_cases.append((BQuarterEnd(startingMonth=1), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 4, 30),
+ datetime(2008, 2, 15): datetime(2008, 4, 30),
+ datetime(2008, 2, 29): datetime(2008, 4, 30),
+ datetime(2008, 3, 15): datetime(2008, 4, 30),
+ datetime(2008, 3, 31): datetime(2008, 4, 30),
+ datetime(2008, 4, 15): datetime(2008, 4, 30),
+ datetime(2008, 4, 30): datetime(2008, 7, 31)}))
+
+ offset_cases.append((BQuarterEnd(startingMonth=2), {
+ datetime(2008, 1, 1): datetime(2008, 2, 29),
+ datetime(2008, 1, 31): datetime(2008, 2, 29),
+ datetime(2008, 2, 15): datetime(2008, 2, 29),
+ datetime(2008, 2, 29): datetime(2008, 5, 30),
+ datetime(2008, 3, 15): datetime(2008, 5, 30),
+ datetime(2008, 3, 31): datetime(2008, 5, 30),
+ datetime(2008, 4, 15): datetime(2008, 5, 30),
+ datetime(2008, 4, 30): datetime(2008, 5, 30)}))
+
+ offset_cases.append((BQuarterEnd(startingMonth=1, n=0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 31),
+ datetime(2008, 1, 31): datetime(2008, 1, 31),
+ datetime(2008, 2, 15): datetime(2008, 4, 30),
+ datetime(2008, 2, 29): datetime(2008, 4, 30),
+ datetime(2008, 3, 15): datetime(2008, 4, 30),
+ datetime(2008, 3, 31): datetime(2008, 4, 30),
+ datetime(2008, 4, 15): datetime(2008, 4, 30),
+ datetime(2008, 4, 30): datetime(2008, 4, 30)}))
+
+ offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), {
+ datetime(2008, 1, 1): datetime(2007, 10, 31),
+ datetime(2008, 1, 31): datetime(2007, 10, 31),
+ datetime(2008, 2, 15): datetime(2008, 1, 31),
+ datetime(2008, 2, 29): datetime(2008, 1, 31),
+ datetime(2008, 3, 15): datetime(2008, 1, 31),
+ datetime(2008, 3, 31): datetime(2008, 1, 31),
+ datetime(2008, 4, 15): datetime(2008, 1, 31),
+ datetime(2008, 4, 30): datetime(2008, 1, 31)}))
+
+ offset_cases.append((BQuarterEnd(startingMonth=1, n=2), {
+ datetime(2008, 1, 31): datetime(2008, 7, 31),
+ datetime(2008, 2, 15): datetime(2008, 7, 31),
+ datetime(2008, 2, 29): datetime(2008, 7, 31),
+ datetime(2008, 3, 15): datetime(2008, 7, 31),
+ datetime(2008, 3, 31): datetime(2008, 7, 31),
+ datetime(2008, 4, 15): datetime(2008, 7, 31),
+ datetime(2008, 4, 30): datetime(2008, 10, 31)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [
+ (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True),
+ (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True),
+ (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False),
+ (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True),
+ (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True),
+ (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False),
+ (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True),
+ (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True),
+ (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False),
+ (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True),
+ (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+# --------------------------------------------------------------------
+# Years
+
+
+class TestYearBegin(Base):
+ _offset = YearBegin
+
+ def test_misspecified(self):
+ pytest.raises(ValueError, YearBegin, month=13)
+
+ offset_cases = []
+ offset_cases.append((YearBegin(), {
+ datetime(2008, 1, 1): datetime(2009, 1, 1),
+ datetime(2008, 6, 30): datetime(2009, 1, 1),
+ datetime(2008, 12, 31): datetime(2009, 1, 1),
+ datetime(2005, 12, 30): datetime(2006, 1, 1),
+ datetime(2005, 12, 31): datetime(2006, 1, 1)}))
+
+ offset_cases.append((YearBegin(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 6, 30): datetime(2009, 1, 1),
+ datetime(2008, 12, 31): datetime(2009, 1, 1),
+ datetime(2005, 12, 30): datetime(2006, 1, 1),
+ datetime(2005, 12, 31): datetime(2006, 1, 1)}))
+
+ offset_cases.append((YearBegin(3), {
+ datetime(2008, 1, 1): datetime(2011, 1, 1),
+ datetime(2008, 6, 30): datetime(2011, 1, 1),
+ datetime(2008, 12, 31): datetime(2011, 1, 1),
+ datetime(2005, 12, 30): datetime(2008, 1, 1),
+ datetime(2005, 12, 31): datetime(2008, 1, 1)}))
+
+ offset_cases.append((YearBegin(-1), {
+ datetime(2007, 1, 1): datetime(2006, 1, 1),
+ datetime(2007, 1, 15): datetime(2007, 1, 1),
+ datetime(2008, 6, 30): datetime(2008, 1, 1),
+ datetime(2008, 12, 31): datetime(2008, 1, 1),
+ datetime(2006, 12, 29): datetime(2006, 1, 1),
+ datetime(2006, 12, 30): datetime(2006, 1, 1),
+ datetime(2007, 1, 1): datetime(2006, 1, 1)}))
+
+ offset_cases.append((YearBegin(-2), {
+ datetime(2007, 1, 1): datetime(2005, 1, 1),
+ datetime(2008, 6, 30): datetime(2007, 1, 1),
+ datetime(2008, 12, 31): datetime(2007, 1, 1)}))
+
+ offset_cases.append((YearBegin(month=4), {
+ datetime(2007, 4, 1): datetime(2008, 4, 1),
+ datetime(2007, 4, 15): datetime(2008, 4, 1),
+ datetime(2007, 3, 1): datetime(2007, 4, 1),
+ datetime(2007, 12, 15): datetime(2008, 4, 1),
+ datetime(2012, 1, 31): datetime(2012, 4, 1)}))
+
+ offset_cases.append((YearBegin(0, month=4), {
+ datetime(2007, 4, 1): datetime(2007, 4, 1),
+ datetime(2007, 3, 1): datetime(2007, 4, 1),
+ datetime(2007, 12, 15): datetime(2008, 4, 1),
+ datetime(2012, 1, 31): datetime(2012, 4, 1)}))
+
+ offset_cases.append((YearBegin(4, month=4), {
+ datetime(2007, 4, 1): datetime(2011, 4, 1),
+ datetime(2007, 4, 15): datetime(2011, 4, 1),
+ datetime(2007, 3, 1): datetime(2010, 4, 1),
+ datetime(2007, 12, 15): datetime(2011, 4, 1),
+ datetime(2012, 1, 31): datetime(2015, 4, 1)}))
+
+ offset_cases.append((YearBegin(-1, month=4), {
+ datetime(2007, 4, 1): datetime(2006, 4, 1),
+ datetime(2007, 3, 1): datetime(2006, 4, 1),
+ datetime(2007, 12, 15): datetime(2007, 4, 1),
+ datetime(2012, 1, 31): datetime(2011, 4, 1)}))
+
+ offset_cases.append((YearBegin(-3, month=4), {
+ datetime(2007, 4, 1): datetime(2004, 4, 1),
+ datetime(2007, 3, 1): datetime(2004, 4, 1),
+ datetime(2007, 12, 15): datetime(2005, 4, 1),
+ datetime(2012, 1, 31): datetime(2009, 4, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False),
+ (YearBegin(), datetime(2008, 1, 1), True),
+ (YearBegin(), datetime(2006, 12, 31), False),
+ (YearBegin(), datetime(2006, 1, 2), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestYearEnd(Base):
+ _offset = YearEnd
+
+ def test_misspecified(self):
+ pytest.raises(ValueError, YearEnd, month=13)
+
+ offset_cases = []
+ offset_cases.append((YearEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 12, 31),
+ datetime(2008, 12, 31): datetime(2009, 12, 31),
+ datetime(2005, 12, 30): datetime(2005, 12, 31),
+ datetime(2005, 12, 31): datetime(2006, 12, 31)}))
+
+ offset_cases.append((YearEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 12, 31),
+ datetime(2008, 12, 31): datetime(2008, 12, 31),
+ datetime(2005, 12, 30): datetime(2005, 12, 31)}))
+
+ offset_cases.append((YearEnd(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 31),
+ datetime(2008, 6, 30): datetime(2007, 12, 31),
+ datetime(2008, 12, 31): datetime(2007, 12, 31),
+ datetime(2006, 12, 29): datetime(2005, 12, 31),
+ datetime(2006, 12, 30): datetime(2005, 12, 31),
+ datetime(2007, 1, 1): datetime(2006, 12, 31)}))
+
+ offset_cases.append((YearEnd(-2), {
+ datetime(2007, 1, 1): datetime(2005, 12, 31),
+ datetime(2008, 6, 30): datetime(2006, 12, 31),
+ datetime(2008, 12, 31): datetime(2006, 12, 31)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True),
+ (YearEnd(), datetime(2008, 1, 1), False),
+ (YearEnd(), datetime(2006, 12, 31), True),
+ (YearEnd(), datetime(2006, 12, 29), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestYearEndDiffMonth(Base):
+ offset_cases = []
+ offset_cases.append((YearEnd(month=3),
+ {datetime(2008, 1, 1): datetime(2008, 3, 31),
+ datetime(2008, 2, 15): datetime(2008, 3, 31),
+ datetime(2008, 3, 31): datetime(2009, 3, 31),
+ datetime(2008, 3, 30): datetime(2008, 3, 31),
+ datetime(2005, 3, 31): datetime(2006, 3, 31),
+ datetime(2006, 7, 30): datetime(2007, 3, 31)}))
+
+ offset_cases.append((YearEnd(0, month=3),
+ {datetime(2008, 1, 1): datetime(2008, 3, 31),
+ datetime(2008, 2, 28): datetime(2008, 3, 31),
+ datetime(2008, 3, 31): datetime(2008, 3, 31),
+ datetime(2005, 3, 30): datetime(2005, 3, 31)}))
+
+ offset_cases.append((YearEnd(-1, month=3),
+ {datetime(2007, 1, 1): datetime(2006, 3, 31),
+ datetime(2008, 2, 28): datetime(2007, 3, 31),
+ datetime(2008, 3, 31): datetime(2007, 3, 31),
+ datetime(2006, 3, 29): datetime(2005, 3, 31),
+ datetime(2006, 3, 30): datetime(2005, 3, 31),
+ datetime(2007, 3, 1): datetime(2006, 3, 31)}))
+
+ offset_cases.append((YearEnd(-2, month=3),
+ {datetime(2007, 1, 1): datetime(2005, 3, 31),
+ datetime(2008, 6, 30): datetime(2007, 3, 31),
+ datetime(2008, 3, 31): datetime(2006, 3, 31)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True),
+ (YearEnd(month=3), datetime(2008, 1, 1), False),
+ (YearEnd(month=3), datetime(2006, 3, 31), True),
+ (YearEnd(month=3), datetime(2006, 3, 29), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestBYearBegin(Base):
+ _offset = BYearBegin
+
+ def test_misspecified(self):
+ pytest.raises(ValueError, BYearBegin, month=13)
+ pytest.raises(ValueError, BYearEnd, month=13)
+
+ offset_cases = []
+ offset_cases.append((BYearBegin(), {
+ datetime(2008, 1, 1): datetime(2009, 1, 1),
+ datetime(2008, 6, 30): datetime(2009, 1, 1),
+ datetime(2008, 12, 31): datetime(2009, 1, 1),
+ datetime(2011, 1, 1): datetime(2011, 1, 3),
+ datetime(2011, 1, 3): datetime(2012, 1, 2),
+ datetime(2005, 12, 30): datetime(2006, 1, 2),
+ datetime(2005, 12, 31): datetime(2006, 1, 2)}))
+
+ offset_cases.append((BYearBegin(0), {
+ datetime(2008, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 6, 30): datetime(2009, 1, 1),
+ datetime(2008, 12, 31): datetime(2009, 1, 1),
+ datetime(2005, 12, 30): datetime(2006, 1, 2),
+ datetime(2005, 12, 31): datetime(2006, 1, 2)}))
+
+ offset_cases.append((BYearBegin(-1), {
+ datetime(2007, 1, 1): datetime(2006, 1, 2),
+ datetime(2009, 1, 4): datetime(2009, 1, 1),
+ datetime(2009, 1, 1): datetime(2008, 1, 1),
+ datetime(2008, 6, 30): datetime(2008, 1, 1),
+ datetime(2008, 12, 31): datetime(2008, 1, 1),
+ datetime(2006, 12, 29): datetime(2006, 1, 2),
+ datetime(2006, 12, 30): datetime(2006, 1, 2),
+ datetime(2006, 1, 1): datetime(2005, 1, 3)}))
+
+ offset_cases.append((BYearBegin(-2), {
+ datetime(2007, 1, 1): datetime(2005, 1, 3),
+ datetime(2007, 6, 30): datetime(2006, 1, 2),
+ datetime(2008, 12, 31): datetime(2007, 1, 1)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+
+class TestBYearEnd(Base):
+ _offset = BYearEnd
+
+ offset_cases = []
+ offset_cases.append((BYearEnd(), {
+ datetime(2008, 1, 1): datetime(2008, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 12, 31),
+ datetime(2008, 12, 31): datetime(2009, 12, 31),
+ datetime(2005, 12, 30): datetime(2006, 12, 29),
+ datetime(2005, 12, 31): datetime(2006, 12, 29)}))
+
+ offset_cases.append((BYearEnd(0), {
+ datetime(2008, 1, 1): datetime(2008, 12, 31),
+ datetime(2008, 6, 30): datetime(2008, 12, 31),
+ datetime(2008, 12, 31): datetime(2008, 12, 31),
+ datetime(2005, 12, 31): datetime(2006, 12, 29)}))
+
+ offset_cases.append((BYearEnd(-1), {
+ datetime(2007, 1, 1): datetime(2006, 12, 29),
+ datetime(2008, 6, 30): datetime(2007, 12, 31),
+ datetime(2008, 12, 31): datetime(2007, 12, 31),
+ datetime(2006, 12, 29): datetime(2005, 12, 30),
+ datetime(2006, 12, 30): datetime(2006, 12, 29),
+ datetime(2007, 1, 1): datetime(2006, 12, 29)}))
+
+ offset_cases.append((BYearEnd(-2), {
+ datetime(2007, 1, 1): datetime(2005, 12, 30),
+ datetime(2008, 6, 30): datetime(2006, 12, 29),
+ datetime(2008, 12, 31): datetime(2006, 12, 29)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True),
+ (BYearEnd(), datetime(2008, 1, 1), False),
+ (BYearEnd(), datetime(2006, 12, 31), False),
+ (BYearEnd(), datetime(2006, 12, 29), True)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
+
+
+class TestBYearEndLagged(Base):
+ _offset = BYearEnd
+
+ def test_bad_month_fail(self):
+ pytest.raises(Exception, BYearEnd, month=13)
+ pytest.raises(Exception, BYearEnd, month=0)
+
+ offset_cases = []
+ offset_cases.append((BYearEnd(month=6), {
+ datetime(2008, 1, 1): datetime(2008, 6, 30),
+ datetime(2007, 6, 30): datetime(2008, 6, 30)}))
+
+ offset_cases.append((BYearEnd(n=-1, month=6), {
+ datetime(2008, 1, 1): datetime(2007, 6, 29),
+ datetime(2007, 6, 30): datetime(2007, 6, 29)}))
+
+ @pytest.mark.parametrize('case', offset_cases)
+ def test_offset(self, case):
+ offset, cases = case
+ for base, expected in compat.iteritems(cases):
+ assert_offset_equal(offset, base, expected)
+
+ def test_roll(self):
+ offset = BYearEnd(month=6)
+ date = datetime(2009, 11, 30)
+
+ assert offset.rollforward(date) == datetime(2010, 6, 30)
+ assert offset.rollback(date) == datetime(2009, 6, 30)
+
+ on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True),
+ (BYearEnd(month=6), datetime(2007, 6, 30), False)]
+
+ @pytest.mark.parametrize('case', on_offset_cases)
+ def test_onOffset(self, case):
+ offset, dt, expected = case
+ assert_onOffset(offset, dt, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/test_frequencies.py b/contrib/python/pandas/py2/pandas/tests/tseries/test_frequencies.py
new file mode 100644
index 00000000000..eb4e63654b4
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/test_frequencies.py
@@ -0,0 +1,793 @@
+from datetime import datetime, timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import frequencies as libfrequencies, resolution
+from pandas._libs.tslibs.ccalendar import MONTHS
+from pandas._libs.tslibs.frequencies import (
+ INVALID_FREQ_ERR_MSG, FreqGroup, _period_code_map, get_freq, get_freq_code)
+import pandas.compat as compat
+from pandas.compat import is_platform_windows, range
+
+from pandas import (
+ DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range,
+ period_range)
+from pandas.core.tools.datetimes import to_datetime
+import pandas.util.testing as tm
+
+import pandas.tseries.frequencies as frequencies
+import pandas.tseries.offsets as offsets
+
+
+class TestToOffset(object):
+
+ def test_to_offset_multiple(self):
+ freqstr = '2h30min'
+ freqstr2 = '2h 30min'
+
+ result = frequencies.to_offset(freqstr)
+ assert (result == frequencies.to_offset(freqstr2))
+ expected = offsets.Minute(150)
+ assert (result == expected)
+
+ freqstr = '2h30min15s'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Second(150 * 60 + 15)
+ assert (result == expected)
+
+ freqstr = '2h 60min'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Hour(3)
+ assert (result == expected)
+
+ freqstr = '2h 20.5min'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Second(8430)
+ assert (result == expected)
+
+ freqstr = '1.5min'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Second(90)
+ assert (result == expected)
+
+ freqstr = '0.5S'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Milli(500)
+ assert (result == expected)
+
+ freqstr = '15l500u'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Micro(15500)
+ assert (result == expected)
+
+ freqstr = '10s75L'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Milli(10075)
+ assert (result == expected)
+
+ freqstr = '1s0.25ms'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Micro(1000250)
+ assert (result == expected)
+
+ freqstr = '1s0.25L'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Micro(1000250)
+ assert (result == expected)
+
+ freqstr = '2800N'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.Nano(2800)
+ assert (result == expected)
+
+ freqstr = '2SM'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.SemiMonthEnd(2)
+ assert (result == expected)
+
+ freqstr = '2SM-16'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.SemiMonthEnd(2, day_of_month=16)
+ assert (result == expected)
+
+ freqstr = '2SMS-14'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.SemiMonthBegin(2, day_of_month=14)
+ assert (result == expected)
+
+ freqstr = '2SMS-15'
+ result = frequencies.to_offset(freqstr)
+ expected = offsets.SemiMonthBegin(2)
+ assert (result == expected)
+
+ # malformed
+ with pytest.raises(ValueError, match='Invalid frequency: 2h20m'):
+ frequencies.to_offset('2h20m')
+
+ def test_to_offset_negative(self):
+ freqstr = '-1S'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == -1)
+
+ freqstr = '-5min10s'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == -310)
+
+ freqstr = '-2SM'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == -2)
+
+ freqstr = '-1SMS'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == -1)
+
+ def test_to_offset_invalid(self):
+ # GH 13930
+ with pytest.raises(ValueError, match='Invalid frequency: U1'):
+ frequencies.to_offset('U1')
+ with pytest.raises(ValueError, match='Invalid frequency: -U'):
+ frequencies.to_offset('-U')
+ with pytest.raises(ValueError, match='Invalid frequency: 3U1'):
+ frequencies.to_offset('3U1')
+ with pytest.raises(ValueError, match='Invalid frequency: -2-3U'):
+ frequencies.to_offset('-2-3U')
+ with pytest.raises(ValueError, match='Invalid frequency: -2D:3H'):
+ frequencies.to_offset('-2D:3H')
+ with pytest.raises(ValueError, match='Invalid frequency: 1.5.0S'):
+ frequencies.to_offset('1.5.0S')
+
+ # split offsets with spaces are valid
+ assert frequencies.to_offset('2D 3H') == offsets.Hour(51)
+ assert frequencies.to_offset('2 D3 H') == offsets.Hour(51)
+ assert frequencies.to_offset('2 D 3 H') == offsets.Hour(51)
+ assert frequencies.to_offset(' 2 D 3 H ') == offsets.Hour(51)
+ assert frequencies.to_offset(' H ') == offsets.Hour()
+ assert frequencies.to_offset(' 3 H ') == offsets.Hour(3)
+
+ # special cases
+ assert frequencies.to_offset('2SMS-15') == offsets.SemiMonthBegin(2)
+ with pytest.raises(ValueError, match='Invalid frequency: 2SMS-15-15'):
+ frequencies.to_offset('2SMS-15-15')
+ with pytest.raises(ValueError, match='Invalid frequency: 2SMS-15D'):
+ frequencies.to_offset('2SMS-15D')
+
+ def test_to_offset_leading_zero(self):
+ freqstr = '00H 00T 01S'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == 1)
+
+ freqstr = '-00H 03T 14S'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == -194)
+
+ def test_to_offset_leading_plus(self):
+ freqstr = '+1d'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == 1)
+
+ freqstr = '+2h30min'
+ result = frequencies.to_offset(freqstr)
+ assert (result.n == 150)
+
+ for bad_freq in ['+-1d', '-+1h', '+1', '-7', '+d', '-m']:
+ with pytest.raises(ValueError, match='Invalid frequency:'):
+ frequencies.to_offset(bad_freq)
+
+ def test_to_offset_pd_timedelta(self):
+ # Tests for #9064
+ td = Timedelta(days=1, seconds=1)
+ result = frequencies.to_offset(td)
+ expected = offsets.Second(86401)
+ assert (expected == result)
+
+ td = Timedelta(days=-1, seconds=1)
+ result = frequencies.to_offset(td)
+ expected = offsets.Second(-86399)
+ assert (expected == result)
+
+ td = Timedelta(hours=1, minutes=10)
+ result = frequencies.to_offset(td)
+ expected = offsets.Minute(70)
+ assert (expected == result)
+
+ td = Timedelta(hours=1, minutes=-10)
+ result = frequencies.to_offset(td)
+ expected = offsets.Minute(50)
+ assert (expected == result)
+
+ td = Timedelta(weeks=1)
+ result = frequencies.to_offset(td)
+ expected = offsets.Day(7)
+ assert (expected == result)
+
+ td1 = Timedelta(hours=1)
+ result1 = frequencies.to_offset(td1)
+ result2 = frequencies.to_offset('60min')
+ assert (result1 == result2)
+
+ td = Timedelta(microseconds=1)
+ result = frequencies.to_offset(td)
+ expected = offsets.Micro(1)
+ assert (expected == result)
+
+ td = Timedelta(microseconds=0)
+ pytest.raises(ValueError, lambda: frequencies.to_offset(td))
+
+ def test_anchored_shortcuts(self):
+ result = frequencies.to_offset('W')
+ expected = frequencies.to_offset('W-SUN')
+ assert (result == expected)
+
+ result1 = frequencies.to_offset('Q')
+ result2 = frequencies.to_offset('Q-DEC')
+ expected = offsets.QuarterEnd(startingMonth=12)
+ assert (result1 == expected)
+ assert (result2 == expected)
+
+ result1 = frequencies.to_offset('Q-MAY')
+ expected = offsets.QuarterEnd(startingMonth=5)
+ assert (result1 == expected)
+
+ result1 = frequencies.to_offset('SM')
+ result2 = frequencies.to_offset('SM-15')
+ expected = offsets.SemiMonthEnd(day_of_month=15)
+ assert (result1 == expected)
+ assert (result2 == expected)
+
+ result = frequencies.to_offset('SM-1')
+ expected = offsets.SemiMonthEnd(day_of_month=1)
+ assert (result == expected)
+
+ result = frequencies.to_offset('SM-27')
+ expected = offsets.SemiMonthEnd(day_of_month=27)
+ assert (result == expected)
+
+ result = frequencies.to_offset('SMS-2')
+ expected = offsets.SemiMonthBegin(day_of_month=2)
+ assert (result == expected)
+
+ result = frequencies.to_offset('SMS-27')
+ expected = offsets.SemiMonthBegin(day_of_month=27)
+ assert (result == expected)
+
+ # ensure invalid cases fail as expected
+ invalid_anchors = ['SM-0', 'SM-28', 'SM-29',
+ 'SM-FOO', 'BSM', 'SM--1',
+ 'SMS-1', 'SMS-28', 'SMS-30',
+ 'SMS-BAR', 'SMS-BYR' 'BSMS',
+ 'SMS--2']
+ for invalid_anchor in invalid_anchors:
+ with pytest.raises(ValueError, match='Invalid frequency: '):
+ frequencies.to_offset(invalid_anchor)
+
+
+def test_ms_vs_MS():
+ left = frequencies.get_offset('ms')
+ right = frequencies.get_offset('MS')
+ assert left == offsets.Milli()
+ assert right == offsets.MonthBegin()
+
+
+def test_rule_aliases():
+ rule = frequencies.to_offset('10us')
+ assert rule == offsets.Micro(10)
+
+
+class TestFrequencyCode(object):
+
+ def test_freq_code(self):
+ assert get_freq('A') == 1000
+ assert get_freq('3A') == 1000
+ assert get_freq('-1A') == 1000
+
+ assert get_freq('Y') == 1000
+ assert get_freq('3Y') == 1000
+ assert get_freq('-1Y') == 1000
+
+ assert get_freq('W') == 4000
+ assert get_freq('W-MON') == 4001
+ assert get_freq('W-FRI') == 4005
+
+ for freqstr, code in compat.iteritems(_period_code_map):
+ result = get_freq(freqstr)
+ assert result == code
+
+ result = resolution.get_freq_group(freqstr)
+ assert result == code // 1000 * 1000
+
+ result = resolution.get_freq_group(code)
+ assert result == code // 1000 * 1000
+
+ def test_freq_group(self):
+ assert resolution.get_freq_group('A') == 1000
+ assert resolution.get_freq_group('3A') == 1000
+ assert resolution.get_freq_group('-1A') == 1000
+ assert resolution.get_freq_group('A-JAN') == 1000
+ assert resolution.get_freq_group('A-MAY') == 1000
+
+ assert resolution.get_freq_group('Y') == 1000
+ assert resolution.get_freq_group('3Y') == 1000
+ assert resolution.get_freq_group('-1Y') == 1000
+ assert resolution.get_freq_group('Y-JAN') == 1000
+ assert resolution.get_freq_group('Y-MAY') == 1000
+
+ assert resolution.get_freq_group(offsets.YearEnd()) == 1000
+ assert resolution.get_freq_group(offsets.YearEnd(month=1)) == 1000
+ assert resolution.get_freq_group(offsets.YearEnd(month=5)) == 1000
+
+ assert resolution.get_freq_group('W') == 4000
+ assert resolution.get_freq_group('W-MON') == 4000
+ assert resolution.get_freq_group('W-FRI') == 4000
+ assert resolution.get_freq_group(offsets.Week()) == 4000
+ assert resolution.get_freq_group(offsets.Week(weekday=1)) == 4000
+ assert resolution.get_freq_group(offsets.Week(weekday=5)) == 4000
+
+ def test_get_to_timestamp_base(self):
+ tsb = libfrequencies.get_to_timestamp_base
+
+ assert (tsb(get_freq_code('D')[0]) ==
+ get_freq_code('D')[0])
+ assert (tsb(get_freq_code('W')[0]) ==
+ get_freq_code('D')[0])
+ assert (tsb(get_freq_code('M')[0]) ==
+ get_freq_code('D')[0])
+
+ assert (tsb(get_freq_code('S')[0]) ==
+ get_freq_code('S')[0])
+ assert (tsb(get_freq_code('T')[0]) ==
+ get_freq_code('S')[0])
+ assert (tsb(get_freq_code('H')[0]) ==
+ get_freq_code('S')[0])
+
+ def test_freq_to_reso(self):
+ Reso = resolution.Resolution
+
+ assert Reso.get_str_from_freq('A') == 'year'
+ assert Reso.get_str_from_freq('Q') == 'quarter'
+ assert Reso.get_str_from_freq('M') == 'month'
+ assert Reso.get_str_from_freq('D') == 'day'
+ assert Reso.get_str_from_freq('H') == 'hour'
+ assert Reso.get_str_from_freq('T') == 'minute'
+ assert Reso.get_str_from_freq('S') == 'second'
+ assert Reso.get_str_from_freq('L') == 'millisecond'
+ assert Reso.get_str_from_freq('U') == 'microsecond'
+ assert Reso.get_str_from_freq('N') == 'nanosecond'
+
+ for freq in ['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U', 'N']:
+ # check roundtrip
+ result = Reso.get_freq(Reso.get_str_from_freq(freq))
+ assert freq == result
+
+ for freq in ['D', 'H', 'T', 'S', 'L', 'U']:
+ result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq)))
+ assert freq == result
+
+ def test_resolution_bumping(self):
+ # see gh-14378
+ Reso = resolution.Resolution
+
+ assert Reso.get_stride_from_decimal(1.5, 'T') == (90, 'S')
+ assert Reso.get_stride_from_decimal(62.4, 'T') == (3744, 'S')
+ assert Reso.get_stride_from_decimal(1.04, 'H') == (3744, 'S')
+ assert Reso.get_stride_from_decimal(1, 'D') == (1, 'D')
+ assert (Reso.get_stride_from_decimal(0.342931, 'H') ==
+ (1234551600, 'U'))
+ assert Reso.get_stride_from_decimal(1.2345, 'D') == (106660800, 'L')
+
+ with pytest.raises(ValueError):
+ Reso.get_stride_from_decimal(0.5, 'N')
+
+ # too much precision in the input can prevent
+ with pytest.raises(ValueError):
+ Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H')
+
+ def test_get_freq_code(self):
+ # frequency str
+ assert (get_freq_code('A') ==
+ (get_freq('A'), 1))
+ assert (get_freq_code('3D') ==
+ (get_freq('D'), 3))
+ assert (get_freq_code('-2M') ==
+ (get_freq('M'), -2))
+
+ # tuple
+ assert (get_freq_code(('D', 1)) ==
+ (get_freq('D'), 1))
+ assert (get_freq_code(('A', 3)) ==
+ (get_freq('A'), 3))
+ assert (get_freq_code(('M', -2)) ==
+ (get_freq('M'), -2))
+
+ # numeric tuple
+ assert get_freq_code((1000, 1)) == (1000, 1)
+
+ # offsets
+ assert (get_freq_code(offsets.Day()) ==
+ (get_freq('D'), 1))
+ assert (get_freq_code(offsets.Day(3)) ==
+ (get_freq('D'), 3))
+ assert (get_freq_code(offsets.Day(-2)) ==
+ (get_freq('D'), -2))
+
+ assert (get_freq_code(offsets.MonthEnd()) ==
+ (get_freq('M'), 1))
+ assert (get_freq_code(offsets.MonthEnd(3)) ==
+ (get_freq('M'), 3))
+ assert (get_freq_code(offsets.MonthEnd(-2)) ==
+ (get_freq('M'), -2))
+
+ assert (get_freq_code(offsets.Week()) ==
+ (get_freq('W'), 1))
+ assert (get_freq_code(offsets.Week(3)) ==
+ (get_freq('W'), 3))
+ assert (get_freq_code(offsets.Week(-2)) ==
+ (get_freq('W'), -2))
+
+ # Monday is weekday=0
+ assert (get_freq_code(offsets.Week(weekday=1)) ==
+ (get_freq('W-TUE'), 1))
+ assert (get_freq_code(offsets.Week(3, weekday=0)) ==
+ (get_freq('W-MON'), 3))
+ assert (get_freq_code(offsets.Week(-2, weekday=4)) ==
+ (get_freq('W-FRI'), -2))
+
+ def test_frequency_misc(self):
+ assert (resolution.get_freq_group('T') ==
+ FreqGroup.FR_MIN)
+
+ code, stride = get_freq_code(offsets.Hour())
+ assert code == FreqGroup.FR_HR
+
+ code, stride = get_freq_code((5, 'T'))
+ assert code == FreqGroup.FR_MIN
+ assert stride == 5
+
+ offset = offsets.Hour()
+ result = frequencies.to_offset(offset)
+ assert result == offset
+
+ result = frequencies.to_offset((5, 'T'))
+ expected = offsets.Minute(5)
+ assert result == expected
+
+ with pytest.raises(ValueError, match='Invalid frequency'):
+ get_freq_code((5, 'baz'))
+
+ with pytest.raises(ValueError, match='Invalid frequency'):
+ frequencies.to_offset('100foo')
+
+ with pytest.raises(ValueError, match='Could not evaluate'):
+ frequencies.to_offset(('', ''))
+
+
+_dti = DatetimeIndex
+
+
+class TestFrequencyInference(object):
+
+ def test_raise_if_period_index(self):
+ index = period_range(start="1/1/1990", periods=20, freq="M")
+ pytest.raises(TypeError, frequencies.infer_freq, index)
+
+ def test_raise_if_too_few(self):
+ index = _dti(['12/31/1998', '1/3/1999'])
+ pytest.raises(ValueError, frequencies.infer_freq, index)
+
+ def test_business_daily(self):
+ index = _dti(['01/01/1999', '1/4/1999', '1/5/1999'])
+ assert frequencies.infer_freq(index) == 'B'
+
+ def test_business_daily_look_alike(self):
+ # GH 16624, do not infer 'B' when 'weekend' (2-day gap) in wrong place
+ index = _dti(['12/31/1998', '1/3/1999', '1/4/1999'])
+ assert frequencies.infer_freq(index) is None
+
+ def test_day(self):
+ self._check_tick(timedelta(1), 'D')
+
+ def test_day_corner(self):
+ index = _dti(['1/1/2000', '1/2/2000', '1/3/2000'])
+ assert frequencies.infer_freq(index) == 'D'
+
+ def test_non_datetimeindex(self):
+ dates = to_datetime(['1/1/2000', '1/2/2000', '1/3/2000'])
+ assert frequencies.infer_freq(dates) == 'D'
+
+ def test_hour(self):
+ self._check_tick(timedelta(hours=1), 'H')
+
+ def test_minute(self):
+ self._check_tick(timedelta(minutes=1), 'T')
+
+ def test_second(self):
+ self._check_tick(timedelta(seconds=1), 'S')
+
+ def test_millisecond(self):
+ self._check_tick(timedelta(microseconds=1000), 'L')
+
+ def test_microsecond(self):
+ self._check_tick(timedelta(microseconds=1), 'U')
+
+ def test_nanosecond(self):
+ self._check_tick(np.timedelta64(1, 'ns'), 'N')
+
+ def _check_tick(self, base_delta, code):
+ b = Timestamp(datetime.now())
+ for i in range(1, 5):
+ inc = base_delta * i
+ index = _dti([b + inc * j for j in range(3)])
+ if i > 1:
+ exp_freq = '%d%s' % (i, code)
+ else:
+ exp_freq = code
+ assert frequencies.infer_freq(index) == exp_freq
+
+ index = _dti([b + base_delta * 7] + [b + base_delta * j for j in range(
+ 3)])
+ assert frequencies.infer_freq(index) is None
+
+ index = _dti([b + base_delta * j for j in range(3)] + [b + base_delta *
+ 7])
+
+ assert frequencies.infer_freq(index) is None
+
+ def test_weekly(self):
+ days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
+
+ for day in days:
+ self._check_generated_range('1/1/2000', 'W-%s' % day)
+
+ def test_week_of_month(self):
+ days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
+
+ for day in days:
+ for i in range(1, 5):
+ self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day))
+
+ def test_fifth_week_of_month(self):
+ # Only supports freq up to WOM-4. See #9425
+ func = lambda: date_range('2014-01-01', freq='WOM-5MON')
+ pytest.raises(ValueError, func)
+
+ def test_fifth_week_of_month_infer(self):
+ # Only attempts to infer up to WOM-4. See #9425
+ index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"])
+ assert frequencies.infer_freq(index) is None
+
+ def test_week_of_month_fake(self):
+ # All of these dates are on same day of week and are 4 or 5 weeks apart
+ index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29",
+ "2013-11-26"])
+ assert frequencies.infer_freq(index) != 'WOM-4TUE'
+
+ def test_monthly(self):
+ self._check_generated_range('1/1/2000', 'M')
+
+ def test_monthly_ambiguous(self):
+ rng = _dti(['1/31/2000', '2/29/2000', '3/31/2000'])
+ assert rng.inferred_freq == 'M'
+
+ def test_business_monthly(self):
+ self._check_generated_range('1/1/2000', 'BM')
+
+ def test_business_start_monthly(self):
+ self._check_generated_range('1/1/2000', 'BMS')
+
+ def test_quarterly(self):
+ for month in ['JAN', 'FEB', 'MAR']:
+ self._check_generated_range('1/1/2000', 'Q-%s' % month)
+
+ def test_annual(self):
+ for month in MONTHS:
+ self._check_generated_range('1/1/2000', 'A-%s' % month)
+
+ def test_business_annual(self):
+ for month in MONTHS:
+ self._check_generated_range('1/1/2000', 'BA-%s' % month)
+
+ def test_annual_ambiguous(self):
+ rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002'])
+ assert rng.inferred_freq == 'A-JAN'
+
+ def _check_generated_range(self, start, freq):
+ freq = freq.upper()
+
+ gen = date_range(start, periods=7, freq=freq)
+ index = _dti(gen.values)
+ if not freq.startswith('Q-'):
+ assert frequencies.infer_freq(index) == gen.freqstr
+ else:
+ inf_freq = frequencies.infer_freq(index)
+ is_dec_range = inf_freq == 'Q-DEC' and gen.freqstr in (
+ 'Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR')
+ is_nov_range = inf_freq == 'Q-NOV' and gen.freqstr in (
+ 'Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')
+ is_oct_range = inf_freq == 'Q-OCT' and gen.freqstr in (
+ 'Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN')
+ assert is_dec_range or is_nov_range or is_oct_range
+
+ gen = date_range(start, periods=5, freq=freq)
+ index = _dti(gen.values)
+
+ if not freq.startswith('Q-'):
+ assert frequencies.infer_freq(index) == gen.freqstr
+ else:
+ inf_freq = frequencies.infer_freq(index)
+ is_dec_range = inf_freq == 'Q-DEC' and gen.freqstr in (
+ 'Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR')
+ is_nov_range = inf_freq == 'Q-NOV' and gen.freqstr in (
+ 'Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')
+ is_oct_range = inf_freq == 'Q-OCT' and gen.freqstr in (
+ 'Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN')
+
+ assert is_dec_range or is_nov_range or is_oct_range
+
+ def test_infer_freq(self):
+ rng = period_range('1959Q2', '2009Q3', freq='Q')
+ rng = Index(rng.to_timestamp('D', how='e').astype(object))
+ assert rng.inferred_freq == 'Q-DEC'
+
+ rng = period_range('1959Q2', '2009Q3', freq='Q-NOV')
+ rng = Index(rng.to_timestamp('D', how='e').astype(object))
+ assert rng.inferred_freq == 'Q-NOV'
+
+ rng = period_range('1959Q2', '2009Q3', freq='Q-OCT')
+ rng = Index(rng.to_timestamp('D', how='e').astype(object))
+ assert rng.inferred_freq == 'Q-OCT'
+
+ def test_infer_freq_tz(self):
+
+ freqs = {'AS-JAN':
+ ['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01'],
+ 'Q-OCT':
+ ['2009-01-31', '2009-04-30', '2009-07-31', '2009-10-31'],
+ 'M': ['2010-11-30', '2010-12-31', '2011-01-31', '2011-02-28'],
+ 'W-SAT':
+ ['2010-12-25', '2011-01-01', '2011-01-08', '2011-01-15'],
+ 'D': ['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04'],
+ 'H': ['2011-12-31 22:00', '2011-12-31 23:00',
+ '2012-01-01 00:00', '2012-01-01 01:00']}
+
+ # GH 7310
+ for tz in [None, 'Australia/Sydney', 'Asia/Tokyo', 'Europe/Paris',
+ 'US/Pacific', 'US/Eastern']:
+ for expected, dates in compat.iteritems(freqs):
+ idx = DatetimeIndex(dates, tz=tz)
+ assert idx.inferred_freq == expected
+
+ def test_infer_freq_tz_transition(self):
+ # Tests for #8772
+ date_pairs = [['2013-11-02', '2013-11-5'], # Fall DST
+ ['2014-03-08', '2014-03-11'], # Spring DST
+ ['2014-01-01', '2014-01-03']] # Regular Time
+ freqs = ['3H', '10T', '3601S', '3600001L', '3600000001U',
+ '3600000000001N']
+
+ for tz in [None, 'Australia/Sydney', 'Asia/Tokyo', 'Europe/Paris',
+ 'US/Pacific', 'US/Eastern']:
+ for date_pair in date_pairs:
+ for freq in freqs:
+ idx = date_range(date_pair[0], date_pair[
+ 1], freq=freq, tz=tz)
+ assert idx.inferred_freq == freq
+
+ index = date_range("2013-11-03", periods=5,
+ freq="3H").tz_localize("America/Chicago")
+ assert index.inferred_freq is None
+
+ def test_infer_freq_businesshour(self):
+ # GH 7905
+ idx = DatetimeIndex(
+ ['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00',
+ '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00'])
+ # hourly freq in a day must result in 'H'
+ assert idx.inferred_freq == 'H'
+
+ idx = DatetimeIndex(
+ ['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00',
+ '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00',
+ '2014-07-01 15:00', '2014-07-01 16:00', '2014-07-02 09:00',
+ '2014-07-02 10:00', '2014-07-02 11:00'])
+ assert idx.inferred_freq == 'BH'
+
+ idx = DatetimeIndex(
+ ['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00',
+ '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00',
+ '2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00',
+ '2014-07-07 10:00', '2014-07-07 11:00'])
+ assert idx.inferred_freq == 'BH'
+
+ idx = DatetimeIndex(
+ ['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00',
+ '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00',
+ '2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00',
+ '2014-07-07 10:00', '2014-07-07 11:00', '2014-07-07 12:00',
+ '2014-07-07 13:00', '2014-07-07 14:00', '2014-07-07 15:00',
+ '2014-07-07 16:00', '2014-07-08 09:00', '2014-07-08 10:00',
+ '2014-07-08 11:00', '2014-07-08 12:00', '2014-07-08 13:00',
+ '2014-07-08 14:00', '2014-07-08 15:00', '2014-07-08 16:00'])
+ assert idx.inferred_freq == 'BH'
+
+ def test_not_monotonic(self):
+ rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002'])
+ rng = rng[::-1]
+ assert rng.inferred_freq == '-1A-JAN'
+
+ def test_non_datetimeindex2(self):
+ rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002'])
+
+ vals = rng.to_pydatetime()
+
+ result = frequencies.infer_freq(vals)
+ assert result == rng.inferred_freq
+
+ def test_invalid_index_types(self):
+
+ # test all index types
+ for i in [tm.makeIntIndex(10), tm.makeFloatIndex(10),
+ tm.makePeriodIndex(10)]:
+ pytest.raises(TypeError, lambda: frequencies.infer_freq(i))
+
+ # GH 10822
+ # odd error message on conversions to datetime for unicode
+ if not is_platform_windows():
+ for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]:
+ pytest.raises(ValueError, lambda: frequencies.infer_freq(i))
+
+ def test_string_datetimelike_compat(self):
+
+ # GH 6463
+ expected = frequencies.infer_freq(['2004-01', '2004-02', '2004-03',
+ '2004-04'])
+ result = frequencies.infer_freq(Index(['2004-01', '2004-02', '2004-03',
+ '2004-04']))
+ assert result == expected
+
+ def test_series(self):
+
+ # GH6407
+ # inferring series
+
+ # invalid type of Series
+ for s in [Series(np.arange(10)), Series(np.arange(10.))]:
+ pytest.raises(TypeError, lambda: frequencies.infer_freq(s))
+
+ # a non-convertible string
+ pytest.raises(ValueError, lambda: frequencies.infer_freq(
+ Series(['foo', 'bar'])))
+
+ # cannot infer on PeriodIndex
+ for freq in [None, 'L']:
+ s = Series(period_range('2013', periods=10, freq=freq))
+ pytest.raises(TypeError, lambda: frequencies.infer_freq(s))
+
+ # DateTimeIndex
+ for freq in ['M', 'L', 'S']:
+ s = Series(date_range('20130101', periods=10, freq=freq))
+ inferred = frequencies.infer_freq(s)
+ assert inferred == freq
+
+ s = Series(date_range('20130101', '20130110'))
+ inferred = frequencies.infer_freq(s)
+ assert inferred == 'D'
+
+ def test_legacy_offset_warnings(self):
+ freqs = ['WEEKDAY', 'EOM', 'W@MON', 'W@TUE', 'W@WED', 'W@THU',
+ 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR',
+ 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN',
+ 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC',
+ 'Y@JAN', 'WOM@1MON', 'WOM@2MON', 'WOM@3MON',
+ 'WOM@4MON', 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE',
+ 'WOM@4TUE', 'WOM@1WED', 'WOM@2WED', 'WOM@3WED',
+ 'WOM@4WED', 'WOM@1THU', 'WOM@2THU', 'WOM@3THU',
+ 'WOM@4THU', 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI',
+ 'WOM@4FRI']
+
+ msg = INVALID_FREQ_ERR_MSG
+ for freq in freqs:
+ with pytest.raises(ValueError, match=msg):
+ frequencies.get_offset(freq)
+
+ with pytest.raises(ValueError, match=msg):
+ date_range('2011-01-01', periods=5, freq=freq)
diff --git a/contrib/python/pandas/py2/pandas/tests/tseries/test_holiday.py b/contrib/python/pandas/py2/pandas/tests/tseries/test_holiday.py
new file mode 100644
index 00000000000..86f154ed1ac
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tseries/test_holiday.py
@@ -0,0 +1,382 @@
+from datetime import datetime
+
+import pytest
+from pytz import utc
+
+from pandas import DatetimeIndex, compat
+import pandas.util.testing as tm
+
+from pandas.tseries.holiday import (
+ MO, SA, AbstractHolidayCalendar, DateOffset, EasterMonday, GoodFriday,
+ Holiday, HolidayCalendarFactory, Timestamp, USColumbusDay,
+ USFederalHolidayCalendar, USLaborDay, USMartinLutherKingJr, USMemorialDay,
+ USPresidentsDay, USThanksgivingDay, after_nearest_workday,
+ before_nearest_workday, get_calendar, nearest_workday, next_monday,
+ next_monday_or_tuesday, next_workday, previous_friday, previous_workday,
+ sunday_to_monday, weekend_to_monday)
+
+
+class TestCalendar(object):
+
+ def setup_method(self, method):
+ self.holiday_list = [
+ datetime(2012, 1, 2),
+ datetime(2012, 1, 16),
+ datetime(2012, 2, 20),
+ datetime(2012, 5, 28),
+ datetime(2012, 7, 4),
+ datetime(2012, 9, 3),
+ datetime(2012, 10, 8),
+ datetime(2012, 11, 12),
+ datetime(2012, 11, 22),
+ datetime(2012, 12, 25)]
+
+ self.start_date = datetime(2012, 1, 1)
+ self.end_date = datetime(2012, 12, 31)
+
+ def test_calendar(self):
+
+ calendar = USFederalHolidayCalendar()
+ holidays = calendar.holidays(self.start_date, self.end_date)
+
+ holidays_1 = calendar.holidays(
+ self.start_date.strftime('%Y-%m-%d'),
+ self.end_date.strftime('%Y-%m-%d'))
+ holidays_2 = calendar.holidays(
+ Timestamp(self.start_date),
+ Timestamp(self.end_date))
+
+ assert list(holidays.to_pydatetime()) == self.holiday_list
+ assert list(holidays_1.to_pydatetime()) == self.holiday_list
+ assert list(holidays_2.to_pydatetime()) == self.holiday_list
+
+ def test_calendar_caching(self):
+ # Test for issue #9552
+
+ class TestCalendar(AbstractHolidayCalendar):
+
+ def __init__(self, name=None, rules=None):
+ super(TestCalendar, self).__init__(name=name, rules=rules)
+
+ jan1 = TestCalendar(rules=[Holiday('jan1', year=2015, month=1, day=1)])
+ jan2 = TestCalendar(rules=[Holiday('jan2', year=2015, month=1, day=2)])
+
+ tm.assert_index_equal(jan1.holidays(), DatetimeIndex(['01-Jan-2015']))
+ tm.assert_index_equal(jan2.holidays(), DatetimeIndex(['02-Jan-2015']))
+
+ def test_calendar_observance_dates(self):
+ # Test for issue 11477
+ USFedCal = get_calendar('USFederalHolidayCalendar')
+ holidays0 = USFedCal.holidays(datetime(2015, 7, 3), datetime(
+ 2015, 7, 3)) # <-- same start and end dates
+ holidays1 = USFedCal.holidays(datetime(2015, 7, 3), datetime(
+ 2015, 7, 6)) # <-- different start and end dates
+ holidays2 = USFedCal.holidays(datetime(2015, 7, 3), datetime(
+ 2015, 7, 3)) # <-- same start and end dates
+
+ tm.assert_index_equal(holidays0, holidays1)
+ tm.assert_index_equal(holidays0, holidays2)
+
+ def test_rule_from_name(self):
+ USFedCal = get_calendar('USFederalHolidayCalendar')
+ assert USFedCal.rule_from_name('Thanksgiving') == USThanksgivingDay
+
+
+class TestHoliday(object):
+
+ def setup_method(self, method):
+ self.start_date = datetime(2011, 1, 1)
+ self.end_date = datetime(2020, 12, 31)
+
+ def check_results(self, holiday, start, end, expected):
+ assert list(holiday.dates(start, end)) == expected
+
+ # Verify that timezone info is preserved.
+ assert (list(holiday.dates(utc.localize(Timestamp(start)),
+ utc.localize(Timestamp(end)))) ==
+ [utc.localize(dt) for dt in expected])
+
+ def test_usmemorialday(self):
+ self.check_results(holiday=USMemorialDay,
+ start=self.start_date,
+ end=self.end_date,
+ expected=[
+ datetime(2011, 5, 30),
+ datetime(2012, 5, 28),
+ datetime(2013, 5, 27),
+ datetime(2014, 5, 26),
+ datetime(2015, 5, 25),
+ datetime(2016, 5, 30),
+ datetime(2017, 5, 29),
+ datetime(2018, 5, 28),
+ datetime(2019, 5, 27),
+ datetime(2020, 5, 25),
+ ], )
+
+ def test_non_observed_holiday(self):
+
+ self.check_results(
+ Holiday('July 4th Eve', month=7, day=3),
+ start="2001-01-01",
+ end="2003-03-03",
+ expected=[
+ Timestamp('2001-07-03 00:00:00'),
+ Timestamp('2002-07-03 00:00:00')
+ ]
+ )
+
+ self.check_results(
+ Holiday('July 4th Eve', month=7, day=3, days_of_week=(0, 1, 2, 3)),
+ start="2001-01-01",
+ end="2008-03-03",
+ expected=[
+ Timestamp('2001-07-03 00:00:00'),
+ Timestamp('2002-07-03 00:00:00'),
+ Timestamp('2003-07-03 00:00:00'),
+ Timestamp('2006-07-03 00:00:00'),
+ Timestamp('2007-07-03 00:00:00'),
+ ]
+ )
+
+ def test_easter(self):
+
+ self.check_results(EasterMonday,
+ start=self.start_date,
+ end=self.end_date,
+ expected=[
+ Timestamp('2011-04-25 00:00:00'),
+ Timestamp('2012-04-09 00:00:00'),
+ Timestamp('2013-04-01 00:00:00'),
+ Timestamp('2014-04-21 00:00:00'),
+ Timestamp('2015-04-06 00:00:00'),
+ Timestamp('2016-03-28 00:00:00'),
+ Timestamp('2017-04-17 00:00:00'),
+ Timestamp('2018-04-02 00:00:00'),
+ Timestamp('2019-04-22 00:00:00'),
+ Timestamp('2020-04-13 00:00:00'),
+ ], )
+ self.check_results(GoodFriday,
+ start=self.start_date,
+ end=self.end_date,
+ expected=[
+ Timestamp('2011-04-22 00:00:00'),
+ Timestamp('2012-04-06 00:00:00'),
+ Timestamp('2013-03-29 00:00:00'),
+ Timestamp('2014-04-18 00:00:00'),
+ Timestamp('2015-04-03 00:00:00'),
+ Timestamp('2016-03-25 00:00:00'),
+ Timestamp('2017-04-14 00:00:00'),
+ Timestamp('2018-03-30 00:00:00'),
+ Timestamp('2019-04-19 00:00:00'),
+ Timestamp('2020-04-10 00:00:00'),
+ ], )
+
+ def test_usthanksgivingday(self):
+
+ self.check_results(USThanksgivingDay,
+ start=self.start_date,
+ end=self.end_date,
+ expected=[
+ datetime(2011, 11, 24),
+ datetime(2012, 11, 22),
+ datetime(2013, 11, 28),
+ datetime(2014, 11, 27),
+ datetime(2015, 11, 26),
+ datetime(2016, 11, 24),
+ datetime(2017, 11, 23),
+ datetime(2018, 11, 22),
+ datetime(2019, 11, 28),
+ datetime(2020, 11, 26),
+ ], )
+
+ def test_holidays_within_dates(self):
+ # Fix holiday behavior found in #11477
+ # where holiday.dates returned dates outside start/end date
+ # or observed rules could not be applied as the holiday
+ # was not in the original date range (e.g., 7/4/2015 -> 7/3/2015)
+ start_date = datetime(2015, 7, 1)
+ end_date = datetime(2015, 7, 1)
+
+ calendar = get_calendar('USFederalHolidayCalendar')
+ new_years = calendar.rule_from_name('New Years Day')
+ july_4th = calendar.rule_from_name('July 4th')
+ veterans_day = calendar.rule_from_name('Veterans Day')
+ christmas = calendar.rule_from_name('Christmas')
+
+ # Holiday: (start/end date, holiday)
+ holidays = {USMemorialDay: ("2015-05-25", "2015-05-25"),
+ USLaborDay: ("2015-09-07", "2015-09-07"),
+ USColumbusDay: ("2015-10-12", "2015-10-12"),
+ USThanksgivingDay: ("2015-11-26", "2015-11-26"),
+ USMartinLutherKingJr: ("2015-01-19", "2015-01-19"),
+ USPresidentsDay: ("2015-02-16", "2015-02-16"),
+ GoodFriday: ("2015-04-03", "2015-04-03"),
+ EasterMonday: [("2015-04-06", "2015-04-06"),
+ ("2015-04-05", [])],
+ new_years: [("2015-01-01", "2015-01-01"),
+ ("2011-01-01", []),
+ ("2010-12-31", "2010-12-31")],
+ july_4th: [("2015-07-03", "2015-07-03"),
+ ("2015-07-04", [])],
+ veterans_day: [("2012-11-11", []),
+ ("2012-11-12", "2012-11-12")],
+ christmas: [("2011-12-25", []),
+ ("2011-12-26", "2011-12-26")]}
+
+ for rule, dates in compat.iteritems(holidays):
+ empty_dates = rule.dates(start_date, end_date)
+ assert empty_dates.tolist() == []
+
+ if isinstance(dates, tuple):
+ dates = [dates]
+
+ for start, expected in dates:
+ if len(expected):
+ expected = [Timestamp(expected)]
+ self.check_results(rule, start, start, expected)
+
+ def test_argument_types(self):
+ holidays = USThanksgivingDay.dates(self.start_date, self.end_date)
+
+ holidays_1 = USThanksgivingDay.dates(
+ self.start_date.strftime('%Y-%m-%d'),
+ self.end_date.strftime('%Y-%m-%d'))
+
+ holidays_2 = USThanksgivingDay.dates(
+ Timestamp(self.start_date),
+ Timestamp(self.end_date))
+
+ tm.assert_index_equal(holidays, holidays_1)
+ tm.assert_index_equal(holidays, holidays_2)
+
+ def test_special_holidays(self):
+ base_date = [datetime(2012, 5, 28)]
+ holiday_1 = Holiday('One-Time', year=2012, month=5, day=28)
+ holiday_2 = Holiday('Range', month=5, day=28,
+ start_date=datetime(2012, 1, 1),
+ end_date=datetime(2012, 12, 31),
+ offset=DateOffset(weekday=MO(1)))
+
+ assert base_date == holiday_1.dates(self.start_date, self.end_date)
+ assert base_date == holiday_2.dates(self.start_date, self.end_date)
+
+ def test_get_calendar(self):
+ class TestCalendar(AbstractHolidayCalendar):
+ rules = []
+
+ calendar = get_calendar('TestCalendar')
+ assert TestCalendar == calendar.__class__
+
+ def test_factory(self):
+ class_1 = HolidayCalendarFactory('MemorialDay',
+ AbstractHolidayCalendar,
+ USMemorialDay)
+ class_2 = HolidayCalendarFactory('Thansksgiving',
+ AbstractHolidayCalendar,
+ USThanksgivingDay)
+ class_3 = HolidayCalendarFactory('Combined', class_1, class_2)
+
+ assert len(class_1.rules) == 1
+ assert len(class_2.rules) == 1
+ assert len(class_3.rules) == 2
+
+
+class TestObservanceRules(object):
+
+ def setup_method(self, method):
+ self.we = datetime(2014, 4, 9)
+ self.th = datetime(2014, 4, 10)
+ self.fr = datetime(2014, 4, 11)
+ self.sa = datetime(2014, 4, 12)
+ self.su = datetime(2014, 4, 13)
+ self.mo = datetime(2014, 4, 14)
+ self.tu = datetime(2014, 4, 15)
+
+ def test_next_monday(self):
+ assert next_monday(self.sa) == self.mo
+ assert next_monday(self.su) == self.mo
+
+ def test_next_monday_or_tuesday(self):
+ assert next_monday_or_tuesday(self.sa) == self.mo
+ assert next_monday_or_tuesday(self.su) == self.tu
+ assert next_monday_or_tuesday(self.mo) == self.tu
+
+ def test_previous_friday(self):
+ assert previous_friday(self.sa) == self.fr
+ assert previous_friday(self.su) == self.fr
+
+ def test_sunday_to_monday(self):
+ assert sunday_to_monday(self.su) == self.mo
+
+ def test_nearest_workday(self):
+ assert nearest_workday(self.sa) == self.fr
+ assert nearest_workday(self.su) == self.mo
+ assert nearest_workday(self.mo) == self.mo
+
+ def test_weekend_to_monday(self):
+ assert weekend_to_monday(self.sa) == self.mo
+ assert weekend_to_monday(self.su) == self.mo
+ assert weekend_to_monday(self.mo) == self.mo
+
+ def test_next_workday(self):
+ assert next_workday(self.sa) == self.mo
+ assert next_workday(self.su) == self.mo
+ assert next_workday(self.mo) == self.tu
+
+ def test_previous_workday(self):
+ assert previous_workday(self.sa) == self.fr
+ assert previous_workday(self.su) == self.fr
+ assert previous_workday(self.tu) == self.mo
+
+ def test_before_nearest_workday(self):
+ assert before_nearest_workday(self.sa) == self.th
+ assert before_nearest_workday(self.su) == self.fr
+ assert before_nearest_workday(self.tu) == self.mo
+
+ def test_after_nearest_workday(self):
+ assert after_nearest_workday(self.sa) == self.mo
+ assert after_nearest_workday(self.su) == self.tu
+ assert after_nearest_workday(self.fr) == self.mo
+
+
+class TestFederalHolidayCalendar(object):
+
+ def test_no_mlk_before_1986(self):
+ # see gh-10278
+ class MLKCalendar(AbstractHolidayCalendar):
+ rules = [USMartinLutherKingJr]
+
+ holidays = MLKCalendar().holidays(start='1984',
+ end='1988').to_pydatetime().tolist()
+
+ # Testing to make sure holiday is not incorrectly observed before 1986
+ assert holidays == [datetime(1986, 1, 20, 0, 0),
+ datetime(1987, 1, 19, 0, 0)]
+
+ def test_memorial_day(self):
+ class MemorialDay(AbstractHolidayCalendar):
+ rules = [USMemorialDay]
+
+ holidays = MemorialDay().holidays(start='1971',
+ end='1980').to_pydatetime().tolist()
+
+ # Fixes 5/31 error and checked manually against Wikipedia
+ assert holidays == [datetime(1971, 5, 31, 0, 0),
+ datetime(1972, 5, 29, 0, 0),
+ datetime(1973, 5, 28, 0, 0),
+ datetime(1974, 5, 27, 0, 0),
+ datetime(1975, 5, 26, 0, 0),
+ datetime(1976, 5, 31, 0, 0),
+ datetime(1977, 5, 30, 0, 0),
+ datetime(1978, 5, 29, 0, 0),
+ datetime(1979, 5, 28, 0, 0)]
+
+
+class TestHolidayConflictingArguments(object):
+
+ def test_both_offset_observance_raises(self):
+ # see gh-10217
+ with pytest.raises(NotImplementedError):
+ Holiday("Cyber Monday", month=11, day=1,
+ offset=[DateOffset(weekday=SA(4))],
+ observance=next_monday)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/__init__.py b/contrib/python/pandas/py2/pandas/tests/tslibs/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_api.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_api.py
new file mode 100644
index 00000000000..de937d1a4c5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_api.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+"""Tests that the tslibs API is locked down"""
+
+from pandas._libs import tslibs
+
+
+def test_namespace():
+
+ submodules = ['ccalendar',
+ 'conversion',
+ 'fields',
+ 'frequencies',
+ 'nattype',
+ 'np_datetime',
+ 'offsets',
+ 'parsing',
+ 'period',
+ 'resolution',
+ 'strptime',
+ 'timedeltas',
+ 'timestamps',
+ 'timezones']
+
+ api = ['NaT',
+ 'iNaT',
+ 'is_null_datetimelike',
+ 'OutOfBoundsDatetime',
+ 'Period',
+ 'IncompatibleFrequency',
+ 'Timedelta',
+ 'Timestamp',
+ 'delta_to_nanoseconds',
+ 'ints_to_pytimedelta',
+ 'localize_pydatetime',
+ 'normalize_date',
+ 'tz_convert_single']
+
+ expected = set(submodules + api)
+ names = [x for x in dir(tslibs) if not x.startswith('__')]
+ assert set(names) == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_array_to_datetime.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_array_to_datetime.py
new file mode 100644
index 00000000000..f5b036dde20
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_array_to_datetime.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+from datetime import date, datetime
+
+from dateutil.tz.tz import tzoffset
+import numpy as np
+import pytest
+import pytz
+
+from pandas._libs import iNaT, tslib
+from pandas.compat.numpy import np_array_datetime64_compat
+
+import pandas.util.testing as tm
+
+
[email protected]("data,expected", [
+ (["01-01-2013", "01-02-2013"],
+ ["2013-01-01T00:00:00.000000000-0000",
+ "2013-01-02T00:00:00.000000000-0000"]),
+ (["Mon Sep 16 2013", "Tue Sep 17 2013"],
+ ["2013-09-16T00:00:00.000000000-0000",
+ "2013-09-17T00:00:00.000000000-0000"])
+])
+def test_parsing_valid_dates(data, expected):
+ arr = np.array(data, dtype=object)
+ result, _ = tslib.array_to_datetime(arr)
+
+ expected = np_array_datetime64_compat(expected, dtype="M8[ns]")
+ tm.assert_numpy_array_equal(result, expected)
+
+
[email protected]("dt_string, expected_tz", [
+ ["01-01-2013 08:00:00+08:00", 480],
+ ["2013-01-01T08:00:00.000000000+0800", 480],
+ ["2012-12-31T16:00:00.000000000-0800", -480],
+ ["12-31-2012 23:00:00-01:00", -60]
+])
+def test_parsing_timezone_offsets(dt_string, expected_tz):
+ # All of these datetime strings with offsets are equivalent
+ # to the same datetime after the timezone offset is added.
+ arr = np.array(["01-01-2013 00:00:00"], dtype=object)
+ expected, _ = tslib.array_to_datetime(arr)
+
+ arr = np.array([dt_string], dtype=object)
+ result, result_tz = tslib.array_to_datetime(arr)
+
+ tm.assert_numpy_array_equal(result, expected)
+ assert result_tz is pytz.FixedOffset(expected_tz)
+
+
+def test_parsing_non_iso_timezone_offset():
+ dt_string = "01-01-2013T00:00:00.000000000+0000"
+ arr = np.array([dt_string], dtype=object)
+
+ result, result_tz = tslib.array_to_datetime(arr)
+ expected = np.array([np.datetime64("2013-01-01 00:00:00.000000000")])
+
+ tm.assert_numpy_array_equal(result, expected)
+ assert result_tz is pytz.FixedOffset(0)
+
+
+def test_parsing_different_timezone_offsets():
+ # see gh-17697
+ data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"]
+ data = np.array(data, dtype=object)
+
+ result, result_tz = tslib.array_to_datetime(data)
+ expected = np.array([datetime(2015, 11, 18, 15, 30,
+ tzinfo=tzoffset(None, 19800)),
+ datetime(2015, 11, 18, 15, 30,
+ tzinfo=tzoffset(None, 23400))],
+ dtype=object)
+
+ tm.assert_numpy_array_equal(result, expected)
+ assert result_tz is None
+
+
+ ["-352.737091", "183.575577"],
+ ["1", "2", "3", "4", "5"]
+])
+def test_number_looking_strings_not_into_datetime(data):
+ # see gh-4601
+ #
+ # These strings don't look like datetimes, so
+ # they shouldn't be attempted to be converted.
+ arr = np.array(data, dtype=object)
+ result, _ = tslib.array_to_datetime(arr, errors="ignore")
+
+ tm.assert_numpy_array_equal(result, arr)
+
+
[email protected]("invalid_date", [
+ date(1000, 1, 1),
+ datetime(1000, 1, 1),
+ "1000-01-01",
+ "Jan 1, 1000",
+ np.datetime64("1000-01-01")])
[email protected]("errors", ["coerce", "raise"])
+def test_coerce_outside_ns_bounds(invalid_date, errors):
+ arr = np.array([invalid_date], dtype="object")
+ kwargs = dict(values=arr, errors=errors)
+
+ if errors == "raise":
+ msg = "Out of bounds nanosecond timestamp"
+
+ with pytest.raises(ValueError, match=msg):
+ tslib.array_to_datetime(**kwargs)
+ else: # coerce.
+ result, _ = tslib.array_to_datetime(**kwargs)
+ expected = np.array([iNaT], dtype="M8[ns]")
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_coerce_outside_ns_bounds_one_valid():
+ arr = np.array(["1/1/1000", "1/1/2000"], dtype=object)
+ result, _ = tslib.array_to_datetime(arr, errors="coerce")
+
+ expected = [iNaT, "2000-01-01T00:00:00.000000000-0000"]
+ expected = np_array_datetime64_compat(expected, dtype="M8[ns]")
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
[email protected]("errors", ["ignore", "coerce"])
+def test_coerce_of_invalid_datetimes(errors):
+ arr = np.array(["01-01-2013", "not_a_date", "1"], dtype=object)
+ kwargs = dict(values=arr, errors=errors)
+
+ if errors == "ignore":
+ # Without coercing, the presence of any invalid
+ # dates prevents any values from being converted.
+ result, _ = tslib.array_to_datetime(**kwargs)
+ tm.assert_numpy_array_equal(result, arr)
+ else: # coerce.
+ # With coercing, the invalid dates becomes iNaT
+ result, _ = tslib.array_to_datetime(arr, errors="coerce")
+ expected = ["2013-01-01T00:00:00.000000000-0000",
+ iNaT,
+ iNaT]
+
+ tm.assert_numpy_array_equal(
+ result,
+ np_array_datetime64_compat(expected, dtype="M8[ns]"))
+
+
+def test_to_datetime_barely_out_of_bounds():
+ # see gh-19382, gh-19529
+ #
+ # Close enough to bounds that dropping nanos
+ # would result in an in-bounds datetime.
+ arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object)
+ msg = "Out of bounds nanosecond timestamp: 2262-04-11 23:47:16"
+
+ with pytest.raises(tslib.OutOfBoundsDatetime, match=msg):
+ tslib.array_to_datetime(arr)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_ccalendar.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_ccalendar.py
new file mode 100644
index 00000000000..255558a8001
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_ccalendar.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import ccalendar
+
+
[email protected]("date_tuple,expected", [
+ ((2001, 3, 1), 60),
+ ((2004, 3, 1), 61),
+ ((1907, 12, 31), 365), # End-of-year, non-leap year.
+ ((2004, 12, 31), 366), # End-of-year, leap year.
+])
+def test_get_day_of_year_numeric(date_tuple, expected):
+ assert ccalendar.get_day_of_year(*date_tuple) == expected
+
+
+def test_get_day_of_year_dt():
+ dt = datetime.fromordinal(1 + np.random.randint(365 * 4000))
+ result = ccalendar.get_day_of_year(dt.year, dt.month, dt.day)
+
+ expected = (dt - dt.replace(month=1, day=1)).days + 1
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_conversion.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_conversion.py
new file mode 100644
index 00000000000..13398a69b49
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_conversion.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+from pytz import UTC
+
+from pandas._libs.tslib import iNaT
+from pandas._libs.tslibs import conversion, timezones
+
+from pandas import date_range
+import pandas.util.testing as tm
+
+
+def _compare_utc_to_local(tz_didx):
+ def f(x):
+ return conversion.tz_convert_single(x, UTC, tz_didx.tz)
+
+ result = conversion.tz_convert(tz_didx.asi8, UTC, tz_didx.tz)
+ expected = np.vectorize(f)(tz_didx.asi8)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def _compare_local_to_utc(tz_didx, utc_didx):
+ def f(x):
+ return conversion.tz_convert_single(x, tz_didx.tz, UTC)
+
+ result = conversion.tz_convert(utc_didx.asi8, tz_didx.tz, UTC)
+ expected = np.vectorize(f)(utc_didx.asi8)
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture):
+ tz = tz_aware_fixture
+ tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz)
+ utc_didx = date_range("2014-03-01", "2015-01-10", freq="H")
+
+ _compare_utc_to_local(tz_didx)
+ _compare_local_to_utc(tz_didx, utc_didx)
+
+
[email protected]("freq", ["D", "A"])
+def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq):
+ tz = tz_aware_fixture
+ tz_didx = date_range("2000-01-01", "2020-01-01", freq=freq, tz=tz)
+ utc_didx = date_range("2000-01-01", "2020-01-01", freq=freq)
+
+ _compare_utc_to_local(tz_didx)
+ _compare_local_to_utc(tz_didx, utc_didx)
+
+
+ pytest.param(np.array([], dtype=np.int64), id="empty"),
+ pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat")])
+def test_tz_convert_corner(arr):
+ result = conversion.tz_convert(arr,
+ timezones.maybe_get_tz("US/Eastern"),
+ timezones.maybe_get_tz("Asia/Tokyo"))
+ tm.assert_numpy_array_equal(result, arr)
+
+
[email protected]("copy", [True, False])
[email protected]("dtype", ["M8[ns]", "M8[s]"])
+def test_length_zero_copy(dtype, copy):
+ arr = np.array([], dtype=dtype)
+ result = conversion.ensure_datetime64ns(arr, copy=copy)
+ assert result.base is (None if copy else arr)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_libfrequencies.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_libfrequencies.py
new file mode 100644
index 00000000000..b9b1c72dbf2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_libfrequencies.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas._libs.tslibs.frequencies import (
+ INVALID_FREQ_ERR_MSG, _period_str_to_code, get_rule_month, is_subperiod,
+ is_superperiod)
+
+from pandas.tseries import offsets
+
+
[email protected]("obj,expected", [
+ ("W", "DEC"),
+ (offsets.Week(), "DEC"),
+
+ ("D", "DEC"),
+ (offsets.Day(), "DEC"),
+
+ ("Q", "DEC"),
+ (offsets.QuarterEnd(startingMonth=12), "DEC"),
+
+ ("Q-JAN", "JAN"),
+ (offsets.QuarterEnd(startingMonth=1), "JAN"),
+
+ ("A-DEC", "DEC"),
+ ("Y-DEC", "DEC"),
+ (offsets.YearEnd(), "DEC"),
+
+ ("A-MAY", "MAY"),
+ ("Y-MAY", "MAY"),
+ (offsets.YearEnd(month=5), "MAY")
+])
+def test_get_rule_month(obj, expected):
+ result = get_rule_month(obj)
+ assert result == expected
+
+
[email protected]("obj,expected", [
+ ("A", 1000),
+ ("A-DEC", 1000),
+ ("A-JAN", 1001),
+
+ ("Y", 1000),
+ ("Y-DEC", 1000),
+ ("Y-JAN", 1001),
+
+ ("Q", 2000),
+ ("Q-DEC", 2000),
+ ("Q-FEB", 2002),
+
+ ("W", 4000),
+ ("W-SUN", 4000),
+ ("W-FRI", 4005),
+
+ ("Min", 8000),
+ ("ms", 10000),
+ ("US", 11000),
+ ("NS", 12000)
+])
+def test_period_str_to_code(obj, expected):
+ assert _period_str_to_code(obj) == expected
+
+
[email protected]("p1,p2,expected", [
+ # Input validation.
+ (offsets.MonthEnd(), None, False),
+ (offsets.YearEnd(), None, False),
+ (None, offsets.YearEnd(), False),
+ (None, offsets.MonthEnd(), False),
+ (None, None, False),
+
+ (offsets.YearEnd(), offsets.MonthEnd(), True),
+ (offsets.Hour(), offsets.Minute(), True),
+ (offsets.Second(), offsets.Milli(), True),
+ (offsets.Milli(), offsets.Micro(), True),
+ (offsets.Micro(), offsets.Nano(), True)
+])
+def test_super_sub_symmetry(p1, p2, expected):
+ assert is_superperiod(p1, p2) is expected
+ assert is_subperiod(p2, p1) is expected
+
+
[email protected]("freq,expected,aliases", [
+ ("D", 6000, ["DAY", "DLY", "DAILY"]),
+ ("M", 3000, ["MTH", "MONTH", "MONTHLY"]),
+ ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]),
+ ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]),
+ ("T", 8000, ["minute", "MINUTE", "MINUTELY"]),
+ ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]),
+ ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]),
+ ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]),
+ ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]),
+])
+def test_assert_aliases_deprecated(freq, expected, aliases):
+ assert isinstance(aliases, list)
+ assert _period_str_to_code(freq) == expected
+
+ for alias in aliases:
+ with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
+ _period_str_to_code(alias)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_liboffsets.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_liboffsets.py
new file mode 100644
index 00000000000..cb699278595
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_liboffsets.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for helper functions in the cython tslibs.offsets
+"""
+from datetime import datetime
+
+import pytest
+
+import pandas._libs.tslibs.offsets as liboffsets
+from pandas._libs.tslibs.offsets import roll_qtrday
+
+from pandas import Timestamp
+
+
[email protected](params=["start", "end", "business_start", "business_end"])
+def day_opt(request):
+ return request.param
+
+
[email protected]("dt,exp_week_day,exp_last_day", [
+ (datetime(2017, 11, 30), 3, 30), # Business day.
+ (datetime(1993, 10, 31), 6, 29) # Non-business day.
+])
+def test_get_last_bday(dt, exp_week_day, exp_last_day):
+ assert dt.weekday() == exp_week_day
+ assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day
+
+
[email protected]("dt,exp_week_day,exp_first_day", [
+ (datetime(2017, 4, 1), 5, 3), # Non-weekday.
+ (datetime(1993, 10, 1), 4, 1) # Business day.
+])
+def test_get_first_bday(dt, exp_week_day, exp_first_day):
+ assert dt.weekday() == exp_week_day
+ assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day
+
+
[email protected]("months,day_opt,expected", [
+ (0, 15, datetime(2017, 11, 15)),
+ (0, None, datetime(2017, 11, 30)),
+ (1, "start", datetime(2017, 12, 1)),
+ (-145, "end", datetime(2005, 10, 31)),
+ (0, "business_end", datetime(2017, 11, 30)),
+ (0, "business_start", datetime(2017, 11, 1))
+])
+def test_shift_month_dt(months, day_opt, expected):
+ dt = datetime(2017, 11, 30)
+ assert liboffsets.shift_month(dt, months, day_opt=day_opt) == expected
+
+
[email protected]("months,day_opt,expected", [
+ (1, "start", Timestamp("1929-06-01")),
+ (-3, "end", Timestamp("1929-02-28")),
+ (25, None, Timestamp("1931-06-5")),
+ (-1, 31, Timestamp("1929-04-30"))
+])
+def test_shift_month_ts(months, day_opt, expected):
+ ts = Timestamp("1929-05-05")
+ assert liboffsets.shift_month(ts, months, day_opt=day_opt) == expected
+
+
+def test_shift_month_error():
+ dt = datetime(2017, 11, 15)
+ day_opt = "this should raise"
+
+ with pytest.raises(ValueError, match=day_opt):
+ liboffsets.shift_month(dt, 3, day_opt=day_opt)
+
+
[email protected]("other,expected", [
+ # Before March 1.
+ (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}),
+
+ # After March 1.
+ (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1})
+])
[email protected]("n", [2, -7, 0])
+def test_roll_yearday(other, expected, n):
+ month = 3
+ day_opt = "start" # `other` will be compared to March 1.
+
+ assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n]
+
+
[email protected]("other,expected", [
+ # Before June 30.
+ (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}),
+
+ # After June 30.
+ (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1})
+])
[email protected]("n", [5, -7, 0])
+def test_roll_yearday2(other, expected, n):
+ month = 6
+ day_opt = "end" # `other` will be compared to June 30.
+
+ assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n]
+
+
+def test_get_day_of_month_error():
+ # get_day_of_month is not directly exposed.
+ # We test it via roll_yearday.
+ dt = datetime(2017, 11, 15)
+ day_opt = "foo"
+
+ with pytest.raises(ValueError, match=day_opt):
+ # To hit the raising case we need month == dt.month and n > 0.
+ liboffsets.roll_yearday(dt, n=3, month=11, day_opt=day_opt)
+
+
+ 3, # (other.month % 3) < (month % 3)
+ 5 # (other.month % 3) > (month % 3)
+])
[email protected]("n", [4, -3])
+def test_roll_qtr_day_not_mod_unequal(day_opt, month, n):
+ expected = {
+ 3: {
+ -3: -2,
+ 4: 4
+ },
+ 5: {
+ -3: -3,
+ 4: 3
+ }
+ }
+
+ other = Timestamp(2072, 10, 1, 6, 17, 18) # Saturday.
+ assert roll_qtrday(other, n, month, day_opt, modby=3) == expected[month][n]
+
+
[email protected]("other,month,exp_dict", [
+ # Monday.
+ (datetime(1999, 5, 31), 2, {
+ -1: {
+ "start": 0,
+ "business_start": 0
+ }
+ }),
+
+ # Saturday.
+ (Timestamp(2072, 10, 1, 6, 17, 18), 4, {
+ 2: {
+ "end": 1,
+ "business_end": 1,
+ "business_start": 1
+ }
+ }),
+
+ # First business day.
+ (Timestamp(2072, 10, 3, 6, 17, 18), 4, {
+ 2: {
+ "end": 1,
+ "business_end": 1
+ },
+ -1: {
+ "start": 0
+ }
+ })
+])
[email protected]("n", [2, -1])
+def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt):
+ # All cases have (other.month % 3) == (month % 3).
+ expected = exp_dict.get(n, {}).get(day_opt, n)
+ assert roll_qtrday(other, n, month, day_opt, modby=3) == expected
+
+
[email protected]("n,expected", [
+ (42, {29: 42, 1: 42, 31: 41}),
+ (-4, {29: -4, 1: -3, 31: -4})
+])
[email protected]("compare", [29, 1, 31])
+def test_roll_convention(n, expected, compare):
+ assert liboffsets.roll_convention(29, n, compare) == expected[compare]
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_normalize_date.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_normalize_date.py
new file mode 100644
index 00000000000..6124121b971
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_normalize_date.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""Tests for functions from pandas._libs.tslibs"""
+
+from datetime import date, datetime
+
+import pytest
+
+from pandas._libs import tslibs
+
+
[email protected]("value,expected", [
+ (date(2012, 9, 7), datetime(2012, 9, 7)),
+ (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)),
+ (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1))
+])
+def test_normalize_date(value, expected):
+ result = tslibs.normalize_date(value)
+ assert result == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_parse_iso8601.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_parse_iso8601.py
new file mode 100644
index 00000000000..d1b3dee948a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_parse_iso8601.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+import pytest
+
+from pandas._libs import tslib
+
+
[email protected]("date_str, exp", [
+ ("2011-01-02", datetime(2011, 1, 2)),
+ ("2011-1-2", datetime(2011, 1, 2)),
+ ("2011-01", datetime(2011, 1, 1)),
+ ("2011-1", datetime(2011, 1, 1)),
+ ("2011 01 02", datetime(2011, 1, 2)),
+ ("2011.01.02", datetime(2011, 1, 2)),
+ ("2011/01/02", datetime(2011, 1, 2)),
+ ("2011\\01\\02", datetime(2011, 1, 2)),
+ ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)),
+ ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30))])
+def test_parsers_iso8601(date_str, exp):
+ # see gh-12060
+ #
+ # Test only the ISO parser - flexibility to
+ # different separators and leading zero's.
+ actual = tslib._test_parse_iso8601(date_str)
+ assert actual == exp
+
+
[email protected]("date_str", [
+ "2011-01/02",
+ "2011=11=11",
+ "201401",
+ "201111",
+ "200101",
+
+ # Mixed separated and unseparated.
+ "2005-0101",
+ "200501-01",
+ "20010101 12:3456",
+ "20010101 1234:56",
+
+ # HHMMSS must have two digits in
+ # each component if unseparated.
+ "20010101 1",
+ "20010101 123",
+ "20010101 12345",
+ "20010101 12345Z",
+])
+def test_parsers_iso8601_invalid(date_str):
+ msg = "Error parsing datetime string \"{s}\"".format(s=date_str)
+
+ with pytest.raises(ValueError, match=msg):
+ tslib._test_parse_iso8601(date_str)
+
+
+def test_parsers_iso8601_invalid_offset_invalid():
+ date_str = "2001-01-01 12-34-56"
+ msg = ("Timezone hours offset out of range "
+ "in datetime string \"{s}\"".format(s=date_str))
+
+ with pytest.raises(ValueError, match=msg):
+ tslib._test_parse_iso8601(date_str)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_parsing.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_parsing.py
new file mode 100644
index 00000000000..597ec6df738
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_parsing.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for Timestamp parsing, aimed at pandas/_libs/tslibs/parsing.pyx
+"""
+from datetime import datetime
+
+from dateutil.parser import parse
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import parsing
+from pandas._libs.tslibs.parsing import parse_time_string
+import pandas.util._test_decorators as td
+
+from pandas.util import testing as tm
+
+
+def test_parse_time_string():
+ (date, parsed, reso) = parse_time_string("4Q1984")
+ (date_lower, parsed_lower, reso_lower) = parse_time_string("4q1984")
+
+ assert date == date_lower
+ assert reso == reso_lower
+ assert parsed == parsed_lower
+
+
[email protected]("dashed,normal", [
+ ("1988-Q2", "1988Q2"),
+ ("2Q-1988", "2Q1988")
+])
+def test_parse_time_quarter_with_dash(dashed, normal):
+ # see gh-9688
+ (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed)
+ (date, parsed, reso) = parse_time_string(normal)
+
+ assert date_dash == date
+ assert parsed_dash == parsed
+ assert reso_dash == reso
+
+
[email protected]("dashed", [
+ "-2Q1992", "2-Q1992", "4-4Q1992"
+])
+def test_parse_time_quarter_with_dash_error(dashed):
+ msg = ("Unknown datetime string format, "
+ "unable to parse: {dashed}".format(dashed=dashed))
+
+ with pytest.raises(parsing.DateParseError, match=msg):
+ parse_time_string(dashed)
+
+
[email protected]("date_string,expected", [
+ ("123.1234", False),
+ ("-50000", False),
+ ("999", False),
+ ("m", False),
+ ("T", False),
+
+ ("Mon Sep 16, 2013", True),
+ ("2012-01-01", True),
+ ("01/01/2012", True),
+ ("01012012", True),
+ ("0101", True),
+ ("1-1", True)
+])
+def test_does_not_convert_mixed_integer(date_string, expected):
+ assert parsing._does_string_look_like_datetime(date_string) is expected
+
+
[email protected]("date_str,kwargs,msg", [
+ ("2013Q5", dict(),
+ ("Incorrect quarterly string is given, "
+ "quarter must be between 1 and 4: 2013Q5")),
+
+ # see gh-5418
+ ("2013Q1", dict(freq="INVLD-L-DEC-SAT"),
+ ("Unable to retrieve month information "
+ "from given freq: INVLD-L-DEC-SAT"))
+])
+def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg):
+ with pytest.raises(parsing.DateParseError, match=msg):
+ parsing.parse_time_string(date_str, **kwargs)
+
+
[email protected]("date_str,freq,expected", [
+ ("2013Q2", None, datetime(2013, 4, 1)),
+ ("2013Q2", "A-APR", datetime(2012, 8, 1)),
+ ("2013-Q2", "A-DEC", datetime(2013, 4, 1))
+])
+def test_parsers_quarterly_with_freq(date_str, freq, expected):
+ result, _, _ = parsing.parse_time_string(date_str, freq=freq)
+ assert result == expected
+
+
[email protected]("date_str", [
+ "2Q 2005", "2Q-200A", "2Q-200",
+ "22Q2005", "2Q200.", "6Q-20"
+])
+def test_parsers_quarter_invalid(date_str):
+ if date_str == "6Q-20":
+ msg = ("Incorrect quarterly string is given, quarter "
+ "must be between 1 and 4: {date_str}".format(date_str=date_str))
+ else:
+ msg = ("Unknown datetime string format, unable "
+ "to parse: {date_str}".format(date_str=date_str))
+
+ with pytest.raises(ValueError, match=msg):
+ parsing.parse_time_string(date_str)
+
+
[email protected]("date_str,expected", [
+ ("201101", datetime(2011, 1, 1, 0, 0)),
+ ("200005", datetime(2000, 5, 1, 0, 0))
+])
+def test_parsers_month_freq(date_str, expected):
+ result, _, _ = parsing.parse_time_string(date_str, freq="M")
+ assert result == expected
+
+
[email protected]_if_not_us_locale
[email protected]("string,fmt", [
+ ("20111230", "%Y%m%d"),
+ ("2011-12-30", "%Y-%m-%d"),
+ ("30-12-2011", "%d-%m-%Y"),
+ ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
+ ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f")
+])
+def test_guess_datetime_format_with_parseable_formats(string, fmt):
+ result = parsing._guess_datetime_format(string)
+ assert result == fmt
+
+
[email protected]("dayfirst,expected", [
+ (True, "%d/%m/%Y"),
+ (False, "%m/%d/%Y")
+])
+def test_guess_datetime_format_with_dayfirst(dayfirst, expected):
+ ambiguous_string = "01/01/2011"
+ result = parsing._guess_datetime_format(ambiguous_string,
+ dayfirst=dayfirst)
+ assert result == expected
+
+
[email protected]_if_has_locale
[email protected]("string,fmt", [
+ ("30/Dec/2011", "%d/%b/%Y"),
+ ("30/December/2011", "%d/%B/%Y"),
+ ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S")
+])
+def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
+ result = parsing._guess_datetime_format(string)
+ assert result == fmt
+
+
[email protected]("invalid_dt", [
+ "2013", "01/2013", "12:00:00", "1/1/1/1",
+ "this_is_not_a_datetime", "51a", 9,
+ datetime(2011, 1, 1)
+])
+def test_guess_datetime_format_invalid_inputs(invalid_dt):
+ # A datetime string must include a year, month and a day for it to be
+ # guessable, in addition to being a string that looks like a datetime.
+ assert parsing._guess_datetime_format(invalid_dt) is None
+
+
[email protected]("string,fmt", [
+ ("2011-1-1", "%Y-%m-%d"),
+ ("1/1/2011", "%m/%d/%Y"),
+ ("30-1-2011", "%d-%m-%Y"),
+ ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"),
+ ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"),
+ ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S")
+])
+def test_guess_datetime_format_no_padding(string, fmt):
+ # see gh-11142
+ result = parsing._guess_datetime_format(string)
+ assert result == fmt
+
+
+def test_try_parse_dates():
+ arr = np.array(["5/1/2000", "6/1/2000", "7/1/2000"], dtype=object)
+ result = parsing.try_parse_dates(arr, dayfirst=True)
+
+ expected = np.array([parse(d, dayfirst=True) for d in arr])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_period_asfreq.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_period_asfreq.py
new file mode 100644
index 00000000000..6a9522e7053
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_period_asfreq.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas._libs.tslibs.frequencies import get_freq
+from pandas._libs.tslibs.period import period_asfreq, period_ordinal
+
+
[email protected]("freq1,freq2,expected", [
+ ("D", "H", 24),
+ ("D", "T", 1440),
+ ("D", "S", 86400),
+ ("D", "L", 86400000),
+ ("D", "U", 86400000000),
+ ("D", "N", 86400000000000),
+
+ ("H", "T", 60),
+ ("H", "S", 3600),
+ ("H", "L", 3600000),
+ ("H", "U", 3600000000),
+ ("H", "N", 3600000000000),
+
+ ("T", "S", 60),
+ ("T", "L", 60000),
+ ("T", "U", 60000000),
+ ("T", "N", 60000000000),
+
+ ("S", "L", 1000),
+ ("S", "U", 1000000),
+ ("S", "N", 1000000000),
+
+ ("L", "U", 1000),
+ ("L", "N", 1000000),
+
+ ("U", "N", 1000)
+])
+def test_intra_day_conversion_factors(freq1, freq2, expected):
+ assert period_asfreq(1, get_freq(freq1),
+ get_freq(freq2), False) == expected
+
+
[email protected]("freq,expected", [
+ ("A", 0),
+ ("M", 0),
+ ("W", 1),
+ ("D", 0),
+ ("B", 0)
+])
+def test_period_ordinal_start_values(freq, expected):
+ # information for Jan. 1, 1970.
+ assert period_ordinal(1970, 1, 1, 0, 0, 0,
+ 0, 0, get_freq(freq)) == expected
+
+
[email protected]("dt,expected", [
+ ((1970, 1, 4, 0, 0, 0, 0, 0), 1),
+ ((1970, 1, 5, 0, 0, 0, 0, 0), 2),
+ ((2013, 10, 6, 0, 0, 0, 0, 0), 2284),
+ ((2013, 10, 7, 0, 0, 0, 0, 0), 2285)
+])
+def test_period_ordinal_week(dt, expected):
+ args = dt + (get_freq("W"),)
+ assert period_ordinal(*args) == expected
+
+
[email protected]("day,expected", [
+ # Thursday (Oct. 3, 2013).
+ (3, 11415),
+
+ # Friday (Oct. 4, 2013).
+ (4, 11416),
+
+ # Saturday (Oct. 5, 2013).
+ (5, 11417),
+
+ # Sunday (Oct. 6, 2013).
+ (6, 11417),
+
+ # Monday (Oct. 7, 2013).
+ (7, 11417),
+
+ # Tuesday (Oct. 8, 2013).
+ (8, 11418)
+])
+def test_period_ordinal_business_day(day, expected):
+ args = (2013, 10, day, 0, 0, 0, 0, 0, get_freq("B"))
+ assert period_ordinal(*args) == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_timedeltas.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_timedeltas.py
new file mode 100644
index 00000000000..fdc8eff80ac
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_timedeltas.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds
+
+import pandas as pd
+from pandas import Timedelta
+
+
[email protected]("obj,expected", [
+ (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9),
+ (Timedelta(minutes=-7), -7 * 60 * 1e9),
+ (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9),
+ (pd.offsets.Nano(125), 125),
+ (1, 1),
+ (np.int64(2), 2),
+ (np.int32(3), 3)
+])
+def test_delta_to_nanoseconds(obj, expected):
+ result = delta_to_nanoseconds(obj)
+ assert result == expected
+
+
+def test_delta_to_nanoseconds_error():
+ obj = np.array([123456789], dtype="m8[ns]")
+
+ with pytest.raises(TypeError, match="<(class|type) 'numpy.ndarray'>"):
+ delta_to_nanoseconds(obj)
diff --git a/contrib/python/pandas/py2/pandas/tests/tslibs/test_timezones.py b/contrib/python/pandas/py2/pandas/tests/tslibs/test_timezones.py
new file mode 100644
index 00000000000..0255865dbdf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/tslibs/test_timezones.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+import dateutil.tz
+import pytest
+import pytz
+
+from pandas._libs.tslibs import conversion, timezones
+
+from pandas import Timestamp
+
+
[email protected]("tz_name", list(pytz.common_timezones))
+def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name):
+ if tz_name == "UTC":
+ pytest.skip("UTC: special case in dateutil")
+
+ tz_p = timezones.maybe_get_tz(tz_name)
+ tz_d = timezones.maybe_get_tz("dateutil/" + tz_name)
+
+ if tz_d is None:
+ pytest.skip(tz_name + ": dateutil does not know about this one")
+
+ assert timezones._p_tz_cache_key(tz_p) != timezones._p_tz_cache_key(tz_d)
+
+
+def test_tzlocal_repr():
+ # see gh-13583
+ ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal())
+ assert ts.tz == dateutil.tz.tzlocal()
+ assert "tz='tzlocal()')" in repr(ts)
+
+
+def test_tzlocal_maybe_get_tz():
+ # see gh-13583
+ tz = timezones.maybe_get_tz('tzlocal()')
+ assert tz == dateutil.tz.tzlocal()
+
+
+def test_tzlocal_offset():
+ # see gh-13583
+ #
+ # Get offset using normal datetime for test.
+ ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal())
+
+ offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1))
+ offset = offset.total_seconds() * 1000000000
+
+ assert ts.value + offset == Timestamp("2011-01-01").value
+
+
+ (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)),
+ (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz))
+])
+def infer_setup(request):
+ eastern, localize = request.param
+
+ start_naive = datetime(2001, 1, 1)
+ end_naive = datetime(2009, 1, 1)
+
+ start = localize(eastern, start_naive)
+ end = localize(eastern, end_naive)
+
+ return eastern, localize, start, end, start_naive, end_naive
+
+
+def test_infer_tz_compat(infer_setup):
+ eastern, _, start, end, start_naive, end_naive = infer_setup
+
+ assert (timezones.infer_tzinfo(start, end) is
+ conversion.localize_pydatetime(start_naive, eastern).tzinfo)
+ assert (timezones.infer_tzinfo(start, None) is
+ conversion.localize_pydatetime(start_naive, eastern).tzinfo)
+ assert (timezones.infer_tzinfo(None, end) is
+ conversion.localize_pydatetime(end_naive, eastern).tzinfo)
+
+
+def test_infer_tz_utc_localize(infer_setup):
+ _, _, start, end, start_naive, end_naive = infer_setup
+ utc = pytz.utc
+
+ start = utc.localize(start_naive)
+ end = utc.localize(end_naive)
+
+ assert timezones.infer_tzinfo(start, end) is utc
+
+
[email protected]("ordered", [True, False])
+def test_infer_tz_mismatch(infer_setup, ordered):
+ eastern, _, _, _, start_naive, end_naive = infer_setup
+ msg = "Inputs must both have the same timezone"
+
+ utc = pytz.utc
+ start = utc.localize(start_naive)
+ end = conversion.localize_pydatetime(end_naive, eastern)
+
+ args = (start, end) if ordered else (end, start)
+
+ with pytest.raises(AssertionError, match=msg):
+ timezones.infer_tzinfo(*args)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/__init__.py b/contrib/python/pandas/py2/pandas/tests/util/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tests/util/conftest.py b/contrib/python/pandas/py2/pandas/tests/util/conftest.py
new file mode 100644
index 00000000000..5eff49ab774
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/conftest.py
@@ -0,0 +1,26 @@
+import pytest
+
+
[email protected](params=[True, False])
+def check_dtype(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def check_exact(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def check_index_type(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def check_less_precise(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def check_categorical(request):
+ return request.param
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_almost_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_almost_equal.py
new file mode 100644
index 00000000000..afee9c00829
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_almost_equal.py
@@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Index, Series, Timestamp
+from pandas.util.testing import assert_almost_equal
+
+
+def _assert_almost_equal_both(a, b, **kwargs):
+ """
+ Check that two objects are approximately equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : object
+ The first object to compare.
+ b : object
+ The second object to compare.
+ kwargs : dict
+ The arguments passed to `assert_almost_equal`.
+ """
+ assert_almost_equal(a, b, **kwargs)
+ assert_almost_equal(b, a, **kwargs)
+
+
+def _assert_not_almost_equal(a, b, **kwargs):
+ """
+ Check that two objects are not approximately equal.
+
+ Parameters
+ ----------
+ a : object
+ The first object to compare.
+ b : object
+ The second object to compare.
+ kwargs : dict
+ The arguments passed to `assert_almost_equal`.
+ """
+ try:
+ assert_almost_equal(a, b, **kwargs)
+ msg = ("{a} and {b} were approximately equal "
+ "when they shouldn't have been").format(a=a, b=b)
+ pytest.fail(msg=msg)
+ except AssertionError:
+ pass
+
+
+def _assert_not_almost_equal_both(a, b, **kwargs):
+ """
+ Check that two objects are not approximately equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : object
+ The first object to compare.
+ b : object
+ The second object to compare.
+ kwargs : dict
+ The arguments passed to `tm.assert_almost_equal`.
+ """
+ _assert_not_almost_equal(a, b, **kwargs)
+ _assert_not_almost_equal(b, a, **kwargs)
+
+
+ (1.1, 1.1), (1.1, 1.100001), (np.int16(1), 1.000001),
+ (np.float64(1.1), 1.1), (np.uint32(5), 5),
+])
+def test_assert_almost_equal_numbers(a, b):
+ _assert_almost_equal_both(a, b)
+
+
+ (1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1)),
+])
+def test_assert_not_almost_equal_numbers(a, b):
+ _assert_not_almost_equal_both(a, b)
+
+
+ (0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0),
+])
+def test_assert_almost_equal_numbers_with_zeros(a, b):
+ _assert_almost_equal_both(a, b)
+
+
+ (0.001, 0), (1, 0),
+])
+def test_assert_not_almost_equal_numbers_with_zeros(a, b):
+ _assert_not_almost_equal_both(a, b)
+
+
+ (1, "abc"), (1, [1, ]), (1, object()),
+])
+def test_assert_not_almost_equal_numbers_with_mixed(a, b):
+ _assert_not_almost_equal_both(a, b)
+
+
+ "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"])
+ "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"])
+def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype):
+ # Empty compare.
+ _assert_almost_equal_both(np.array([], dtype=left_dtype),
+ np.array([], dtype=right_dtype),
+ check_dtype=False)
+
+
+def test_assert_almost_equal_dicts():
+ _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2})
+
+
+ ({"a": 1, "b": 2}, {"a": 1, "b": 3}),
+ ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}),
+ ({"a": 1}, 1), ({"a": 1}, "abc"), ({"a": 1}, [1, ]),
+])
+def test_assert_not_almost_equal_dicts(a, b):
+ _assert_not_almost_equal_both(a, b)
+
+
[email protected]("val", [1, 2])
+def test_assert_almost_equal_dict_like_object(val):
+ dict_val = 1
+ real_dict = dict(a=val)
+
+ class DictLikeObj(object):
+ def keys(self):
+ return "a",
+
+ def __getitem__(self, item):
+ if item == "a":
+ return dict_val
+
+ func = (_assert_almost_equal_both if val == dict_val
+ else _assert_not_almost_equal_both)
+ func(real_dict, DictLikeObj(), check_dtype=False)
+
+
+def test_assert_almost_equal_strings():
+ _assert_almost_equal_both("abc", "abc")
+
+
+ ("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1, ]),
+])
+def test_assert_not_almost_equal_strings(a, b):
+ _assert_not_almost_equal_both(a, b)
+
+
+ ([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3])),
+])
+def test_assert_almost_equal_iterables(a, b):
+ _assert_almost_equal_both(a, b)
+
+
+ # Class is different.
+ (np.array([1, 2, 3]), [1, 2, 3]),
+
+ # Dtype is different.
+ (np.array([1, 2, 3]), np.array([1., 2., 3.])),
+
+ # Can't compare generators.
+ (iter([1, 2, 3]), [1, 2, 3]), ([1, 2, 3], [1, 2, 4]),
+ ([1, 2, 3], [1, 2, 3, 4]), ([1, 2, 3], 1),
+])
+def test_assert_not_almost_equal_iterables(a, b):
+ _assert_not_almost_equal(a, b)
+
+
+def test_assert_almost_equal_null():
+ _assert_almost_equal_both(None, None)
+
+
+ (None, np.NaN), (None, 0), (np.NaN, 0),
+])
+def test_assert_not_almost_equal_null(a, b):
+ _assert_not_almost_equal(a, b)
+
+
+ (np.inf, np.inf), (np.inf, float("inf")),
+ (np.array([np.inf, np.nan, -np.inf]),
+ np.array([np.inf, np.nan, -np.inf])),
+ (np.array([np.inf, None, -np.inf], dtype=np.object_),
+ np.array([np.inf, np.nan, -np.inf], dtype=np.object_)),
+])
+def test_assert_almost_equal_inf(a, b):
+ _assert_almost_equal_both(a, b)
+
+
+def test_assert_not_almost_equal_inf():
+ _assert_not_almost_equal_both(np.inf, 0)
+
+
+ (Index([1., 1.1]), Index([1., 1.100001])),
+ (Series([1., 1.1]), Series([1., 1.100001])),
+ (np.array([1.1, 2.000001]), np.array([1.1, 2.0])),
+ (DataFrame({"a": [1., 1.1]}), DataFrame({"a": [1., 1.100001]}))
+])
+def test_assert_almost_equal_pandas(a, b):
+ _assert_almost_equal_both(a, b)
+
+
+def test_assert_almost_equal_object():
+ a = [Timestamp("2011-01-01"), Timestamp("2011-01-01")]
+ b = [Timestamp("2011-01-01"), Timestamp("2011-01-01")]
+ _assert_almost_equal_both(a, b)
+
+
+def test_assert_almost_equal_value_mismatch():
+ msg = "expected 2\\.00000 but got 1\\.00000, with decimal 5"
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(1, 2)
+
+
[email protected]("a,b,klass1,klass2", [
+ (np.array([1]), 1, "ndarray", "int"),
+ (1, np.array([1]), "int", "ndarray"),
+])
+def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2):
+ msg = """numpy array are different
+
+numpy array classes are different
+\\[left\\]: {klass1}
+\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(a, b)
+
+
+def test_assert_almost_equal_value_mismatch1():
+ msg = """numpy array are different
+
+numpy array values are different \\(66\\.66667 %\\)
+\\[left\\]: \\[nan, 2\\.0, 3\\.0\\]
+\\[right\\]: \\[1\\.0, nan, 3\\.0\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([np.nan, 2, 3]),
+ np.array([1, np.nan, 3]))
+
+
+def test_assert_almost_equal_value_mismatch2():
+ msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]: \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([1, 2]), np.array([1, 3]))
+
+
+def test_assert_almost_equal_value_mismatch3():
+ msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]),
+ np.array([[1, 3], [3, 4], [5, 6]]))
+
+
+def test_assert_almost_equal_value_mismatch4():
+ msg = """numpy array are different
+
+numpy array values are different \\(25\\.0 %\\)
+\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([[1, 2], [3, 4]]),
+ np.array([[1, 3], [3, 4]]))
+
+
+def test_assert_almost_equal_shape_mismatch_override():
+ msg = """Index are different
+
+Index shapes are different
+\\[left\\]: \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([1, 2]),
+ np.array([3, 4, 5]),
+ obj="Index")
+
+
+def test_assert_almost_equal_unicode():
+ # see gh-20503
+ msg = """numpy array are different
+
+numpy array values are different \\(33\\.33333 %\\)
+\\[left\\]: \\[á, à, ä\\]
+\\[right\\]: \\[á, à, å\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(np.array([u"á", u"à", u"ä"]),
+ np.array([u"á", u"à", u"å"]))
+
+
+def test_assert_almost_equal_timestamp():
+ a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")])
+ b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")])
+
+ msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\]
+\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal(a, b)
+
+
+def test_assert_almost_equal_iterable_length_mismatch():
+ msg = """Iterable are different
+
+Iterable length are different
+\\[left\\]: 2
+\\[right\\]: 3"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal([1, 2], [3, 4, 5])
+
+
+def test_assert_almost_equal_iterable_values_mismatch():
+ msg = """Iterable are different
+
+Iterable values are different \\(50\\.0 %\\)
+\\[left\\]: \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_almost_equal([1, 2], [1, 3])
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_categorical_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_categorical_equal.py
new file mode 100644
index 00000000000..04c83010270
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_categorical_equal.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas import Categorical
+from pandas.util.testing import assert_categorical_equal
+
+
+ Categorical([1, 2, 3, 4]),
+ Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5]),
+])
+def test_categorical_equal(c):
+ assert_categorical_equal(c, c)
+
+
[email protected]("check_category_order", [True, False])
+def test_categorical_equal_order_mismatch(check_category_order):
+ c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
+ c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1])
+ kwargs = dict(check_category_order=check_category_order)
+
+ if check_category_order:
+ msg = """Categorical\\.categories are different
+
+Categorical\\.categories values are different \\(100\\.0 %\\)
+\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[4, 3, 2, 1\\], dtype='int64'\\)"""
+ with pytest.raises(AssertionError, match=msg):
+ assert_categorical_equal(c1, c2, **kwargs)
+ else:
+ assert_categorical_equal(c1, c2, **kwargs)
+
+
+def test_categorical_equal_categories_mismatch():
+ msg = """Categorical\\.categories are different
+
+Categorical\\.categories values are different \\(25\\.0 %\\)
+\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)"""
+
+ c1 = Categorical([1, 2, 3, 4])
+ c2 = Categorical([1, 2, 3, 5])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_categorical_equal(c1, c2)
+
+
+def test_categorical_equal_codes_mismatch():
+ categories = [1, 2, 3, 4]
+ msg = """Categorical\\.codes are different
+
+Categorical\\.codes values are different \\(50\\.0 %\\)
+\\[left\\]: \\[0, 1, 3, 2\\]
+\\[right\\]: \\[0, 1, 2, 3\\]"""
+
+ c1 = Categorical([1, 2, 4, 3], categories=categories)
+ c2 = Categorical([1, 2, 3, 4], categories=categories)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_categorical_equal(c1, c2)
+
+
+def test_categorical_equal_ordered_mismatch():
+ data = [1, 2, 3, 4]
+ msg = """Categorical are different
+
+Attribute "ordered" are different
+\\[left\\]: False
+\\[right\\]: True"""
+
+ c1 = Categorical(data, ordered=False)
+ c2 = Categorical(data, ordered=True)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_categorical_equal(c1, c2)
+
+
[email protected]("obj", ["index", "foo", "pandas"])
+def test_categorical_equal_object_override(obj):
+ data = [1, 2, 3, 4]
+ msg = """{obj} are different
+
+Attribute "ordered" are different
+\\[left\\]: False
+\\[right\\]: True""".format(obj=obj)
+
+ c1 = Categorical(data, ordered=False)
+ c2 = Categorical(data, ordered=True)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_categorical_equal(c1, c2, obj=obj)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_extension_array_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_extension_array_equal.py
new file mode 100644
index 00000000000..3149078a567
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_extension_array_equal.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas.core.arrays.sparse import SparseArray
+from pandas.util.testing import assert_extension_array_equal
+
+
[email protected]("kwargs", [
+ dict(), # Default is check_exact=False
+ dict(check_exact=False), dict(check_exact=True)
+])
+def test_assert_extension_array_equal_not_exact(kwargs):
+ # see gh-23709
+ arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936])
+ arr2 = SparseArray([-0.17387645482451206, 0.3414148016424937])
+
+ if kwargs.get("check_exact", False):
+ msg = """\
+ExtensionArray are different
+
+ExtensionArray values are different \\(50\\.0 %\\)
+\\[left\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]
+\\[right\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+ else:
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+
+
[email protected]("check_less_precise", [
+ True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+])
+def test_assert_extension_array_equal_less_precise(check_less_precise):
+ arr1 = SparseArray([0.5, 0.123456])
+ arr2 = SparseArray([0.5, 0.123457])
+
+ kwargs = dict(check_less_precise=check_less_precise)
+
+ if check_less_precise is False or check_less_precise >= 5:
+ msg = """\
+ExtensionArray are different
+
+ExtensionArray values are different \\(50\\.0 %\\)
+\\[left\\]: \\[0\\.5, 0\\.123456\\]
+\\[right\\]: \\[0\\.5, 0\\.123457\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+ else:
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+
+
+def test_assert_extension_array_equal_dtype_mismatch(check_dtype):
+ end = 5
+ kwargs = dict(check_dtype=check_dtype)
+
+ arr1 = SparseArray(np.arange(end, dtype="int64"))
+ arr2 = SparseArray(np.arange(end, dtype="int32"))
+
+ if check_dtype:
+ msg = """\
+ExtensionArray are different
+
+Attribute "dtype" are different
+\\[left\\]: Sparse\\[int64, 0\\]
+\\[right\\]: Sparse\\[int32, 0\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+ else:
+ assert_extension_array_equal(arr1, arr2, **kwargs)
+
+
+def test_assert_extension_array_equal_missing_values():
+ arr1 = SparseArray([np.nan, 1, 2, np.nan])
+ arr2 = SparseArray([np.nan, 1, 2, 3])
+
+ msg = """\
+ExtensionArray NA mask are different
+
+ExtensionArray NA mask values are different \\(25\\.0 %\\)
+\\[left\\]: \\[True, False, False, True\\]
+\\[right\\]: \\[True, False, False, False\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_extension_array_equal(arr1, arr2)
+
+
[email protected]("side", ["left", "right"])
+def test_assert_extension_array_equal_non_extension_array(side):
+ numpy_array = np.arange(5)
+ extension_array = SparseArray(numpy_array)
+
+ msg = "{side} is not an ExtensionArray".format(side=side)
+ args = ((numpy_array, extension_array) if side == "left"
+ else (extension_array, numpy_array))
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_extension_array_equal(*args)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_frame_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_frame_equal.py
new file mode 100644
index 00000000000..1a941c0f0c2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_frame_equal.py
@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas import DataFrame
+from pandas.util.testing import assert_frame_equal
+
+
[email protected](params=[True, False])
+def by_blocks(request):
+ return request.param
+
+
+def _assert_frame_equal_both(a, b, **kwargs):
+ """
+ Check that two DataFrame equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : DataFrame
+ The first DataFrame to compare.
+ b : DataFrame
+ The second DataFrame to compare.
+ kwargs : dict
+ The arguments passed to `assert_frame_equal`.
+ """
+ assert_frame_equal(a, b, **kwargs)
+ assert_frame_equal(b, a, **kwargs)
+
+
+def _assert_not_frame_equal(a, b, **kwargs):
+ """
+ Check that two DataFrame are not equal.
+
+ Parameters
+ ----------
+ a : DataFrame
+ The first DataFrame to compare.
+ b : DataFrame
+ The second DataFrame to compare.
+ kwargs : dict
+ The arguments passed to `assert_frame_equal`.
+ """
+ try:
+ assert_frame_equal(a, b, **kwargs)
+ msg = "The two DataFrames were equal when they shouldn't have been"
+
+ pytest.fail(msg=msg)
+ except AssertionError:
+ pass
+
+
+def _assert_not_frame_equal_both(a, b, **kwargs):
+ """
+ Check that two DataFrame are not equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : DataFrame
+ The first DataFrame to compare.
+ b : DataFrame
+ The second DataFrame to compare.
+ kwargs : dict
+ The arguments passed to `assert_frame_equal`.
+ """
+ _assert_not_frame_equal(a, b, **kwargs)
+ _assert_not_frame_equal(b, a, **kwargs)
+
+
[email protected]("check_like", [True, False])
+def test_frame_equal_row_order_mismatch(check_like):
+ df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]},
+ index=["a", "b", "c"])
+ df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]},
+ index=["c", "b", "a"])
+
+ if not check_like: # Do not ignore row-column orderings.
+ msg = "DataFrame.index are different"
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2, check_like=check_like)
+ else:
+ _assert_frame_equal_both(df1, df2, check_like=check_like)
+
+
[email protected]("df1,df2", [
+ (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})),
+ (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})),
+])
+def test_frame_equal_shape_mismatch(df1, df2):
+ msg = "DataFrame are different"
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2)
+
+
[email protected]("df1,df2,msg", [
+ # Index
+ (DataFrame.from_records({"a": [1, 2],
+ "c": ["l1", "l2"]}, index=["a"]),
+ DataFrame.from_records({"a": [1.0, 2.0],
+ "c": ["l1", "l2"]}, index=["a"]),
+ "DataFrame\\.index are different"),
+
+ # MultiIndex
+ (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5],
+ "c": ["l1", "l2"]}, index=["a", "b"]),
+ DataFrame.from_records({"a": [1.0, 2.0], "b": [2.1, 1.5],
+ "c": ["l1", "l2"]}, index=["a", "b"]),
+ "MultiIndex level \\[0\\] are different")
+])
+def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type):
+ kwargs = dict(check_index_type=check_index_type)
+
+ if check_index_type:
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2, **kwargs)
+ else:
+ assert_frame_equal(df1, df2, **kwargs)
+
+
+def test_empty_dtypes(check_dtype):
+ columns = ["col1", "col2"]
+ df1 = DataFrame(columns=columns)
+ df2 = DataFrame(columns=columns)
+
+ kwargs = dict(check_dtype=check_dtype)
+ df1["col1"] = df1["col1"].astype("int64")
+
+ if check_dtype:
+ msg = "Attributes are different"
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2, **kwargs)
+ else:
+ assert_frame_equal(df1, df2, **kwargs)
+
+
+def test_frame_equal_index_mismatch():
+ msg = """DataFrame\\.index are different
+
+DataFrame\\.index values are different \\(33\\.33333 %\\)
+\\[left\\]: Index\\(\\[u?'a', u?'b', u?'c'\\], dtype='object'\\)
+\\[right\\]: Index\\(\\[u?'a', u?'b', u?'d'\\], dtype='object'\\)"""
+
+ df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]},
+ index=["a", "b", "c"])
+ df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]},
+ index=["a", "b", "d"])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2)
+
+
+def test_frame_equal_columns_mismatch():
+ msg = """DataFrame\\.columns are different
+
+DataFrame\\.columns values are different \\(50\\.0 %\\)
+\\[left\\]: Index\\(\\[u?'A', u?'B'\\], dtype='object'\\)
+\\[right\\]: Index\\(\\[u?'A', u?'b'\\], dtype='object'\\)"""
+
+ df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]},
+ index=["a", "b", "c"])
+ df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]},
+ index=["a", "b", "c"])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2)
+
+
+def test_frame_equal_block_mismatch(by_blocks):
+ msg = """DataFrame\\.iloc\\[:, 1\\] are different
+
+DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\)
+\\[left\\]: \\[4, 5, 6\\]
+\\[right\\]: \\[4, 5, 7\\]"""
+
+ df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]})
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2, by_blocks=by_blocks)
+
+
[email protected]("df1,df2,msg", [
+ (DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"ë"]}),
+ DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"e̊"]}),
+ """DataFrame\\.iloc\\[:, 1\\] are different
+
+DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\)
+\\[left\\]: \\[é, è, ë\\]
+\\[right\\]: \\[é, è, e̊\\]"""),
+ (DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"ë"]}),
+ DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}),
+ """DataFrame\\.iloc\\[:, 0\\] are different
+
+DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\)
+\\[left\\]: \\[á, à, ä\\]
+\\[right\\]: \\[a, a, a\\]"""),
+])
+def test_frame_equal_unicode(df1, df2, msg, by_blocks):
+ # see gh-20503
+ #
+ # Test ensures that `assert_frame_equals` raises the right exception
+ # when comparing DataFrames containing differing unicode objects.
+ with pytest.raises(AssertionError, match=msg):
+ assert_frame_equal(df1, df2, by_blocks=by_blocks)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_index_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_index_equal.py
new file mode 100644
index 00000000000..b5409bf7cd2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_index_equal.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import Categorical, Index, MultiIndex, NaT
+from pandas.util.testing import assert_index_equal
+
+
+def test_index_equal_levels_mismatch():
+ msg = """Index are different
+
+Index levels are different
+\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\],
+ codes=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)"""
+
+ idx1 = Index([1, 2, 3])
+ idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2),
+ ("B", 3), ("B", 4)])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, exact=False)
+
+
+def test_index_equal_values_mismatch(check_exact):
+ msg = """MultiIndex level \\[1\\] are different
+
+MultiIndex level \\[1\\] values are different \\(25\\.0 %\\)
+\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+ idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2),
+ ("B", 3), ("B", 4)])
+ idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2),
+ ("B", 3), ("B", 4)])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+def test_index_equal_length_mismatch(check_exact):
+ msg = """Index are different
+
+Index length are different
+\\[left\\]: 3, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: 4, Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+ idx1 = Index([1, 2, 3])
+ idx2 = Index([1, 2, 3, 4])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+def test_index_equal_class_mismatch(check_exact):
+ msg = """Index are different
+
+Index classes are different
+\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)"""
+
+ idx1 = Index([1, 2, 3])
+ idx2 = Index([1, 2, 3.0])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact)
+
+
+def test_index_equal_values_close(check_exact):
+ idx1 = Index([1, 2, 3.])
+ idx2 = Index([1, 2, 3.0000000001])
+
+ if check_exact:
+ msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\)
+\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, check_exact=check_exact)
+ else:
+ assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+def test_index_equal_values_less_close(check_exact, check_less_precise):
+ idx1 = Index([1, 2, 3.])
+ idx2 = Index([1, 2, 3.0001])
+ kwargs = dict(check_exact=check_exact,
+ check_less_precise=check_less_precise)
+
+ if check_exact or not check_less_precise:
+ msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\)
+\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, **kwargs)
+ else:
+ assert_index_equal(idx1, idx2, **kwargs)
+
+
+def test_index_equal_values_too_far(check_exact, check_less_precise):
+ idx1 = Index([1, 2, 3])
+ idx2 = Index([1, 2, 4])
+ kwargs = dict(check_exact=check_exact,
+ check_less_precise=check_less_precise)
+
+ msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[1, 2, 4\\], dtype='int64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, **kwargs)
+
+
+def test_index_equal_level_values_mismatch(check_exact, check_less_precise):
+ idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2),
+ ("B", 3), ("B", 4)])
+ idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2),
+ ("B", 3), ("B", 4)])
+ kwargs = dict(check_exact=check_exact,
+ check_less_precise=check_less_precise)
+
+ msg = """MultiIndex level \\[1\\] are different
+
+MultiIndex level \\[1\\] values are different \\(25\\.0 %\\)
+\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, **kwargs)
+
+
[email protected]("name1,name2", [
+ (None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)
+])
+def test_index_equal_names(name1, name2):
+ msg = """Index are different
+
+Attribute "names" are different
+\\[left\\]: \\[{name1}\\]
+\\[right\\]: \\[{name2}\\]"""
+
+ idx1 = Index([1, 2, 3], name=name1)
+ idx2 = Index([1, 2, 3], name=name2)
+
+ if name1 == name2 or name1 is name2:
+ assert_index_equal(idx1, idx2)
+ else:
+ name1 = "u?'x'" if name1 == "x" else name1
+ name2 = "u?'x'" if name2 == "x" else name2
+ msg = msg.format(name1=name1, name2=name2)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2)
+
+
+def test_index_equal_category_mismatch(check_categorical):
+ msg = """Index are different
+
+Attribute "dtype" are different
+\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\)
+\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \
+ordered=False\\)"""
+
+ idx1 = Index(Categorical(["a", "b"]))
+ idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"]))
+
+ if check_categorical:
+ with pytest.raises(AssertionError, match=msg):
+ assert_index_equal(idx1, idx2, check_categorical=check_categorical)
+ else:
+ assert_index_equal(idx1, idx2, check_categorical=check_categorical)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_interval_array_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_interval_array_equal.py
new file mode 100644
index 00000000000..c81a27f9b3f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_interval_array_equal.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas import interval_range
+from pandas.util.testing import assert_interval_array_equal
+
+
[email protected]("kwargs", [
+ dict(start=0, periods=4),
+ dict(start=1, periods=5),
+ dict(start=5, end=10, closed="left"),
+])
+def test_interval_array_equal(kwargs):
+ arr = interval_range(**kwargs).values
+ assert_interval_array_equal(arr, arr)
+
+
+def test_interval_array_equal_closed_mismatch():
+ kwargs = dict(start=0, periods=5)
+ arr1 = interval_range(closed="left", **kwargs).values
+ arr2 = interval_range(closed="right", **kwargs).values
+
+ msg = """\
+IntervalArray are different
+
+Attribute "closed" are different
+\\[left\\]: left
+\\[right\\]: right"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_periods_mismatch():
+ kwargs = dict(start=0)
+ arr1 = interval_range(periods=5, **kwargs).values
+ arr2 = interval_range(periods=6, **kwargs).values
+
+ msg = """\
+IntervalArray.left are different
+
+IntervalArray.left length are different
+\\[left\\]: 5, Int64Index\\(\\[0, 1, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: 6, Int64Index\\(\\[0, 1, 2, 3, 4, 5\\], dtype='int64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_end_mismatch():
+ kwargs = dict(start=0, periods=5)
+ arr1 = interval_range(end=10, **kwargs).values
+ arr2 = interval_range(end=20, **kwargs).values
+
+ msg = """\
+IntervalArray.left are different
+
+IntervalArray.left values are different \\(80.0 %\\)
+\\[left\\]: Int64Index\\(\\[0, 2, 4, 6, 8\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[0, 4, 8, 12, 16\\], dtype='int64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_start_mismatch():
+ kwargs = dict(periods=4)
+ arr1 = interval_range(start=0, **kwargs).values
+ arr2 = interval_range(start=1, **kwargs).values
+
+ msg = """\
+IntervalArray.left are different
+
+IntervalArray.left values are different \\(100.0 %\\)
+\\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_interval_array_equal(arr1, arr2)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_numpy_array_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_numpy_array_equal.py
new file mode 100644
index 00000000000..99037fcf961
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_numpy_array_equal.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pytest
+
+from pandas import Timestamp
+from pandas.util.testing import assert_numpy_array_equal
+
+
+def test_assert_numpy_array_equal_shape_mismatch():
+ msg = """numpy array are different
+
+numpy array shapes are different
+\\[left\\]: \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]))
+
+
+def test_assert_numpy_array_equal_bad_type():
+ expected = "Expected type"
+
+ with pytest.raises(AssertionError, match=expected):
+ assert_numpy_array_equal(1, 2)
+
+
[email protected]("a,b,klass1,klass2", [
+ (np.array([1]), 1, "ndarray", "int"),
+ (1, np.array([1]), "int", "ndarray"),
+])
+def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2):
+ msg = """numpy array are different
+
+numpy array classes are different
+\\[left\\]: {klass1}
+\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2)
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(a, b)
+
+
+def test_assert_numpy_array_equal_value_mismatch1():
+ msg = """numpy array are different
+
+numpy array values are different \\(66\\.66667 %\\)
+\\[left\\]: \\[nan, 2\\.0, 3\\.0\\]
+\\[right\\]: \\[1\\.0, nan, 3\\.0\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([np.nan, 2, 3]),
+ np.array([1, np.nan, 3]))
+
+
+def test_assert_numpy_array_equal_value_mismatch2():
+ msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]: \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3]))
+
+
+def test_assert_numpy_array_equal_value_mismatch3():
+ msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]),
+ np.array([[1, 3], [3, 4], [5, 6]]))
+
+
+def test_assert_numpy_array_equal_value_mismatch4():
+ msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]: \\[1\\.1, 2\\.000001\\]
+\\[right\\]: \\[1\\.1, 2.0\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([1.1, 2.000001]),
+ np.array([1.1, 2.0]))
+
+
+def test_assert_numpy_array_equal_value_mismatch5():
+ msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]),
+ np.array([[1, 3], [3, 4], [5, 6]]))
+
+
+def test_assert_numpy_array_equal_value_mismatch6():
+ msg = """numpy array are different
+
+numpy array values are different \\(25\\.0 %\\)
+\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([[1, 2], [3, 4]]),
+ np.array([[1, 3], [3, 4]]))
+
+
+def test_assert_numpy_array_equal_shape_mismatch_override():
+ msg = """Index are different
+
+Index shapes are different
+\\[left\\]: \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([1, 2]),
+ np.array([3, 4, 5]),
+ obj="Index")
+
+
+def test_numpy_array_equal_unicode():
+ # see gh-20503
+ #
+ # Test ensures that `assert_numpy_array_equals` raises the right
+ # exception when comparing np.arrays containing differing unicode objects.
+ msg = """numpy array are different
+
+numpy array values are different \\(33\\.33333 %\\)
+\\[left\\]: \\[á, à, ä\\]
+\\[right\\]: \\[á, à, å\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(np.array([u"á", u"à", u"ä"]),
+ np.array([u"á", u"à", u"å"]))
+
+
+def test_numpy_array_equal_object():
+ a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")])
+ b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")])
+
+ msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\]
+\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]"""
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(a, b)
+
+
[email protected]("other_type", ["same", "copy"])
[email protected]("check_same", ["same", "copy"])
+def test_numpy_array_equal_copy_flag(other_type, check_same):
+ a = np.array([1, 2, 3])
+ msg = None
+
+ if other_type == "same":
+ other = a.view()
+ else:
+ other = a.copy()
+
+ if check_same != other_type:
+ msg = (r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)"
+ if check_same == "same"
+ else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)")
+
+ if msg is not None:
+ with pytest.raises(AssertionError, match=msg):
+ assert_numpy_array_equal(a, other, check_same=check_same)
+ else:
+ assert_numpy_array_equal(a, other, check_same=check_same)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_assert_series_equal.py b/contrib/python/pandas/py2/pandas/tests/util/test_assert_series_equal.py
new file mode 100644
index 00000000000..537a0e01ff8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_assert_series_equal.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+from pandas import Categorical, DataFrame, Series
+from pandas.util.testing import assert_series_equal
+
+
+def _assert_series_equal_both(a, b, **kwargs):
+ """
+ Check that two Series equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : Series
+ The first Series to compare.
+ b : Series
+ The second Series to compare.
+ kwargs : dict
+ The arguments passed to `assert_series_equal`.
+ """
+ assert_series_equal(a, b, **kwargs)
+ assert_series_equal(b, a, **kwargs)
+
+
+def _assert_not_series_equal(a, b, **kwargs):
+ """
+ Check that two Series are not equal.
+
+ Parameters
+ ----------
+ a : Series
+ The first Series to compare.
+ b : Series
+ The second Series to compare.
+ kwargs : dict
+ The arguments passed to `assert_series_equal`.
+ """
+ try:
+ assert_series_equal(a, b, **kwargs)
+ msg = "The two Series were equal when they shouldn't have been"
+
+ pytest.fail(msg=msg)
+ except AssertionError:
+ pass
+
+
+def _assert_not_series_equal_both(a, b, **kwargs):
+ """
+ Check that two Series are not equal.
+
+ This check is performed commutatively.
+
+ Parameters
+ ----------
+ a : Series
+ The first Series to compare.
+ b : Series
+ The second Series to compare.
+ kwargs : dict
+ The arguments passed to `assert_series_equal`.
+ """
+ _assert_not_series_equal(a, b, **kwargs)
+ _assert_not_series_equal(b, a, **kwargs)
+
+
+ range(3), list("abc"), list(u"áàä"),
+])
+def test_series_equal(data):
+ _assert_series_equal_both(Series(data), Series(data))
+
+
[email protected]("data1,data2", [
+ (range(3), range(1, 4)),
+ (list("abc"), list("xyz")),
+ (list(u"áàä"), list(u"éèë")),
+ (list(u"áàä"), list(b"aaa")),
+ (range(3), range(4)),
+])
+def test_series_not_equal_value_mismatch(data1, data2):
+ _assert_not_series_equal_both(Series(data1), Series(data2))
+
+
[email protected]("kwargs", [
+ dict(dtype="float64"), # dtype mismatch
+ dict(index=[1, 2, 4]), # index mismatch
+ dict(name="foo"), # name mismatch
+])
+def test_series_not_equal_metadata_mismatch(kwargs):
+ data = range(3)
+ s1 = Series(data)
+
+ s2 = Series(data, **kwargs)
+ _assert_not_series_equal_both(s1, s2)
+
+
[email protected]("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)])
[email protected]("dtype", ["float32", "float64"])
[email protected]("check_less_precise", [False, True, 0, 1, 2, 3, 10])
+def test_less_precise(data1, data2, dtype, check_less_precise):
+ s1 = Series([data1], dtype=dtype)
+ s2 = Series([data2], dtype=dtype)
+
+ kwargs = dict(check_less_precise=check_less_precise)
+
+ if ((check_less_precise is False or check_less_precise == 10) or
+ ((check_less_precise is True or check_less_precise >= 3) and
+ abs(data1 - data2) >= 0.0001)):
+ msg = "Series values are different"
+ with pytest.raises(AssertionError, match=msg):
+ assert_series_equal(s1, s2, **kwargs)
+ else:
+ _assert_series_equal_both(s1, s2, **kwargs)
+
+
[email protected]("s1,s2,msg", [
+ # Index
+ (Series(["l1", "l2"], index=[1, 2]),
+ Series(["l1", "l2"], index=[1., 2.]),
+ "Series\\.index are different"),
+
+ # MultiIndex
+ (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5],
+ "c": ["l1", "l2"]}, index=["a", "b"]).c,
+ DataFrame.from_records({"a": [1., 2.], "b": [2.1, 1.5],
+ "c": ["l1", "l2"]}, index=["a", "b"]).c,
+ "MultiIndex level \\[0\\] are different")
+])
+def test_series_equal_index_dtype(s1, s2, msg, check_index_type):
+ kwargs = dict(check_index_type=check_index_type)
+
+ if check_index_type:
+ with pytest.raises(AssertionError, match=msg):
+ assert_series_equal(s1, s2, **kwargs)
+ else:
+ assert_series_equal(s1, s2, **kwargs)
+
+
+def test_series_equal_length_mismatch(check_less_precise):
+ msg = """Series are different
+
+Series length are different
+\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\)
+\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)"""
+
+ s1 = Series([1, 2, 3])
+ s2 = Series([1, 2, 3, 4])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_series_equal(s1, s2, check_less_precise=check_less_precise)
+
+
+def test_series_equal_values_mismatch(check_less_precise):
+ msg = """Series are different
+
+Series values are different \\(33\\.33333 %\\)
+\\[left\\]: \\[1, 2, 3\\]
+\\[right\\]: \\[1, 2, 4\\]"""
+
+ s1 = Series([1, 2, 3])
+ s2 = Series([1, 2, 4])
+
+ with pytest.raises(AssertionError, match=msg):
+ assert_series_equal(s1, s2, check_less_precise=check_less_precise)
+
+
+def test_series_equal_categorical_mismatch(check_categorical):
+ msg = """Attributes are different
+
+Attribute "dtype" are different
+\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\)
+\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \
+ordered=False\\)"""
+
+ s1 = Series(Categorical(["a", "b"]))
+ s2 = Series(Categorical(["a", "b"], categories=list("abc")))
+
+ if check_categorical:
+ with pytest.raises(AssertionError, match=msg):
+ assert_series_equal(s1, s2, check_categorical=check_categorical)
+ else:
+ _assert_series_equal_both(s1, s2, check_categorical=check_categorical)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_deprecate.py b/contrib/python/pandas/py2/pandas/tests/util/test_deprecate.py
new file mode 100644
index 00000000000..7fa7989eff6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_deprecate.py
@@ -0,0 +1,63 @@
+from textwrap import dedent
+
+import pytest
+
+from pandas.util._decorators import deprecate
+
+import pandas.util.testing as tm
+
+
+def new_func():
+ """
+ This is the summary. The deprecate directive goes next.
+
+ This is the extended summary. The deprecate directive goes before this.
+ """
+ return 'new_func called'
+
+
+def new_func_no_docstring():
+ return 'new_func_no_docstring called'
+
+
+def new_func_wrong_docstring():
+ """Summary should be in the next line."""
+ return 'new_func_wrong_docstring called'
+
+
+def new_func_with_deprecation():
+ """
+ This is the summary. The deprecate directive goes next.
+
+ .. deprecated:: 1.0
+ Use new_func instead.
+
+ This is the extended summary. The deprecate directive goes before this.
+ """
+ pass
+
+
+def test_deprecate_ok():
+ depr_func = deprecate('depr_func', new_func, '1.0',
+ msg='Use new_func instead.')
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = depr_func()
+
+ assert result == 'new_func called'
+ assert depr_func.__doc__ == dedent(new_func_with_deprecation.__doc__)
+
+
+def test_deprecate_no_docstring():
+ depr_func = deprecate('depr_func', new_func_no_docstring, '1.0',
+ msg='Use new_func instead.')
+ with tm.assert_produces_warning(FutureWarning):
+ result = depr_func()
+ assert result == 'new_func_no_docstring called'
+
+
+def test_deprecate_wrong_docstring():
+ with pytest.raises(AssertionError, match='deprecate needs a correctly '
+ 'formatted docstring'):
+ deprecate('depr_func', new_func_wrong_docstring, '1.0',
+ msg='Use new_func instead.')
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_deprecate_kwarg.py b/contrib/python/pandas/py2/pandas/tests/util/test_deprecate_kwarg.py
new file mode 100644
index 00000000000..7287df9db8a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_deprecate_kwarg.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+from pandas.util._decorators import deprecate_kwarg
+
+import pandas.util.testing as tm
+
+
+@deprecate_kwarg("old", "new")
+def _f1(new=False):
+ return new
+
+
+_f2_mappings = {"yes": True, "no": False}
+
+
+@deprecate_kwarg("old", "new", _f2_mappings)
+def _f2(new=False):
+ return new
+
+
+def _f3_mapping(x):
+ return x + 1
+
+
+@deprecate_kwarg("old", "new", _f3_mapping)
+def _f3(new=0):
+ return new
+
+
[email protected]("key,klass", [
+ ("old", FutureWarning),
+ ("new", None)
+])
+def test_deprecate_kwarg(key, klass):
+ x = 78
+
+ with tm.assert_produces_warning(klass):
+ assert _f1(**{key: x}) == x
+
+
[email protected]("key", list(_f2_mappings.keys()))
+def test_dict_deprecate_kwarg(key):
+ with tm.assert_produces_warning(FutureWarning):
+ assert _f2(old=key) == _f2_mappings[key]
+
+
[email protected]("key", ["bogus", 12345, -1.23])
+def test_missing_deprecate_kwarg(key):
+ with tm.assert_produces_warning(FutureWarning):
+ assert _f2(old=key) == key
+
+
[email protected]("x", [1, -1.4, 0])
+def test_callable_deprecate_kwarg(x):
+ with tm.assert_produces_warning(FutureWarning):
+ assert _f3(old=x) == _f3_mapping(x)
+
+
+def test_callable_deprecate_kwarg_fail():
+ msg = "((can only|cannot) concatenate)|(must be str)|(Can't convert)"
+
+ with pytest.raises(TypeError, match=msg):
+ _f3(old="hello")
+
+
+def test_bad_deprecate_kwarg():
+ msg = "mapping from old to new argument values must be dict or callable!"
+
+ with pytest.raises(TypeError, match=msg):
+ @deprecate_kwarg("old", "new", 0)
+ def f4(new=None):
+ return new
+
+
+@deprecate_kwarg("old", None)
+def _f4(old=True, unchanged=True):
+ return old, unchanged
+
+
[email protected]("key", ["old", "unchanged"])
+def test_deprecate_keyword(key):
+ x = 9
+
+ if key == "old":
+ klass = FutureWarning
+ expected = (x, True)
+ else:
+ klass = None
+ expected = (True, x)
+
+ with tm.assert_produces_warning(klass):
+ assert _f4(**{key: x}) == expected
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_hashing.py b/contrib/python/pandas/py2/pandas/tests/util/test_hashing.py
new file mode 100644
index 00000000000..d36de931e26
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_hashing.py
@@ -0,0 +1,328 @@
+import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, MultiIndex, Series
+from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples
+from pandas.util import hash_array, hash_pandas_object
+import pandas.util.testing as tm
+
+
+ Series([1, 2, 3] * 3, dtype="int32"),
+ Series([None, 2.5, 3.5] * 3, dtype="float32"),
+ Series(["a", "b", "c"] * 3, dtype="category"),
+ Series(["d", "e", "f"] * 3),
+ Series([True, False, True] * 3),
+ Series(pd.date_range("20130101", periods=9)),
+ Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
+ Series(pd.timedelta_range("2000", periods=9))])
+def series(request):
+ return request.param
+
+
[email protected](params=[True, False])
+def index(request):
+ return request.param
+
+
+def _check_equal(obj, **kwargs):
+ """
+ Check that hashing an objects produces the same value each time.
+
+ Parameters
+ ----------
+ obj : object
+ The object to hash.
+ kwargs : kwargs
+ Keyword arguments to pass to the hashing function.
+ """
+ a = hash_pandas_object(obj, **kwargs)
+ b = hash_pandas_object(obj, **kwargs)
+ tm.assert_series_equal(a, b)
+
+
+def _check_not_equal_with_index(obj):
+ """
+ Check the hash of an object with and without its index is not the same.
+
+ Parameters
+ ----------
+ obj : object
+ The object to hash.
+ """
+ if not isinstance(obj, Index):
+ a = hash_pandas_object(obj, index=True)
+ b = hash_pandas_object(obj, index=False)
+
+ if len(obj):
+ assert not (a == b).all()
+
+
+def test_consistency():
+ # Check that our hash doesn't change because of a mistake
+ # in the actual code; this is the ground truth.
+ result = hash_pandas_object(Index(["foo", "bar", "baz"]))
+ expected = Series(np.array([3600424527151052760, 1374399572096150070,
+ 477881037637427054], dtype="uint64"),
+ index=["foo", "bar", "baz"])
+ tm.assert_series_equal(result, expected)
+
+
+def test_hash_array(series):
+ arr = series.values
+ tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
+
+
+ np.array([3, 4, "All"]),
+ np.array([3, 4, "All"], dtype=object),
+])
+def test_hash_array_mixed(arr2):
+ result1 = hash_array(np.array(["3", "4", "All"]))
+ result2 = hash_array(arr2)
+
+ tm.assert_numpy_array_equal(result1, result2)
+
+
[email protected]("val", [5, "foo", pd.Timestamp("20130101")])
+def test_hash_array_errors(val):
+ msg = "must pass a ndarray-like"
+ with pytest.raises(TypeError, match=msg):
+ hash_array(val)
+
+
+def test_hash_tuples():
+ tuples = [(1, "one"), (1, "two"), (2, "one")]
+ result = hash_tuples(tuples)
+
+ expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = hash_tuples(tuples[0])
+ assert result == expected[0]
+
+
+ (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"),
+ ("A", pd.Timestamp("2012-01-01"))])
+def test_hash_tuple(tup):
+ # Test equivalence between
+ # hash_tuples and hash_tuple.
+ result = hash_tuple(tup)
+ expected = hash_tuples([tup])[0]
+
+ assert result == expected
+
+
+ 1, 1.4, "A", b"A", u"A", pd.Timestamp("2012-01-01"),
+ pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
+ datetime.datetime(2012, 1, 1),
+ pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
+ pd.Timedelta("1 days"), datetime.timedelta(1),
+ pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1),
+ np.nan, pd.NaT, None])
+def test_hash_scalar(val):
+ result = _hash_scalar(val)
+ expected = hash_array(np.array([val], dtype=object), categorize=True)
+
+ assert result[0] == expected[0]
+
+
[email protected]("val", [5, "foo", pd.Timestamp("20130101")])
+def test_hash_tuples_err(val):
+ msg = "must be convertible to a list-of-tuples"
+ with pytest.raises(TypeError, match=msg):
+ hash_tuples(val)
+
+
+def test_multiindex_unique():
+ mi = MultiIndex.from_tuples([(118, 472), (236, 118),
+ (51, 204), (102, 51)])
+ assert mi.is_unique is True
+
+ result = hash_pandas_object(mi)
+ assert result.is_unique is True
+
+
+def test_multiindex_objects():
+ mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]],
+ codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
+ names=["col1", "col2"])
+ recons = mi._sort_levels_monotonic()
+
+ # These are equal.
+ assert mi.equals(recons)
+ assert Index(mi.values).equals(Index(recons.values))
+
+ # _hashed_values and hash_pandas_object(..., index=False) equivalency.
+ expected = hash_pandas_object(mi, index=False).values
+ result = mi._hashed_values
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = hash_pandas_object(recons, index=False).values
+ result = recons._hashed_values
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ expected = mi._hashed_values
+ result = recons._hashed_values
+
+ # Values should match, but in different order.
+ tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
+
+
+ Series([1, 2, 3]),
+ Series([1.0, 1.5, 3.2]),
+ Series([1.0, 1.5, np.nan]),
+ Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
+ Series(["a", "b", "c"]),
+ Series(["a", np.nan, "c"]),
+ Series(["a", None, "c"]),
+ Series([True, False, True]),
+ Series(),
+ Index([1, 2, 3]),
+ Index([True, False, True]),
+ DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
+ DataFrame(),
+ tm.makeMissingDataframe(),
+ tm.makeMixedDataFrame(),
+ tm.makeTimeDataFrame(),
+ tm.makeTimeSeries(),
+ tm.makeTimedeltaIndex(),
+ tm.makePeriodIndex(),
+ Series(tm.makePeriodIndex()),
+ Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
+ MultiIndex.from_product([range(5), ["foo", "bar", "baz"],
+ pd.date_range("20130101", periods=2)]),
+ MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)])
+])
+def test_hash_pandas_object(obj, index):
+ _check_equal(obj, index=index)
+ _check_not_equal_with_index(obj)
+
+
+def test_hash_pandas_object2(series, index):
+ _check_equal(series, index=index)
+ _check_not_equal_with_index(series)
+
+
+ Series([], dtype="float64"), Series([], dtype="object"), Index([])])
+def test_hash_pandas_empty_object(obj, index):
+ # These are by-definition the same with
+ # or without the index as the data is empty.
+ _check_equal(obj, index=index)
+
+
+ Series(["a", "b", "c", "d"]),
+ Series([1000, 2000, 3000, 4000]),
+ Series(pd.date_range(0, periods=4))])
[email protected]("categorize", [True, False])
+def test_categorical_consistency(s1, categorize):
+ # see gh-15143
+ #
+ # Check that categoricals hash consistent with their values,
+ # not codes. This should work for categoricals of any dtype.
+ s2 = s1.astype("category").cat.set_categories(s1)
+ s3 = s2.cat.set_categories(list(reversed(s1)))
+
+ # These should all hash identically.
+ h1 = hash_pandas_object(s1, categorize=categorize)
+ h2 = hash_pandas_object(s2, categorize=categorize)
+ h3 = hash_pandas_object(s3, categorize=categorize)
+
+ tm.assert_series_equal(h1, h2)
+ tm.assert_series_equal(h1, h3)
+
+
+def test_categorical_with_nan_consistency():
+ c = pd.Categorical.from_codes(
+ [-1, 0, 1, 2, 3, 4],
+ categories=pd.date_range("2012-01-01", periods=5, name="B"))
+ expected = hash_array(c, categorize=False)
+
+ c = pd.Categorical.from_codes(
+ [-1, 0],
+ categories=[pd.Timestamp("2012-01-01")])
+ result = hash_array(c, categorize=False)
+
+ assert result[0] in expected
+ assert result[1] in expected
+
+
[email protected]("ignore:\\nPanel:FutureWarning")
[email protected]("obj", [pd.Timestamp("20130101"), tm.makePanel()])
+def test_pandas_errors(obj):
+ msg = "Unexpected type for hashing"
+ with pytest.raises(TypeError, match=msg):
+ hash_pandas_object(obj)
+
+
+def test_hash_keys():
+ # Using different hash keys, should have
+ # different hashes for the same data.
+ #
+ # This only matters for object dtypes.
+ obj = Series(list("abc"))
+
+ a = hash_pandas_object(obj, hash_key="9876543210123456")
+ b = hash_pandas_object(obj, hash_key="9876543210123465")
+
+ assert (a != b).all()
+
+
+def test_invalid_key():
+ # This only matters for object dtypes.
+ msg = "key should be a 16-byte string encoded"
+
+ with pytest.raises(ValueError, match=msg):
+ hash_pandas_object(Series(list("abc")), hash_key="foo")
+
+
+def test_already_encoded(index):
+ # If already encoded, then ok.
+ obj = Series(list("abc")).str.encode("utf8")
+ _check_equal(obj, index=index)
+
+
+def test_alternate_encoding(index):
+ obj = Series(list("abc"))
+ _check_equal(obj, index=index, encoding="ascii")
+
+
[email protected]("l_exp", range(8))
[email protected]("l_add", [0, 1])
+def test_same_len_hash_collisions(l_exp, l_add):
+ length = 2**(l_exp + 8) + l_add
+ s = tm.rands_array(length, 2)
+
+ result = hash_array(s, "utf8")
+ assert not result[0] == result[1]
+
+
+def test_hash_collisions():
+ # Hash collisions are bad.
+ #
+ # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
+ hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa
+ "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa
+
+ # These should be different.
+ result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8")
+ expected1 = np.array([14963968704024874985], dtype=np.uint64)
+ tm.assert_numpy_array_equal(result1, expected1)
+
+ result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8")
+ expected2 = np.array([16428432627716348016], dtype=np.uint64)
+ tm.assert_numpy_array_equal(result2, expected2)
+
+ result = hash_array(np.asarray(hashes, dtype=object), "utf8")
+ tm.assert_numpy_array_equal(result, np.concatenate([expected1,
+ expected2], axis=0))
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_locale.py b/contrib/python/pandas/py2/pandas/tests/util/test_locale.py
new file mode 100644
index 00000000000..b848b22994e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_locale.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+import codecs
+import locale
+import os
+
+import pytest
+
+from pandas.compat import is_platform_windows
+
+import pandas.core.common as com
+import pandas.util.testing as tm
+
+_all_locales = tm.get_locales() or []
+_current_locale = locale.getlocale()
+
+# Don't run any of these tests if we are on Windows or have no locales.
+pytestmark = pytest.mark.skipif(is_platform_windows() or not _all_locales,
+ reason="Need non-Windows and locales")
+
+_skip_if_only_one_locale = pytest.mark.skipif(
+ len(_all_locales) <= 1, reason="Need multiple locales for meaningful test")
+
+
+def test_can_set_locale_valid_set():
+ # Can set the default locale.
+ assert tm.can_set_locale("")
+
+
+def test_can_set_locale_invalid_set():
+ # Cannot set an invalid locale.
+ assert not tm.can_set_locale("non-existent_locale")
+
+
+def test_can_set_locale_invalid_get(monkeypatch):
+ # see gh-22129
+ #
+ # In some cases, an invalid locale can be set,
+ # but a subsequent getlocale() raises a ValueError.
+
+ def mock_get_locale():
+ raise ValueError()
+
+ with monkeypatch.context() as m:
+ m.setattr(locale, "getlocale", mock_get_locale)
+ assert not tm.can_set_locale("")
+
+
+def test_get_locales_at_least_one():
+ # see gh-9744
+ assert len(_all_locales) > 0
+
+
+@_skip_if_only_one_locale
+def test_get_locales_prefix():
+ first_locale = _all_locales[0]
+ assert len(tm.get_locales(prefix=first_locale[:2])) > 0
+
+
+@_skip_if_only_one_locale
+def test_set_locale():
+ if com._all_none(_current_locale):
+ # Not sure why, but on some Travis runs with pytest,
+ # getlocale() returned (None, None).
+ pytest.skip("Current locale is not set.")
+
+ locale_override = os.environ.get("LOCALE_OVERRIDE", None)
+
+ if locale_override is None:
+ lang, enc = "it_CH", "UTF-8"
+ elif locale_override == "C":
+ lang, enc = "en_US", "ascii"
+ else:
+ lang, enc = locale_override.split(".")
+
+ enc = codecs.lookup(enc).name
+ new_locale = lang, enc
+
+ if not tm.can_set_locale(new_locale):
+ msg = "unsupported locale setting"
+
+ with pytest.raises(locale.Error, match=msg):
+ with tm.set_locale(new_locale):
+ pass
+ else:
+ with tm.set_locale(new_locale) as normalized_locale:
+ new_lang, new_enc = normalized_locale.split(".")
+ new_enc = codecs.lookup(enc).name
+
+ normalized_locale = new_lang, new_enc
+ assert normalized_locale == new_locale
+
+ # Once we exit the "with" statement, locale should be back to what it was.
+ current_locale = locale.getlocale()
+ assert current_locale == _current_locale
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_move.py b/contrib/python/pandas/py2/pandas/tests/util/test_move.py
new file mode 100644
index 00000000000..ef98f2032e6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_move.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+import sys
+from uuid import uuid4
+
+import pytest
+
+from pandas.compat import PY3, intern
+from pandas.util._move import BadMove, move_into_mutable_buffer, stolenbuf
+
+
+def test_cannot_create_instance_of_stolen_buffer():
+ # Stolen buffers need to be created through the smart constructor
+ # "move_into_mutable_buffer," which has a bunch of checks in it.
+
+ msg = "cannot create 'pandas.util._move.stolenbuf' instances"
+ with pytest.raises(TypeError, match=msg):
+ stolenbuf()
+
+
+def test_more_than_one_ref():
+ # Test case for when we try to use "move_into_mutable_buffer"
+ # when the object being moved has other references.
+
+ b = b"testing"
+
+ with pytest.raises(BadMove, match="testing") as e:
+ def handle_success(type_, value, tb):
+ assert value.args[0] is b
+ return type(e).handle_success(e, type_, value, tb) # super
+
+ e.handle_success = handle_success
+ move_into_mutable_buffer(b)
+
+
+def test_exactly_one_ref():
+ # Test case for when the object being moved has exactly one reference.
+
+ b = b"testing"
+
+ # We need to pass an expression on the stack to ensure that there are
+ # not extra references hanging around. We cannot rewrite this test as
+ # buf = b[:-3]
+ # as_stolen_buf = move_into_mutable_buffer(buf)
+ # because then we would have more than one reference to buf.
+ as_stolen_buf = move_into_mutable_buffer(b[:-3])
+
+ # Materialize as byte-array to show that it is mutable.
+ assert bytearray(as_stolen_buf) == b"test"
+
+
[email protected](PY3, reason="bytes objects cannot be interned in PY3")
+def test_interned():
+ salt = uuid4().hex
+
+ def make_string():
+ # We need to actually create a new string so that it has refcount
+ # one. We use a uuid so that we know the string could not already
+ # be in the intern table.
+ return "".join(("testing: ", salt))
+
+ # This should work, the string has one reference on the stack.
+ move_into_mutable_buffer(make_string())
+ refcount = [None] # nonlocal
+
+ def ref_capture(ob):
+ # Subtract two because those are the references owned by this frame:
+ # 1. The local variables of this stack frame.
+ # 2. The python data stack of this stack frame.
+ refcount[0] = sys.getrefcount(ob) - 2
+ return ob
+
+ with pytest.raises(BadMove, match="testing"):
+ # If we intern the string, it will still have one reference. Now,
+ # it is in the intern table, so if other people intern the same
+ # string while the mutable buffer holds the first string they will
+ # be the same instance.
+ move_into_mutable_buffer(ref_capture(intern(make_string()))) # noqa
+
+ assert refcount[0] == 1
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_safe_import.py b/contrib/python/pandas/py2/pandas/tests/util/test_safe_import.py
new file mode 100644
index 00000000000..a9c52ef7883
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_safe_import.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import sys
+import types
+
+import pytest
+
+import pandas.util._test_decorators as td
+
+
[email protected]("name", ["foo", "hello123"])
+def test_safe_import_non_existent(name):
+ assert not td.safe_import(name)
+
+
+def test_safe_import_exists():
+ assert td.safe_import("pandas")
+
+
[email protected]("min_version,valid", [
+ ("0.0.0", True),
+ ("99.99.99", False)
+])
+def test_safe_import_versions(min_version, valid):
+ result = td.safe_import("pandas", min_version=min_version)
+ result = result if valid else not result
+ assert result
+
+
[email protected]("min_version,valid", [
+ (None, False),
+ ("1.0", True),
+ ("2.0", False)
+])
+def test_safe_import_dummy(monkeypatch, min_version, valid):
+ mod_name = "hello123"
+
+ mod = types.ModuleType(mod_name)
+ mod.__version__ = "1.5"
+
+ if min_version is not None:
+ monkeypatch.setitem(sys.modules, mod_name, mod)
+
+ result = td.safe_import(mod_name, min_version=min_version)
+ result = result if valid else not result
+ assert result
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_util.py b/contrib/python/pandas/py2/pandas/tests/util/test_util.py
new file mode 100644
index 00000000000..e40784fd546
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_util.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+
+import pytest
+
+import pandas.compat as compat
+from pandas.compat import raise_with_traceback
+from pandas.util._decorators import deprecate_kwarg, make_signature
+from pandas.util._validators import validate_kwargs
+
+import pandas.util.testing as tm
+
+
+def test_rands():
+ r = tm.rands(10)
+ assert len(r) == 10
+
+
+def test_rands_array_1d():
+ arr = tm.rands_array(5, size=10)
+ assert arr.shape == (10,)
+ assert len(arr[0]) == 5
+
+
+def test_rands_array_2d():
+ arr = tm.rands_array(7, size=(10, 10))
+ assert arr.shape == (10, 10)
+ assert len(arr[1, 1]) == 7
+
+
+def test_numpy_err_state_is_default():
+ expected = {"over": "warn", "divide": "warn",
+ "invalid": "warn", "under": "ignore"}
+ import numpy as np
+
+ # The error state should be unchanged after that import.
+ assert np.geterr() == expected
+
+
[email protected]("func,expected", [
+ # Case where the func does not have default kwargs.
+ (validate_kwargs, (["fname", "kwargs", "compat_args"],
+ ["fname", "kwargs", "compat_args"])),
+
+ # Case where the func does have default kwargs.
+ (deprecate_kwarg, (["old_arg_name", "new_arg_name",
+ "mapping=None", "stacklevel=2"],
+ ["old_arg_name", "new_arg_name",
+ "mapping", "stacklevel"]))
+])
+def test_make_signature(func, expected):
+ # see gh-17608
+ assert make_signature(func) == expected
+
+
+def test_raise_with_traceback():
+ with pytest.raises(LookupError, match="error_text"):
+ try:
+ raise ValueError("THIS IS AN ERROR")
+ except ValueError:
+ e = LookupError("error_text")
+ raise_with_traceback(e)
+
+ with pytest.raises(LookupError, match="error_text"):
+ try:
+ raise ValueError("This is another error")
+ except ValueError:
+ e = LookupError("error_text")
+ _, _, traceback = sys.exc_info()
+ raise_with_traceback(e, traceback)
+
+
+def test_convert_rows_list_to_csv_str():
+ rows_list = ["aaa", "bbb", "ccc"]
+ ret = tm.convert_rows_list_to_csv_str(rows_list)
+
+ if compat.is_platform_windows():
+ expected = "aaa\r\nbbb\r\nccc\r\n"
+ else:
+ expected = "aaa\nbbb\nccc\n"
+
+ assert ret == expected
+
+
+def test_create_temp_directory():
+ with tm.ensure_clean_dir() as path:
+ assert os.path.exists(path)
+ assert os.path.isdir(path)
+ assert not os.path.exists(path)
+
+
+def test_assert_raises_regex_deprecated():
+ # see gh-23592
+
+ with tm.assert_produces_warning(FutureWarning):
+ msg = "Not equal!"
+
+ with tm.assert_raises_regex(AssertionError, msg):
+ assert 1 == 2, msg
+
+
[email protected]('strict_data_files', [True, False])
+def test_datapath_missing(datapath):
+ with pytest.raises(ValueError, match="Could not find file"):
+ datapath("not_a_file")
+
+
+def test_datapath(datapath):
+ args = ("data", "iris.csv")
+
+ result = datapath(*args)
+ expected = os.path.join(os.path.dirname(os.path.dirname(__file__)), *args)
+
+ assert result == expected
+
+
+def test_rng_context():
+ import numpy as np
+
+ expected0 = 1.764052345967664
+ expected1 = 1.6243453636632417
+
+ with tm.RNGContext(0):
+ with tm.RNGContext(1):
+ assert np.random.randn() == expected1
+ assert np.random.randn() == expected0
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_validate_args.py b/contrib/python/pandas/py2/pandas/tests/util/test_validate_args.py
new file mode 100644
index 00000000000..ca71b0c9d25
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_validate_args.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+from collections import OrderedDict
+
+import pytest
+
+from pandas.util._validators import validate_args
+
+_fname = "func"
+
+
+def test_bad_min_fname_arg_count():
+ msg = "'max_fname_arg_count' must be non-negative"
+
+ with pytest.raises(ValueError, match=msg):
+ validate_args(_fname, (None,), -1, "foo")
+
+
+def test_bad_arg_length_max_value_single():
+ args = (None, None)
+ compat_args = ("foo",)
+
+ min_fname_arg_count = 0
+ max_length = len(compat_args) + min_fname_arg_count
+ actual_length = len(args) + min_fname_arg_count
+ msg = (r"{fname}\(\) takes at most {max_length} "
+ r"argument \({actual_length} given\)"
+ .format(fname=_fname, max_length=max_length,
+ actual_length=actual_length))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_args(_fname, args, min_fname_arg_count, compat_args)
+
+
+def test_bad_arg_length_max_value_multiple():
+ args = (None, None)
+ compat_args = dict(foo=None)
+
+ min_fname_arg_count = 2
+ max_length = len(compat_args) + min_fname_arg_count
+ actual_length = len(args) + min_fname_arg_count
+ msg = (r"{fname}\(\) takes at most {max_length} "
+ r"arguments \({actual_length} given\)"
+ .format(fname=_fname, max_length=max_length,
+ actual_length=actual_length))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_args(_fname, args, min_fname_arg_count, compat_args)
+
+
[email protected]("i", range(1, 3))
+def test_not_all_defaults(i):
+ bad_arg = "foo"
+ msg = ("the '{arg}' parameter is not supported "
+ r"in the pandas implementation of {func}\(\)".
+ format(arg=bad_arg, func=_fname))
+
+ compat_args = OrderedDict()
+ compat_args["foo"] = 2
+ compat_args["bar"] = -1
+ compat_args["baz"] = 3
+
+ arg_vals = (1, -1, 3)
+
+ with pytest.raises(ValueError, match=msg):
+ validate_args(_fname, arg_vals[:i], 2, compat_args)
+
+
+def test_validation():
+ # No exceptions should be raised.
+ validate_args(_fname, (None,), 2, dict(out=None))
+
+ compat_args = OrderedDict()
+ compat_args["axis"] = 1
+ compat_args["out"] = None
+
+ validate_args(_fname, (1, None), 2, compat_args)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_validate_args_and_kwargs.py b/contrib/python/pandas/py2/pandas/tests/util/test_validate_args_and_kwargs.py
new file mode 100644
index 00000000000..c3c0b3dedc0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_validate_args_and_kwargs.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+from collections import OrderedDict
+
+import pytest
+
+from pandas.util._validators import validate_args_and_kwargs
+
+_fname = "func"
+
+
+def test_invalid_total_length_max_length_one():
+ compat_args = ("foo",)
+ kwargs = {"foo": "FOO"}
+ args = ("FoO", "BaZ")
+
+ min_fname_arg_count = 0
+ max_length = len(compat_args) + min_fname_arg_count
+ actual_length = len(kwargs) + len(args) + min_fname_arg_count
+
+ msg = (r"{fname}\(\) takes at most {max_length} "
+ r"argument \({actual_length} given\)"
+ .format(fname=_fname, max_length=max_length,
+ actual_length=actual_length))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_args_and_kwargs(_fname, args, kwargs,
+ min_fname_arg_count,
+ compat_args)
+
+
+def test_invalid_total_length_max_length_multiple():
+ compat_args = ("foo", "bar", "baz")
+ kwargs = {"foo": "FOO", "bar": "BAR"}
+ args = ("FoO", "BaZ")
+
+ min_fname_arg_count = 2
+ max_length = len(compat_args) + min_fname_arg_count
+ actual_length = len(kwargs) + len(args) + min_fname_arg_count
+
+ msg = (r"{fname}\(\) takes at most {max_length} "
+ r"arguments \({actual_length} given\)"
+ .format(fname=_fname, max_length=max_length,
+ actual_length=actual_length))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_args_and_kwargs(_fname, args, kwargs,
+ min_fname_arg_count,
+ compat_args)
+
+
[email protected]("args,kwargs", [
+ ((), {"foo": -5, "bar": 2}),
+ ((-5, 2), {})
+])
+def test_missing_args_or_kwargs(args, kwargs):
+ bad_arg = "bar"
+ min_fname_arg_count = 2
+
+ compat_args = OrderedDict()
+ compat_args["foo"] = -5
+ compat_args[bad_arg] = 1
+
+ msg = (r"the '{arg}' parameter is not supported "
+ r"in the pandas implementation of {func}\(\)".
+ format(arg=bad_arg, func=_fname))
+
+ with pytest.raises(ValueError, match=msg):
+ validate_args_and_kwargs(_fname, args, kwargs,
+ min_fname_arg_count, compat_args)
+
+
+def test_duplicate_argument():
+ min_fname_arg_count = 2
+
+ compat_args = OrderedDict()
+ compat_args["foo"] = None
+ compat_args["bar"] = None
+ compat_args["baz"] = None
+
+ kwargs = {"foo": None, "bar": None}
+ args = (None,) # duplicate value for "foo"
+
+ msg = (r"{fname}\(\) got multiple values for keyword "
+ r"argument '{arg}'".format(fname=_fname, arg="foo"))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_args_and_kwargs(_fname, args, kwargs,
+ min_fname_arg_count,
+ compat_args)
+
+
+def test_validation():
+ # No exceptions should be raised.
+ compat_args = OrderedDict()
+ compat_args["foo"] = 1
+ compat_args["bar"] = None
+ compat_args["baz"] = -2
+ kwargs = {"baz": -2}
+
+ args = (1, None)
+ min_fname_arg_count = 2
+
+ validate_args_and_kwargs(_fname, args, kwargs,
+ min_fname_arg_count,
+ compat_args)
diff --git a/contrib/python/pandas/py2/pandas/tests/util/test_validate_kwargs.py b/contrib/python/pandas/py2/pandas/tests/util/test_validate_kwargs.py
new file mode 100644
index 00000000000..f36818ddfc9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/util/test_validate_kwargs.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+from collections import OrderedDict
+
+import pytest
+
+from pandas.util._validators import validate_bool_kwarg, validate_kwargs
+
+_fname = "func"
+
+
+def test_bad_kwarg():
+ good_arg = "f"
+ bad_arg = good_arg + "o"
+
+ compat_args = OrderedDict()
+ compat_args[good_arg] = "foo"
+ compat_args[bad_arg + "o"] = "bar"
+ kwargs = {good_arg: "foo", bad_arg: "bar"}
+
+ msg = (r"{fname}\(\) got an unexpected "
+ r"keyword argument '{arg}'".format(fname=_fname, arg=bad_arg))
+
+ with pytest.raises(TypeError, match=msg):
+ validate_kwargs(_fname, kwargs, compat_args)
+
+
[email protected]("i", range(1, 3))
+def test_not_all_none(i):
+ bad_arg = "foo"
+ msg = (r"the '{arg}' parameter is not supported "
+ r"in the pandas implementation of {func}\(\)".
+ format(arg=bad_arg, func=_fname))
+
+ compat_args = OrderedDict()
+ compat_args["foo"] = 1
+ compat_args["bar"] = "s"
+ compat_args["baz"] = None
+
+ kwarg_keys = ("foo", "bar", "baz")
+ kwarg_vals = (2, "s", None)
+
+ kwargs = dict(zip(kwarg_keys[:i], kwarg_vals[:i]))
+
+ with pytest.raises(ValueError, match=msg):
+ validate_kwargs(_fname, kwargs, compat_args)
+
+
+def test_validation():
+ # No exceptions should be raised.
+ compat_args = OrderedDict()
+ compat_args["f"] = None
+ compat_args["b"] = 1
+ compat_args["ba"] = "s"
+
+ kwargs = dict(f=None, b=1)
+ validate_kwargs(_fname, kwargs, compat_args)
+
+
[email protected]("name", ["inplace", "copy"])
[email protected]("value", [1, "True", [1, 2, 3], 5.0])
+def test_validate_bool_kwarg_fail(name, value):
+ msg = ("For argument \"%s\" expected type bool, received type %s" %
+ (name, type(value).__name__))
+
+ with pytest.raises(ValueError, match=msg):
+ validate_bool_kwarg(value, name)
+
+
[email protected]("name", ["inplace", "copy"])
[email protected]("value", [True, False, None])
+def test_validate_bool_kwarg(name, value):
+ assert validate_bool_kwarg(value, name) == value